1/*	$NetBSD: journal.c,v 1.12 2024/02/21 22:52:06 christos Exp $	*/
2
3/*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16#include <errno.h>
17#include <inttypes.h>
18#include <stdbool.h>
19#include <stdlib.h>
20#include <unistd.h>
21
22#include <isc/dir.h>
23#include <isc/file.h>
24#include <isc/mem.h>
25#include <isc/print.h>
26#include <isc/result.h>
27#include <isc/serial.h>
28#include <isc/stdio.h>
29#include <isc/string.h>
30#include <isc/util.h>
31
32#include <dns/compress.h>
33#include <dns/db.h>
34#include <dns/dbiterator.h>
35#include <dns/diff.h>
36#include <dns/fixedname.h>
37#include <dns/journal.h>
38#include <dns/log.h>
39#include <dns/rdataset.h>
40#include <dns/rdatasetiter.h>
41#include <dns/soa.h>
42
43/*! \file
44 * \brief Journaling.
45 *
46 * A journal file consists of
47 *
48 *   \li A fixed-size header of type journal_rawheader_t.
49 *
50 *   \li The index.  This is an unordered array of index entries
51 *     of type journal_rawpos_t giving the locations
52 *     of some arbitrary subset of the journal's addressable
53 *     transactions.  The index entries are used as hints to
54 *     speed up the process of locating a transaction with a given
55 *     serial number.  Unused index entries have an "offset"
56 *     field of zero.  The size of the index can vary between
57 *     journal files, but does not change during the lifetime
58 *     of a file.  The size can be zero.
59 *
60 *   \li The journal data.  This  consists of one or more transactions.
61 *     Each transaction begins with a transaction header of type
62 *     journal_rawxhdr_t.  The transaction header is followed by a
63 *     sequence of RRs, similar in structure to an IXFR difference
64 *     sequence (RFC1995).  That is, the pre-transaction SOA,
65 *     zero or more other deleted RRs, the post-transaction SOA,
66 *     and zero or more other added RRs.  Unlike in IXFR, each RR
67 *     is prefixed with a 32-bit length.
68 *
69 *     The journal data part grows as new transactions are
70 *     appended to the file.  Only those transactions
71 *     whose serial number is current-(2^31-1) to current
72 *     are considered "addressable" and may be pointed
73 *     to from the header or index.  They may be preceded
74 *     by old transactions that are no longer addressable,
75 *     and they may be followed by transactions that were
76 *     appended to the journal but never committed by updating
77 *     the "end" position in the header.  The latter will
78 *     be overwritten when new transactions are added.
79 */
80
81/**************************************************************************/
82/*
83 * Miscellaneous utilities.
84 */
85
86#define JOURNAL_COMMON_LOGARGS \
87	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
88
89#define JOURNAL_DEBUG_LOGARGS(n) JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
90
91/*%
92 * It would be non-sensical (or at least obtuse) to use FAIL() with an
93 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
94 * from complaining about "end-of-loop code not reached".
95 */
96#define FAIL(code)                           \
97	do {                                 \
98		result = (code);             \
99		if (result != ISC_R_SUCCESS) \
100			goto failure;        \
101	} while (0)
102
103#define CHECK(op)                            \
104	do {                                 \
105		result = (op);               \
106		if (result != ISC_R_SUCCESS) \
107			goto failure;        \
108	} while (0)
109
110#define JOURNAL_SERIALSET 0x01U
111
112static isc_result_t
113index_to_disk(dns_journal_t *);
114
115static uint32_t
116decode_uint32(unsigned char *p) {
117	return (((uint32_t)p[0] << 24) + ((uint32_t)p[1] << 16) +
118		((uint32_t)p[2] << 8) + ((uint32_t)p[3] << 0));
119}
120
121static void
122encode_uint32(uint32_t val, unsigned char *p) {
123	p[0] = (uint8_t)(val >> 24);
124	p[1] = (uint8_t)(val >> 16);
125	p[2] = (uint8_t)(val >> 8);
126	p[3] = (uint8_t)(val >> 0);
127}
128
129isc_result_t
130dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
131		      dns_diffop_t op, dns_difftuple_t **tp) {
132	isc_result_t result;
133	dns_dbnode_t *node;
134	dns_rdataset_t rdataset;
135	dns_rdata_t rdata = DNS_RDATA_INIT;
136	dns_fixedname_t fixed;
137	dns_name_t *zonename;
138
139	zonename = dns_fixedname_initname(&fixed);
140	dns_name_copy(dns_db_origin(db), zonename);
141
142	node = NULL;
143	result = dns_db_findnode(db, zonename, false, &node);
144	if (result != ISC_R_SUCCESS) {
145		goto nonode;
146	}
147
148	dns_rdataset_init(&rdataset);
149	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
150				     (isc_stdtime_t)0, &rdataset, NULL);
151	if (result != ISC_R_SUCCESS) {
152		goto freenode;
153	}
154
155	result = dns_rdataset_first(&rdataset);
156	if (result != ISC_R_SUCCESS) {
157		goto freenode;
158	}
159
160	dns_rdataset_current(&rdataset, &rdata);
161	dns_rdataset_getownercase(&rdataset, zonename);
162
163	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl, &rdata,
164				      tp);
165
166	dns_rdataset_disassociate(&rdataset);
167	dns_db_detachnode(db, &node);
168	return (result);
169
170freenode:
171	dns_db_detachnode(db, &node);
172nonode:
173	UNEXPECTED_ERROR("missing SOA");
174	return (result);
175}
176
177/* Journaling */
178
179/*%
180 * On-disk representation of a "pointer" to a journal entry.
181 * These are used in the journal header to locate the beginning
182 * and end of the journal, and in the journal index to locate
183 * other transactions.
184 */
185typedef struct {
186	unsigned char serial[4]; /*%< SOA serial before update. */
187	/*
188	 * XXXRTH  Should offset be 8 bytes?
189	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
190	 * XXXAG  ... but we will not be able to seek >2G anyway on many
191	 *            platforms as long as we are using fseek() rather
192	 *            than lseek().
193	 */
194	unsigned char offset[4]; /*%< Offset from beginning of file. */
195} journal_rawpos_t;
196
197/*%
198 * The header is of a fixed size, with some spare room for future
199 * extensions.
200 */
201#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
202
203typedef enum {
204	XHDR_VERSION1 = 1,
205	XHDR_VERSION2 = 2,
206} xhdr_version_t;
207
208/*%
209 * The on-disk representation of the journal header.
210 * All numbers are stored in big-endian order.
211 */
212typedef union {
213	struct {
214		/*% File format version ID. */
215		unsigned char format[16];
216		/*% Position of the first addressable transaction */
217		journal_rawpos_t begin;
218		/*% Position of the next (yet nonexistent) transaction. */
219		journal_rawpos_t end;
220		/*% Number of index entries following the header. */
221		unsigned char index_size[4];
222		/*% Source serial number. */
223		unsigned char sourceserial[4];
224		unsigned char flags;
225	} h;
226	/* Pad the header to a fixed size. */
227	unsigned char pad[JOURNAL_HEADER_SIZE];
228} journal_rawheader_t;
229
230/*%
231 * The on-disk representation of the transaction header, version 2.
232 * There is one of these at the beginning of each transaction.
233 */
234typedef struct {
235	unsigned char size[4];	  /*%< In bytes, excluding header. */
236	unsigned char count[4];	  /*%< Number of records in transaction */
237	unsigned char serial0[4]; /*%< SOA serial before update. */
238	unsigned char serial1[4]; /*%< SOA serial after update. */
239} journal_rawxhdr_t;
240
241/*%
242 * Old-style raw transaction header, version 1, used for backward
243 * compatibility mode.
244 */
245typedef struct {
246	unsigned char size[4];
247	unsigned char serial0[4];
248	unsigned char serial1[4];
249} journal_rawxhdr_ver1_t;
250
251/*%
252 * The on-disk representation of the RR header.
253 * There is one of these at the beginning of each RR.
254 */
255typedef struct {
256	unsigned char size[4]; /*%< In bytes, excluding header. */
257} journal_rawrrhdr_t;
258
259/*%
260 * The in-core representation of the journal header.
261 */
262typedef struct {
263	uint32_t serial;
264	isc_offset_t offset;
265} journal_pos_t;
266
267#define POS_VALID(pos)	    ((pos).offset != 0)
268#define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)
269
270typedef struct {
271	unsigned char format[16];
272	journal_pos_t begin;
273	journal_pos_t end;
274	uint32_t index_size;
275	uint32_t sourceserial;
276	bool serialset;
277} journal_header_t;
278
279/*%
280 * The in-core representation of the transaction header.
281 */
282typedef struct {
283	uint32_t size;
284	uint32_t count;
285	uint32_t serial0;
286	uint32_t serial1;
287} journal_xhdr_t;
288
289/*%
290 * The in-core representation of the RR header.
291 */
292typedef struct {
293	uint32_t size;
294} journal_rrhdr_t;
295
296/*%
297 * Initial contents to store in the header of a newly created
298 * journal file.
299 *
300 * The header starts with the magic string ";BIND LOG V9.2\n"
301 * to identify the file as a BIND 9 journal file.  An ASCII
302 * identification string is used rather than a binary magic
303 * number to be consistent with BIND 8 (BIND 8 journal files
304 * are ASCII text files).
305 */
306
307static journal_header_t journal_header_ver1 = {
308	";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
309};
310static journal_header_t initial_journal_header = {
311	";BIND LOG V9.2\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
312};
313
314#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
315
316typedef enum {
317	JOURNAL_STATE_INVALID,
318	JOURNAL_STATE_READ,
319	JOURNAL_STATE_WRITE,
320	JOURNAL_STATE_TRANSACTION,
321	JOURNAL_STATE_INLINE
322} journal_state_t;
323
324struct dns_journal {
325	unsigned int magic; /*%< JOUR */
326	isc_mem_t *mctx;    /*%< Memory context */
327	journal_state_t state;
328	xhdr_version_t xhdr_version; /*%< Expected transaction header version */
329	bool header_ver1;	     /*%< Transaction header compatibility
330				      *   mode is allowed */
331	bool recovered;		     /*%< A recoverable error was found
332				      *   while reading the journal */
333	char *filename;		     /*%< Journal file name */
334	FILE *fp;		     /*%< File handle */
335	isc_offset_t offset;	     /*%< Current file offset */
336	journal_xhdr_t curxhdr;	     /*%< Current transaction header */
337	journal_header_t header;     /*%< In-core journal header */
338	unsigned char *rawindex;     /*%< In-core buffer for journal index
339				      * in on-disk format */
340	journal_pos_t *index;	     /*%< In-core journal index */
341
342	/*% Current transaction state (when writing). */
343	struct {
344		unsigned int n_soa;   /*%< Number of SOAs seen */
345		unsigned int n_rr;    /*%< Number of RRs to write */
346		journal_pos_t pos[2]; /*%< Begin/end position */
347	} x;
348
349	/*% Iteration state (when reading). */
350	struct {
351		/* These define the part of the journal we iterate over. */
352		journal_pos_t bpos; /*%< Position before first, */
353		journal_pos_t cpos; /*%< before current, */
354		journal_pos_t epos; /*%< and after last transaction */
355		/* The rest is iterator state. */
356		uint32_t current_serial; /*%< Current SOA serial */
357		isc_buffer_t source;	 /*%< Data from disk */
358		isc_buffer_t target;	 /*%< Data from _fromwire check */
359		dns_decompress_t dctx;	 /*%< Dummy decompression ctx */
360		dns_name_t name;	 /*%< Current domain name */
361		dns_rdata_t rdata;	 /*%< Current rdata */
362		uint32_t ttl;		 /*%< Current TTL */
363		unsigned int xsize;	 /*%< Size of transaction data */
364		unsigned int xpos;	 /*%< Current position in it */
365		isc_result_t result;	 /*%< Result of last call */
366	} it;
367};
368
369#define DNS_JOURNAL_MAGIC    ISC_MAGIC('J', 'O', 'U', 'R')
370#define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
371
372static void
373journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
374	cooked->serial = decode_uint32(raw->serial);
375	cooked->offset = decode_uint32(raw->offset);
376}
377
378static void
379journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
380	encode_uint32(cooked->serial, raw->serial);
381	encode_uint32(cooked->offset, raw->offset);
382}
383
384static void
385journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
386	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
387
388	memmove(cooked->format, raw->h.format, sizeof(cooked->format));
389	journal_pos_decode(&raw->h.begin, &cooked->begin);
390	journal_pos_decode(&raw->h.end, &cooked->end);
391	cooked->index_size = decode_uint32(raw->h.index_size);
392	cooked->sourceserial = decode_uint32(raw->h.sourceserial);
393	cooked->serialset = ((raw->h.flags & JOURNAL_SERIALSET) != 0);
394}
395
396static void
397journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
398	unsigned char flags = 0;
399
400	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
401
402	memset(raw->pad, 0, sizeof(raw->pad));
403	memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
404	journal_pos_encode(&raw->h.begin, &cooked->begin);
405	journal_pos_encode(&raw->h.end, &cooked->end);
406	encode_uint32(cooked->index_size, raw->h.index_size);
407	encode_uint32(cooked->sourceserial, raw->h.sourceserial);
408	if (cooked->serialset) {
409		flags |= JOURNAL_SERIALSET;
410	}
411	raw->h.flags = flags;
412}
413
414/*
415 * Journal file I/O subroutines, with error checking and reporting.
416 */
417static isc_result_t
418journal_seek(dns_journal_t *j, uint32_t offset) {
419	isc_result_t result;
420
421	result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET);
422	if (result != ISC_R_SUCCESS) {
423		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
424			      "%s: seek: %s", j->filename,
425			      isc_result_totext(result));
426		return (ISC_R_UNEXPECTED);
427	}
428	j->offset = offset;
429	return (ISC_R_SUCCESS);
430}
431
432static isc_result_t
433journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
434	isc_result_t result;
435
436	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
437	if (result != ISC_R_SUCCESS) {
438		if (result == ISC_R_EOF) {
439			return (ISC_R_NOMORE);
440		}
441		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
442			      "%s: read: %s", j->filename,
443			      isc_result_totext(result));
444		return (ISC_R_UNEXPECTED);
445	}
446	j->offset += (isc_offset_t)nbytes;
447	return (ISC_R_SUCCESS);
448}
449
450static isc_result_t
451journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
452	isc_result_t result;
453
454	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
455	if (result != ISC_R_SUCCESS) {
456		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
457			      "%s: write: %s", j->filename,
458			      isc_result_totext(result));
459		return (ISC_R_UNEXPECTED);
460	}
461	j->offset += (isc_offset_t)nbytes;
462	return (ISC_R_SUCCESS);
463}
464
465static isc_result_t
466journal_fsync(dns_journal_t *j) {
467	isc_result_t result;
468
469	result = isc_stdio_flush(j->fp);
470	if (result != ISC_R_SUCCESS) {
471		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
472			      "%s: flush: %s", j->filename,
473			      isc_result_totext(result));
474		return (ISC_R_UNEXPECTED);
475	}
476	result = isc_stdio_sync(j->fp);
477	if (result != ISC_R_SUCCESS) {
478		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
479			      "%s: fsync: %s", j->filename,
480			      isc_result_totext(result));
481		return (ISC_R_UNEXPECTED);
482	}
483	return (ISC_R_SUCCESS);
484}
485
486/*
487 * Read/write a transaction header at the current file position.
488 */
489static isc_result_t
490journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
491	isc_result_t result;
492
493	j->it.cpos.offset = j->offset;
494
495	switch (j->xhdr_version) {
496	case XHDR_VERSION1: {
497		journal_rawxhdr_ver1_t raw;
498		result = journal_read(j, &raw, sizeof(raw));
499		if (result != ISC_R_SUCCESS) {
500			return (result);
501		}
502		xhdr->size = decode_uint32(raw.size);
503		xhdr->count = 0;
504		xhdr->serial0 = decode_uint32(raw.serial0);
505		xhdr->serial1 = decode_uint32(raw.serial1);
506		j->curxhdr = *xhdr;
507		return (ISC_R_SUCCESS);
508	}
509
510	case XHDR_VERSION2: {
511		journal_rawxhdr_t raw;
512		result = journal_read(j, &raw, sizeof(raw));
513		if (result != ISC_R_SUCCESS) {
514			return (result);
515		}
516		xhdr->size = decode_uint32(raw.size);
517		xhdr->count = decode_uint32(raw.count);
518		xhdr->serial0 = decode_uint32(raw.serial0);
519		xhdr->serial1 = decode_uint32(raw.serial1);
520		j->curxhdr = *xhdr;
521		return (ISC_R_SUCCESS);
522	}
523
524	default:
525		return (ISC_R_NOTIMPLEMENTED);
526	}
527}
528
529static isc_result_t
530journal_write_xhdr(dns_journal_t *j, uint32_t size, uint32_t count,
531		   uint32_t serial0, uint32_t serial1) {
532	if (j->header_ver1) {
533		journal_rawxhdr_ver1_t raw;
534		encode_uint32(size, raw.size);
535		encode_uint32(serial0, raw.serial0);
536		encode_uint32(serial1, raw.serial1);
537		return (journal_write(j, &raw, sizeof(raw)));
538	} else {
539		journal_rawxhdr_t raw;
540		encode_uint32(size, raw.size);
541		encode_uint32(count, raw.count);
542		encode_uint32(serial0, raw.serial0);
543		encode_uint32(serial1, raw.serial1);
544		return (journal_write(j, &raw, sizeof(raw)));
545	}
546}
547
548/*
549 * Read an RR header at the current file position.
550 */
551
552static isc_result_t
553journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
554	journal_rawrrhdr_t raw;
555	isc_result_t result;
556
557	result = journal_read(j, &raw, sizeof(raw));
558	if (result != ISC_R_SUCCESS) {
559		return (result);
560	}
561	rrhdr->size = decode_uint32(raw.size);
562	return (ISC_R_SUCCESS);
563}
564
565static isc_result_t
566journal_file_create(isc_mem_t *mctx, bool downgrade, const char *filename) {
567	FILE *fp = NULL;
568	isc_result_t result;
569	journal_header_t header;
570	journal_rawheader_t rawheader;
571	int index_size = 56; /* XXX configurable */
572	int size;
573	void *mem = NULL; /* Memory for temporary index image. */
574
575	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
576
577	result = isc_stdio_open(filename, "wb", &fp);
578	if (result != ISC_R_SUCCESS) {
579		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
580			      "%s: create: %s", filename,
581			      isc_result_totext(result));
582		return (ISC_R_UNEXPECTED);
583	}
584
585	if (downgrade) {
586		header = journal_header_ver1;
587	} else {
588		header = initial_journal_header;
589	}
590	header.index_size = index_size;
591	journal_header_encode(&header, &rawheader);
592
593	size = sizeof(journal_rawheader_t) +
594	       index_size * sizeof(journal_rawpos_t);
595
596	mem = isc_mem_get(mctx, size);
597	memset(mem, 0, size);
598	memmove(mem, &rawheader, sizeof(rawheader));
599
600	result = isc_stdio_write(mem, 1, (size_t)size, fp, NULL);
601	if (result != ISC_R_SUCCESS) {
602		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
603			      "%s: write: %s", filename,
604			      isc_result_totext(result));
605		(void)isc_stdio_close(fp);
606		(void)isc_file_remove(filename);
607		isc_mem_put(mctx, mem, size);
608		return (ISC_R_UNEXPECTED);
609	}
610	isc_mem_put(mctx, mem, size);
611
612	result = isc_stdio_close(fp);
613	if (result != ISC_R_SUCCESS) {
614		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
615			      "%s: close: %s", filename,
616			      isc_result_totext(result));
617		(void)isc_file_remove(filename);
618		return (ISC_R_UNEXPECTED);
619	}
620
621	return (ISC_R_SUCCESS);
622}
623
624static isc_result_t
625journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
626	     bool downgrade, dns_journal_t **journalp) {
627	FILE *fp = NULL;
628	isc_result_t result;
629	journal_rawheader_t rawheader;
630	dns_journal_t *j;
631
632	REQUIRE(journalp != NULL && *journalp == NULL);
633
634	j = isc_mem_get(mctx, sizeof(*j));
635	*j = (dns_journal_t){ .state = JOURNAL_STATE_INVALID,
636			      .filename = isc_mem_strdup(mctx, filename),
637			      .xhdr_version = XHDR_VERSION2 };
638	isc_mem_attach(mctx, &j->mctx);
639
640	result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp);
641	if (result == ISC_R_FILENOTFOUND) {
642		if (create) {
643			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
644				      "journal file %s does not exist, "
645				      "creating it",
646				      j->filename);
647			CHECK(journal_file_create(mctx, downgrade, filename));
648			/*
649			 * Retry.
650			 */
651			result = isc_stdio_open(j->filename, "rb+", &fp);
652		} else {
653			FAIL(ISC_R_NOTFOUND);
654		}
655	}
656	if (result != ISC_R_SUCCESS) {
657		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
658			      "%s: open: %s", j->filename,
659			      isc_result_totext(result));
660		FAIL(ISC_R_UNEXPECTED);
661	}
662
663	j->fp = fp;
664
665	/*
666	 * Set magic early so that seek/read can succeed.
667	 */
668	j->magic = DNS_JOURNAL_MAGIC;
669
670	CHECK(journal_seek(j, 0));
671	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
672
673	if (memcmp(rawheader.h.format, journal_header_ver1.format,
674		   sizeof(journal_header_ver1.format)) == 0)
675	{
676		/*
677		 * The file header says it's the old format, but it
678		 * still might have the new xhdr format because we
679		 * forgot to change the format string when we introduced
680		 * the new xhdr.  When we first try to read it, we assume
681		 * it uses the new xhdr format. If that fails, we'll be
682		 * called a second time with compat set to true, in which
683		 * case we can lower xhdr_version to 1 if we find a
684		 * corrupt transaction.
685		 */
686		j->header_ver1 = true;
687	} else if (memcmp(rawheader.h.format, initial_journal_header.format,
688			  sizeof(initial_journal_header.format)) == 0)
689	{
690		/*
691		 * File header says this is format version 2; all
692		 * transactions have to match.
693		 */
694		j->header_ver1 = false;
695	} else {
696		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
697			      "%s: journal format not recognized", j->filename);
698		FAIL(ISC_R_UNEXPECTED);
699	}
700	journal_header_decode(&rawheader, &j->header);
701
702	/*
703	 * If there is an index, read the raw index into a dynamically
704	 * allocated buffer and then convert it into a cooked index.
705	 */
706	if (j->header.index_size != 0) {
707		unsigned int i;
708		unsigned int rawbytes;
709		unsigned char *p;
710
711		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
712		j->rawindex = isc_mem_get(mctx, rawbytes);
713
714		CHECK(journal_read(j, j->rawindex, rawbytes));
715
716		j->index = isc_mem_get(mctx, j->header.index_size *
717						     sizeof(journal_pos_t));
718
719		p = j->rawindex;
720		for (i = 0; i < j->header.index_size; i++) {
721			j->index[i].serial = decode_uint32(p);
722			p += 4;
723			j->index[i].offset = decode_uint32(p);
724			p += 4;
725		}
726		INSIST(p == j->rawindex + rawbytes);
727	}
728	j->offset = -1; /* Invalid, must seek explicitly. */
729
730	/*
731	 * Initialize the iterator.
732	 */
733	dns_name_init(&j->it.name, NULL);
734	dns_rdata_init(&j->it.rdata);
735
736	/*
737	 * Set up empty initial buffers for unchecked and checked
738	 * wire format RR data.  They will be reallocated
739	 * later.
740	 */
741	isc_buffer_init(&j->it.source, NULL, 0);
742	isc_buffer_init(&j->it.target, NULL, 0);
743	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
744
745	j->state = writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
746
747	*journalp = j;
748	return (ISC_R_SUCCESS);
749
750failure:
751	j->magic = 0;
752	if (j->rawindex != NULL) {
753		isc_mem_put(j->mctx, j->rawindex,
754			    j->header.index_size * sizeof(journal_rawpos_t));
755	}
756	if (j->index != NULL) {
757		isc_mem_put(j->mctx, j->index,
758			    j->header.index_size * sizeof(journal_pos_t));
759	}
760	isc_mem_free(j->mctx, j->filename);
761	if (j->fp != NULL) {
762		(void)isc_stdio_close(j->fp);
763	}
764	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
765	return (result);
766}
767
768isc_result_t
769dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
770		 dns_journal_t **journalp) {
771	isc_result_t result;
772	size_t namelen;
773	char backup[1024];
774	bool writable, create;
775
776	create = ((mode & DNS_JOURNAL_CREATE) != 0);
777	writable = ((mode & (DNS_JOURNAL_WRITE | DNS_JOURNAL_CREATE)) != 0);
778
779	result = journal_open(mctx, filename, writable, create, false,
780			      journalp);
781	if (result == ISC_R_NOTFOUND) {
782		namelen = strlen(filename);
783		if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
784		{
785			namelen -= 4;
786		}
787
788		result = snprintf(backup, sizeof(backup), "%.*s.jbk",
789				  (int)namelen, filename);
790		if (result >= sizeof(backup)) {
791			return (ISC_R_NOSPACE);
792		}
793		result = journal_open(mctx, backup, writable, writable, false,
794				      journalp);
795	}
796	return (result);
797}
798
799/*
800 * A comparison function defining the sorting order for
801 * entries in the IXFR-style journal file.
802 *
803 * The IXFR format requires that deletions are sorted before
804 * additions, and within either one, SOA records are sorted
805 * before others.
806 *
807 * Also sort the non-SOA records by type as a courtesy to the
808 * server receiving the IXFR - it may help reduce the amount of
809 * rdataset merging it has to do.
810 */
811static int
812ixfr_order(const void *av, const void *bv) {
813	dns_difftuple_t const *const *ap = av;
814	dns_difftuple_t const *const *bp = bv;
815	dns_difftuple_t const *a = *ap;
816	dns_difftuple_t const *b = *bp;
817	int r;
818	int bop = 0, aop = 0;
819
820	switch (a->op) {
821	case DNS_DIFFOP_DEL:
822	case DNS_DIFFOP_DELRESIGN:
823		aop = 1;
824		break;
825	case DNS_DIFFOP_ADD:
826	case DNS_DIFFOP_ADDRESIGN:
827		aop = 0;
828		break;
829	default:
830		UNREACHABLE();
831	}
832
833	switch (b->op) {
834	case DNS_DIFFOP_DEL:
835	case DNS_DIFFOP_DELRESIGN:
836		bop = 1;
837		break;
838	case DNS_DIFFOP_ADD:
839	case DNS_DIFFOP_ADDRESIGN:
840		bop = 0;
841		break;
842	default:
843		UNREACHABLE();
844	}
845
846	r = bop - aop;
847	if (r != 0) {
848		return (r);
849	}
850
851	r = (b->rdata.type == dns_rdatatype_soa) -
852	    (a->rdata.type == dns_rdatatype_soa);
853	if (r != 0) {
854		return (r);
855	}
856
857	r = (a->rdata.type - b->rdata.type);
858	return (r);
859}
860
861static isc_result_t
862maybe_fixup_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr, uint32_t serial,
863		 isc_offset_t offset) {
864	isc_result_t result = ISC_R_SUCCESS;
865
866	/*
867	 * Handle mixture of version 1 and version 2
868	 * transaction headers in a version 1 journal.
869	 */
870	if ((xhdr->serial0 != serial ||
871	     isc_serial_le(xhdr->serial1, xhdr->serial0)))
872	{
873		if (j->xhdr_version == XHDR_VERSION1 && xhdr->serial1 == serial)
874		{
875			isc_log_write(
876				JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
877				"%s: XHDR_VERSION1 -> XHDR_VERSION2 at %u",
878				j->filename, serial);
879			j->xhdr_version = XHDR_VERSION2;
880			CHECK(journal_seek(j, offset));
881			CHECK(journal_read_xhdr(j, xhdr));
882			j->recovered = true;
883		} else if (j->xhdr_version == XHDR_VERSION2 &&
884			   xhdr->count == serial)
885		{
886			isc_log_write(
887				JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
888				"%s: XHDR_VERSION2 -> XHDR_VERSION1 at %u",
889				j->filename, serial);
890			j->xhdr_version = XHDR_VERSION1;
891			CHECK(journal_seek(j, offset));
892			CHECK(journal_read_xhdr(j, xhdr));
893			j->recovered = true;
894		}
895	}
896
897	/*
898	 * Handle <size, serial0, serial1, 0> transaction header.
899	 */
900	if (j->xhdr_version == XHDR_VERSION1) {
901		uint32_t value;
902
903		CHECK(journal_read(j, &value, sizeof(value)));
904		if (value != 0L) {
905			CHECK(journal_seek(j, offset + 12));
906		} else {
907			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
908				      "%s: XHDR_VERSION1 count zero at %u",
909				      j->filename, serial);
910			j->xhdr_version = XHDR_VERSION2;
911			j->recovered = true;
912		}
913	} else if (j->xhdr_version == XHDR_VERSION2 && xhdr->count == serial &&
914		   xhdr->serial1 == 0U &&
915		   isc_serial_gt(xhdr->serial0, xhdr->count))
916	{
917		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
918			      "%s: XHDR_VERSION2 count zero at %u", j->filename,
919			      serial);
920		xhdr->serial1 = xhdr->serial0;
921		xhdr->serial0 = xhdr->count;
922		xhdr->count = 0;
923		j->recovered = true;
924	}
925
926failure:
927	return (result);
928}
929
930/*
931 * Advance '*pos' to the next journal transaction.
932 *
933 * Requires:
934 *	*pos refers to a valid journal transaction.
935 *
936 * Ensures:
937 *	When ISC_R_SUCCESS is returned,
938 *	*pos refers to the next journal transaction.
939 *
940 * Returns one of:
941 *
942 *    ISC_R_SUCCESS
943 *    ISC_R_NOMORE 	*pos pointed at the last transaction
944 *    Other results due to file errors are possible.
945 */
946static isc_result_t
947journal_next(dns_journal_t *j, journal_pos_t *pos) {
948	isc_result_t result;
949	journal_xhdr_t xhdr;
950	size_t hdrsize;
951
952	REQUIRE(DNS_JOURNAL_VALID(j));
953
954	result = journal_seek(j, pos->offset);
955	if (result != ISC_R_SUCCESS) {
956		return (result);
957	}
958
959	if (pos->serial == j->header.end.serial) {
960		return (ISC_R_NOMORE);
961	}
962
963	/*
964	 * Read the header of the current transaction.
965	 * This will return ISC_R_NOMORE if we are at EOF.
966	 */
967	result = journal_read_xhdr(j, &xhdr);
968	if (result != ISC_R_SUCCESS) {
969		return (result);
970	}
971
972	if (j->header_ver1) {
973		CHECK(maybe_fixup_xhdr(j, &xhdr, pos->serial, pos->offset));
974	}
975
976	/*
977	 * Check serial number consistency.
978	 */
979	if (xhdr.serial0 != pos->serial ||
980	    isc_serial_le(xhdr.serial1, xhdr.serial0))
981	{
982		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
983			      "%s: journal file corrupt: "
984			      "expected serial %u, got %u",
985			      j->filename, pos->serial, xhdr.serial0);
986		return (ISC_R_UNEXPECTED);
987	}
988
989	/*
990	 * Check for offset wraparound.
991	 */
992	hdrsize = (j->xhdr_version == XHDR_VERSION2)
993			  ? sizeof(journal_rawxhdr_t)
994			  : sizeof(journal_rawxhdr_ver1_t);
995
996	if ((isc_offset_t)(pos->offset + hdrsize + xhdr.size) < pos->offset) {
997		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
998			      "%s: offset too large", j->filename);
999		return (ISC_R_UNEXPECTED);
1000	}
1001
1002	pos->offset += hdrsize + xhdr.size;
1003	pos->serial = xhdr.serial1;
1004	return (ISC_R_SUCCESS);
1005
1006failure:
1007	return (result);
1008}
1009
1010/*
1011 * If the index of the journal 'j' contains an entry "better"
1012 * than '*best_guess', replace '*best_guess' with it.
1013 *
1014 * "Better" means having a serial number closer to 'serial'
1015 * but not greater than 'serial'.
1016 */
1017static void
1018index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) {
1019	unsigned int i;
1020	if (j->index == NULL) {
1021		return;
1022	}
1023	for (i = 0; i < j->header.index_size; i++) {
1024		if (POS_VALID(j->index[i]) &&
1025		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1026		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
1027		{
1028			*best_guess = j->index[i];
1029		}
1030	}
1031}
1032
1033/*
1034 * Add a new index entry.  If there is no room, make room by removing
1035 * the odd-numbered entries and compacting the others into the first
1036 * half of the index.  This decimates old index entries exponentially
1037 * over time, so that the index always contains a much larger fraction
1038 * of recent serial numbers than of old ones.  This is deliberate -
1039 * most index searches are for outgoing IXFR, and IXFR tends to request
1040 * recent versions more often than old ones.
1041 */
1042static void
1043index_add(dns_journal_t *j, journal_pos_t *pos) {
1044	unsigned int i;
1045
1046	if (j->index == NULL) {
1047		return;
1048	}
1049
1050	/*
1051	 * Search for a vacant position.
1052	 */
1053	for (i = 0; i < j->header.index_size; i++) {
1054		if (!POS_VALID(j->index[i])) {
1055			break;
1056		}
1057	}
1058	if (i == j->header.index_size) {
1059		unsigned int k = 0;
1060		/*
1061		 * Found no vacant position.  Make some room.
1062		 */
1063		for (i = 0; i < j->header.index_size; i += 2) {
1064			j->index[k++] = j->index[i];
1065		}
1066		i = k; /* 'i' identifies the first vacant position. */
1067		while (k < j->header.index_size) {
1068			POS_INVALIDATE(j->index[k]);
1069			k++;
1070		}
1071	}
1072	INSIST(i < j->header.index_size);
1073	INSIST(!POS_VALID(j->index[i]));
1074
1075	/*
1076	 * Store the new index entry.
1077	 */
1078	j->index[i] = *pos;
1079}
1080
1081/*
1082 * Invalidate any existing index entries that could become
1083 * ambiguous when a new transaction with number 'serial' is added.
1084 */
1085static void
1086index_invalidate(dns_journal_t *j, uint32_t serial) {
1087	unsigned int i;
1088	if (j->index == NULL) {
1089		return;
1090	}
1091	for (i = 0; i < j->header.index_size; i++) {
1092		if (!DNS_SERIAL_GT(serial, j->index[i].serial)) {
1093			POS_INVALIDATE(j->index[i]);
1094		}
1095	}
1096}
1097
1098/*
1099 * Try to find a transaction with initial serial number 'serial'
1100 * in the journal 'j'.
1101 *
1102 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
1103 *
1104 * If 'serial' is current (= the ending serial number of the
1105 * last transaction in the journal), set '*pos' to
1106 * the position immediately following the last transaction and
1107 * return ISC_R_SUCCESS.
1108 *
1109 * If 'serial' is within the range of addressable serial numbers
1110 * covered by the journal but that particular serial number is missing
1111 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
1112 *
1113 * If 'serial' is outside the range of addressable serial numbers
1114 * covered by the journal, return ISC_R_RANGE.
1115 *
1116 */
1117static isc_result_t
1118journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) {
1119	isc_result_t result;
1120	journal_pos_t current_pos;
1121
1122	REQUIRE(DNS_JOURNAL_VALID(j));
1123
1124	if (DNS_SERIAL_GT(j->header.begin.serial, serial)) {
1125		return (ISC_R_RANGE);
1126	}
1127	if (DNS_SERIAL_GT(serial, j->header.end.serial)) {
1128		return (ISC_R_RANGE);
1129	}
1130	if (serial == j->header.end.serial) {
1131		*pos = j->header.end;
1132		return (ISC_R_SUCCESS);
1133	}
1134
1135	current_pos = j->header.begin;
1136	index_find(j, serial, &current_pos);
1137
1138	while (current_pos.serial != serial) {
1139		if (DNS_SERIAL_GT(current_pos.serial, serial)) {
1140			return (ISC_R_NOTFOUND);
1141		}
1142		result = journal_next(j, &current_pos);
1143		if (result != ISC_R_SUCCESS) {
1144			return (result);
1145		}
1146	}
1147	*pos = current_pos;
1148	return (ISC_R_SUCCESS);
1149}
1150
1151isc_result_t
1152dns_journal_begin_transaction(dns_journal_t *j) {
1153	uint32_t offset;
1154	isc_result_t result;
1155
1156	REQUIRE(DNS_JOURNAL_VALID(j));
1157	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1158		j->state == JOURNAL_STATE_INLINE);
1159
1160	/*
1161	 * Find the file offset where the new transaction should
1162	 * be written, and seek there.
1163	 */
1164	if (JOURNAL_EMPTY(&j->header)) {
1165		offset = sizeof(journal_rawheader_t) +
1166			 j->header.index_size * sizeof(journal_rawpos_t);
1167	} else {
1168		offset = j->header.end.offset;
1169	}
1170	j->x.pos[0].offset = offset;
1171	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
1172	j->x.n_soa = 0;
1173
1174	CHECK(journal_seek(j, offset));
1175
1176	/*
1177	 * Write a dummy transaction header of all zeroes to reserve
1178	 * space.  It will be filled in when the transaction is
1179	 * finished.
1180	 */
1181	CHECK(journal_write_xhdr(j, 0, 0, 0, 0));
1182	j->x.pos[1].offset = j->offset;
1183
1184	j->state = JOURNAL_STATE_TRANSACTION;
1185	result = ISC_R_SUCCESS;
1186failure:
1187	return (result);
1188}
1189
1190isc_result_t
1191dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
1192	dns_difftuple_t *t;
1193	isc_buffer_t buffer;
1194	void *mem = NULL;
1195	uint64_t size = 0;
1196	uint32_t rrcount = 0;
1197	isc_result_t result;
1198	isc_region_t used;
1199
1200	REQUIRE(DNS_DIFF_VALID(diff));
1201	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1202
1203	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
1204	(void)dns_diff_print(diff, NULL);
1205
1206	/*
1207	 * Pass 1: determine the buffer size needed, and
1208	 * keep track of SOA serial numbers.
1209	 */
1210	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1211	     t = ISC_LIST_NEXT(t, link))
1212	{
1213		if (t->rdata.type == dns_rdatatype_soa) {
1214			if (j->x.n_soa < 2) {
1215				j->x.pos[j->x.n_soa].serial =
1216					dns_soa_getserial(&t->rdata);
1217			}
1218			j->x.n_soa++;
1219		}
1220		size += sizeof(journal_rawrrhdr_t);
1221		size += t->name.length; /* XXX should have access macro? */
1222		size += 10;
1223		size += t->rdata.length;
1224	}
1225
1226	if (size >= DNS_JOURNAL_SIZE_MAX) {
1227		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1228			      "dns_journal_writediff: %s: journal entry "
1229			      "too big to be stored: %" PRIu64 " bytes",
1230			      j->filename, size);
1231		return (ISC_R_NOSPACE);
1232	}
1233
1234	mem = isc_mem_get(j->mctx, size);
1235
1236	isc_buffer_init(&buffer, mem, size);
1237
1238	/*
1239	 * Pass 2.  Write RRs to buffer.
1240	 */
1241	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1242	     t = ISC_LIST_NEXT(t, link))
1243	{
1244		/*
1245		 * Write the RR header.
1246		 */
1247		isc_buffer_putuint32(&buffer,
1248				     t->name.length + 10 + t->rdata.length);
1249		/*
1250		 * Write the owner name, RR header, and RR data.
1251		 */
1252		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
1253		isc_buffer_putuint16(&buffer, t->rdata.type);
1254		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1255		isc_buffer_putuint32(&buffer, t->ttl);
1256		INSIST(t->rdata.length < 65536);
1257		isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length);
1258		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1259		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1260
1261		rrcount++;
1262	}
1263
1264	isc_buffer_usedregion(&buffer, &used);
1265	INSIST(used.length == size);
1266
1267	j->x.pos[1].offset += used.length;
1268	j->x.n_rr = rrcount;
1269
1270	/*
1271	 * Write the buffer contents to the journal file.
1272	 */
1273	CHECK(journal_write(j, used.base, used.length));
1274
1275	result = ISC_R_SUCCESS;
1276
1277failure:
1278	if (mem != NULL) {
1279		isc_mem_put(j->mctx, mem, size);
1280	}
1281	return (result);
1282}
1283
1284isc_result_t
1285dns_journal_commit(dns_journal_t *j) {
1286	isc_result_t result;
1287	journal_rawheader_t rawheader;
1288	uint64_t total;
1289
1290	REQUIRE(DNS_JOURNAL_VALID(j));
1291	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
1292		j->state == JOURNAL_STATE_INLINE);
1293
1294	/*
1295	 * Just write out a updated header.
1296	 */
1297	if (j->state == JOURNAL_STATE_INLINE) {
1298		CHECK(journal_fsync(j));
1299		journal_header_encode(&j->header, &rawheader);
1300		CHECK(journal_seek(j, 0));
1301		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1302		CHECK(journal_fsync(j));
1303		j->state = JOURNAL_STATE_WRITE;
1304		return (ISC_R_SUCCESS);
1305	}
1306
1307	/*
1308	 * Perform some basic consistency checks.
1309	 */
1310	if (j->x.n_soa != 2) {
1311		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1312			      "%s: malformed transaction: %d SOAs", j->filename,
1313			      j->x.n_soa);
1314		return (ISC_R_UNEXPECTED);
1315	}
1316	if (!DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial)) {
1317		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1318			      "%s: malformed transaction: serial number "
1319			      "did not increase",
1320			      j->filename);
1321		return (ISC_R_UNEXPECTED);
1322	}
1323	if (!JOURNAL_EMPTY(&j->header)) {
1324		if (j->x.pos[0].serial != j->header.end.serial) {
1325			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1326				      "malformed transaction: "
1327				      "%s last serial %u != "
1328				      "transaction first serial %u",
1329				      j->filename, j->header.end.serial,
1330				      j->x.pos[0].serial);
1331			return (ISC_R_UNEXPECTED);
1332		}
1333	}
1334
1335	/*
1336	 * We currently don't support huge journal entries.
1337	 */
1338	total = j->x.pos[1].offset - j->x.pos[0].offset;
1339	if (total >= DNS_JOURNAL_SIZE_MAX) {
1340		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1341			      "transaction too big to be stored in journal: "
1342			      "%" PRIu64 "b (max is %" PRIu64 "b)",
1343			      total, (uint64_t)DNS_JOURNAL_SIZE_MAX);
1344		return (ISC_R_UNEXPECTED);
1345	}
1346
1347	/*
1348	 * Some old journal entries may become non-addressable
1349	 * when we increment the current serial number.  Purge them
1350	 * by stepping header.begin forward to the first addressable
1351	 * transaction.  Also purge them from the index.
1352	 */
1353	if (!JOURNAL_EMPTY(&j->header)) {
1354		while (!DNS_SERIAL_GT(j->x.pos[1].serial,
1355				      j->header.begin.serial))
1356		{
1357			CHECK(journal_next(j, &j->header.begin));
1358		}
1359		index_invalidate(j, j->x.pos[1].serial);
1360	}
1361#ifdef notyet
1362	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1363		force_dump(...);
1364	}
1365#endif /* ifdef notyet */
1366
1367	/*
1368	 * Commit the transaction data to stable storage.
1369	 */
1370	CHECK(journal_fsync(j));
1371
1372	if (j->state == JOURNAL_STATE_TRANSACTION) {
1373		isc_offset_t offset;
1374		offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
1375			 (j->header_ver1 ? sizeof(journal_rawxhdr_ver1_t)
1376					 : sizeof(journal_rawxhdr_t));
1377		/*
1378		 * Update the transaction header.
1379		 */
1380		CHECK(journal_seek(j, j->x.pos[0].offset));
1381		CHECK(journal_write_xhdr(j, offset, j->x.n_rr,
1382					 j->x.pos[0].serial,
1383					 j->x.pos[1].serial));
1384	}
1385
1386	/*
1387	 * Update the journal header.
1388	 */
1389	if (JOURNAL_EMPTY(&j->header)) {
1390		j->header.begin = j->x.pos[0];
1391	}
1392	j->header.end = j->x.pos[1];
1393	journal_header_encode(&j->header, &rawheader);
1394	CHECK(journal_seek(j, 0));
1395	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1396
1397	/*
1398	 * Update the index.
1399	 */
1400	index_add(j, &j->x.pos[0]);
1401
1402	/*
1403	 * Convert the index into on-disk format and write
1404	 * it to disk.
1405	 */
1406	CHECK(index_to_disk(j));
1407
1408	/*
1409	 * Commit the header to stable storage.
1410	 */
1411	CHECK(journal_fsync(j));
1412
1413	/*
1414	 * We no longer have a transaction open.
1415	 */
1416	j->state = JOURNAL_STATE_WRITE;
1417
1418	result = ISC_R_SUCCESS;
1419
1420failure:
1421	return (result);
1422}
1423
1424isc_result_t
1425dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1426	isc_result_t result;
1427
1428	CHECK(dns_diff_sort(diff, ixfr_order));
1429	CHECK(dns_journal_begin_transaction(j));
1430	CHECK(dns_journal_writediff(j, diff));
1431	CHECK(dns_journal_commit(j));
1432	result = ISC_R_SUCCESS;
1433failure:
1434	return (result);
1435}
1436
1437void
1438dns_journal_destroy(dns_journal_t **journalp) {
1439	dns_journal_t *j = NULL;
1440
1441	REQUIRE(journalp != NULL);
1442	REQUIRE(DNS_JOURNAL_VALID(*journalp));
1443
1444	j = *journalp;
1445	*journalp = NULL;
1446
1447	j->it.result = ISC_R_FAILURE;
1448	dns_name_invalidate(&j->it.name);
1449	dns_decompress_invalidate(&j->it.dctx);
1450	if (j->rawindex != NULL) {
1451		isc_mem_put(j->mctx, j->rawindex,
1452			    j->header.index_size * sizeof(journal_rawpos_t));
1453	}
1454	if (j->index != NULL) {
1455		isc_mem_put(j->mctx, j->index,
1456			    j->header.index_size * sizeof(journal_pos_t));
1457	}
1458	if (j->it.target.base != NULL) {
1459		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1460	}
1461	if (j->it.source.base != NULL) {
1462		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1463	}
1464	if (j->filename != NULL) {
1465		isc_mem_free(j->mctx, j->filename);
1466	}
1467	if (j->fp != NULL) {
1468		(void)isc_stdio_close(j->fp);
1469	}
1470	j->magic = 0;
1471	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
1472}
1473
1474/*
1475 * Roll the open journal 'j' into the database 'db'.
1476 * A new database version will be created.
1477 */
1478
1479/* XXX Share code with incoming IXFR? */
1480
1481isc_result_t
1482dns_journal_rollforward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
1483	isc_buffer_t source; /* Transaction data from disk */
1484	isc_buffer_t target; /* Ditto after _fromwire check */
1485	uint32_t db_serial;  /* Database SOA serial */
1486	uint32_t end_serial; /* Last journal SOA serial */
1487	isc_result_t result;
1488	dns_dbversion_t *ver = NULL;
1489	journal_pos_t pos;
1490	dns_diff_t diff;
1491	unsigned int n_soa = 0;
1492	unsigned int n_put = 0;
1493	dns_diffop_t op;
1494
1495	REQUIRE(DNS_JOURNAL_VALID(j));
1496	REQUIRE(DNS_DB_VALID(db));
1497
1498	dns_diff_init(j->mctx, &diff);
1499
1500	/*
1501	 * Set up empty initial buffers for unchecked and checked
1502	 * wire format transaction data.  They will be reallocated
1503	 * later.
1504	 */
1505	isc_buffer_init(&source, NULL, 0);
1506	isc_buffer_init(&target, NULL, 0);
1507
1508	/*
1509	 * Create the new database version.
1510	 */
1511	CHECK(dns_db_newversion(db, &ver));
1512
1513	/*
1514	 * Get the current database SOA serial number.
1515	 */
1516	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1517
1518	/*
1519	 * Locate a journal entry for the current database serial.
1520	 */
1521	CHECK(journal_find(j, db_serial, &pos));
1522
1523	end_serial = dns_journal_last_serial(j);
1524
1525	/*
1526	 * If we're reading a version 1 file, scan all the transactions
1527	 * to see if the journal needs rewriting: if any outdated
1528	 * transaction headers are found, j->recovered will be set.
1529	 */
1530	if (j->header_ver1) {
1531		uint32_t start_serial = dns_journal_first_serial(j);
1532
1533		CHECK(dns_journal_iter_init(j, start_serial, db_serial, NULL));
1534		for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1535		     result = dns_journal_next_rr(j))
1536		{
1537			continue;
1538		}
1539	}
1540
1541	if (db_serial == end_serial) {
1542		CHECK(DNS_R_UPTODATE);
1543	}
1544
1545	CHECK(dns_journal_iter_init(j, db_serial, end_serial, NULL));
1546	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1547	     result = dns_journal_next_rr(j))
1548	{
1549		dns_name_t *name = NULL;
1550		dns_rdata_t *rdata = NULL;
1551		dns_difftuple_t *tuple = NULL;
1552		uint32_t ttl;
1553
1554		dns_journal_current_rr(j, &name, &ttl, &rdata);
1555
1556		if (rdata->type == dns_rdatatype_soa) {
1557			n_soa++;
1558			if (n_soa == 2) {
1559				db_serial = j->it.current_serial;
1560			}
1561		}
1562
1563		if (n_soa == 3) {
1564			n_soa = 1;
1565		}
1566		if (n_soa == 0) {
1567			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1568				      "%s: journal file corrupt: missing "
1569				      "initial SOA",
1570				      j->filename);
1571			FAIL(ISC_R_UNEXPECTED);
1572		}
1573		if ((options & DNS_JOURNALOPT_RESIGN) != 0) {
1574			op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN
1575					  : DNS_DIFFOP_ADDRESIGN;
1576		} else {
1577			op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
1578		}
1579
1580		CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
1581					   &tuple));
1582		dns_diff_append(&diff, &tuple);
1583
1584		if (++n_put > 100) {
1585			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1586				      "%s: applying diff to database (%u)",
1587				      j->filename, db_serial);
1588			(void)dns_diff_print(&diff, NULL);
1589			CHECK(dns_diff_apply(&diff, db, ver));
1590			dns_diff_clear(&diff);
1591			n_put = 0;
1592		}
1593	}
1594	if (result == ISC_R_NOMORE) {
1595		result = ISC_R_SUCCESS;
1596	}
1597	CHECK(result);
1598
1599	if (n_put != 0) {
1600		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1601			      "%s: applying final diff to database (%u)",
1602			      j->filename, db_serial);
1603		(void)dns_diff_print(&diff, NULL);
1604		CHECK(dns_diff_apply(&diff, db, ver));
1605		dns_diff_clear(&diff);
1606	}
1607
1608failure:
1609	if (ver != NULL) {
1610		dns_db_closeversion(db, &ver,
1611				    result == ISC_R_SUCCESS ? true : false);
1612	}
1613
1614	if (source.base != NULL) {
1615		isc_mem_put(j->mctx, source.base, source.length);
1616	}
1617	if (target.base != NULL) {
1618		isc_mem_put(j->mctx, target.base, target.length);
1619	}
1620
1621	dns_diff_clear(&diff);
1622
1623	INSIST(ver == NULL);
1624
1625	return (result);
1626}
1627
1628isc_result_t
1629dns_journal_print(isc_mem_t *mctx, uint32_t flags, const char *filename,
1630		  FILE *file) {
1631	dns_journal_t *j = NULL;
1632	isc_buffer_t source;   /* Transaction data from disk */
1633	isc_buffer_t target;   /* Ditto after _fromwire check */
1634	uint32_t start_serial; /* Database SOA serial */
1635	uint32_t end_serial;   /* Last journal SOA serial */
1636	isc_result_t result;
1637	dns_diff_t diff;
1638	unsigned int n_soa = 0;
1639	unsigned int n_put = 0;
1640	bool printxhdr = ((flags & DNS_JOURNAL_PRINTXHDR) != 0);
1641
1642	REQUIRE(filename != NULL);
1643
1644	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1645	if (result == ISC_R_NOTFOUND) {
1646		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1647		return (DNS_R_NOJOURNAL);
1648	} else if (result != ISC_R_SUCCESS) {
1649		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1650			      "journal open failure: %s: %s",
1651			      isc_result_totext(result), filename);
1652		return (result);
1653	}
1654
1655	if (printxhdr) {
1656		fprintf(file, "Journal format = %sHeader version = %d\n",
1657			j->header.format + 1, j->header_ver1 ? 1 : 2);
1658		fprintf(file, "Start serial = %u\n", j->header.begin.serial);
1659		fprintf(file, "End serial = %u\n", j->header.end.serial);
1660		fprintf(file, "Index (size = %u):\n", j->header.index_size);
1661		for (uint32_t i = 0; i < j->header.index_size; i++) {
1662			if (j->index[i].offset == 0) {
1663				fputc('\n', file);
1664				break;
1665			}
1666			fprintf(file, "%lld", (long long)j->index[i].offset);
1667			fputc((i + 1) % 8 == 0 ? '\n' : ' ', file);
1668		}
1669	}
1670	if (j->header.serialset) {
1671		fprintf(file, "Source serial = %u\n", j->header.sourceserial);
1672	}
1673	dns_diff_init(j->mctx, &diff);
1674
1675	/*
1676	 * Set up empty initial buffers for unchecked and checked
1677	 * wire format transaction data.  They will be reallocated
1678	 * later.
1679	 */
1680	isc_buffer_init(&source, NULL, 0);
1681	isc_buffer_init(&target, NULL, 0);
1682
1683	start_serial = dns_journal_first_serial(j);
1684	end_serial = dns_journal_last_serial(j);
1685
1686	CHECK(dns_journal_iter_init(j, start_serial, end_serial, NULL));
1687
1688	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1689	     result = dns_journal_next_rr(j))
1690	{
1691		dns_name_t *name = NULL;
1692		dns_rdata_t *rdata = NULL;
1693		dns_difftuple_t *tuple = NULL;
1694		static uint32_t i = 0;
1695		bool print = false;
1696		uint32_t ttl;
1697
1698		dns_journal_current_rr(j, &name, &ttl, &rdata);
1699
1700		if (rdata->type == dns_rdatatype_soa) {
1701			n_soa++;
1702			if (n_soa == 3) {
1703				n_soa = 1;
1704			}
1705			if (n_soa == 1) {
1706				print = printxhdr;
1707			}
1708		}
1709		if (n_soa == 0) {
1710			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1711				      "%s: journal file corrupt: missing "
1712				      "initial SOA",
1713				      j->filename);
1714			FAIL(ISC_R_UNEXPECTED);
1715		}
1716
1717		if (print) {
1718			fprintf(file,
1719				"Transaction: version %d offset %lld size %u "
1720				"rrcount %u start %u end %u\n",
1721				j->xhdr_version, (long long)j->it.cpos.offset,
1722				j->curxhdr.size, j->curxhdr.count,
1723				j->curxhdr.serial0, j->curxhdr.serial1);
1724			if (j->it.cpos.offset > j->index[i].offset) {
1725				fprintf(file,
1726					"ERROR: Offset mismatch, "
1727					"expected %lld\n",
1728					(long long)j->index[i].offset);
1729			} else if (j->it.cpos.offset == j->index[i].offset) {
1730				i++;
1731			}
1732		}
1733		CHECK(dns_difftuple_create(
1734			diff.mctx, n_soa == 1 ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1735			name, ttl, rdata, &tuple));
1736		dns_diff_append(&diff, &tuple);
1737
1738		if (++n_put > 100 || printxhdr) {
1739			result = dns_diff_print(&diff, file);
1740			dns_diff_clear(&diff);
1741			n_put = 0;
1742			if (result != ISC_R_SUCCESS) {
1743				break;
1744			}
1745		}
1746	}
1747	if (result == ISC_R_NOMORE) {
1748		result = ISC_R_SUCCESS;
1749	}
1750	CHECK(result);
1751
1752	if (n_put != 0) {
1753		result = dns_diff_print(&diff, file);
1754		dns_diff_clear(&diff);
1755	}
1756	goto cleanup;
1757
1758failure:
1759	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1760		      "%s: cannot print: journal file corrupt", j->filename);
1761
1762cleanup:
1763	if (source.base != NULL) {
1764		isc_mem_put(j->mctx, source.base, source.length);
1765	}
1766	if (target.base != NULL) {
1767		isc_mem_put(j->mctx, target.base, target.length);
1768	}
1769
1770	dns_diff_clear(&diff);
1771	dns_journal_destroy(&j);
1772
1773	return (result);
1774}
1775
1776/**************************************************************************/
1777/*
1778 * Miscellaneous accessors.
1779 */
1780bool
1781dns_journal_empty(dns_journal_t *j) {
1782	return (JOURNAL_EMPTY(&j->header));
1783}
1784
1785bool
1786dns_journal_recovered(dns_journal_t *j) {
1787	return (j->recovered);
1788}
1789
1790uint32_t
1791dns_journal_first_serial(dns_journal_t *j) {
1792	return (j->header.begin.serial);
1793}
1794
1795uint32_t
1796dns_journal_last_serial(dns_journal_t *j) {
1797	return (j->header.end.serial);
1798}
1799
1800void
1801dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) {
1802	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1803		j->state == JOURNAL_STATE_INLINE ||
1804		j->state == JOURNAL_STATE_TRANSACTION);
1805
1806	j->header.sourceserial = sourceserial;
1807	j->header.serialset = true;
1808	if (j->state == JOURNAL_STATE_WRITE) {
1809		j->state = JOURNAL_STATE_INLINE;
1810	}
1811}
1812
1813bool
1814dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) {
1815	REQUIRE(sourceserial != NULL);
1816
1817	if (!j->header.serialset) {
1818		return (false);
1819	}
1820	*sourceserial = j->header.sourceserial;
1821	return (true);
1822}
1823
1824/**************************************************************************/
1825/*
1826 * Iteration support.
1827 *
1828 * When serving an outgoing IXFR, we transmit a part the journal starting
1829 * at the serial number in the IXFR request and ending at the serial
1830 * number that is current when the IXFR request arrives.  The ending
1831 * serial number is not necessarily at the end of the journal:
1832 * the journal may grow while the IXFR is in progress, but we stop
1833 * when we reach the serial number that was current when the IXFR started.
1834 */
1835
1836static isc_result_t
1837read_one_rr(dns_journal_t *j);
1838
1839/*
1840 * Make sure the buffer 'b' is has at least 'size' bytes
1841 * allocated, and clear it.
1842 *
1843 * Requires:
1844 *	Either b->base is NULL, or it points to b->length bytes of memory
1845 *	previously allocated by isc_mem_get().
1846 */
1847
1848static isc_result_t
1849size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1850	if (b->length < size) {
1851		void *mem = isc_mem_get(mctx, size);
1852		if (mem == NULL) {
1853			return (ISC_R_NOMEMORY);
1854		}
1855		if (b->base != NULL) {
1856			isc_mem_put(mctx, b->base, b->length);
1857		}
1858		b->base = mem;
1859		b->length = size;
1860	}
1861	isc_buffer_clear(b);
1862	return (ISC_R_SUCCESS);
1863}
1864
1865isc_result_t
1866dns_journal_iter_init(dns_journal_t *j, uint32_t begin_serial,
1867		      uint32_t end_serial, size_t *xfrsizep) {
1868	isc_result_t result;
1869
1870	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1871	INSIST(j->it.bpos.serial == begin_serial);
1872
1873	CHECK(journal_find(j, end_serial, &j->it.epos));
1874	INSIST(j->it.epos.serial == end_serial);
1875
1876	if (xfrsizep != NULL) {
1877		journal_pos_t pos = j->it.bpos;
1878		journal_xhdr_t xhdr;
1879		uint64_t size = 0;
1880		uint32_t count = 0;
1881
1882		/*
1883		 * We already know the beginning and ending serial
1884		 * numbers are in the journal. Scan through them,
1885		 * adding up sizes and RR counts so we can calculate
1886		 * the IXFR size.
1887		 */
1888		do {
1889			CHECK(journal_seek(j, pos.offset));
1890			CHECK(journal_read_xhdr(j, &xhdr));
1891
1892			if (j->header_ver1) {
1893				CHECK(maybe_fixup_xhdr(j, &xhdr, pos.serial,
1894						       pos.offset));
1895			}
1896
1897			/*
1898			 * Check that xhdr is consistent.
1899			 */
1900			if (xhdr.serial0 != pos.serial ||
1901			    isc_serial_le(xhdr.serial1, xhdr.serial0))
1902			{
1903				CHECK(ISC_R_UNEXPECTED);
1904			}
1905
1906			size += xhdr.size;
1907			count += xhdr.count;
1908
1909			result = journal_next(j, &pos);
1910			if (result == ISC_R_NOMORE) {
1911				result = ISC_R_SUCCESS;
1912			}
1913			CHECK(result);
1914		} while (pos.serial != end_serial);
1915
1916		/*
1917		 * For each RR, subtract the length of the RR header,
1918		 * as this would not be present in IXFR messages.
1919		 * (We don't need to worry about the transaction header
1920		 * because that was already excluded from xdr.size.)
1921		 */
1922		*xfrsizep = size - (count * sizeof(journal_rawrrhdr_t));
1923	}
1924
1925	result = ISC_R_SUCCESS;
1926failure:
1927	j->it.result = result;
1928	return (j->it.result);
1929}
1930
1931isc_result_t
1932dns_journal_first_rr(dns_journal_t *j) {
1933	isc_result_t result;
1934
1935	/*
1936	 * Seek to the beginning of the first transaction we are
1937	 * interested in.
1938	 */
1939	CHECK(journal_seek(j, j->it.bpos.offset));
1940	j->it.current_serial = j->it.bpos.serial;
1941
1942	j->it.xsize = 0; /* We have no transaction data yet... */
1943	j->it.xpos = 0;	 /* ...and haven't used any of it. */
1944
1945	return (read_one_rr(j));
1946
1947failure:
1948	return (result);
1949}
1950
1951static isc_result_t
1952read_one_rr(dns_journal_t *j) {
1953	isc_result_t result;
1954	dns_rdatatype_t rdtype;
1955	dns_rdataclass_t rdclass;
1956	unsigned int rdlen;
1957	uint32_t ttl;
1958	journal_xhdr_t xhdr;
1959	journal_rrhdr_t rrhdr;
1960	dns_journal_t save = *j;
1961
1962	if (j->offset > j->it.epos.offset) {
1963		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1964			      "%s: journal corrupt: possible integer overflow",
1965			      j->filename);
1966		return (ISC_R_UNEXPECTED);
1967	}
1968	if (j->offset == j->it.epos.offset) {
1969		return (ISC_R_NOMORE);
1970	}
1971	if (j->it.xpos == j->it.xsize) {
1972		/*
1973		 * We are at a transaction boundary.
1974		 * Read another transaction header.
1975		 */
1976		CHECK(journal_read_xhdr(j, &xhdr));
1977		if (xhdr.size == 0) {
1978			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1979				      "%s: journal corrupt: empty transaction",
1980				      j->filename);
1981			FAIL(ISC_R_UNEXPECTED);
1982		}
1983
1984		if (j->header_ver1) {
1985			CHECK(maybe_fixup_xhdr(j, &xhdr, j->it.current_serial,
1986					       save.offset));
1987		}
1988
1989		if (xhdr.serial0 != j->it.current_serial ||
1990		    isc_serial_le(xhdr.serial1, xhdr.serial0))
1991		{
1992			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1993				      "%s: journal file corrupt: "
1994				      "expected serial %u, got %u",
1995				      j->filename, j->it.current_serial,
1996				      xhdr.serial0);
1997			FAIL(ISC_R_UNEXPECTED);
1998		}
1999
2000		j->it.xsize = xhdr.size;
2001		j->it.xpos = 0;
2002	}
2003	/*
2004	 * Read an RR.
2005	 */
2006	CHECK(journal_read_rrhdr(j, &rrhdr));
2007	/*
2008	 * Perform a sanity check on the journal RR size.
2009	 * The smallest possible RR has a 1-byte owner name
2010	 * and a 10-byte header.  The largest possible
2011	 * RR has 65535 bytes of data, a header, and a maximum-
2012	 * size owner name, well below 70 k total.
2013	 */
2014	if (rrhdr.size < 1 + 10 || rrhdr.size > 70000) {
2015		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
2016			      "%s: journal corrupt: impossible RR size "
2017			      "(%d bytes)",
2018			      j->filename, rrhdr.size);
2019		FAIL(ISC_R_UNEXPECTED);
2020	}
2021
2022	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
2023	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
2024	isc_buffer_add(&j->it.source, rrhdr.size);
2025
2026	/*
2027	 * The target buffer is made the same size
2028	 * as the source buffer, with the assumption that when
2029	 * no compression in present, the output of dns_*_fromwire()
2030	 * is no larger than the input.
2031	 */
2032	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
2033
2034	/*
2035	 * Parse the owner name.  We don't know where it
2036	 * ends yet, so we make the entire "remaining"
2037	 * part of the buffer "active".
2038	 */
2039	isc_buffer_setactive(&j->it.source,
2040			     j->it.source.used - j->it.source.current);
2041	CHECK(dns_name_fromwire(&j->it.name, &j->it.source, &j->it.dctx, 0,
2042				&j->it.target));
2043
2044	/*
2045	 * Check that the RR header is there, and parse it.
2046	 */
2047	if (isc_buffer_remaininglength(&j->it.source) < 10) {
2048		FAIL(DNS_R_FORMERR);
2049	}
2050
2051	rdtype = isc_buffer_getuint16(&j->it.source);
2052	rdclass = isc_buffer_getuint16(&j->it.source);
2053	ttl = isc_buffer_getuint32(&j->it.source);
2054	rdlen = isc_buffer_getuint16(&j->it.source);
2055
2056	if (rdlen > DNS_RDATA_MAXLENGTH) {
2057		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
2058			      "%s: journal corrupt: impossible rdlen "
2059			      "(%u bytes)",
2060			      j->filename, rdlen);
2061		FAIL(ISC_R_FAILURE);
2062	}
2063
2064	/*
2065	 * Parse the rdata.
2066	 */
2067	if (isc_buffer_remaininglength(&j->it.source) != rdlen) {
2068		FAIL(DNS_R_FORMERR);
2069	}
2070	isc_buffer_setactive(&j->it.source, rdlen);
2071	dns_rdata_reset(&j->it.rdata);
2072	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass, rdtype, &j->it.source,
2073				 &j->it.dctx, 0, &j->it.target));
2074	j->it.ttl = ttl;
2075
2076	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
2077	if (rdtype == dns_rdatatype_soa) {
2078		/* XXX could do additional consistency checks here */
2079		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
2080	}
2081
2082	result = ISC_R_SUCCESS;
2083
2084failure:
2085	j->it.result = result;
2086	return (result);
2087}
2088
2089isc_result_t
2090dns_journal_next_rr(dns_journal_t *j) {
2091	j->it.result = read_one_rr(j);
2092	return (j->it.result);
2093}
2094
2095void
2096dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl,
2097		       dns_rdata_t **rdata) {
2098	REQUIRE(j->it.result == ISC_R_SUCCESS);
2099	*name = &j->it.name;
2100	*ttl = j->it.ttl;
2101	*rdata = &j->it.rdata;
2102}
2103
2104/**************************************************************************/
2105/*
2106 * Generating diffs from databases
2107 */
2108
2109/*
2110 * Construct a diff containing all the RRs at the current name of the
2111 * database iterator 'dbit' in database 'db', version 'ver'.
2112 * Set '*name' to the current name, and append the diff to 'diff'.
2113 * All new tuples will have the operation 'op'.
2114 *
2115 * Requires: 'name' must have buffer large enough to hold the name.
2116 * Typically, a dns_fixedname_t would be used.
2117 */
2118static isc_result_t
2119get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
2120	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
2121	      dns_diff_t *diff) {
2122	isc_result_t result;
2123	dns_dbnode_t *node = NULL;
2124	dns_rdatasetiter_t *rdsiter = NULL;
2125	dns_difftuple_t *tuple = NULL;
2126
2127	result = dns_dbiterator_current(dbit, &node, name);
2128	if (result != ISC_R_SUCCESS) {
2129		return (result);
2130	}
2131
2132	result = dns_db_allrdatasets(db, node, ver, 0, now, &rdsiter);
2133	if (result != ISC_R_SUCCESS) {
2134		goto cleanup_node;
2135	}
2136
2137	for (result = dns_rdatasetiter_first(rdsiter); result == ISC_R_SUCCESS;
2138	     result = dns_rdatasetiter_next(rdsiter))
2139	{
2140		dns_rdataset_t rdataset;
2141
2142		dns_rdataset_init(&rdataset);
2143		dns_rdatasetiter_current(rdsiter, &rdataset);
2144
2145		for (result = dns_rdataset_first(&rdataset);
2146		     result == ISC_R_SUCCESS;
2147		     result = dns_rdataset_next(&rdataset))
2148		{
2149			dns_rdata_t rdata = DNS_RDATA_INIT;
2150			dns_rdataset_current(&rdataset, &rdata);
2151			result = dns_difftuple_create(diff->mctx, op, name,
2152						      rdataset.ttl, &rdata,
2153						      &tuple);
2154			if (result != ISC_R_SUCCESS) {
2155				dns_rdataset_disassociate(&rdataset);
2156				goto cleanup_iterator;
2157			}
2158			dns_diff_append(diff, &tuple);
2159		}
2160		dns_rdataset_disassociate(&rdataset);
2161		if (result != ISC_R_NOMORE) {
2162			goto cleanup_iterator;
2163		}
2164	}
2165	if (result != ISC_R_NOMORE) {
2166		goto cleanup_iterator;
2167	}
2168
2169	result = ISC_R_SUCCESS;
2170
2171cleanup_iterator:
2172	dns_rdatasetiter_destroy(&rdsiter);
2173
2174cleanup_node:
2175	dns_db_detachnode(db, &node);
2176
2177	return (result);
2178}
2179
2180/*
2181 * Comparison function for use by dns_diff_subtract when sorting
2182 * the diffs to be subtracted.  The sort keys are the rdata type
2183 * and the rdata itself.  The owner name is ignored, because
2184 * it is known to be the same for all tuples.
2185 */
2186static int
2187rdata_order(const void *av, const void *bv) {
2188	dns_difftuple_t const *const *ap = av;
2189	dns_difftuple_t const *const *bp = bv;
2190	dns_difftuple_t const *a = *ap;
2191	dns_difftuple_t const *b = *bp;
2192	int r;
2193	r = (b->rdata.type - a->rdata.type);
2194	if (r != 0) {
2195		return (r);
2196	}
2197	r = dns_rdata_compare(&a->rdata, &b->rdata);
2198	return (r);
2199}
2200
2201static isc_result_t
2202dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
2203	isc_result_t result;
2204	dns_difftuple_t *p[2];
2205	int i, t;
2206	bool append;
2207	dns_difftuplelist_t add, del;
2208
2209	CHECK(dns_diff_sort(&diff[0], rdata_order));
2210	CHECK(dns_diff_sort(&diff[1], rdata_order));
2211	ISC_LIST_INIT(add);
2212	ISC_LIST_INIT(del);
2213
2214	for (;;) {
2215		p[0] = ISC_LIST_HEAD(diff[0].tuples);
2216		p[1] = ISC_LIST_HEAD(diff[1].tuples);
2217		if (p[0] == NULL && p[1] == NULL) {
2218			break;
2219		}
2220
2221		for (i = 0; i < 2; i++) {
2222			if (p[!i] == NULL) {
2223				dns_difftuplelist_t *l = (i == 0) ? &add : &del;
2224				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
2225				ISC_LIST_APPEND(*l, p[i], link);
2226				goto next;
2227			}
2228		}
2229		t = rdata_order(&p[0], &p[1]);
2230		if (t < 0) {
2231			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
2232			ISC_LIST_APPEND(add, p[0], link);
2233			goto next;
2234		}
2235		if (t > 0) {
2236			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
2237			ISC_LIST_APPEND(del, p[1], link);
2238			goto next;
2239		}
2240		INSIST(t == 0);
2241		/*
2242		 * Identical RRs in both databases; skip them both
2243		 * if the ttl differs.
2244		 */
2245		append = (p[0]->ttl != p[1]->ttl);
2246		for (i = 0; i < 2; i++) {
2247			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
2248			if (append) {
2249				dns_difftuplelist_t *l = (i == 0) ? &add : &del;
2250				ISC_LIST_APPEND(*l, p[i], link);
2251			} else {
2252				dns_difftuple_free(&p[i]);
2253			}
2254		}
2255	next:;
2256	}
2257	ISC_LIST_APPENDLIST(r->tuples, del, link);
2258	ISC_LIST_APPENDLIST(r->tuples, add, link);
2259	result = ISC_R_SUCCESS;
2260failure:
2261	return (result);
2262}
2263
2264static isc_result_t
2265diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, dns_db_t *dbb,
2266	       dns_dbversion_t *dbverb, unsigned int options,
2267	       dns_diff_t *resultdiff) {
2268	dns_db_t *db[2];
2269	dns_dbversion_t *ver[2];
2270	dns_dbiterator_t *dbit[2] = { NULL, NULL };
2271	bool have[2] = { false, false };
2272	dns_fixedname_t fixname[2];
2273	isc_result_t result, itresult[2];
2274	dns_diff_t diff[2];
2275	int i, t;
2276
2277	db[0] = dba, db[1] = dbb;
2278	ver[0] = dbvera, ver[1] = dbverb;
2279
2280	dns_diff_init(resultdiff->mctx, &diff[0]);
2281	dns_diff_init(resultdiff->mctx, &diff[1]);
2282
2283	dns_fixedname_init(&fixname[0]);
2284	dns_fixedname_init(&fixname[1]);
2285
2286	result = dns_db_createiterator(db[0], options, &dbit[0]);
2287	if (result != ISC_R_SUCCESS) {
2288		return (result);
2289	}
2290	result = dns_db_createiterator(db[1], options, &dbit[1]);
2291	if (result != ISC_R_SUCCESS) {
2292		goto cleanup_iterator;
2293	}
2294
2295	itresult[0] = dns_dbiterator_first(dbit[0]);
2296	itresult[1] = dns_dbiterator_first(dbit[1]);
2297
2298	for (;;) {
2299		for (i = 0; i < 2; i++) {
2300			if (!have[i] && itresult[i] == ISC_R_SUCCESS) {
2301				CHECK(get_name_diff(
2302					db[i], ver[i], 0, dbit[i],
2303					dns_fixedname_name(&fixname[i]),
2304					i == 0 ? DNS_DIFFOP_ADD
2305					       : DNS_DIFFOP_DEL,
2306					&diff[i]));
2307				itresult[i] = dns_dbiterator_next(dbit[i]);
2308				have[i] = true;
2309			}
2310		}
2311
2312		if (!have[0] && !have[1]) {
2313			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2314			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2315			break;
2316		}
2317
2318		for (i = 0; i < 2; i++) {
2319			if (!have[!i]) {
2320				ISC_LIST_APPENDLIST(resultdiff->tuples,
2321						    diff[i].tuples, link);
2322				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
2323				have[i] = false;
2324				goto next;
2325			}
2326		}
2327
2328		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
2329				     dns_fixedname_name(&fixname[1]));
2330		if (t < 0) {
2331			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[0].tuples,
2332					    link);
2333			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2334			have[0] = false;
2335			continue;
2336		}
2337		if (t > 0) {
2338			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[1].tuples,
2339					    link);
2340			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2341			have[1] = false;
2342			continue;
2343		}
2344		INSIST(t == 0);
2345		CHECK(dns_diff_subtract(diff, resultdiff));
2346		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2347		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2348		have[0] = have[1] = false;
2349	next:;
2350	}
2351	if (itresult[0] != ISC_R_NOMORE) {
2352		FAIL(itresult[0]);
2353	}
2354	if (itresult[1] != ISC_R_NOMORE) {
2355		FAIL(itresult[1]);
2356	}
2357
2358	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2359	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2360
2361failure:
2362	dns_dbiterator_destroy(&dbit[1]);
2363
2364cleanup_iterator:
2365	dns_dbiterator_destroy(&dbit[0]);
2366	dns_diff_clear(&diff[0]);
2367	dns_diff_clear(&diff[1]);
2368	return (result);
2369}
2370
2371/*
2372 * Compare the databases 'dba' and 'dbb' and generate a journal
2373 * entry containing the changes to make 'dba' from 'dbb' (note
2374 * the order).  This journal entry will consist of a single,
2375 * possibly very large transaction.
2376 */
2377isc_result_t
2378dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
2379	    dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
2380	isc_result_t result;
2381	dns_diff_t diff;
2382
2383	dns_diff_init(mctx, &diff);
2384
2385	result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
2386
2387	dns_diff_clear(&diff);
2388
2389	return (result);
2390}
2391
2392isc_result_t
2393dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
2394	     dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
2395	isc_result_t result;
2396	dns_journal_t *journal = NULL;
2397
2398	if (filename != NULL) {
2399		result = dns_journal_open(diff->mctx, filename,
2400					  DNS_JOURNAL_CREATE, &journal);
2401		if (result != ISC_R_SUCCESS) {
2402			return (result);
2403		}
2404	}
2405
2406	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
2407	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
2408
2409	if (journal != NULL) {
2410		if (ISC_LIST_EMPTY(diff->tuples)) {
2411			isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
2412		} else {
2413			CHECK(dns_journal_write_transaction(journal, diff));
2414		}
2415	}
2416
2417failure:
2418	if (journal != NULL) {
2419		dns_journal_destroy(&journal);
2420	}
2421	return (result);
2422}
2423
2424static uint32_t
2425rrcount(unsigned char *buf, unsigned int size) {
2426	isc_buffer_t b;
2427	uint32_t rrsize, count = 0;
2428
2429	isc_buffer_init(&b, buf, size);
2430	isc_buffer_add(&b, size);
2431	while (isc_buffer_remaininglength(&b) > 0) {
2432		rrsize = isc_buffer_getuint32(&b);
2433		INSIST(isc_buffer_remaininglength(&b) >= rrsize);
2434		isc_buffer_forward(&b, rrsize);
2435		count++;
2436	}
2437
2438	return (count);
2439}
2440
2441static bool
2442check_delta(unsigned char *buf, size_t size) {
2443	isc_buffer_t b;
2444	uint32_t rrsize;
2445
2446	isc_buffer_init(&b, buf, size);
2447	isc_buffer_add(&b, size);
2448	while (isc_buffer_remaininglength(&b) > 0) {
2449		if (isc_buffer_remaininglength(&b) < 4) {
2450			return (false);
2451		}
2452		rrsize = isc_buffer_getuint32(&b);
2453		/* "." + type + class + ttl + rdlen => 11U */
2454		if (rrsize < 11U || isc_buffer_remaininglength(&b) < rrsize) {
2455			return (false);
2456		}
2457		isc_buffer_forward(&b, rrsize);
2458	}
2459
2460	return (true);
2461}
2462
2463isc_result_t
2464dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
2465		    uint32_t flags, uint32_t target_size) {
2466	unsigned int i;
2467	journal_pos_t best_guess;
2468	journal_pos_t current_pos;
2469	dns_journal_t *j1 = NULL;
2470	dns_journal_t *j2 = NULL;
2471	journal_rawheader_t rawheader;
2472	unsigned int len;
2473	size_t namelen;
2474	unsigned char *buf = NULL;
2475	unsigned int size = 0;
2476	isc_result_t result;
2477	unsigned int indexend;
2478	char newname[PATH_MAX];
2479	char backup[PATH_MAX];
2480	bool is_backup = false;
2481	bool rewrite = false;
2482	bool downgrade = false;
2483
2484	REQUIRE(filename != NULL);
2485
2486	namelen = strlen(filename);
2487	if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) {
2488		namelen -= 4;
2489	}
2490
2491	result = snprintf(newname, sizeof(newname), "%.*s.jnw", (int)namelen,
2492			  filename);
2493	RUNTIME_CHECK(result < sizeof(newname));
2494
2495	result = snprintf(backup, sizeof(backup), "%.*s.jbk", (int)namelen,
2496			  filename);
2497	RUNTIME_CHECK(result < sizeof(backup));
2498
2499	result = journal_open(mctx, filename, false, false, false, &j1);
2500	if (result == ISC_R_NOTFOUND) {
2501		is_backup = true;
2502		result = journal_open(mctx, backup, false, false, false, &j1);
2503	}
2504	if (result != ISC_R_SUCCESS) {
2505		return (result);
2506	}
2507
2508	/*
2509	 * Always perform a re-write when processing a version 1 journal.
2510	 */
2511	rewrite = j1->header_ver1;
2512
2513	/*
2514	 * Check whether we need to rewrite the whole journal
2515	 * file (for example, to upversion it).
2516	 */
2517	if ((flags & DNS_JOURNAL_COMPACTALL) != 0) {
2518		if ((flags & DNS_JOURNAL_VERSION1) != 0) {
2519			downgrade = true;
2520		}
2521		rewrite = true;
2522		serial = dns_journal_first_serial(j1);
2523	} else if (JOURNAL_EMPTY(&j1->header)) {
2524		dns_journal_destroy(&j1);
2525		return (ISC_R_SUCCESS);
2526	}
2527
2528	if (DNS_SERIAL_GT(j1->header.begin.serial, serial) ||
2529	    DNS_SERIAL_GT(serial, j1->header.end.serial))
2530	{
2531		dns_journal_destroy(&j1);
2532		return (ISC_R_RANGE);
2533	}
2534
2535	/*
2536	 * Cope with very small target sizes.
2537	 */
2538	indexend = sizeof(journal_rawheader_t) +
2539		   j1->header.index_size * sizeof(journal_rawpos_t);
2540	if (target_size < DNS_JOURNAL_SIZE_MIN) {
2541		target_size = DNS_JOURNAL_SIZE_MIN;
2542	}
2543	if (target_size < indexend * 2) {
2544		target_size = target_size / 2 + indexend;
2545	}
2546
2547	/*
2548	 * See if there is any work to do.
2549	 */
2550	if (!rewrite && (uint32_t)j1->header.end.offset < target_size) {
2551		dns_journal_destroy(&j1);
2552		return (ISC_R_SUCCESS);
2553	}
2554
2555	CHECK(journal_open(mctx, newname, true, true, downgrade, &j2));
2556	CHECK(journal_seek(j2, indexend));
2557
2558	/*
2559	 * Remove overhead so space test below can succeed.
2560	 */
2561	if (target_size >= indexend) {
2562		target_size -= indexend;
2563	}
2564
2565	/*
2566	 * Find if we can create enough free space.
2567	 */
2568	best_guess = j1->header.begin;
2569	for (i = 0; i < j1->header.index_size; i++) {
2570		if (POS_VALID(j1->index[i]) &&
2571		    DNS_SERIAL_GE(serial, j1->index[i].serial) &&
2572		    ((uint32_t)(j1->header.end.offset - j1->index[i].offset) >=
2573		     target_size / 2) &&
2574		    j1->index[i].offset > best_guess.offset)
2575		{
2576			best_guess = j1->index[i];
2577		}
2578	}
2579
2580	current_pos = best_guess;
2581	while (current_pos.serial != serial) {
2582		CHECK(journal_next(j1, &current_pos));
2583		if (current_pos.serial == j1->header.end.serial) {
2584			break;
2585		}
2586
2587		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
2588		    ((uint32_t)(j1->header.end.offset - current_pos.offset) >=
2589		     (target_size / 2)) &&
2590		    current_pos.offset > best_guess.offset)
2591		{
2592			best_guess = current_pos;
2593		} else {
2594			break;
2595		}
2596	}
2597
2598	INSIST(best_guess.serial != j1->header.end.serial);
2599	if (best_guess.serial != serial) {
2600		CHECK(journal_next(j1, &best_guess));
2601		serial = best_guess.serial;
2602	}
2603
2604	/*
2605	 * We should now be roughly half target_size provided
2606	 * we did not reach 'serial'.  If not we will just copy
2607	 * all uncommitted deltas regardless of the size.
2608	 */
2609	len = j1->header.end.offset - best_guess.offset;
2610	if (len != 0) {
2611		CHECK(journal_seek(j1, best_guess.offset));
2612
2613		/* Prepare new header */
2614		j2->header.begin.serial = best_guess.serial;
2615		j2->header.begin.offset = indexend;
2616		j2->header.sourceserial = j1->header.sourceserial;
2617		j2->header.serialset = j1->header.serialset;
2618		j2->header.end.serial = j1->header.end.serial;
2619
2620		/*
2621		 * Only use this method if we're rewriting the
2622		 * journal to fix outdated transaction headers;
2623		 * otherwise we'll copy the whole journal without
2624		 * parsing individual deltas below.
2625		 */
2626		while (rewrite && len > 0) {
2627			journal_xhdr_t xhdr;
2628			isc_offset_t offset = j1->offset;
2629			uint32_t count;
2630
2631			result = journal_read_xhdr(j1, &xhdr);
2632			if (rewrite && result == ISC_R_NOMORE) {
2633				break;
2634			}
2635			CHECK(result);
2636
2637			size = xhdr.size;
2638			if (size > len) {
2639				isc_log_write(JOURNAL_COMMON_LOGARGS,
2640					      ISC_LOG_ERROR,
2641					      "%s: journal file corrupt, "
2642					      "transaction too large",
2643					      j1->filename);
2644				CHECK(ISC_R_FAILURE);
2645			}
2646			buf = isc_mem_get(mctx, size);
2647			result = journal_read(j1, buf, size);
2648
2649			/*
2650			 * If we're repairing an outdated journal, the
2651			 * xhdr format may be wrong.
2652			 */
2653			if (rewrite && (result != ISC_R_SUCCESS ||
2654					!check_delta(buf, size)))
2655			{
2656				if (j1->xhdr_version == XHDR_VERSION2) {
2657					/* XHDR_VERSION2 -> XHDR_VERSION1 */
2658					j1->xhdr_version = XHDR_VERSION1;
2659					CHECK(journal_seek(j1, offset));
2660					CHECK(journal_read_xhdr(j1, &xhdr));
2661				} else if (j1->xhdr_version == XHDR_VERSION1) {
2662					/* XHDR_VERSION1 -> XHDR_VERSION2 */
2663					j1->xhdr_version = XHDR_VERSION2;
2664					CHECK(journal_seek(j1, offset));
2665					CHECK(journal_read_xhdr(j1, &xhdr));
2666				}
2667
2668				/* Check again */
2669				isc_mem_put(mctx, buf, size);
2670				size = xhdr.size;
2671				if (size > len) {
2672					isc_log_write(
2673						JOURNAL_COMMON_LOGARGS,
2674						ISC_LOG_ERROR,
2675						"%s: journal file corrupt, "
2676						"transaction too large",
2677						j1->filename);
2678					CHECK(ISC_R_FAILURE);
2679				}
2680				buf = isc_mem_get(mctx, size);
2681				CHECK(journal_read(j1, buf, size));
2682
2683				if (!check_delta(buf, size)) {
2684					CHECK(ISC_R_UNEXPECTED);
2685				}
2686			} else {
2687				CHECK(result);
2688			}
2689
2690			/*
2691			 * Recover from incorrectly written transaction header.
2692			 * The incorrect header was written as size, serial0,
2693			 * serial1, and 0.  XHDR_VERSION2 is expecting size,
2694			 * count, serial0, and serial1.
2695			 */
2696			if (j1->xhdr_version == XHDR_VERSION2 &&
2697			    xhdr.count == serial && xhdr.serial1 == 0U &&
2698			    isc_serial_gt(xhdr.serial0, xhdr.count))
2699			{
2700				xhdr.serial1 = xhdr.serial0;
2701				xhdr.serial0 = xhdr.count;
2702				xhdr.count = 0;
2703			}
2704
2705			/*
2706			 * Check that xhdr is consistent.
2707			 */
2708			if (xhdr.serial0 != serial ||
2709			    isc_serial_le(xhdr.serial1, xhdr.serial0))
2710			{
2711				CHECK(ISC_R_UNEXPECTED);
2712			}
2713
2714			/*
2715			 * Extract record count from the transaction.  This
2716			 * is needed when converting from XHDR_VERSION1 to
2717			 * XHDR_VERSION2, and when recovering from an
2718			 * incorrectly written XHDR_VERSION2.
2719			 */
2720			count = rrcount(buf, size);
2721			CHECK(journal_write_xhdr(j2, xhdr.size, count,
2722						 xhdr.serial0, xhdr.serial1));
2723			CHECK(journal_write(j2, buf, size));
2724
2725			j2->header.end.offset = j2->offset;
2726
2727			serial = xhdr.serial1;
2728
2729			len = j1->header.end.offset - j1->offset;
2730			isc_mem_put(mctx, buf, size);
2731		}
2732
2733		/*
2734		 * If we're not rewriting transaction headers, we can use
2735		 * this faster method instead.
2736		 */
2737		if (!rewrite) {
2738			size = ISC_MIN(64 * 1024, len);
2739			buf = isc_mem_get(mctx, size);
2740			for (i = 0; i < len; i += size) {
2741				unsigned int blob = ISC_MIN(size, len - i);
2742				CHECK(journal_read(j1, buf, blob));
2743				CHECK(journal_write(j2, buf, blob));
2744			}
2745
2746			j2->header.end.offset = indexend + len;
2747		}
2748
2749		CHECK(journal_fsync(j2));
2750
2751		/*
2752		 * Update the journal header.
2753		 */
2754		journal_header_encode(&j2->header, &rawheader);
2755		CHECK(journal_seek(j2, 0));
2756		CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
2757		CHECK(journal_fsync(j2));
2758
2759		/*
2760		 * Build new index.
2761		 */
2762		current_pos = j2->header.begin;
2763		while (current_pos.serial != j2->header.end.serial) {
2764			index_add(j2, &current_pos);
2765			CHECK(journal_next(j2, &current_pos));
2766		}
2767
2768		/*
2769		 * Write index.
2770		 */
2771		CHECK(index_to_disk(j2));
2772		CHECK(journal_fsync(j2));
2773
2774		indexend = j2->header.end.offset;
2775		POST(indexend);
2776	}
2777
2778	/*
2779	 * Close both journals before trying to rename files.
2780	 */
2781	dns_journal_destroy(&j1);
2782	dns_journal_destroy(&j2);
2783
2784	/*
2785	 * With a UFS file system this should just succeed and be atomic.
2786	 * Any IXFR outs will just continue and the old journal will be
2787	 * removed on final close.
2788	 *
2789	 * With MSDOS / NTFS we need to do a two stage rename, triggered
2790	 * by EEXIST.  (If any IXFR's are running in other threads, however,
2791	 * this will fail, and the journal will not be compacted.  But
2792	 * if so, hopefully they'll be finished by the next time we
2793	 * compact.)
2794	 */
2795	if (rename(newname, filename) == -1) {
2796		if (errno == EEXIST && !is_backup) {
2797			result = isc_file_remove(backup);
2798			if (result != ISC_R_SUCCESS &&
2799			    result != ISC_R_FILENOTFOUND)
2800			{
2801				goto failure;
2802			}
2803			if (rename(filename, backup) == -1) {
2804				goto maperrno;
2805			}
2806			if (rename(newname, filename) == -1) {
2807				goto maperrno;
2808			}
2809			(void)isc_file_remove(backup);
2810		} else {
2811		maperrno:
2812			result = ISC_R_FAILURE;
2813			goto failure;
2814		}
2815	}
2816
2817	result = ISC_R_SUCCESS;
2818
2819failure:
2820	(void)isc_file_remove(newname);
2821	if (buf != NULL) {
2822		isc_mem_put(mctx, buf, size);
2823	}
2824	if (j1 != NULL) {
2825		dns_journal_destroy(&j1);
2826	}
2827	if (j2 != NULL) {
2828		dns_journal_destroy(&j2);
2829	}
2830	return (result);
2831}
2832
2833static isc_result_t
2834index_to_disk(dns_journal_t *j) {
2835	isc_result_t result = ISC_R_SUCCESS;
2836
2837	if (j->header.index_size != 0) {
2838		unsigned int i;
2839		unsigned char *p;
2840		unsigned int rawbytes;
2841
2842		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2843
2844		p = j->rawindex;
2845		for (i = 0; i < j->header.index_size; i++) {
2846			encode_uint32(j->index[i].serial, p);
2847			p += 4;
2848			encode_uint32(j->index[i].offset, p);
2849			p += 4;
2850		}
2851		INSIST(p == j->rawindex + rawbytes);
2852
2853		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2854		CHECK(journal_write(j, j->rawindex, rawbytes));
2855	}
2856failure:
2857	return (result);
2858}
2859