1/*	$NetBSD: journal.c,v 1.1 2024/02/18 20:57:32 christos Exp $	*/
2
3/*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16#include <errno.h>
17#include <inttypes.h>
18#include <stdbool.h>
19#include <stdlib.h>
20#include <unistd.h>
21
22#include <isc/file.h>
23#include <isc/mem.h>
24#include <isc/print.h>
25#include <isc/serial.h>
26#include <isc/stdio.h>
27#include <isc/string.h>
28#include <isc/util.h>
29
30#include <dns/compress.h>
31#include <dns/db.h>
32#include <dns/dbiterator.h>
33#include <dns/diff.h>
34#include <dns/fixedname.h>
35#include <dns/journal.h>
36#include <dns/log.h>
37#include <dns/rdataset.h>
38#include <dns/rdatasetiter.h>
39#include <dns/result.h>
40#include <dns/soa.h>
41
42/*! \file
43 * \brief Journaling.
44 *
45 * A journal file consists of
46 *
47 *   \li A fixed-size header of type journal_rawheader_t.
48 *
49 *   \li The index.  This is an unordered array of index entries
50 *     of type journal_rawpos_t giving the locations
51 *     of some arbitrary subset of the journal's addressable
52 *     transactions.  The index entries are used as hints to
53 *     speed up the process of locating a transaction with a given
54 *     serial number.  Unused index entries have an "offset"
55 *     field of zero.  The size of the index can vary between
56 *     journal files, but does not change during the lifetime
57 *     of a file.  The size can be zero.
58 *
59 *   \li The journal data.  This  consists of one or more transactions.
60 *     Each transaction begins with a transaction header of type
61 *     journal_rawxhdr_t.  The transaction header is followed by a
62 *     sequence of RRs, similar in structure to an IXFR difference
63 *     sequence (RFC1995).  That is, the pre-transaction SOA,
64 *     zero or more other deleted RRs, the post-transaction SOA,
65 *     and zero or more other added RRs.  Unlike in IXFR, each RR
66 *     is prefixed with a 32-bit length.
67 *
68 *     The journal data part grows as new transactions are
69 *     appended to the file.  Only those transactions
70 *     whose serial number is current-(2^31-1) to current
71 *     are considered "addressable" and may be pointed
72 *     to from the header or index.  They may be preceded
73 *     by old transactions that are no longer addressable,
74 *     and they may be followed by transactions that were
75 *     appended to the journal but never committed by updating
76 *     the "end" position in the header.  The latter will
77 *     be overwritten when new transactions are added.
78 */
79
80/**************************************************************************/
81/*
82 * Miscellaneous utilities.
83 */
84
85#define JOURNAL_COMMON_LOGARGS \
86	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
87
88#define JOURNAL_DEBUG_LOGARGS(n) JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
89
90/*%
91 * It would be non-sensical (or at least obtuse) to use FAIL() with an
92 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
93 * from complaining about "end-of-loop code not reached".
94 */
95#define FAIL(code)                           \
96	do {                                 \
97		result = (code);             \
98		if (result != ISC_R_SUCCESS) \
99			goto failure;        \
100	} while (0)
101
102#define CHECK(op)                            \
103	do {                                 \
104		result = (op);               \
105		if (result != ISC_R_SUCCESS) \
106			goto failure;        \
107	} while (0)
108
109#define JOURNAL_SERIALSET 0x01U
110
111static isc_result_t
112index_to_disk(dns_journal_t *);
113
114static uint32_t
115decode_uint32(unsigned char *p) {
116	return (((uint32_t)p[0] << 24) + ((uint32_t)p[1] << 16) +
117		((uint32_t)p[2] << 8) + ((uint32_t)p[3] << 0));
118}
119
120static void
121encode_uint32(uint32_t val, unsigned char *p) {
122	p[0] = (uint8_t)(val >> 24);
123	p[1] = (uint8_t)(val >> 16);
124	p[2] = (uint8_t)(val >> 8);
125	p[3] = (uint8_t)(val >> 0);
126}
127
128isc_result_t
129dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
130		      dns_diffop_t op, dns_difftuple_t **tp) {
131	isc_result_t result;
132	dns_dbnode_t *node;
133	dns_rdataset_t rdataset;
134	dns_rdata_t rdata = DNS_RDATA_INIT;
135	dns_fixedname_t fixed;
136	dns_name_t *zonename;
137
138	zonename = dns_fixedname_initname(&fixed);
139	dns_name_copynf(dns_db_origin(db), zonename);
140
141	node = NULL;
142	result = dns_db_findnode(db, zonename, false, &node);
143	if (result != ISC_R_SUCCESS) {
144		goto nonode;
145	}
146
147	dns_rdataset_init(&rdataset);
148	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
149				     (isc_stdtime_t)0, &rdataset, NULL);
150	if (result != ISC_R_SUCCESS) {
151		goto freenode;
152	}
153
154	result = dns_rdataset_first(&rdataset);
155	if (result != ISC_R_SUCCESS) {
156		goto freenode;
157	}
158
159	dns_rdataset_current(&rdataset, &rdata);
160	dns_rdataset_getownercase(&rdataset, zonename);
161
162	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl, &rdata,
163				      tp);
164
165	dns_rdataset_disassociate(&rdataset);
166	dns_db_detachnode(db, &node);
167	return (result);
168
169freenode:
170	dns_db_detachnode(db, &node);
171nonode:
172	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
173	return (result);
174}
175
176/* Journaling */
177
178/*%
179 * On-disk representation of a "pointer" to a journal entry.
180 * These are used in the journal header to locate the beginning
181 * and end of the journal, and in the journal index to locate
182 * other transactions.
183 */
184typedef struct {
185	unsigned char serial[4]; /*%< SOA serial before update. */
186	/*
187	 * XXXRTH  Should offset be 8 bytes?
188	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
189	 * XXXAG  ... but we will not be able to seek >2G anyway on many
190	 *            platforms as long as we are using fseek() rather
191	 *            than lseek().
192	 */
193	unsigned char offset[4]; /*%< Offset from beginning of file. */
194} journal_rawpos_t;
195
196/*%
197 * The header is of a fixed size, with some spare room for future
198 * extensions.
199 */
200#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
201
202typedef enum {
203	XHDR_VERSION1 = 1,
204	XHDR_VERSION2 = 2,
205} xhdr_version_t;
206
207/*%
208 * The on-disk representation of the journal header.
209 * All numbers are stored in big-endian order.
210 */
211typedef union {
212	struct {
213		/*% File format version ID. */
214		unsigned char format[16];
215		/*% Position of the first addressable transaction */
216		journal_rawpos_t begin;
217		/*% Position of the next (yet nonexistent) transaction. */
218		journal_rawpos_t end;
219		/*% Number of index entries following the header. */
220		unsigned char index_size[4];
221		/*% Source serial number. */
222		unsigned char sourceserial[4];
223		unsigned char flags;
224	} h;
225	/* Pad the header to a fixed size. */
226	unsigned char pad[JOURNAL_HEADER_SIZE];
227} journal_rawheader_t;
228
229/*%
230 * The on-disk representation of the transaction header, version 2.
231 * There is one of these at the beginning of each transaction.
232 */
233typedef struct {
234	unsigned char size[4];	  /*%< In bytes, excluding header. */
235	unsigned char count[4];	  /*%< Number of records in transaction */
236	unsigned char serial0[4]; /*%< SOA serial before update. */
237	unsigned char serial1[4]; /*%< SOA serial after update. */
238} journal_rawxhdr_t;
239
240/*%
241 * Old-style raw transaction header, version 1, used for backward
242 * compatibility mode.
243 */
244typedef struct {
245	unsigned char size[4];
246	unsigned char serial0[4];
247	unsigned char serial1[4];
248} journal_rawxhdr_ver1_t;
249
250/*%
251 * The on-disk representation of the RR header.
252 * There is one of these at the beginning of each RR.
253 */
254typedef struct {
255	unsigned char size[4]; /*%< In bytes, excluding header. */
256} journal_rawrrhdr_t;
257
258/*%
259 * The in-core representation of the journal header.
260 */
261typedef struct {
262	uint32_t serial;
263	isc_offset_t offset;
264} journal_pos_t;
265
266#define POS_VALID(pos)	    ((pos).offset != 0)
267#define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)
268
269typedef struct {
270	unsigned char format[16];
271	journal_pos_t begin;
272	journal_pos_t end;
273	uint32_t index_size;
274	uint32_t sourceserial;
275	bool serialset;
276} journal_header_t;
277
278/*%
279 * The in-core representation of the transaction header.
280 */
281typedef struct {
282	uint32_t size;
283	uint32_t count;
284	uint32_t serial0;
285	uint32_t serial1;
286} journal_xhdr_t;
287
288/*%
289 * The in-core representation of the RR header.
290 */
291typedef struct {
292	uint32_t size;
293} journal_rrhdr_t;
294
295/*%
296 * Initial contents to store in the header of a newly created
297 * journal file.
298 *
299 * The header starts with the magic string ";BIND LOG V9.2\n"
300 * to identify the file as a BIND 9 journal file.  An ASCII
301 * identification string is used rather than a binary magic
302 * number to be consistent with BIND 8 (BIND 8 journal files
303 * are ASCII text files).
304 */
305
306static journal_header_t journal_header_ver1 = {
307	";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
308};
309static journal_header_t initial_journal_header = {
310	";BIND LOG V9.2\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
311};
312
313#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
314
315typedef enum {
316	JOURNAL_STATE_INVALID,
317	JOURNAL_STATE_READ,
318	JOURNAL_STATE_WRITE,
319	JOURNAL_STATE_TRANSACTION,
320	JOURNAL_STATE_INLINE
321} journal_state_t;
322
323struct dns_journal {
324	unsigned int magic; /*%< JOUR */
325	isc_mem_t *mctx;    /*%< Memory context */
326	journal_state_t state;
327	xhdr_version_t xhdr_version; /*%< Expected transaction header version */
328	bool header_ver1;	     /*%< Transaction header compatibility
329				      *   mode is allowed */
330	bool recovered;		     /*%< A recoverable error was found
331				      *   while reading the journal */
332	char *filename;		     /*%< Journal file name */
333	FILE *fp;		     /*%< File handle */
334	isc_offset_t offset;	     /*%< Current file offset */
335	journal_xhdr_t curxhdr;	     /*%< Current transaction header */
336	journal_header_t header;     /*%< In-core journal header */
337	unsigned char *rawindex;     /*%< In-core buffer for journal index
338				      * in on-disk format */
339	journal_pos_t *index;	     /*%< In-core journal index */
340
341	/*% Current transaction state (when writing). */
342	struct {
343		unsigned int n_soa;   /*%< Number of SOAs seen */
344		unsigned int n_rr;    /*%< Number of RRs to write */
345		journal_pos_t pos[2]; /*%< Begin/end position */
346	} x;
347
348	/*% Iteration state (when reading). */
349	struct {
350		/* These define the part of the journal we iterate over. */
351		journal_pos_t bpos; /*%< Position before first, */
352		journal_pos_t cpos; /*%< before current, */
353		journal_pos_t epos; /*%< and after last transaction */
354		/* The rest is iterator state. */
355		uint32_t current_serial; /*%< Current SOA serial */
356		isc_buffer_t source;	 /*%< Data from disk */
357		isc_buffer_t target;	 /*%< Data from _fromwire check */
358		dns_decompress_t dctx;	 /*%< Dummy decompression ctx */
359		dns_name_t name;	 /*%< Current domain name */
360		dns_rdata_t rdata;	 /*%< Current rdata */
361		uint32_t ttl;		 /*%< Current TTL */
362		unsigned int xsize;	 /*%< Size of transaction data */
363		unsigned int xpos;	 /*%< Current position in it */
364		isc_result_t result;	 /*%< Result of last call */
365	} it;
366};
367
368#define DNS_JOURNAL_MAGIC    ISC_MAGIC('J', 'O', 'U', 'R')
369#define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
370
371static void
372journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
373	cooked->serial = decode_uint32(raw->serial);
374	cooked->offset = decode_uint32(raw->offset);
375}
376
377static void
378journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
379	encode_uint32(cooked->serial, raw->serial);
380	encode_uint32(cooked->offset, raw->offset);
381}
382
383static void
384journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
385	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
386
387	memmove(cooked->format, raw->h.format, sizeof(cooked->format));
388	journal_pos_decode(&raw->h.begin, &cooked->begin);
389	journal_pos_decode(&raw->h.end, &cooked->end);
390	cooked->index_size = decode_uint32(raw->h.index_size);
391	cooked->sourceserial = decode_uint32(raw->h.sourceserial);
392	cooked->serialset = ((raw->h.flags & JOURNAL_SERIALSET) != 0);
393}
394
395static void
396journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
397	unsigned char flags = 0;
398
399	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
400
401	memset(raw->pad, 0, sizeof(raw->pad));
402	memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
403	journal_pos_encode(&raw->h.begin, &cooked->begin);
404	journal_pos_encode(&raw->h.end, &cooked->end);
405	encode_uint32(cooked->index_size, raw->h.index_size);
406	encode_uint32(cooked->sourceserial, raw->h.sourceserial);
407	if (cooked->serialset) {
408		flags |= JOURNAL_SERIALSET;
409	}
410	raw->h.flags = flags;
411}
412
413/*
414 * Journal file I/O subroutines, with error checking and reporting.
415 */
416static isc_result_t
417journal_seek(dns_journal_t *j, uint32_t offset) {
418	isc_result_t result;
419
420	result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET);
421	if (result != ISC_R_SUCCESS) {
422		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
423			      "%s: seek: %s", j->filename,
424			      isc_result_totext(result));
425		return (ISC_R_UNEXPECTED);
426	}
427	j->offset = offset;
428	return (ISC_R_SUCCESS);
429}
430
431static isc_result_t
432journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
433	isc_result_t result;
434
435	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
436	if (result != ISC_R_SUCCESS) {
437		if (result == ISC_R_EOF) {
438			return (ISC_R_NOMORE);
439		}
440		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
441			      "%s: read: %s", j->filename,
442			      isc_result_totext(result));
443		return (ISC_R_UNEXPECTED);
444	}
445	j->offset += (isc_offset_t)nbytes;
446	return (ISC_R_SUCCESS);
447}
448
449static isc_result_t
450journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
451	isc_result_t result;
452
453	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
454	if (result != ISC_R_SUCCESS) {
455		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
456			      "%s: write: %s", j->filename,
457			      isc_result_totext(result));
458		return (ISC_R_UNEXPECTED);
459	}
460	j->offset += (isc_offset_t)nbytes;
461	return (ISC_R_SUCCESS);
462}
463
464static isc_result_t
465journal_fsync(dns_journal_t *j) {
466	isc_result_t result;
467
468	result = isc_stdio_flush(j->fp);
469	if (result != ISC_R_SUCCESS) {
470		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
471			      "%s: flush: %s", j->filename,
472			      isc_result_totext(result));
473		return (ISC_R_UNEXPECTED);
474	}
475	result = isc_stdio_sync(j->fp);
476	if (result != ISC_R_SUCCESS) {
477		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
478			      "%s: fsync: %s", j->filename,
479			      isc_result_totext(result));
480		return (ISC_R_UNEXPECTED);
481	}
482	return (ISC_R_SUCCESS);
483}
484
485/*
486 * Read/write a transaction header at the current file position.
487 */
488static isc_result_t
489journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
490	isc_result_t result;
491
492	j->it.cpos.offset = j->offset;
493
494	switch (j->xhdr_version) {
495	case XHDR_VERSION1: {
496		journal_rawxhdr_ver1_t raw;
497		result = journal_read(j, &raw, sizeof(raw));
498		if (result != ISC_R_SUCCESS) {
499			return (result);
500		}
501		xhdr->size = decode_uint32(raw.size);
502		xhdr->count = 0;
503		xhdr->serial0 = decode_uint32(raw.serial0);
504		xhdr->serial1 = decode_uint32(raw.serial1);
505		j->curxhdr = *xhdr;
506		return (ISC_R_SUCCESS);
507	}
508
509	case XHDR_VERSION2: {
510		journal_rawxhdr_t raw;
511		result = journal_read(j, &raw, sizeof(raw));
512		if (result != ISC_R_SUCCESS) {
513			return (result);
514		}
515		xhdr->size = decode_uint32(raw.size);
516		xhdr->count = decode_uint32(raw.count);
517		xhdr->serial0 = decode_uint32(raw.serial0);
518		xhdr->serial1 = decode_uint32(raw.serial1);
519		j->curxhdr = *xhdr;
520		return (ISC_R_SUCCESS);
521	}
522
523	default:
524		return (ISC_R_NOTIMPLEMENTED);
525	}
526}
527
528static isc_result_t
529journal_write_xhdr(dns_journal_t *j, uint32_t size, uint32_t count,
530		   uint32_t serial0, uint32_t serial1) {
531	if (j->header_ver1) {
532		journal_rawxhdr_ver1_t raw;
533		encode_uint32(size, raw.size);
534		encode_uint32(serial0, raw.serial0);
535		encode_uint32(serial1, raw.serial1);
536		return (journal_write(j, &raw, sizeof(raw)));
537	} else {
538		journal_rawxhdr_t raw;
539		encode_uint32(size, raw.size);
540		encode_uint32(count, raw.count);
541		encode_uint32(serial0, raw.serial0);
542		encode_uint32(serial1, raw.serial1);
543		return (journal_write(j, &raw, sizeof(raw)));
544	}
545}
546
547/*
548 * Read an RR header at the current file position.
549 */
550
551static isc_result_t
552journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
553	journal_rawrrhdr_t raw;
554	isc_result_t result;
555
556	result = journal_read(j, &raw, sizeof(raw));
557	if (result != ISC_R_SUCCESS) {
558		return (result);
559	}
560	rrhdr->size = decode_uint32(raw.size);
561	return (ISC_R_SUCCESS);
562}
563
564static isc_result_t
565journal_file_create(isc_mem_t *mctx, bool downgrade, const char *filename) {
566	FILE *fp = NULL;
567	isc_result_t result;
568	journal_header_t header;
569	journal_rawheader_t rawheader;
570	int index_size = 56; /* XXX configurable */
571	int size;
572	void *mem = NULL; /* Memory for temporary index image. */
573
574	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
575
576	result = isc_stdio_open(filename, "wb", &fp);
577	if (result != ISC_R_SUCCESS) {
578		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
579			      "%s: create: %s", filename,
580			      isc_result_totext(result));
581		return (ISC_R_UNEXPECTED);
582	}
583
584	if (downgrade) {
585		header = journal_header_ver1;
586	} else {
587		header = initial_journal_header;
588	}
589	header.index_size = index_size;
590	journal_header_encode(&header, &rawheader);
591
592	size = sizeof(journal_rawheader_t) +
593	       index_size * sizeof(journal_rawpos_t);
594
595	mem = isc_mem_get(mctx, size);
596	memset(mem, 0, size);
597	memmove(mem, &rawheader, sizeof(rawheader));
598
599	result = isc_stdio_write(mem, 1, (size_t)size, fp, NULL);
600	if (result != ISC_R_SUCCESS) {
601		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
602			      "%s: write: %s", filename,
603			      isc_result_totext(result));
604		(void)isc_stdio_close(fp);
605		(void)isc_file_remove(filename);
606		isc_mem_put(mctx, mem, size);
607		return (ISC_R_UNEXPECTED);
608	}
609	isc_mem_put(mctx, mem, size);
610
611	result = isc_stdio_close(fp);
612	if (result != ISC_R_SUCCESS) {
613		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
614			      "%s: close: %s", filename,
615			      isc_result_totext(result));
616		(void)isc_file_remove(filename);
617		return (ISC_R_UNEXPECTED);
618	}
619
620	return (ISC_R_SUCCESS);
621}
622
623static isc_result_t
624journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
625	     bool downgrade, dns_journal_t **journalp) {
626	FILE *fp = NULL;
627	isc_result_t result;
628	journal_rawheader_t rawheader;
629	dns_journal_t *j;
630
631	REQUIRE(journalp != NULL && *journalp == NULL);
632
633	j = isc_mem_get(mctx, sizeof(*j));
634	*j = (dns_journal_t){ .state = JOURNAL_STATE_INVALID,
635			      .filename = isc_mem_strdup(mctx, filename),
636			      .xhdr_version = XHDR_VERSION2 };
637	isc_mem_attach(mctx, &j->mctx);
638
639	result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp);
640	if (result == ISC_R_FILENOTFOUND) {
641		if (create) {
642			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
643				      "journal file %s does not exist, "
644				      "creating it",
645				      j->filename);
646			CHECK(journal_file_create(mctx, downgrade, filename));
647			/*
648			 * Retry.
649			 */
650			result = isc_stdio_open(j->filename, "rb+", &fp);
651		} else {
652			FAIL(ISC_R_NOTFOUND);
653		}
654	}
655	if (result != ISC_R_SUCCESS) {
656		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
657			      "%s: open: %s", j->filename,
658			      isc_result_totext(result));
659		FAIL(ISC_R_UNEXPECTED);
660	}
661
662	j->fp = fp;
663
664	/*
665	 * Set magic early so that seek/read can succeed.
666	 */
667	j->magic = DNS_JOURNAL_MAGIC;
668
669	CHECK(journal_seek(j, 0));
670	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
671
672	if (memcmp(rawheader.h.format, journal_header_ver1.format,
673		   sizeof(journal_header_ver1.format)) == 0)
674	{
675		/*
676		 * The file header says it's the old format, but it
677		 * still might have the new xhdr format because we
678		 * forgot to change the format string when we introduced
679		 * the new xhdr.  When we first try to read it, we assume
680		 * it uses the new xhdr format. If that fails, we'll be
681		 * called a second time with compat set to true, in which
682		 * case we can lower xhdr_version to 1 if we find a
683		 * corrupt transaction.
684		 */
685		j->header_ver1 = true;
686	} else if (memcmp(rawheader.h.format, initial_journal_header.format,
687			  sizeof(initial_journal_header.format)) == 0)
688	{
689		/*
690		 * File header says this is format version 2; all
691		 * transactions have to match.
692		 */
693		j->header_ver1 = false;
694	} else {
695		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
696			      "%s: journal format not recognized", j->filename);
697		FAIL(ISC_R_UNEXPECTED);
698	}
699	journal_header_decode(&rawheader, &j->header);
700
701	/*
702	 * If there is an index, read the raw index into a dynamically
703	 * allocated buffer and then convert it into a cooked index.
704	 */
705	if (j->header.index_size != 0) {
706		unsigned int i;
707		unsigned int rawbytes;
708		unsigned char *p;
709
710		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
711		j->rawindex = isc_mem_get(mctx, rawbytes);
712
713		CHECK(journal_read(j, j->rawindex, rawbytes));
714
715		j->index = isc_mem_get(mctx, j->header.index_size *
716						     sizeof(journal_pos_t));
717
718		p = j->rawindex;
719		for (i = 0; i < j->header.index_size; i++) {
720			j->index[i].serial = decode_uint32(p);
721			p += 4;
722			j->index[i].offset = decode_uint32(p);
723			p += 4;
724		}
725		INSIST(p == j->rawindex + rawbytes);
726	}
727	j->offset = -1; /* Invalid, must seek explicitly. */
728
729	/*
730	 * Initialize the iterator.
731	 */
732	dns_name_init(&j->it.name, NULL);
733	dns_rdata_init(&j->it.rdata);
734
735	/*
736	 * Set up empty initial buffers for unchecked and checked
737	 * wire format RR data.  They will be reallocated
738	 * later.
739	 */
740	isc_buffer_init(&j->it.source, NULL, 0);
741	isc_buffer_init(&j->it.target, NULL, 0);
742	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
743
744	j->state = writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
745
746	*journalp = j;
747	return (ISC_R_SUCCESS);
748
749failure:
750	j->magic = 0;
751	if (j->rawindex != NULL) {
752		isc_mem_put(j->mctx, j->rawindex,
753			    j->header.index_size * sizeof(journal_rawpos_t));
754	}
755	if (j->index != NULL) {
756		isc_mem_put(j->mctx, j->index,
757			    j->header.index_size * sizeof(journal_pos_t));
758	}
759	isc_mem_free(j->mctx, j->filename);
760	if (j->fp != NULL) {
761		(void)isc_stdio_close(j->fp);
762	}
763	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
764	return (result);
765}
766
767isc_result_t
768dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
769		 dns_journal_t **journalp) {
770	isc_result_t result;
771	size_t namelen;
772	char backup[1024];
773	bool writable, create;
774
775	create = ((mode & DNS_JOURNAL_CREATE) != 0);
776	writable = ((mode & (DNS_JOURNAL_WRITE | DNS_JOURNAL_CREATE)) != 0);
777
778	result = journal_open(mctx, filename, writable, create, false,
779			      journalp);
780	if (result == ISC_R_NOTFOUND) {
781		namelen = strlen(filename);
782		if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
783		{
784			namelen -= 4;
785		}
786
787		result = snprintf(backup, sizeof(backup), "%.*s.jbk",
788				  (int)namelen, filename);
789		if (result >= sizeof(backup)) {
790			return (ISC_R_NOSPACE);
791		}
792		result = journal_open(mctx, backup, writable, writable, false,
793				      journalp);
794	}
795	return (result);
796}
797
798/*
799 * A comparison function defining the sorting order for
800 * entries in the IXFR-style journal file.
801 *
802 * The IXFR format requires that deletions are sorted before
803 * additions, and within either one, SOA records are sorted
804 * before others.
805 *
806 * Also sort the non-SOA records by type as a courtesy to the
807 * server receiving the IXFR - it may help reduce the amount of
808 * rdataset merging it has to do.
809 */
810static int
811ixfr_order(const void *av, const void *bv) {
812	dns_difftuple_t const *const *ap = av;
813	dns_difftuple_t const *const *bp = bv;
814	dns_difftuple_t const *a = *ap;
815	dns_difftuple_t const *b = *bp;
816	int r;
817	int bop = 0, aop = 0;
818
819	switch (a->op) {
820	case DNS_DIFFOP_DEL:
821	case DNS_DIFFOP_DELRESIGN:
822		aop = 1;
823		break;
824	case DNS_DIFFOP_ADD:
825	case DNS_DIFFOP_ADDRESIGN:
826		aop = 0;
827		break;
828	default:
829		UNREACHABLE();
830	}
831
832	switch (b->op) {
833	case DNS_DIFFOP_DEL:
834	case DNS_DIFFOP_DELRESIGN:
835		bop = 1;
836		break;
837	case DNS_DIFFOP_ADD:
838	case DNS_DIFFOP_ADDRESIGN:
839		bop = 0;
840		break;
841	default:
842		UNREACHABLE();
843	}
844
845	r = bop - aop;
846	if (r != 0) {
847		return (r);
848	}
849
850	r = (b->rdata.type == dns_rdatatype_soa) -
851	    (a->rdata.type == dns_rdatatype_soa);
852	if (r != 0) {
853		return (r);
854	}
855
856	r = (a->rdata.type - b->rdata.type);
857	return (r);
858}
859
860static isc_result_t
861maybe_fixup_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr, uint32_t serial,
862		 isc_offset_t offset) {
863	isc_result_t result = ISC_R_SUCCESS;
864
865	/*
866	 * Handle mixture of version 1 and version 2
867	 * transaction headers in a version 1 journal.
868	 */
869	if ((xhdr->serial0 != serial ||
870	     isc_serial_le(xhdr->serial1, xhdr->serial0)))
871	{
872		if (j->xhdr_version == XHDR_VERSION1 && xhdr->serial1 == serial)
873		{
874			isc_log_write(
875				JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
876				"%s: XHDR_VERSION1 -> XHDR_VERSION2 at %u",
877				j->filename, serial);
878			j->xhdr_version = XHDR_VERSION2;
879			CHECK(journal_seek(j, offset));
880			CHECK(journal_read_xhdr(j, xhdr));
881			j->recovered = true;
882		} else if (j->xhdr_version == XHDR_VERSION2 &&
883			   xhdr->count == serial)
884		{
885			isc_log_write(
886				JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
887				"%s: XHDR_VERSION2 -> XHDR_VERSION1 at %u",
888				j->filename, serial);
889			j->xhdr_version = XHDR_VERSION1;
890			CHECK(journal_seek(j, offset));
891			CHECK(journal_read_xhdr(j, xhdr));
892			j->recovered = true;
893		}
894	}
895
896	/*
897	 * Handle <size, serial0, serial1, 0> transaction header.
898	 */
899	if (j->xhdr_version == XHDR_VERSION1) {
900		uint32_t value;
901
902		CHECK(journal_read(j, &value, sizeof(value)));
903		if (value != 0L) {
904			CHECK(journal_seek(j, offset + 12));
905		} else {
906			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
907				      "%s: XHDR_VERSION1 count zero at %u",
908				      j->filename, serial);
909			j->xhdr_version = XHDR_VERSION2;
910			j->recovered = true;
911		}
912	} else if (j->xhdr_version == XHDR_VERSION2 && xhdr->count == serial &&
913		   xhdr->serial1 == 0U &&
914		   isc_serial_gt(xhdr->serial0, xhdr->count))
915	{
916		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
917			      "%s: XHDR_VERSION2 count zero at %u", j->filename,
918			      serial);
919		xhdr->serial1 = xhdr->serial0;
920		xhdr->serial0 = xhdr->count;
921		xhdr->count = 0;
922		j->recovered = true;
923	}
924
925failure:
926	return (result);
927}
928
929/*
930 * Advance '*pos' to the next journal transaction.
931 *
932 * Requires:
933 *	*pos refers to a valid journal transaction.
934 *
935 * Ensures:
936 *	When ISC_R_SUCCESS is returned,
937 *	*pos refers to the next journal transaction.
938 *
939 * Returns one of:
940 *
941 *    ISC_R_SUCCESS
942 *    ISC_R_NOMORE 	*pos pointed at the last transaction
943 *    Other results due to file errors are possible.
944 */
945static isc_result_t
946journal_next(dns_journal_t *j, journal_pos_t *pos) {
947	isc_result_t result;
948	journal_xhdr_t xhdr;
949	size_t hdrsize;
950
951	REQUIRE(DNS_JOURNAL_VALID(j));
952
953	result = journal_seek(j, pos->offset);
954	if (result != ISC_R_SUCCESS) {
955		return (result);
956	}
957
958	if (pos->serial == j->header.end.serial) {
959		return (ISC_R_NOMORE);
960	}
961
962	/*
963	 * Read the header of the current transaction.
964	 * This will return ISC_R_NOMORE if we are at EOF.
965	 */
966	result = journal_read_xhdr(j, &xhdr);
967	if (result != ISC_R_SUCCESS) {
968		return (result);
969	}
970
971	if (j->header_ver1) {
972		CHECK(maybe_fixup_xhdr(j, &xhdr, pos->serial, pos->offset));
973	}
974
975	/*
976	 * Check serial number consistency.
977	 */
978	if (xhdr.serial0 != pos->serial ||
979	    isc_serial_le(xhdr.serial1, xhdr.serial0))
980	{
981		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
982			      "%s: journal file corrupt: "
983			      "expected serial %u, got %u",
984			      j->filename, pos->serial, xhdr.serial0);
985		return (ISC_R_UNEXPECTED);
986	}
987
988	/*
989	 * Check for offset wraparound.
990	 */
991	hdrsize = (j->xhdr_version == XHDR_VERSION2)
992			  ? sizeof(journal_rawxhdr_t)
993			  : sizeof(journal_rawxhdr_ver1_t);
994
995	if ((isc_offset_t)(pos->offset + hdrsize + xhdr.size) < pos->offset) {
996		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
997			      "%s: offset too large", j->filename);
998		return (ISC_R_UNEXPECTED);
999	}
1000
1001	pos->offset += hdrsize + xhdr.size;
1002	pos->serial = xhdr.serial1;
1003	return (ISC_R_SUCCESS);
1004
1005failure:
1006	return (result);
1007}
1008
1009/*
1010 * If the index of the journal 'j' contains an entry "better"
1011 * than '*best_guess', replace '*best_guess' with it.
1012 *
1013 * "Better" means having a serial number closer to 'serial'
1014 * but not greater than 'serial'.
1015 */
1016static void
1017index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) {
1018	unsigned int i;
1019	if (j->index == NULL) {
1020		return;
1021	}
1022	for (i = 0; i < j->header.index_size; i++) {
1023		if (POS_VALID(j->index[i]) &&
1024		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1025		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
1026		{
1027			*best_guess = j->index[i];
1028		}
1029	}
1030}
1031
1032/*
1033 * Add a new index entry.  If there is no room, make room by removing
1034 * the odd-numbered entries and compacting the others into the first
1035 * half of the index.  This decimates old index entries exponentially
1036 * over time, so that the index always contains a much larger fraction
1037 * of recent serial numbers than of old ones.  This is deliberate -
1038 * most index searches are for outgoing IXFR, and IXFR tends to request
1039 * recent versions more often than old ones.
1040 */
1041static void
1042index_add(dns_journal_t *j, journal_pos_t *pos) {
1043	unsigned int i;
1044
1045	if (j->index == NULL) {
1046		return;
1047	}
1048
1049	/*
1050	 * Search for a vacant position.
1051	 */
1052	for (i = 0; i < j->header.index_size; i++) {
1053		if (!POS_VALID(j->index[i])) {
1054			break;
1055		}
1056	}
1057	if (i == j->header.index_size) {
1058		unsigned int k = 0;
1059		/*
1060		 * Found no vacant position.  Make some room.
1061		 */
1062		for (i = 0; i < j->header.index_size; i += 2) {
1063			j->index[k++] = j->index[i];
1064		}
1065		i = k; /* 'i' identifies the first vacant position. */
1066		while (k < j->header.index_size) {
1067			POS_INVALIDATE(j->index[k]);
1068			k++;
1069		}
1070	}
1071	INSIST(i < j->header.index_size);
1072	INSIST(!POS_VALID(j->index[i]));
1073
1074	/*
1075	 * Store the new index entry.
1076	 */
1077	j->index[i] = *pos;
1078}
1079
1080/*
1081 * Invalidate any existing index entries that could become
1082 * ambiguous when a new transaction with number 'serial' is added.
1083 */
1084static void
1085index_invalidate(dns_journal_t *j, uint32_t serial) {
1086	unsigned int i;
1087	if (j->index == NULL) {
1088		return;
1089	}
1090	for (i = 0; i < j->header.index_size; i++) {
1091		if (!DNS_SERIAL_GT(serial, j->index[i].serial)) {
1092			POS_INVALIDATE(j->index[i]);
1093		}
1094	}
1095}
1096
1097/*
1098 * Try to find a transaction with initial serial number 'serial'
1099 * in the journal 'j'.
1100 *
1101 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
1102 *
1103 * If 'serial' is current (= the ending serial number of the
1104 * last transaction in the journal), set '*pos' to
1105 * the position immediately following the last transaction and
1106 * return ISC_R_SUCCESS.
1107 *
1108 * If 'serial' is within the range of addressable serial numbers
1109 * covered by the journal but that particular serial number is missing
1110 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
1111 *
1112 * If 'serial' is outside the range of addressable serial numbers
1113 * covered by the journal, return ISC_R_RANGE.
1114 *
1115 */
1116static isc_result_t
1117journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) {
1118	isc_result_t result;
1119	journal_pos_t current_pos;
1120
1121	REQUIRE(DNS_JOURNAL_VALID(j));
1122
1123	if (DNS_SERIAL_GT(j->header.begin.serial, serial)) {
1124		return (ISC_R_RANGE);
1125	}
1126	if (DNS_SERIAL_GT(serial, j->header.end.serial)) {
1127		return (ISC_R_RANGE);
1128	}
1129	if (serial == j->header.end.serial) {
1130		*pos = j->header.end;
1131		return (ISC_R_SUCCESS);
1132	}
1133
1134	current_pos = j->header.begin;
1135	index_find(j, serial, &current_pos);
1136
1137	while (current_pos.serial != serial) {
1138		if (DNS_SERIAL_GT(current_pos.serial, serial)) {
1139			return (ISC_R_NOTFOUND);
1140		}
1141		result = journal_next(j, &current_pos);
1142		if (result != ISC_R_SUCCESS) {
1143			return (result);
1144		}
1145	}
1146	*pos = current_pos;
1147	return (ISC_R_SUCCESS);
1148}
1149
1150isc_result_t
1151dns_journal_begin_transaction(dns_journal_t *j) {
1152	uint32_t offset;
1153	isc_result_t result;
1154
1155	REQUIRE(DNS_JOURNAL_VALID(j));
1156	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1157		j->state == JOURNAL_STATE_INLINE);
1158
1159	/*
1160	 * Find the file offset where the new transaction should
1161	 * be written, and seek there.
1162	 */
1163	if (JOURNAL_EMPTY(&j->header)) {
1164		offset = sizeof(journal_rawheader_t) +
1165			 j->header.index_size * sizeof(journal_rawpos_t);
1166	} else {
1167		offset = j->header.end.offset;
1168	}
1169	j->x.pos[0].offset = offset;
1170	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
1171	j->x.n_soa = 0;
1172
1173	CHECK(journal_seek(j, offset));
1174
1175	/*
1176	 * Write a dummy transaction header of all zeroes to reserve
1177	 * space.  It will be filled in when the transaction is
1178	 * finished.
1179	 */
1180	CHECK(journal_write_xhdr(j, 0, 0, 0, 0));
1181	j->x.pos[1].offset = j->offset;
1182
1183	j->state = JOURNAL_STATE_TRANSACTION;
1184	result = ISC_R_SUCCESS;
1185failure:
1186	return (result);
1187}
1188
1189isc_result_t
1190dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
1191	dns_difftuple_t *t;
1192	isc_buffer_t buffer;
1193	void *mem = NULL;
1194	uint64_t size = 0;
1195	uint32_t rrcount = 0;
1196	isc_result_t result;
1197	isc_region_t used;
1198
1199	REQUIRE(DNS_DIFF_VALID(diff));
1200	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1201
1202	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
1203	(void)dns_diff_print(diff, NULL);
1204
1205	/*
1206	 * Pass 1: determine the buffer size needed, and
1207	 * keep track of SOA serial numbers.
1208	 */
1209	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1210	     t = ISC_LIST_NEXT(t, link))
1211	{
1212		if (t->rdata.type == dns_rdatatype_soa) {
1213			if (j->x.n_soa < 2) {
1214				j->x.pos[j->x.n_soa].serial =
1215					dns_soa_getserial(&t->rdata);
1216			}
1217			j->x.n_soa++;
1218		}
1219		size += sizeof(journal_rawrrhdr_t);
1220		size += t->name.length; /* XXX should have access macro? */
1221		size += 10;
1222		size += t->rdata.length;
1223	}
1224
1225	if (size >= DNS_JOURNAL_SIZE_MAX) {
1226		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1227			      "dns_journal_writediff: %s: journal entry "
1228			      "too big to be stored: %" PRIu64 " bytes",
1229			      j->filename, size);
1230		return (ISC_R_NOSPACE);
1231	}
1232
1233	mem = isc_mem_get(j->mctx, size);
1234
1235	isc_buffer_init(&buffer, mem, size);
1236
1237	/*
1238	 * Pass 2.  Write RRs to buffer.
1239	 */
1240	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1241	     t = ISC_LIST_NEXT(t, link))
1242	{
1243		/*
1244		 * Write the RR header.
1245		 */
1246		isc_buffer_putuint32(&buffer,
1247				     t->name.length + 10 + t->rdata.length);
1248		/*
1249		 * Write the owner name, RR header, and RR data.
1250		 */
1251		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
1252		isc_buffer_putuint16(&buffer, t->rdata.type);
1253		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1254		isc_buffer_putuint32(&buffer, t->ttl);
1255		INSIST(t->rdata.length < 65536);
1256		isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length);
1257		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1258		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1259
1260		rrcount++;
1261	}
1262
1263	isc_buffer_usedregion(&buffer, &used);
1264	INSIST(used.length == size);
1265
1266	j->x.pos[1].offset += used.length;
1267	j->x.n_rr = rrcount;
1268
1269	/*
1270	 * Write the buffer contents to the journal file.
1271	 */
1272	CHECK(journal_write(j, used.base, used.length));
1273
1274	result = ISC_R_SUCCESS;
1275
1276failure:
1277	if (mem != NULL) {
1278		isc_mem_put(j->mctx, mem, size);
1279	}
1280	return (result);
1281}
1282
1283isc_result_t
1284dns_journal_commit(dns_journal_t *j) {
1285	isc_result_t result;
1286	journal_rawheader_t rawheader;
1287	uint64_t total;
1288
1289	REQUIRE(DNS_JOURNAL_VALID(j));
1290	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
1291		j->state == JOURNAL_STATE_INLINE);
1292
1293	/*
1294	 * Just write out a updated header.
1295	 */
1296	if (j->state == JOURNAL_STATE_INLINE) {
1297		CHECK(journal_fsync(j));
1298		journal_header_encode(&j->header, &rawheader);
1299		CHECK(journal_seek(j, 0));
1300		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1301		CHECK(journal_fsync(j));
1302		j->state = JOURNAL_STATE_WRITE;
1303		return (ISC_R_SUCCESS);
1304	}
1305
1306	/*
1307	 * Perform some basic consistency checks.
1308	 */
1309	if (j->x.n_soa != 2) {
1310		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1311			      "%s: malformed transaction: %d SOAs", j->filename,
1312			      j->x.n_soa);
1313		return (ISC_R_UNEXPECTED);
1314	}
1315	if (!DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial)) {
1316		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1317			      "%s: malformed transaction: serial number "
1318			      "did not increase",
1319			      j->filename);
1320		return (ISC_R_UNEXPECTED);
1321	}
1322	if (!JOURNAL_EMPTY(&j->header)) {
1323		if (j->x.pos[0].serial != j->header.end.serial) {
1324			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1325				      "malformed transaction: "
1326				      "%s last serial %u != "
1327				      "transaction first serial %u",
1328				      j->filename, j->header.end.serial,
1329				      j->x.pos[0].serial);
1330			return (ISC_R_UNEXPECTED);
1331		}
1332	}
1333
1334	/*
1335	 * We currently don't support huge journal entries.
1336	 */
1337	total = j->x.pos[1].offset - j->x.pos[0].offset;
1338	if (total >= DNS_JOURNAL_SIZE_MAX) {
1339		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1340			      "transaction too big to be stored in journal: "
1341			      "%" PRIu64 "b (max is %" PRIu64 "b)",
1342			      total, (uint64_t)DNS_JOURNAL_SIZE_MAX);
1343		return (ISC_R_UNEXPECTED);
1344	}
1345
1346	/*
1347	 * Some old journal entries may become non-addressable
1348	 * when we increment the current serial number.  Purge them
1349	 * by stepping header.begin forward to the first addressable
1350	 * transaction.  Also purge them from the index.
1351	 */
1352	if (!JOURNAL_EMPTY(&j->header)) {
1353		while (!DNS_SERIAL_GT(j->x.pos[1].serial,
1354				      j->header.begin.serial))
1355		{
1356			CHECK(journal_next(j, &j->header.begin));
1357		}
1358		index_invalidate(j, j->x.pos[1].serial);
1359	}
1360#ifdef notyet
1361	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1362		force_dump(...);
1363	}
1364#endif /* ifdef notyet */
1365
1366	/*
1367	 * Commit the transaction data to stable storage.
1368	 */
1369	CHECK(journal_fsync(j));
1370
1371	if (j->state == JOURNAL_STATE_TRANSACTION) {
1372		isc_offset_t offset;
1373		offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
1374			 (j->header_ver1 ? sizeof(journal_rawxhdr_ver1_t)
1375					 : sizeof(journal_rawxhdr_t));
1376		/*
1377		 * Update the transaction header.
1378		 */
1379		CHECK(journal_seek(j, j->x.pos[0].offset));
1380		CHECK(journal_write_xhdr(j, offset, j->x.n_rr,
1381					 j->x.pos[0].serial,
1382					 j->x.pos[1].serial));
1383	}
1384
1385	/*
1386	 * Update the journal header.
1387	 */
1388	if (JOURNAL_EMPTY(&j->header)) {
1389		j->header.begin = j->x.pos[0];
1390	}
1391	j->header.end = j->x.pos[1];
1392	journal_header_encode(&j->header, &rawheader);
1393	CHECK(journal_seek(j, 0));
1394	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1395
1396	/*
1397	 * Update the index.
1398	 */
1399	index_add(j, &j->x.pos[0]);
1400
1401	/*
1402	 * Convert the index into on-disk format and write
1403	 * it to disk.
1404	 */
1405	CHECK(index_to_disk(j));
1406
1407	/*
1408	 * Commit the header to stable storage.
1409	 */
1410	CHECK(journal_fsync(j));
1411
1412	/*
1413	 * We no longer have a transaction open.
1414	 */
1415	j->state = JOURNAL_STATE_WRITE;
1416
1417	result = ISC_R_SUCCESS;
1418
1419failure:
1420	return (result);
1421}
1422
1423isc_result_t
1424dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1425	isc_result_t result;
1426
1427	CHECK(dns_diff_sort(diff, ixfr_order));
1428	CHECK(dns_journal_begin_transaction(j));
1429	CHECK(dns_journal_writediff(j, diff));
1430	CHECK(dns_journal_commit(j));
1431	result = ISC_R_SUCCESS;
1432failure:
1433	return (result);
1434}
1435
1436void
1437dns_journal_destroy(dns_journal_t **journalp) {
1438	dns_journal_t *j = NULL;
1439
1440	REQUIRE(journalp != NULL);
1441	REQUIRE(DNS_JOURNAL_VALID(*journalp));
1442
1443	j = *journalp;
1444	*journalp = NULL;
1445
1446	j->it.result = ISC_R_FAILURE;
1447	dns_name_invalidate(&j->it.name);
1448	dns_decompress_invalidate(&j->it.dctx);
1449	if (j->rawindex != NULL) {
1450		isc_mem_put(j->mctx, j->rawindex,
1451			    j->header.index_size * sizeof(journal_rawpos_t));
1452	}
1453	if (j->index != NULL) {
1454		isc_mem_put(j->mctx, j->index,
1455			    j->header.index_size * sizeof(journal_pos_t));
1456	}
1457	if (j->it.target.base != NULL) {
1458		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1459	}
1460	if (j->it.source.base != NULL) {
1461		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1462	}
1463	if (j->filename != NULL) {
1464		isc_mem_free(j->mctx, j->filename);
1465	}
1466	if (j->fp != NULL) {
1467		(void)isc_stdio_close(j->fp);
1468	}
1469	j->magic = 0;
1470	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
1471}
1472
1473/*
1474 * Roll the open journal 'j' into the database 'db'.
1475 * A new database version will be created.
1476 */
1477
1478/* XXX Share code with incoming IXFR? */
1479
1480isc_result_t
1481dns_journal_rollforward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
1482	isc_buffer_t source; /* Transaction data from disk */
1483	isc_buffer_t target; /* Ditto after _fromwire check */
1484	uint32_t db_serial;  /* Database SOA serial */
1485	uint32_t end_serial; /* Last journal SOA serial */
1486	isc_result_t result;
1487	dns_dbversion_t *ver = NULL;
1488	journal_pos_t pos;
1489	dns_diff_t diff;
1490	unsigned int n_soa = 0;
1491	unsigned int n_put = 0;
1492	dns_diffop_t op;
1493
1494	REQUIRE(DNS_JOURNAL_VALID(j));
1495	REQUIRE(DNS_DB_VALID(db));
1496
1497	dns_diff_init(j->mctx, &diff);
1498
1499	/*
1500	 * Set up empty initial buffers for unchecked and checked
1501	 * wire format transaction data.  They will be reallocated
1502	 * later.
1503	 */
1504	isc_buffer_init(&source, NULL, 0);
1505	isc_buffer_init(&target, NULL, 0);
1506
1507	/*
1508	 * Create the new database version.
1509	 */
1510	CHECK(dns_db_newversion(db, &ver));
1511
1512	/*
1513	 * Get the current database SOA serial number.
1514	 */
1515	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1516
1517	/*
1518	 * Locate a journal entry for the current database serial.
1519	 */
1520	CHECK(journal_find(j, db_serial, &pos));
1521
1522	end_serial = dns_journal_last_serial(j);
1523
1524	/*
1525	 * If we're reading a version 1 file, scan all the transactions
1526	 * to see if the journal needs rewriting: if any outdated
1527	 * transaction headers are found, j->recovered will be set.
1528	 */
1529	if (j->header_ver1) {
1530		uint32_t start_serial = dns_journal_first_serial(j);
1531
1532		CHECK(dns_journal_iter_init(j, start_serial, db_serial, NULL));
1533		for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1534		     result = dns_journal_next_rr(j))
1535		{
1536			continue;
1537		}
1538	}
1539
1540	if (db_serial == end_serial) {
1541		CHECK(DNS_R_UPTODATE);
1542	}
1543
1544	CHECK(dns_journal_iter_init(j, db_serial, end_serial, NULL));
1545	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1546	     result = dns_journal_next_rr(j))
1547	{
1548		dns_name_t *name = NULL;
1549		dns_rdata_t *rdata = NULL;
1550		dns_difftuple_t *tuple = NULL;
1551		uint32_t ttl;
1552
1553		dns_journal_current_rr(j, &name, &ttl, &rdata);
1554
1555		if (rdata->type == dns_rdatatype_soa) {
1556			n_soa++;
1557			if (n_soa == 2) {
1558				db_serial = j->it.current_serial;
1559			}
1560		}
1561
1562		if (n_soa == 3) {
1563			n_soa = 1;
1564		}
1565		if (n_soa == 0) {
1566			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1567				      "%s: journal file corrupt: missing "
1568				      "initial SOA",
1569				      j->filename);
1570			FAIL(ISC_R_UNEXPECTED);
1571		}
1572		if ((options & DNS_JOURNALOPT_RESIGN) != 0) {
1573			op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN
1574					  : DNS_DIFFOP_ADDRESIGN;
1575		} else {
1576			op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
1577		}
1578
1579		CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
1580					   &tuple));
1581		dns_diff_append(&diff, &tuple);
1582
1583		if (++n_put > 100) {
1584			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1585				      "%s: applying diff to database (%u)",
1586				      j->filename, db_serial);
1587			(void)dns_diff_print(&diff, NULL);
1588			CHECK(dns_diff_apply(&diff, db, ver));
1589			dns_diff_clear(&diff);
1590			n_put = 0;
1591		}
1592	}
1593	if (result == ISC_R_NOMORE) {
1594		result = ISC_R_SUCCESS;
1595	}
1596	CHECK(result);
1597
1598	if (n_put != 0) {
1599		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1600			      "%s: applying final diff to database (%u)",
1601			      j->filename, db_serial);
1602		(void)dns_diff_print(&diff, NULL);
1603		CHECK(dns_diff_apply(&diff, db, ver));
1604		dns_diff_clear(&diff);
1605	}
1606
1607failure:
1608	if (ver != NULL) {
1609		dns_db_closeversion(db, &ver,
1610				    result == ISC_R_SUCCESS ? true : false);
1611	}
1612
1613	if (source.base != NULL) {
1614		isc_mem_put(j->mctx, source.base, source.length);
1615	}
1616	if (target.base != NULL) {
1617		isc_mem_put(j->mctx, target.base, target.length);
1618	}
1619
1620	dns_diff_clear(&diff);
1621
1622	INSIST(ver == NULL);
1623
1624	return (result);
1625}
1626
1627isc_result_t
1628dns_journal_print(isc_mem_t *mctx, uint32_t flags, const char *filename,
1629		  FILE *file) {
1630	dns_journal_t *j = NULL;
1631	isc_buffer_t source;   /* Transaction data from disk */
1632	isc_buffer_t target;   /* Ditto after _fromwire check */
1633	uint32_t start_serial; /* Database SOA serial */
1634	uint32_t end_serial;   /* Last journal SOA serial */
1635	isc_result_t result;
1636	dns_diff_t diff;
1637	unsigned int n_soa = 0;
1638	unsigned int n_put = 0;
1639	bool printxhdr = ((flags & DNS_JOURNAL_PRINTXHDR) != 0);
1640
1641	REQUIRE(filename != NULL);
1642
1643	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1644	if (result == ISC_R_NOTFOUND) {
1645		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1646		return (DNS_R_NOJOURNAL);
1647	} else if (result != ISC_R_SUCCESS) {
1648		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1649			      "journal open failure: %s: %s",
1650			      isc_result_totext(result), filename);
1651		return (result);
1652	}
1653
1654	if (printxhdr) {
1655		fprintf(file, "Journal format = %sHeader version = %d\n",
1656			j->header.format + 1, j->header_ver1 ? 1 : 2);
1657		fprintf(file, "Start serial = %u\n", j->header.begin.serial);
1658		fprintf(file, "End serial = %u\n", j->header.end.serial);
1659		fprintf(file, "Index (size = %u):\n", j->header.index_size);
1660		for (uint32_t i = 0; i < j->header.index_size; i++) {
1661			if (j->index[i].offset == 0) {
1662				fputc('\n', file);
1663				break;
1664			}
1665			fprintf(file, "%lld", (long long)j->index[i].offset);
1666			fputc((i + 1) % 8 == 0 ? '\n' : ' ', file);
1667		}
1668	}
1669	if (j->header.serialset) {
1670		fprintf(file, "Source serial = %u\n", j->header.sourceserial);
1671	}
1672	dns_diff_init(j->mctx, &diff);
1673
1674	/*
1675	 * Set up empty initial buffers for unchecked and checked
1676	 * wire format transaction data.  They will be reallocated
1677	 * later.
1678	 */
1679	isc_buffer_init(&source, NULL, 0);
1680	isc_buffer_init(&target, NULL, 0);
1681
1682	start_serial = dns_journal_first_serial(j);
1683	end_serial = dns_journal_last_serial(j);
1684
1685	CHECK(dns_journal_iter_init(j, start_serial, end_serial, NULL));
1686
1687	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1688	     result = dns_journal_next_rr(j))
1689	{
1690		dns_name_t *name = NULL;
1691		dns_rdata_t *rdata = NULL;
1692		dns_difftuple_t *tuple = NULL;
1693		static uint32_t i = 0;
1694		bool print = false;
1695		uint32_t ttl;
1696
1697		dns_journal_current_rr(j, &name, &ttl, &rdata);
1698
1699		if (rdata->type == dns_rdatatype_soa) {
1700			n_soa++;
1701			if (n_soa == 3) {
1702				n_soa = 1;
1703			}
1704			if (n_soa == 1) {
1705				print = printxhdr;
1706			}
1707		}
1708		if (n_soa == 0) {
1709			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1710				      "%s: journal file corrupt: missing "
1711				      "initial SOA",
1712				      j->filename);
1713			FAIL(ISC_R_UNEXPECTED);
1714		}
1715
1716		if (print) {
1717			fprintf(file,
1718				"Transaction: version %d offset %lld size %u "
1719				"rrcount %u start %u end %u\n",
1720				j->xhdr_version, (long long)j->it.cpos.offset,
1721				j->curxhdr.size, j->curxhdr.count,
1722				j->curxhdr.serial0, j->curxhdr.serial1);
1723			if (j->it.cpos.offset > j->index[i].offset) {
1724				fprintf(file,
1725					"ERROR: Offset mismatch, "
1726					"expected %lld\n",
1727					(long long)j->index[i].offset);
1728			} else if (j->it.cpos.offset == j->index[i].offset) {
1729				i++;
1730			}
1731		}
1732		CHECK(dns_difftuple_create(
1733			diff.mctx, n_soa == 1 ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1734			name, ttl, rdata, &tuple));
1735		dns_diff_append(&diff, &tuple);
1736
1737		if (++n_put > 100 || printxhdr) {
1738			result = dns_diff_print(&diff, file);
1739			dns_diff_clear(&diff);
1740			n_put = 0;
1741			if (result != ISC_R_SUCCESS) {
1742				break;
1743			}
1744		}
1745	}
1746	if (result == ISC_R_NOMORE) {
1747		result = ISC_R_SUCCESS;
1748	}
1749	CHECK(result);
1750
1751	if (n_put != 0) {
1752		result = dns_diff_print(&diff, file);
1753		dns_diff_clear(&diff);
1754	}
1755	goto cleanup;
1756
1757failure:
1758	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1759		      "%s: cannot print: journal file corrupt", j->filename);
1760
1761cleanup:
1762	if (source.base != NULL) {
1763		isc_mem_put(j->mctx, source.base, source.length);
1764	}
1765	if (target.base != NULL) {
1766		isc_mem_put(j->mctx, target.base, target.length);
1767	}
1768
1769	dns_diff_clear(&diff);
1770	dns_journal_destroy(&j);
1771
1772	return (result);
1773}
1774
1775/**************************************************************************/
1776/*
1777 * Miscellaneous accessors.
1778 */
1779bool
1780dns_journal_empty(dns_journal_t *j) {
1781	return (JOURNAL_EMPTY(&j->header));
1782}
1783
1784bool
1785dns_journal_recovered(dns_journal_t *j) {
1786	return (j->recovered);
1787}
1788
1789uint32_t
1790dns_journal_first_serial(dns_journal_t *j) {
1791	return (j->header.begin.serial);
1792}
1793
1794uint32_t
1795dns_journal_last_serial(dns_journal_t *j) {
1796	return (j->header.end.serial);
1797}
1798
1799void
1800dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) {
1801	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1802		j->state == JOURNAL_STATE_INLINE ||
1803		j->state == JOURNAL_STATE_TRANSACTION);
1804
1805	j->header.sourceserial = sourceserial;
1806	j->header.serialset = true;
1807	if (j->state == JOURNAL_STATE_WRITE) {
1808		j->state = JOURNAL_STATE_INLINE;
1809	}
1810}
1811
1812bool
1813dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) {
1814	REQUIRE(sourceserial != NULL);
1815
1816	if (!j->header.serialset) {
1817		return (false);
1818	}
1819	*sourceserial = j->header.sourceserial;
1820	return (true);
1821}
1822
1823/**************************************************************************/
1824/*
1825 * Iteration support.
1826 *
1827 * When serving an outgoing IXFR, we transmit a part the journal starting
1828 * at the serial number in the IXFR request and ending at the serial
1829 * number that is current when the IXFR request arrives.  The ending
1830 * serial number is not necessarily at the end of the journal:
1831 * the journal may grow while the IXFR is in progress, but we stop
1832 * when we reach the serial number that was current when the IXFR started.
1833 */
1834
1835static isc_result_t
1836read_one_rr(dns_journal_t *j);
1837
1838/*
1839 * Make sure the buffer 'b' is has at least 'size' bytes
1840 * allocated, and clear it.
1841 *
1842 * Requires:
1843 *	Either b->base is NULL, or it points to b->length bytes of memory
1844 *	previously allocated by isc_mem_get().
1845 */
1846
1847static isc_result_t
1848size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1849	if (b->length < size) {
1850		void *mem = isc_mem_get(mctx, size);
1851		if (mem == NULL) {
1852			return (ISC_R_NOMEMORY);
1853		}
1854		if (b->base != NULL) {
1855			isc_mem_put(mctx, b->base, b->length);
1856		}
1857		b->base = mem;
1858		b->length = size;
1859	}
1860	isc_buffer_clear(b);
1861	return (ISC_R_SUCCESS);
1862}
1863
1864isc_result_t
1865dns_journal_iter_init(dns_journal_t *j, uint32_t begin_serial,
1866		      uint32_t end_serial, size_t *xfrsizep) {
1867	isc_result_t result;
1868
1869	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1870	INSIST(j->it.bpos.serial == begin_serial);
1871
1872	CHECK(journal_find(j, end_serial, &j->it.epos));
1873	INSIST(j->it.epos.serial == end_serial);
1874
1875	if (xfrsizep != NULL) {
1876		journal_pos_t pos = j->it.bpos;
1877		journal_xhdr_t xhdr;
1878		uint64_t size = 0;
1879		uint32_t count = 0;
1880
1881		/*
1882		 * We already know the beginning and ending serial
1883		 * numbers are in the journal. Scan through them,
1884		 * adding up sizes and RR counts so we can calculate
1885		 * the IXFR size.
1886		 */
1887		do {
1888			CHECK(journal_seek(j, pos.offset));
1889			CHECK(journal_read_xhdr(j, &xhdr));
1890
1891			if (j->header_ver1) {
1892				CHECK(maybe_fixup_xhdr(j, &xhdr, pos.serial,
1893						       pos.offset));
1894			}
1895
1896			/*
1897			 * Check that xhdr is consistent.
1898			 */
1899			if (xhdr.serial0 != pos.serial ||
1900			    isc_serial_le(xhdr.serial1, xhdr.serial0))
1901			{
1902				CHECK(ISC_R_UNEXPECTED);
1903			}
1904
1905			size += xhdr.size;
1906			count += xhdr.count;
1907
1908			result = journal_next(j, &pos);
1909			if (result == ISC_R_NOMORE) {
1910				result = ISC_R_SUCCESS;
1911			}
1912			CHECK(result);
1913		} while (pos.serial != end_serial);
1914
1915		/*
1916		 * For each RR, subtract the length of the RR header,
1917		 * as this would not be present in IXFR messages.
1918		 * (We don't need to worry about the transaction header
1919		 * because that was already excluded from xdr.size.)
1920		 */
1921		*xfrsizep = size - (count * sizeof(journal_rawrrhdr_t));
1922	}
1923
1924	result = ISC_R_SUCCESS;
1925failure:
1926	j->it.result = result;
1927	return (j->it.result);
1928}
1929
1930isc_result_t
1931dns_journal_first_rr(dns_journal_t *j) {
1932	isc_result_t result;
1933
1934	/*
1935	 * Seek to the beginning of the first transaction we are
1936	 * interested in.
1937	 */
1938	CHECK(journal_seek(j, j->it.bpos.offset));
1939	j->it.current_serial = j->it.bpos.serial;
1940
1941	j->it.xsize = 0; /* We have no transaction data yet... */
1942	j->it.xpos = 0;	 /* ...and haven't used any of it. */
1943
1944	return (read_one_rr(j));
1945
1946failure:
1947	return (result);
1948}
1949
1950static isc_result_t
1951read_one_rr(dns_journal_t *j) {
1952	isc_result_t result;
1953	dns_rdatatype_t rdtype;
1954	dns_rdataclass_t rdclass;
1955	unsigned int rdlen;
1956	uint32_t ttl;
1957	journal_xhdr_t xhdr;
1958	journal_rrhdr_t rrhdr;
1959	dns_journal_t save = *j;
1960
1961	if (j->offset > j->it.epos.offset) {
1962		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1963			      "%s: journal corrupt: possible integer overflow",
1964			      j->filename);
1965		return (ISC_R_UNEXPECTED);
1966	}
1967	if (j->offset == j->it.epos.offset) {
1968		return (ISC_R_NOMORE);
1969	}
1970	if (j->it.xpos == j->it.xsize) {
1971		/*
1972		 * We are at a transaction boundary.
1973		 * Read another transaction header.
1974		 */
1975		CHECK(journal_read_xhdr(j, &xhdr));
1976		if (xhdr.size == 0) {
1977			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1978				      "%s: journal corrupt: empty transaction",
1979				      j->filename);
1980			FAIL(ISC_R_UNEXPECTED);
1981		}
1982
1983		if (j->header_ver1) {
1984			CHECK(maybe_fixup_xhdr(j, &xhdr, j->it.current_serial,
1985					       save.offset));
1986		}
1987
1988		if (xhdr.serial0 != j->it.current_serial ||
1989		    isc_serial_le(xhdr.serial1, xhdr.serial0))
1990		{
1991			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1992				      "%s: journal file corrupt: "
1993				      "expected serial %u, got %u",
1994				      j->filename, j->it.current_serial,
1995				      xhdr.serial0);
1996			FAIL(ISC_R_UNEXPECTED);
1997		}
1998
1999		j->it.xsize = xhdr.size;
2000		j->it.xpos = 0;
2001	}
2002	/*
2003	 * Read an RR.
2004	 */
2005	CHECK(journal_read_rrhdr(j, &rrhdr));
2006	/*
2007	 * Perform a sanity check on the journal RR size.
2008	 * The smallest possible RR has a 1-byte owner name
2009	 * and a 10-byte header.  The largest possible
2010	 * RR has 65535 bytes of data, a header, and a maximum-
2011	 * size owner name, well below 70 k total.
2012	 */
2013	if (rrhdr.size < 1 + 10 || rrhdr.size > 70000) {
2014		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
2015			      "%s: journal corrupt: impossible RR size "
2016			      "(%d bytes)",
2017			      j->filename, rrhdr.size);
2018		FAIL(ISC_R_UNEXPECTED);
2019	}
2020
2021	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
2022	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
2023	isc_buffer_add(&j->it.source, rrhdr.size);
2024
2025	/*
2026	 * The target buffer is made the same size
2027	 * as the source buffer, with the assumption that when
2028	 * no compression in present, the output of dns_*_fromwire()
2029	 * is no larger than the input.
2030	 */
2031	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
2032
2033	/*
2034	 * Parse the owner name.  We don't know where it
2035	 * ends yet, so we make the entire "remaining"
2036	 * part of the buffer "active".
2037	 */
2038	isc_buffer_setactive(&j->it.source,
2039			     j->it.source.used - j->it.source.current);
2040	CHECK(dns_name_fromwire(&j->it.name, &j->it.source, &j->it.dctx, 0,
2041				&j->it.target));
2042
2043	/*
2044	 * Check that the RR header is there, and parse it.
2045	 */
2046	if (isc_buffer_remaininglength(&j->it.source) < 10) {
2047		FAIL(DNS_R_FORMERR);
2048	}
2049
2050	rdtype = isc_buffer_getuint16(&j->it.source);
2051	rdclass = isc_buffer_getuint16(&j->it.source);
2052	ttl = isc_buffer_getuint32(&j->it.source);
2053	rdlen = isc_buffer_getuint16(&j->it.source);
2054
2055	if (rdlen > DNS_RDATA_MAXLENGTH) {
2056		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
2057			      "%s: journal corrupt: impossible rdlen "
2058			      "(%u bytes)",
2059			      j->filename, rdlen);
2060		FAIL(ISC_R_FAILURE);
2061	}
2062
2063	/*
2064	 * Parse the rdata.
2065	 */
2066	if (isc_buffer_remaininglength(&j->it.source) != rdlen) {
2067		FAIL(DNS_R_FORMERR);
2068	}
2069	isc_buffer_setactive(&j->it.source, rdlen);
2070	dns_rdata_reset(&j->it.rdata);
2071	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass, rdtype, &j->it.source,
2072				 &j->it.dctx, 0, &j->it.target));
2073	j->it.ttl = ttl;
2074
2075	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
2076	if (rdtype == dns_rdatatype_soa) {
2077		/* XXX could do additional consistency checks here */
2078		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
2079	}
2080
2081	result = ISC_R_SUCCESS;
2082
2083failure:
2084	j->it.result = result;
2085	return (result);
2086}
2087
2088isc_result_t
2089dns_journal_next_rr(dns_journal_t *j) {
2090	j->it.result = read_one_rr(j);
2091	return (j->it.result);
2092}
2093
2094void
2095dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl,
2096		       dns_rdata_t **rdata) {
2097	REQUIRE(j->it.result == ISC_R_SUCCESS);
2098	*name = &j->it.name;
2099	*ttl = j->it.ttl;
2100	*rdata = &j->it.rdata;
2101}
2102
2103/**************************************************************************/
2104/*
2105 * Generating diffs from databases
2106 */
2107
2108/*
2109 * Construct a diff containing all the RRs at the current name of the
2110 * database iterator 'dbit' in database 'db', version 'ver'.
2111 * Set '*name' to the current name, and append the diff to 'diff'.
2112 * All new tuples will have the operation 'op'.
2113 *
2114 * Requires: 'name' must have buffer large enough to hold the name.
2115 * Typically, a dns_fixedname_t would be used.
2116 */
2117static isc_result_t
2118get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
2119	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
2120	      dns_diff_t *diff) {
2121	isc_result_t result;
2122	dns_dbnode_t *node = NULL;
2123	dns_rdatasetiter_t *rdsiter = NULL;
2124	dns_difftuple_t *tuple = NULL;
2125
2126	result = dns_dbiterator_current(dbit, &node, name);
2127	if (result != ISC_R_SUCCESS) {
2128		return (result);
2129	}
2130
2131	result = dns_db_allrdatasets(db, node, ver, 0, now, &rdsiter);
2132	if (result != ISC_R_SUCCESS) {
2133		goto cleanup_node;
2134	}
2135
2136	for (result = dns_rdatasetiter_first(rdsiter); result == ISC_R_SUCCESS;
2137	     result = dns_rdatasetiter_next(rdsiter))
2138	{
2139		dns_rdataset_t rdataset;
2140
2141		dns_rdataset_init(&rdataset);
2142		dns_rdatasetiter_current(rdsiter, &rdataset);
2143
2144		for (result = dns_rdataset_first(&rdataset);
2145		     result == ISC_R_SUCCESS;
2146		     result = dns_rdataset_next(&rdataset))
2147		{
2148			dns_rdata_t rdata = DNS_RDATA_INIT;
2149			dns_rdataset_current(&rdataset, &rdata);
2150			result = dns_difftuple_create(diff->mctx, op, name,
2151						      rdataset.ttl, &rdata,
2152						      &tuple);
2153			if (result != ISC_R_SUCCESS) {
2154				dns_rdataset_disassociate(&rdataset);
2155				goto cleanup_iterator;
2156			}
2157			dns_diff_append(diff, &tuple);
2158		}
2159		dns_rdataset_disassociate(&rdataset);
2160		if (result != ISC_R_NOMORE) {
2161			goto cleanup_iterator;
2162		}
2163	}
2164	if (result != ISC_R_NOMORE) {
2165		goto cleanup_iterator;
2166	}
2167
2168	result = ISC_R_SUCCESS;
2169
2170cleanup_iterator:
2171	dns_rdatasetiter_destroy(&rdsiter);
2172
2173cleanup_node:
2174	dns_db_detachnode(db, &node);
2175
2176	return (result);
2177}
2178
2179/*
2180 * Comparison function for use by dns_diff_subtract when sorting
2181 * the diffs to be subtracted.  The sort keys are the rdata type
2182 * and the rdata itself.  The owner name is ignored, because
2183 * it is known to be the same for all tuples.
2184 */
2185static int
2186rdata_order(const void *av, const void *bv) {
2187	dns_difftuple_t const *const *ap = av;
2188	dns_difftuple_t const *const *bp = bv;
2189	dns_difftuple_t const *a = *ap;
2190	dns_difftuple_t const *b = *bp;
2191	int r;
2192	r = (b->rdata.type - a->rdata.type);
2193	if (r != 0) {
2194		return (r);
2195	}
2196	r = dns_rdata_compare(&a->rdata, &b->rdata);
2197	return (r);
2198}
2199
2200static isc_result_t
2201dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
2202	isc_result_t result;
2203	dns_difftuple_t *p[2];
2204	int i, t;
2205	bool append;
2206	dns_difftuplelist_t add, del;
2207
2208	CHECK(dns_diff_sort(&diff[0], rdata_order));
2209	CHECK(dns_diff_sort(&diff[1], rdata_order));
2210	ISC_LIST_INIT(add);
2211	ISC_LIST_INIT(del);
2212
2213	for (;;) {
2214		p[0] = ISC_LIST_HEAD(diff[0].tuples);
2215		p[1] = ISC_LIST_HEAD(diff[1].tuples);
2216		if (p[0] == NULL && p[1] == NULL) {
2217			break;
2218		}
2219
2220		for (i = 0; i < 2; i++) {
2221			if (p[!i] == NULL) {
2222				dns_difftuplelist_t *l = (i == 0) ? &add : &del;
2223				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
2224				ISC_LIST_APPEND(*l, p[i], link);
2225				goto next;
2226			}
2227		}
2228		t = rdata_order(&p[0], &p[1]);
2229		if (t < 0) {
2230			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
2231			ISC_LIST_APPEND(add, p[0], link);
2232			goto next;
2233		}
2234		if (t > 0) {
2235			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
2236			ISC_LIST_APPEND(del, p[1], link);
2237			goto next;
2238		}
2239		INSIST(t == 0);
2240		/*
2241		 * Identical RRs in both databases; skip them both
2242		 * if the ttl differs.
2243		 */
2244		append = (p[0]->ttl != p[1]->ttl);
2245		for (i = 0; i < 2; i++) {
2246			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
2247			if (append) {
2248				dns_difftuplelist_t *l = (i == 0) ? &add : &del;
2249				ISC_LIST_APPEND(*l, p[i], link);
2250			} else {
2251				dns_difftuple_free(&p[i]);
2252			}
2253		}
2254	next:;
2255	}
2256	ISC_LIST_APPENDLIST(r->tuples, del, link);
2257	ISC_LIST_APPENDLIST(r->tuples, add, link);
2258	result = ISC_R_SUCCESS;
2259failure:
2260	return (result);
2261}
2262
2263static isc_result_t
2264diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, dns_db_t *dbb,
2265	       dns_dbversion_t *dbverb, unsigned int options,
2266	       dns_diff_t *resultdiff) {
2267	dns_db_t *db[2];
2268	dns_dbversion_t *ver[2];
2269	dns_dbiterator_t *dbit[2] = { NULL, NULL };
2270	bool have[2] = { false, false };
2271	dns_fixedname_t fixname[2];
2272	isc_result_t result, itresult[2];
2273	dns_diff_t diff[2];
2274	int i, t;
2275
2276	db[0] = dba, db[1] = dbb;
2277	ver[0] = dbvera, ver[1] = dbverb;
2278
2279	dns_diff_init(resultdiff->mctx, &diff[0]);
2280	dns_diff_init(resultdiff->mctx, &diff[1]);
2281
2282	dns_fixedname_init(&fixname[0]);
2283	dns_fixedname_init(&fixname[1]);
2284
2285	result = dns_db_createiterator(db[0], options, &dbit[0]);
2286	if (result != ISC_R_SUCCESS) {
2287		return (result);
2288	}
2289	result = dns_db_createiterator(db[1], options, &dbit[1]);
2290	if (result != ISC_R_SUCCESS) {
2291		goto cleanup_iterator;
2292	}
2293
2294	itresult[0] = dns_dbiterator_first(dbit[0]);
2295	itresult[1] = dns_dbiterator_first(dbit[1]);
2296
2297	for (;;) {
2298		for (i = 0; i < 2; i++) {
2299			if (!have[i] && itresult[i] == ISC_R_SUCCESS) {
2300				CHECK(get_name_diff(
2301					db[i], ver[i], 0, dbit[i],
2302					dns_fixedname_name(&fixname[i]),
2303					i == 0 ? DNS_DIFFOP_ADD
2304					       : DNS_DIFFOP_DEL,
2305					&diff[i]));
2306				itresult[i] = dns_dbiterator_next(dbit[i]);
2307				have[i] = true;
2308			}
2309		}
2310
2311		if (!have[0] && !have[1]) {
2312			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2313			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2314			break;
2315		}
2316
2317		for (i = 0; i < 2; i++) {
2318			if (!have[!i]) {
2319				ISC_LIST_APPENDLIST(resultdiff->tuples,
2320						    diff[i].tuples, link);
2321				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
2322				have[i] = false;
2323				goto next;
2324			}
2325		}
2326
2327		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
2328				     dns_fixedname_name(&fixname[1]));
2329		if (t < 0) {
2330			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[0].tuples,
2331					    link);
2332			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2333			have[0] = false;
2334			continue;
2335		}
2336		if (t > 0) {
2337			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[1].tuples,
2338					    link);
2339			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2340			have[1] = false;
2341			continue;
2342		}
2343		INSIST(t == 0);
2344		CHECK(dns_diff_subtract(diff, resultdiff));
2345		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2346		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2347		have[0] = have[1] = false;
2348	next:;
2349	}
2350	if (itresult[0] != ISC_R_NOMORE) {
2351		FAIL(itresult[0]);
2352	}
2353	if (itresult[1] != ISC_R_NOMORE) {
2354		FAIL(itresult[1]);
2355	}
2356
2357	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2358	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2359
2360failure:
2361	dns_dbiterator_destroy(&dbit[1]);
2362
2363cleanup_iterator:
2364	dns_dbiterator_destroy(&dbit[0]);
2365	dns_diff_clear(&diff[0]);
2366	dns_diff_clear(&diff[1]);
2367	return (result);
2368}
2369
2370/*
2371 * Compare the databases 'dba' and 'dbb' and generate a journal
2372 * entry containing the changes to make 'dba' from 'dbb' (note
2373 * the order).  This journal entry will consist of a single,
2374 * possibly very large transaction.
2375 */
2376isc_result_t
2377dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
2378	    dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
2379	isc_result_t result;
2380	dns_diff_t diff;
2381
2382	dns_diff_init(mctx, &diff);
2383
2384	result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
2385
2386	dns_diff_clear(&diff);
2387
2388	return (result);
2389}
2390
2391isc_result_t
2392dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
2393	     dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
2394	isc_result_t result;
2395	dns_journal_t *journal = NULL;
2396
2397	if (filename != NULL) {
2398		result = dns_journal_open(diff->mctx, filename,
2399					  DNS_JOURNAL_CREATE, &journal);
2400		if (result != ISC_R_SUCCESS) {
2401			return (result);
2402		}
2403	}
2404
2405	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
2406	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
2407
2408	if (journal != NULL) {
2409		if (ISC_LIST_EMPTY(diff->tuples)) {
2410			isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
2411		} else {
2412			CHECK(dns_journal_write_transaction(journal, diff));
2413		}
2414	}
2415
2416failure:
2417	if (journal != NULL) {
2418		dns_journal_destroy(&journal);
2419	}
2420	return (result);
2421}
2422
2423static uint32_t
2424rrcount(unsigned char *buf, unsigned int size) {
2425	isc_buffer_t b;
2426	uint32_t rrsize, count = 0;
2427
2428	isc_buffer_init(&b, buf, size);
2429	isc_buffer_add(&b, size);
2430	while (isc_buffer_remaininglength(&b) > 0) {
2431		rrsize = isc_buffer_getuint32(&b);
2432		INSIST(isc_buffer_remaininglength(&b) >= rrsize);
2433		isc_buffer_forward(&b, rrsize);
2434		count++;
2435	}
2436
2437	return (count);
2438}
2439
2440static bool
2441check_delta(unsigned char *buf, size_t size) {
2442	isc_buffer_t b;
2443	uint32_t rrsize;
2444
2445	isc_buffer_init(&b, buf, size);
2446	isc_buffer_add(&b, size);
2447	while (isc_buffer_remaininglength(&b) > 0) {
2448		if (isc_buffer_remaininglength(&b) < 4) {
2449			return (false);
2450		}
2451		rrsize = isc_buffer_getuint32(&b);
2452		/* "." + type + class + ttl + rdlen => 11U */
2453		if (rrsize < 11U || isc_buffer_remaininglength(&b) < rrsize) {
2454			return (false);
2455		}
2456		isc_buffer_forward(&b, rrsize);
2457	}
2458
2459	return (true);
2460}
2461
2462isc_result_t
2463dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
2464		    uint32_t flags, uint32_t target_size) {
2465	unsigned int i;
2466	journal_pos_t best_guess;
2467	journal_pos_t current_pos;
2468	dns_journal_t *j1 = NULL;
2469	dns_journal_t *j2 = NULL;
2470	journal_rawheader_t rawheader;
2471	unsigned int len;
2472	size_t namelen;
2473	unsigned char *buf = NULL;
2474	unsigned int size = 0;
2475	isc_result_t result;
2476	unsigned int indexend;
2477	char newname[PATH_MAX];
2478	char backup[PATH_MAX];
2479	bool is_backup = false;
2480	bool rewrite = false;
2481	bool downgrade = false;
2482
2483	REQUIRE(filename != NULL);
2484
2485	namelen = strlen(filename);
2486	if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) {
2487		namelen -= 4;
2488	}
2489
2490	result = snprintf(newname, sizeof(newname), "%.*s.jnw", (int)namelen,
2491			  filename);
2492	RUNTIME_CHECK(result < sizeof(newname));
2493
2494	result = snprintf(backup, sizeof(backup), "%.*s.jbk", (int)namelen,
2495			  filename);
2496	RUNTIME_CHECK(result < sizeof(backup));
2497
2498	result = journal_open(mctx, filename, false, false, false, &j1);
2499	if (result == ISC_R_NOTFOUND) {
2500		is_backup = true;
2501		result = journal_open(mctx, backup, false, false, false, &j1);
2502	}
2503	if (result != ISC_R_SUCCESS) {
2504		return (result);
2505	}
2506
2507	/*
2508	 * Always perform a re-write when processing a version 1 journal.
2509	 */
2510	rewrite = j1->header_ver1;
2511
2512	/*
2513	 * Check whether we need to rewrite the whole journal
2514	 * file (for example, to upversion it).
2515	 */
2516	if ((flags & DNS_JOURNAL_COMPACTALL) != 0) {
2517		if ((flags & DNS_JOURNAL_VERSION1) != 0) {
2518			downgrade = true;
2519		}
2520		rewrite = true;
2521		serial = dns_journal_first_serial(j1);
2522	} else if (JOURNAL_EMPTY(&j1->header)) {
2523		dns_journal_destroy(&j1);
2524		return (ISC_R_SUCCESS);
2525	}
2526
2527	if (DNS_SERIAL_GT(j1->header.begin.serial, serial) ||
2528	    DNS_SERIAL_GT(serial, j1->header.end.serial))
2529	{
2530		dns_journal_destroy(&j1);
2531		return (ISC_R_RANGE);
2532	}
2533
2534	/*
2535	 * Cope with very small target sizes.
2536	 */
2537	indexend = sizeof(journal_rawheader_t) +
2538		   j1->header.index_size * sizeof(journal_rawpos_t);
2539	if (target_size < DNS_JOURNAL_SIZE_MIN) {
2540		target_size = DNS_JOURNAL_SIZE_MIN;
2541	}
2542	if (target_size < indexend * 2) {
2543		target_size = target_size / 2 + indexend;
2544	}
2545
2546	/*
2547	 * See if there is any work to do.
2548	 */
2549	if (!rewrite && (uint32_t)j1->header.end.offset < target_size) {
2550		dns_journal_destroy(&j1);
2551		return (ISC_R_SUCCESS);
2552	}
2553
2554	CHECK(journal_open(mctx, newname, true, true, downgrade, &j2));
2555	CHECK(journal_seek(j2, indexend));
2556
2557	/*
2558	 * Remove overhead so space test below can succeed.
2559	 */
2560	if (target_size >= indexend) {
2561		target_size -= indexend;
2562	}
2563
2564	/*
2565	 * Find if we can create enough free space.
2566	 */
2567	best_guess = j1->header.begin;
2568	for (i = 0; i < j1->header.index_size; i++) {
2569		if (POS_VALID(j1->index[i]) &&
2570		    DNS_SERIAL_GE(serial, j1->index[i].serial) &&
2571		    ((uint32_t)(j1->header.end.offset - j1->index[i].offset) >=
2572		     target_size / 2) &&
2573		    j1->index[i].offset > best_guess.offset)
2574		{
2575			best_guess = j1->index[i];
2576		}
2577	}
2578
2579	current_pos = best_guess;
2580	while (current_pos.serial != serial) {
2581		CHECK(journal_next(j1, &current_pos));
2582		if (current_pos.serial == j1->header.end.serial) {
2583			break;
2584		}
2585
2586		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
2587		    ((uint32_t)(j1->header.end.offset - current_pos.offset) >=
2588		     (target_size / 2)) &&
2589		    current_pos.offset > best_guess.offset)
2590		{
2591			best_guess = current_pos;
2592		} else {
2593			break;
2594		}
2595	}
2596
2597	INSIST(best_guess.serial != j1->header.end.serial);
2598	if (best_guess.serial != serial) {
2599		CHECK(journal_next(j1, &best_guess));
2600		serial = best_guess.serial;
2601	}
2602
2603	/*
2604	 * We should now be roughly half target_size provided
2605	 * we did not reach 'serial'.  If not we will just copy
2606	 * all uncommitted deltas regardless of the size.
2607	 */
2608	len = j1->header.end.offset - best_guess.offset;
2609	if (len != 0) {
2610		CHECK(journal_seek(j1, best_guess.offset));
2611
2612		/* Prepare new header */
2613		j2->header.begin.serial = best_guess.serial;
2614		j2->header.begin.offset = indexend;
2615		j2->header.sourceserial = j1->header.sourceserial;
2616		j2->header.serialset = j1->header.serialset;
2617		j2->header.end.serial = j1->header.end.serial;
2618
2619		/*
2620		 * Only use this method if we're rewriting the
2621		 * journal to fix outdated transaction headers;
2622		 * otherwise we'll copy the whole journal without
2623		 * parsing individual deltas below.
2624		 */
2625		while (rewrite && len > 0) {
2626			journal_xhdr_t xhdr;
2627			isc_offset_t offset = j1->offset;
2628			uint32_t count;
2629
2630			result = journal_read_xhdr(j1, &xhdr);
2631			if (rewrite && result == ISC_R_NOMORE) {
2632				break;
2633			}
2634			CHECK(result);
2635
2636			size = xhdr.size;
2637			if (size > len) {
2638				isc_log_write(JOURNAL_COMMON_LOGARGS,
2639					      ISC_LOG_ERROR,
2640					      "%s: journal file corrupt, "
2641					      "transaction too large",
2642					      j1->filename);
2643				CHECK(ISC_R_FAILURE);
2644			}
2645			buf = isc_mem_get(mctx, size);
2646			result = journal_read(j1, buf, size);
2647
2648			/*
2649			 * If we're repairing an outdated journal, the
2650			 * xhdr format may be wrong.
2651			 */
2652			if (rewrite && (result != ISC_R_SUCCESS ||
2653					!check_delta(buf, size)))
2654			{
2655				if (j1->xhdr_version == XHDR_VERSION2) {
2656					/* XHDR_VERSION2 -> XHDR_VERSION1 */
2657					j1->xhdr_version = XHDR_VERSION1;
2658					CHECK(journal_seek(j1, offset));
2659					CHECK(journal_read_xhdr(j1, &xhdr));
2660				} else if (j1->xhdr_version == XHDR_VERSION1) {
2661					/* XHDR_VERSION1 -> XHDR_VERSION2 */
2662					j1->xhdr_version = XHDR_VERSION2;
2663					CHECK(journal_seek(j1, offset));
2664					CHECK(journal_read_xhdr(j1, &xhdr));
2665				}
2666
2667				/* Check again */
2668				isc_mem_put(mctx, buf, size);
2669				size = xhdr.size;
2670				if (size > len) {
2671					isc_log_write(
2672						JOURNAL_COMMON_LOGARGS,
2673						ISC_LOG_ERROR,
2674						"%s: journal file corrupt, "
2675						"transaction too large",
2676						j1->filename);
2677					CHECK(ISC_R_FAILURE);
2678				}
2679				buf = isc_mem_get(mctx, size);
2680				CHECK(journal_read(j1, buf, size));
2681
2682				if (!check_delta(buf, size)) {
2683					CHECK(ISC_R_UNEXPECTED);
2684				}
2685			} else {
2686				CHECK(result);
2687			}
2688
2689			/*
2690			 * Recover from incorrectly written transaction header.
2691			 * The incorrect header was written as size, serial0,
2692			 * serial1, and 0.  XHDR_VERSION2 is expecting size,
2693			 * count, serial0, and serial1.
2694			 */
2695			if (j1->xhdr_version == XHDR_VERSION2 &&
2696			    xhdr.count == serial && xhdr.serial1 == 0U &&
2697			    isc_serial_gt(xhdr.serial0, xhdr.count))
2698			{
2699				xhdr.serial1 = xhdr.serial0;
2700				xhdr.serial0 = xhdr.count;
2701				xhdr.count = 0;
2702			}
2703
2704			/*
2705			 * Check that xhdr is consistent.
2706			 */
2707			if (xhdr.serial0 != serial ||
2708			    isc_serial_le(xhdr.serial1, xhdr.serial0))
2709			{
2710				CHECK(ISC_R_UNEXPECTED);
2711			}
2712
2713			/*
2714			 * Extract record count from the transaction.  This
2715			 * is needed when converting from XHDR_VERSION1 to
2716			 * XHDR_VERSION2, and when recovering from an
2717			 * incorrectly written XHDR_VERSION2.
2718			 */
2719			count = rrcount(buf, size);
2720			CHECK(journal_write_xhdr(j2, xhdr.size, count,
2721						 xhdr.serial0, xhdr.serial1));
2722			CHECK(journal_write(j2, buf, size));
2723
2724			j2->header.end.offset = j2->offset;
2725
2726			serial = xhdr.serial1;
2727
2728			len = j1->header.end.offset - j1->offset;
2729			isc_mem_put(mctx, buf, size);
2730		}
2731
2732		/*
2733		 * If we're not rewriting transaction headers, we can use
2734		 * this faster method instead.
2735		 */
2736		if (!rewrite) {
2737			size = ISC_MIN(64 * 1024, len);
2738			buf = isc_mem_get(mctx, size);
2739			for (i = 0; i < len; i += size) {
2740				unsigned int blob = ISC_MIN(size, len - i);
2741				CHECK(journal_read(j1, buf, blob));
2742				CHECK(journal_write(j2, buf, blob));
2743			}
2744
2745			j2->header.end.offset = indexend + len;
2746		}
2747
2748		CHECK(journal_fsync(j2));
2749
2750		/*
2751		 * Update the journal header.
2752		 */
2753		journal_header_encode(&j2->header, &rawheader);
2754		CHECK(journal_seek(j2, 0));
2755		CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
2756		CHECK(journal_fsync(j2));
2757
2758		/*
2759		 * Build new index.
2760		 */
2761		current_pos = j2->header.begin;
2762		while (current_pos.serial != j2->header.end.serial) {
2763			index_add(j2, &current_pos);
2764			CHECK(journal_next(j2, &current_pos));
2765		}
2766
2767		/*
2768		 * Write index.
2769		 */
2770		CHECK(index_to_disk(j2));
2771		CHECK(journal_fsync(j2));
2772
2773		indexend = j2->header.end.offset;
2774		POST(indexend);
2775	}
2776
2777	/*
2778	 * Close both journals before trying to rename files (this is
2779	 * necessary on WIN32).
2780	 */
2781	dns_journal_destroy(&j1);
2782	dns_journal_destroy(&j2);
2783
2784	/*
2785	 * With a UFS file system this should just succeed and be atomic.
2786	 * Any IXFR outs will just continue and the old journal will be
2787	 * removed on final close.
2788	 *
2789	 * With MSDOS / NTFS we need to do a two stage rename, triggered
2790	 * by EEXIST.  (If any IXFR's are running in other threads, however,
2791	 * this will fail, and the journal will not be compacted.  But
2792	 * if so, hopefully they'll be finished by the next time we
2793	 * compact.)
2794	 */
2795	if (rename(newname, filename) == -1) {
2796		if (errno == EEXIST && !is_backup) {
2797			result = isc_file_remove(backup);
2798			if (result != ISC_R_SUCCESS &&
2799			    result != ISC_R_FILENOTFOUND)
2800			{
2801				goto failure;
2802			}
2803			if (rename(filename, backup) == -1) {
2804				goto maperrno;
2805			}
2806			if (rename(newname, filename) == -1) {
2807				goto maperrno;
2808			}
2809			(void)isc_file_remove(backup);
2810		} else {
2811		maperrno:
2812			result = ISC_R_FAILURE;
2813			goto failure;
2814		}
2815	}
2816
2817	result = ISC_R_SUCCESS;
2818
2819failure:
2820	(void)isc_file_remove(newname);
2821	if (buf != NULL) {
2822		isc_mem_put(mctx, buf, size);
2823	}
2824	if (j1 != NULL) {
2825		dns_journal_destroy(&j1);
2826	}
2827	if (j2 != NULL) {
2828		dns_journal_destroy(&j2);
2829	}
2830	return (result);
2831}
2832
2833static isc_result_t
2834index_to_disk(dns_journal_t *j) {
2835	isc_result_t result = ISC_R_SUCCESS;
2836
2837	if (j->header.index_size != 0) {
2838		unsigned int i;
2839		unsigned char *p;
2840		unsigned int rawbytes;
2841
2842		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2843
2844		p = j->rawindex;
2845		for (i = 0; i < j->header.index_size; i++) {
2846			encode_uint32(j->index[i].serial, p);
2847			p += 4;
2848			encode_uint32(j->index[i].offset, p);
2849			p += 4;
2850		}
2851		INSIST(p == j->rawindex + rawbytes);
2852
2853		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2854		CHECK(journal_write(j, j->rawindex, rawbytes));
2855	}
2856failure:
2857	return (result);
2858}
2859