journal.c revision 170222
1/*
2 * Copyright (C) 2004, 2005  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2002  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: journal.c,v 1.86.18.8 2005/11/03 23:02:23 marka Exp $ */
19
20#include <config.h>
21
22#include <stdlib.h>
23#include <unistd.h>
24
25#include <isc/file.h>
26#include <isc/mem.h>
27#include <isc/stdio.h>
28#include <isc/string.h>
29#include <isc/util.h>
30
31#include <dns/compress.h>
32#include <dns/db.h>
33#include <dns/dbiterator.h>
34#include <dns/diff.h>
35#include <dns/fixedname.h>
36#include <dns/journal.h>
37#include <dns/log.h>
38#include <dns/rdataset.h>
39#include <dns/rdatasetiter.h>
40#include <dns/result.h>
41#include <dns/soa.h>
42
43/*! \file
44 * \brief Journalling.
45 *
46 * A journal file consists of
47 *
48 *   \li A fixed-size header of type journal_rawheader_t.
49 *
50 *   \li The index.  This is an unordered array of index entries
51 *     of type journal_rawpos_t giving the locations
52 *     of some arbitrary subset of the journal's addressable
53 *     transactions.  The index entries are used as hints to
54 *     speed up the process of locating a transaction with a given
55 *     serial number.  Unused index entries have an "offset"
56 *     field of zero.  The size of the index can vary between
57 *     journal files, but does not change during the lifetime
58 *     of a file.  The size can be zero.
59 *
60 *   \li The journal data.  This  consists of one or more transactions.
61 *     Each transaction begins with a transaction header of type
62 *     journal_rawxhdr_t.  The transaction header is followed by a
63 *     sequence of RRs, similar in structure to an IXFR difference
64 *     sequence (RFC1995).  That is, the pre-transaction SOA,
65 *     zero or more other deleted RRs, the post-transaction SOA,
66 *     and zero or more other added RRs.  Unlike in IXFR, each RR
67 *     is prefixed with a 32-bit length.
68 *
69 *     The journal data part grows as new transactions are
70 *     appended to the file.  Only those transactions
71 *     whose serial number is current-(2^31-1) to current
72 *     are considered "addressable" and may be pointed
73 *     to from the header or index.  They may be preceded
74 *     by old transactions that are no longer addressable,
75 *     and they may be followed by transactions that were
76 *     appended to the journal but never committed by updating
77 *     the "end" position in the header.  The latter will
78 *     be overwritten when new transactions are added.
79 */
80/*%
81 * When true, accept IXFR difference sequences where the
82 * SOA serial number does not change (BIND 8 sends such
83 * sequences).
84 */
85static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
86
87/**************************************************************************/
88/*
89 * Miscellaneous utilities.
90 */
91
92#define JOURNAL_COMMON_LOGARGS \
93	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
94
95#define JOURNAL_DEBUG_LOGARGS(n) \
96	JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
97
98/*%
99 * It would be non-sensical (or at least obtuse) to use FAIL() with an
100 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
101 * from complaining about "end-of-loop code not reached".
102 */
103#define FAIL(code) \
104	do { result = (code);					\
105		if (result != ISC_R_SUCCESS) goto failure;	\
106	} while (0)
107
108#define CHECK(op) \
109     	do { result = (op); 					\
110		if (result != ISC_R_SUCCESS) goto failure; 	\
111	} while (0)
112
113static isc_result_t index_to_disk(dns_journal_t *);
114
115static inline isc_uint32_t
116decode_uint32(unsigned char *p) {
117	return ((p[0] << 24) +
118		(p[1] << 16) +
119		(p[2] <<  8) +
120		(p[3] <<  0));
121}
122
123static inline void
124encode_uint32(isc_uint32_t val, unsigned char *p) {
125	p[0] = (isc_uint8_t)(val >> 24);
126	p[1] = (isc_uint8_t)(val >> 16);
127	p[2] = (isc_uint8_t)(val >>  8);
128	p[3] = (isc_uint8_t)(val >>  0);
129}
130
131isc_result_t
132dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
133		      dns_diffop_t op, dns_difftuple_t **tp)
134{
135	isc_result_t result;
136	dns_dbnode_t *node;
137	dns_rdataset_t rdataset;
138	dns_rdata_t rdata = DNS_RDATA_INIT;
139	dns_name_t *zonename;
140
141	zonename = dns_db_origin(db);
142
143	node = NULL;
144	result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
145	if (result != ISC_R_SUCCESS)
146		goto nonode;
147
148	dns_rdataset_init(&rdataset);
149	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
150				     (isc_stdtime_t)0, &rdataset, NULL);
151 	if (result != ISC_R_SUCCESS)
152		goto freenode;
153
154	result = dns_rdataset_first(&rdataset);
155 	if (result != ISC_R_SUCCESS)
156		goto freenode;
157
158	dns_rdataset_current(&rdataset, &rdata);
159
160	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
161				      &rdata, tp);
162
163	dns_rdataset_disassociate(&rdataset);
164	dns_db_detachnode(db, &node);
165	return (ISC_R_SUCCESS);
166
167 freenode:
168	dns_db_detachnode(db, &node);
169 nonode:
170	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
171	return (result);
172}
173
174/* Journalling */
175
176/*%
177 * On-disk representation of a "pointer" to a journal entry.
178 * These are used in the journal header to locate the beginning
179 * and end of the journal, and in the journal index to locate
180 * other transactions.
181 */
182typedef struct {
183	unsigned char	serial[4];  /*%< SOA serial before update. */
184	/*
185	 * XXXRTH  Should offset be 8 bytes?
186	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
187	 * XXXAG  ... but we will not be able to seek >2G anyway on many
188	 *            platforms as long as we are using fseek() rather
189	 *            than lseek().
190	 */
191	unsigned char	offset[4];  /*%< Offset from beginning of file. */
192} journal_rawpos_t;
193
194
195/*%
196 * The header is of a fixed size, with some spare room for future
197 * extensions.
198 */
199#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
200
201/*%
202 * The on-disk representation of the journal header.
203 * All numbers are stored in big-endian order.
204 */
205typedef union {
206	struct {
207		/*% File format version ID. */
208		unsigned char 		format[16];
209		/*% Position of the first addressable transaction */
210		journal_rawpos_t 	begin;
211		/*% Position of the next (yet nonexistent) transaction. */
212		journal_rawpos_t 	end;
213		/*% Number of index entries following the header. */
214		unsigned char 		index_size[4];
215	} h;
216	/* Pad the header to a fixed size. */
217	unsigned char pad[JOURNAL_HEADER_SIZE];
218} journal_rawheader_t;
219
220/*%
221 * The on-disk representation of the transaction header.
222 * There is one of these at the beginning of each transaction.
223 */
224typedef struct {
225	unsigned char	size[4]; 	/*%< In bytes, excluding header. */
226	unsigned char	serial0[4];	/*%< SOA serial before update. */
227	unsigned char	serial1[4];	/*%< SOA serial after update. */
228} journal_rawxhdr_t;
229
230/*%
231 * The on-disk representation of the RR header.
232 * There is one of these at the beginning of each RR.
233 */
234typedef struct {
235	unsigned char	size[4]; 	/*%< In bytes, excluding header. */
236} journal_rawrrhdr_t;
237
238/*%
239 * The in-core representation of the journal header.
240 */
241typedef struct {
242	isc_uint32_t	serial;
243	isc_offset_t	offset;
244} journal_pos_t;
245
246#define POS_VALID(pos) 		((pos).offset != 0)
247#define POS_INVALIDATE(pos) 	((pos).offset = 0, (pos).serial = 0)
248
249typedef struct {
250	unsigned char 	format[16];
251	journal_pos_t 	begin;
252	journal_pos_t 	end;
253	isc_uint32_t	index_size;
254} journal_header_t;
255
256/*%
257 * The in-core representation of the transaction header.
258 */
259
260typedef struct {
261	isc_uint32_t	size;
262	isc_uint32_t	serial0;
263	isc_uint32_t	serial1;
264} journal_xhdr_t;
265
266/*%
267 * The in-core representation of the RR header.
268 */
269typedef struct {
270	isc_uint32_t	size;
271} journal_rrhdr_t;
272
273
274/*%
275 * Initial contents to store in the header of a newly created
276 * journal file.
277 *
278 * The header starts with the magic string ";BIND LOG V9\n"
279 * to identify the file as a BIND 9 journal file.  An ASCII
280 * identification string is used rather than a binary magic
281 * number to be consistent with BIND 8 (BIND 8 journal files
282 * are ASCII text files).
283 */
284
285static journal_header_t
286initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
287
288#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
289
290typedef enum {
291	JOURNAL_STATE_INVALID,
292	JOURNAL_STATE_READ,
293	JOURNAL_STATE_WRITE,
294	JOURNAL_STATE_TRANSACTION
295} journal_state_t;
296
297struct dns_journal {
298	unsigned int		magic;		/*%< JOUR */
299	isc_mem_t		*mctx;		/*%< Memory context */
300	journal_state_t		state;
301	const char 		*filename;	/*%< Journal file name */
302	FILE *			fp;		/*%< File handle */
303	isc_offset_t		offset;		/*%< Current file offset */
304	journal_header_t 	header;		/*%< In-core journal header */
305	unsigned char		*rawindex;	/*%< In-core buffer for journal index in on-disk format */
306	journal_pos_t		*index;		/*%< In-core journal index */
307
308	/*% Current transaction state (when writing). */
309	struct {
310		unsigned int	n_soa;		/*%< Number of SOAs seen */
311		journal_pos_t	pos[2];		/*%< Begin/end position */
312	} x;
313
314	/*% Iteration state (when reading). */
315	struct {
316		/* These define the part of the journal we iterate over. */
317		journal_pos_t bpos;		/*%< Position before first, */
318		journal_pos_t epos;		/*%< and after last transaction */
319		/* The rest is iterator state. */
320		isc_uint32_t current_serial;	/*%< Current SOA serial */
321		isc_buffer_t source;		/*%< Data from disk */
322		isc_buffer_t target;		/*%< Data from _fromwire check */
323		dns_decompress_t dctx;		/*%< Dummy decompression ctx */
324		dns_name_t name;		/*%< Current domain name */
325		dns_rdata_t rdata;		/*%< Current rdata */
326		isc_uint32_t ttl;		/*%< Current TTL */
327		unsigned int xsize;		/*%< Size of transaction data */
328		unsigned int xpos;		/*%< Current position in it */
329		isc_result_t result;		/*%< Result of last call */
330	} it;
331};
332
333#define DNS_JOURNAL_MAGIC	ISC_MAGIC('J', 'O', 'U', 'R')
334#define DNS_JOURNAL_VALID(t)	ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
335
336static void
337journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
338	cooked->serial = decode_uint32(raw->serial);
339	cooked->offset = decode_uint32(raw->offset);
340}
341
342static void
343journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
344	encode_uint32(cooked->serial, raw->serial);
345	encode_uint32(cooked->offset, raw->offset);
346}
347
348static void
349journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
350	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
351	memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
352	journal_pos_decode(&raw->h.begin, &cooked->begin);
353	journal_pos_decode(&raw->h.end, &cooked->end);
354	cooked->index_size = decode_uint32(raw->h.index_size);
355}
356
357static void
358journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
359	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
360	memset(raw->pad, 0, sizeof(raw->pad));
361	memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
362	journal_pos_encode(&raw->h.begin, &cooked->begin);
363	journal_pos_encode(&raw->h.end, &cooked->end);
364	encode_uint32(cooked->index_size, raw->h.index_size);
365}
366
367/*
368 * Journal file I/O subroutines, with error checking and reporting.
369 */
370static isc_result_t
371journal_seek(dns_journal_t *j, isc_uint32_t offset) {
372	isc_result_t result;
373	result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
374	if (result != ISC_R_SUCCESS) {
375		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
376			      "%s: seek: %s", j->filename,
377			      isc_result_totext(result));
378		return (ISC_R_UNEXPECTED);
379	}
380	j->offset = offset;
381	return (ISC_R_SUCCESS);
382}
383
384static isc_result_t
385journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
386	isc_result_t result;
387
388	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
389	if (result != ISC_R_SUCCESS) {
390		if (result == ISC_R_EOF)
391			return (ISC_R_NOMORE);
392		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
393			      "%s: read: %s",
394			      j->filename, isc_result_totext(result));
395		return (ISC_R_UNEXPECTED);
396	}
397	j->offset += nbytes;
398	return (ISC_R_SUCCESS);
399}
400
401static isc_result_t
402journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
403	isc_result_t result;
404
405	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
406	if (result != ISC_R_SUCCESS) {
407		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
408			      "%s: write: %s",
409			      j->filename, isc_result_totext(result));
410		return (ISC_R_UNEXPECTED);
411	}
412	j->offset += nbytes;
413	return (ISC_R_SUCCESS);
414}
415
416static isc_result_t
417journal_fsync(dns_journal_t *j) {
418	isc_result_t result;
419	result = isc_stdio_flush(j->fp);
420	if (result != ISC_R_SUCCESS) {
421		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
422			      "%s: flush: %s",
423			      j->filename, isc_result_totext(result));
424		return (ISC_R_UNEXPECTED);
425	}
426	result = isc_stdio_sync(j->fp);
427	if (result != ISC_R_SUCCESS) {
428		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
429			      "%s: fsync: %s",
430			      j->filename, isc_result_totext(result));
431		return (ISC_R_UNEXPECTED);
432	}
433	return (ISC_R_SUCCESS);
434}
435
436/*
437 * Read/write a transaction header at the current file position.
438 */
439
440static isc_result_t
441journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
442	journal_rawxhdr_t raw;
443	isc_result_t result;
444	result = journal_read(j, &raw, sizeof(raw));
445	if (result != ISC_R_SUCCESS)
446		return (result);
447	xhdr->size = decode_uint32(raw.size);
448	xhdr->serial0 = decode_uint32(raw.serial0);
449	xhdr->serial1 = decode_uint32(raw.serial1);
450	return (ISC_R_SUCCESS);
451}
452
453static isc_result_t
454journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
455		   isc_uint32_t serial0, isc_uint32_t serial1)
456{
457	journal_rawxhdr_t raw;
458	encode_uint32(size, raw.size);
459	encode_uint32(serial0, raw.serial0);
460	encode_uint32(serial1, raw.serial1);
461	return (journal_write(j, &raw, sizeof(raw)));
462}
463
464
465/*
466 * Read an RR header at the current file position.
467 */
468
469static isc_result_t
470journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
471	journal_rawrrhdr_t raw;
472	isc_result_t result;
473	result = journal_read(j, &raw, sizeof(raw));
474	if (result != ISC_R_SUCCESS)
475		return (result);
476	rrhdr->size = decode_uint32(raw.size);
477	return (ISC_R_SUCCESS);
478}
479
480static isc_result_t
481journal_file_create(isc_mem_t *mctx, const char *filename) {
482	FILE *fp = NULL;
483	isc_result_t result;
484	journal_header_t header;
485	journal_rawheader_t rawheader;
486	int index_size = 56; /* XXX configurable */
487	int size;
488	void *mem; /* Memory for temporary index image. */
489
490	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
491
492	result = isc_stdio_open(filename, "wb", &fp);
493	if (result != ISC_R_SUCCESS) {
494		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
495			      "%s: create: %s",
496			      filename, isc_result_totext(result));
497		return (ISC_R_UNEXPECTED);
498	}
499
500	header = initial_journal_header;
501	header.index_size = index_size;
502	journal_header_encode(&header, &rawheader);
503
504	size = sizeof(journal_rawheader_t) +
505		index_size * sizeof(journal_rawpos_t);
506
507	mem = isc_mem_get(mctx, size);
508	if (mem == NULL) {
509		(void)isc_stdio_close(fp);
510		(void)isc_file_remove(filename);
511		return (ISC_R_NOMEMORY);
512	}
513	memset(mem, 0, size);
514	memcpy(mem, &rawheader, sizeof(rawheader));
515
516	result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
517	if (result != ISC_R_SUCCESS) {
518		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
519				 "%s: write: %s",
520				 filename, isc_result_totext(result));
521		(void)isc_stdio_close(fp);
522		(void)isc_file_remove(filename);
523		isc_mem_put(mctx, mem, size);
524		return (ISC_R_UNEXPECTED);
525	}
526	isc_mem_put(mctx, mem, size);
527
528	result = isc_stdio_close(fp);
529	if (result != ISC_R_SUCCESS) {
530		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
531				 "%s: close: %s",
532				 filename, isc_result_totext(result));
533		(void)isc_file_remove(filename);
534		return (ISC_R_UNEXPECTED);
535	}
536
537	return (ISC_R_SUCCESS);
538}
539
540static isc_result_t
541journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
542	     isc_boolean_t create, dns_journal_t **journalp) {
543	FILE *fp = NULL;
544	isc_result_t result;
545	journal_rawheader_t rawheader;
546	dns_journal_t *j;
547
548	INSIST(journalp != NULL && *journalp == NULL);
549	j = isc_mem_get(mctx, sizeof(*j));
550	if (j == NULL)
551		return (ISC_R_NOMEMORY);
552
553	j->mctx = mctx;
554	j->state = JOURNAL_STATE_INVALID;
555	j->fp = NULL;
556	j->filename = filename;
557	j->index = NULL;
558	j->rawindex = NULL;
559
560	result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
561
562	if (result == ISC_R_FILENOTFOUND) {
563		if (create) {
564			isc_log_write(JOURNAL_COMMON_LOGARGS,
565				      ISC_LOG_INFO,
566				      "journal file %s does not exist, "
567				      "creating it",
568				      j->filename);
569			CHECK(journal_file_create(mctx, filename));
570			/*
571			 * Retry.
572			 */
573			result = isc_stdio_open(j->filename, "rb+", &fp);
574		} else {
575			FAIL(ISC_R_NOTFOUND);
576		}
577	}
578	if (result != ISC_R_SUCCESS) {
579		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
580			      "%s: open: %s",
581			      j->filename, isc_result_totext(result));
582		FAIL(ISC_R_UNEXPECTED);
583	}
584
585	j->fp = fp;
586
587	/*
588	 * Set magic early so that seek/read can succeed.
589	 */
590	j->magic = DNS_JOURNAL_MAGIC;
591
592	CHECK(journal_seek(j, 0));
593	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
594
595	if (memcmp(rawheader.h.format, initial_journal_header.format,
596		   sizeof(initial_journal_header.format)) != 0) {
597		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
598				 "%s: journal format not recognized",
599				 j->filename);
600		FAIL(ISC_R_UNEXPECTED);
601	}
602	journal_header_decode(&rawheader, &j->header);
603
604	/*
605	 * If there is an index, read the raw index into a dynamically
606	 * allocated buffer and then convert it into a cooked index.
607	 */
608	if (j->header.index_size != 0) {
609		unsigned int i;
610		unsigned int rawbytes;
611		unsigned char *p;
612
613		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
614		j->rawindex = isc_mem_get(mctx, rawbytes);
615		if (j->rawindex == NULL)
616			FAIL(ISC_R_NOMEMORY);
617
618		CHECK(journal_read(j, j->rawindex, rawbytes));
619
620		j->index = isc_mem_get(mctx, j->header.index_size *
621				       sizeof(journal_pos_t));
622		if (j->index == NULL)
623			FAIL(ISC_R_NOMEMORY);
624
625		p = j->rawindex;
626		for (i = 0; i < j->header.index_size; i++) {
627			j->index[i].serial = decode_uint32(p);
628			p += 4;
629			j->index[i].offset = decode_uint32(p);
630			p += 4;
631		}
632		INSIST(p == j->rawindex + rawbytes);
633	}
634	j->offset = -1; /* Invalid, must seek explicitly. */
635
636	/*
637	 * Initialize the iterator.
638	 */
639	dns_name_init(&j->it.name, NULL);
640	dns_rdata_init(&j->it.rdata);
641
642	/*
643	 * Set up empty initial buffers for uncheched and checked
644	 * wire format RR data.  They will be reallocated
645	 * later.
646	 */
647	isc_buffer_init(&j->it.source, NULL, 0);
648	isc_buffer_init(&j->it.target, NULL, 0);
649	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
650
651	j->state =
652		write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
653
654	*journalp = j;
655	return (ISC_R_SUCCESS);
656
657 failure:
658	j->magic = 0;
659	if (j->index != NULL) {
660		isc_mem_put(j->mctx, j->index, j->header.index_size *
661			    sizeof(journal_rawpos_t));
662		j->index = NULL;
663	}
664	if (j->fp != NULL)
665		(void)isc_stdio_close(j->fp);
666	isc_mem_put(j->mctx, j, sizeof(*j));
667	return (result);
668}
669
670isc_result_t
671dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
672		 dns_journal_t **journalp) {
673	return (journal_open(mctx, filename, write, write, journalp));
674}
675
676/*
677 * A comparison function defining the sorting order for
678 * entries in the IXFR-style journal file.
679 *
680 * The IXFR format requires that deletions are sorted before
681 * additions, and within either one, SOA records are sorted
682 * before others.
683 *
684 * Also sort the non-SOA records by type as a courtesy to the
685 * server receiving the IXFR - it may help reduce the amount of
686 * rdataset merging it has to do.
687 */
688static int
689ixfr_order(const void *av, const void *bv) {
690	dns_difftuple_t const * const *ap = av;
691	dns_difftuple_t const * const *bp = bv;
692	dns_difftuple_t const *a = *ap;
693	dns_difftuple_t const *b = *bp;
694	int r;
695
696	r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
697	if (r != 0)
698		return (r);
699
700	r = (b->rdata.type == dns_rdatatype_soa) -
701		(a->rdata.type == dns_rdatatype_soa);
702	if (r != 0)
703		return (r);
704
705	r = (a->rdata.type - b->rdata.type);
706	return (r);
707}
708
709/*
710 * Advance '*pos' to the next journal transaction.
711 *
712 * Requires:
713 *	*pos refers to a valid journal transaction.
714 *
715 * Ensures:
716 *	When ISC_R_SUCCESS is returned,
717 *	*pos refers to the next journal transaction.
718 *
719 * Returns one of:
720 *
721 *    ISC_R_SUCCESS
722 *    ISC_R_NOMORE 	*pos pointed at the last transaction
723 *    Other results due to file errors are possible.
724 */
725static isc_result_t
726journal_next(dns_journal_t *j, journal_pos_t *pos) {
727	isc_result_t result;
728	journal_xhdr_t xhdr;
729	REQUIRE(DNS_JOURNAL_VALID(j));
730
731	result = journal_seek(j, pos->offset);
732	if (result != ISC_R_SUCCESS)
733		return (result);
734
735	if (pos->serial == j->header.end.serial)
736		return (ISC_R_NOMORE);
737	/*
738	 * Read the header of the current transaction.
739	 * This will return ISC_R_NOMORE if we are at EOF.
740	 */
741	result = journal_read_xhdr(j, &xhdr);
742	if (result != ISC_R_SUCCESS)
743		return (result);
744
745	/*
746	 * Check serial number consistency.
747	 */
748	if (xhdr.serial0 != pos->serial) {
749		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
750			      "%s: journal file corrupt: "
751			      "expected serial %u, got %u",
752			      j->filename, pos->serial, xhdr.serial0);
753		return (ISC_R_UNEXPECTED);
754	}
755
756	/*
757	 * Check for offset wraparound.
758	 */
759	if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
760	    < pos->offset) {
761		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
762			      "%s: offset too large", j->filename);
763		return (ISC_R_UNEXPECTED);
764	}
765
766	pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
767	pos->serial = xhdr.serial1;
768	return (ISC_R_SUCCESS);
769}
770
771/*
772 * If the index of the journal 'j' contains an entry "better"
773 * than '*best_guess', replace '*best_guess' with it.
774 *
775 * "Better" means having a serial number closer to 'serial'
776 * but not greater than 'serial'.
777 */
778static void
779index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
780	unsigned int i;
781	if (j->index == NULL)
782		return;
783	for (i = 0; i < j->header.index_size; i++) {
784		if (POS_VALID(j->index[i]) &&
785		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
786		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
787			*best_guess = j->index[i];
788	}
789}
790
791/*
792 * Add a new index entry.  If there is no room, make room by removing
793 * the odd-numbered entries and compacting the others into the first
794 * half of the index.  This decimates old index entries exponentially
795 * over time, so that the index always contains a much larger fraction
796 * of recent serial numbers than of old ones.  This is deliberate -
797 * most index searches are for outgoing IXFR, and IXFR tends to request
798 * recent versions more often than old ones.
799 */
800static void
801index_add(dns_journal_t *j, journal_pos_t *pos) {
802	unsigned int i;
803	if (j->index == NULL)
804		return;
805	/*
806	 * Search for a vacant position.
807	 */
808	for (i = 0; i < j->header.index_size; i++) {
809		if (! POS_VALID(j->index[i]))
810			break;
811	}
812	if (i == j->header.index_size) {
813		unsigned int k = 0;
814		/*
815		 * Found no vacant position.  Make some room.
816		 */
817		for (i = 0; i < j->header.index_size; i += 2) {
818			j->index[k++] = j->index[i];
819		}
820		i = k; /* 'i' identifies the first vacant position. */
821		while (k < j->header.index_size) {
822			POS_INVALIDATE(j->index[k]);
823			k++;
824		}
825	}
826	INSIST(i < j->header.index_size);
827	INSIST(! POS_VALID(j->index[i]));
828
829	/*
830	 * Store the new index entry.
831	 */
832	j->index[i] = *pos;
833}
834
835/*
836 * Invalidate any existing index entries that could become
837 * ambiguous when a new transaction with number 'serial' is added.
838 */
839static void
840index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
841	unsigned int i;
842	if (j->index == NULL)
843		return;
844	for (i = 0; i < j->header.index_size; i++) {
845		if (! DNS_SERIAL_GT(serial, j->index[i].serial))
846			POS_INVALIDATE(j->index[i]);
847	}
848}
849
850/*
851 * Try to find a transaction with initial serial number 'serial'
852 * in the journal 'j'.
853 *
854 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
855 *
856 * If 'serial' is current (= the ending serial number of the
857 * last transaction in the journal), set '*pos' to
858 * the position immediately following the last transaction and
859 * return ISC_R_SUCCESS.
860 *
861 * If 'serial' is within the range of addressable serial numbers
862 * covered by the journal but that particular serial number is missing
863 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
864 *
865 * If 'serial' is outside the range of addressable serial numbers
866 * covered by the journal, return ISC_R_RANGE.
867 *
868 */
869static isc_result_t
870journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
871	isc_result_t result;
872	journal_pos_t current_pos;
873	REQUIRE(DNS_JOURNAL_VALID(j));
874
875	if (DNS_SERIAL_GT(j->header.begin.serial, serial))
876		return (ISC_R_RANGE);
877	if (DNS_SERIAL_GT(serial, j->header.end.serial))
878		return (ISC_R_RANGE);
879	if (serial == j->header.end.serial) {
880		*pos = j->header.end;
881		return (ISC_R_SUCCESS);
882	}
883
884	current_pos = j->header.begin;
885	index_find(j, serial, &current_pos);
886
887	while (current_pos.serial != serial) {
888		if (DNS_SERIAL_GT(current_pos.serial, serial))
889			return (ISC_R_NOTFOUND);
890		result = journal_next(j, &current_pos);
891		if (result != ISC_R_SUCCESS)
892			return (result);
893	}
894	*pos = current_pos;
895	return (ISC_R_SUCCESS);
896}
897
898isc_result_t
899dns_journal_begin_transaction(dns_journal_t *j) {
900	isc_uint32_t offset;
901	isc_result_t result;
902	journal_rawxhdr_t hdr;
903
904	REQUIRE(DNS_JOURNAL_VALID(j));
905	REQUIRE(j->state == JOURNAL_STATE_WRITE);
906
907	/*
908	 * Find the file offset where the new transaction should
909	 * be written, and seek there.
910	 */
911	if (JOURNAL_EMPTY(&j->header)) {
912		offset = sizeof(journal_rawheader_t) +
913			j->header.index_size * sizeof(journal_rawpos_t);
914	} else {
915		offset = j->header.end.offset;
916	}
917	j->x.pos[0].offset = offset;
918	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
919	j->x.n_soa = 0;
920
921	CHECK(journal_seek(j, offset));
922
923	/*
924	 * Write a dummy transaction header of all zeroes to reserve
925	 * space.  It will be filled in when the transaction is
926	 * finished.
927	 */
928	memset(&hdr, 0, sizeof(hdr));
929	CHECK(journal_write(j, &hdr, sizeof(hdr)));
930	j->x.pos[1].offset = j->offset;
931
932	j->state = JOURNAL_STATE_TRANSACTION;
933	result = ISC_R_SUCCESS;
934 failure:
935	return (result);
936}
937
938isc_result_t
939dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
940	dns_difftuple_t *t;
941	isc_buffer_t buffer;
942	void *mem = NULL;
943	unsigned int size;
944	isc_result_t result;
945	isc_region_t used;
946
947	REQUIRE(DNS_DIFF_VALID(diff));
948	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
949
950	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
951	(void)dns_diff_print(diff, NULL);
952
953	/*
954	 * Pass 1: determine the buffer size needed, and
955	 * keep track of SOA serial numbers.
956	 */
957	size = 0;
958	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
959	     t = ISC_LIST_NEXT(t, link))
960	{
961		if (t->rdata.type == dns_rdatatype_soa) {
962			if (j->x.n_soa < 2)
963				j->x.pos[j->x.n_soa].serial =
964					dns_soa_getserial(&t->rdata);
965			j->x.n_soa++;
966		}
967		size += sizeof(journal_rawrrhdr_t);
968		size += t->name.length; /* XXX should have access macro? */
969		size += 10;
970		size += t->rdata.length;
971	}
972
973	mem = isc_mem_get(j->mctx, size);
974	if (mem == NULL)
975		return (ISC_R_NOMEMORY);
976
977	isc_buffer_init(&buffer, mem, size);
978
979	/*
980	 * Pass 2.  Write RRs to buffer.
981	 */
982	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
983	     t = ISC_LIST_NEXT(t, link))
984	{
985		/*
986		 * Write the RR header.
987		 */
988		isc_buffer_putuint32(&buffer, t->name.length + 10 +
989				     t->rdata.length);
990		/*
991		 * Write the owner name, RR header, and RR data.
992		 */
993		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
994		isc_buffer_putuint16(&buffer, t->rdata.type);
995		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
996		isc_buffer_putuint32(&buffer, t->ttl);
997		INSIST(t->rdata.length < 65536);
998		isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
999		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1000		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1001	}
1002
1003	isc_buffer_usedregion(&buffer, &used);
1004	INSIST(used.length == size);
1005
1006	j->x.pos[1].offset += used.length;
1007
1008	/*
1009	 * Write the buffer contents to the journal file.
1010	 */
1011	CHECK(journal_write(j, used.base, used.length));
1012
1013	result = ISC_R_SUCCESS;
1014
1015 failure:
1016	if (mem != NULL)
1017		isc_mem_put(j->mctx, mem, size);
1018	return (result);
1019
1020}
1021
1022isc_result_t
1023dns_journal_commit(dns_journal_t *j) {
1024	isc_result_t result;
1025	journal_rawheader_t rawheader;
1026
1027	REQUIRE(DNS_JOURNAL_VALID(j));
1028	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1029
1030	/*
1031	 * Perform some basic consistency checks.
1032	 */
1033	if (j->x.n_soa != 2) {
1034		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1035			      "%s: malformed transaction: %d SOAs",
1036			      j->filename, j->x.n_soa);
1037		return (ISC_R_UNEXPECTED);
1038	}
1039	if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1040	       (bind8_compat &&
1041		j->x.pos[1].serial == j->x.pos[0].serial)))
1042	{
1043		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1044			      "%s: malformed transaction: serial number "
1045			      "would decrease", j->filename);
1046		return (ISC_R_UNEXPECTED);
1047	}
1048	if (! JOURNAL_EMPTY(&j->header)) {
1049		if (j->x.pos[0].serial != j->header.end.serial) {
1050			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1051					 "malformed transaction: "
1052					 "%s last serial %u != "
1053					 "transaction first serial %u",
1054					 j->filename,
1055					 j->header.end.serial,
1056					 j->x.pos[0].serial);
1057			return (ISC_R_UNEXPECTED);
1058		}
1059	}
1060
1061	/*
1062	 * Some old journal entries may become non-addressable
1063	 * when we increment the current serial number.  Purge them
1064	 * by stepping header.begin forward to the first addressable
1065	 * transaction.  Also purge them from the index.
1066	 */
1067	if (! JOURNAL_EMPTY(&j->header)) {
1068		while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1069				       j->header.begin.serial)) {
1070			CHECK(journal_next(j, &j->header.begin));
1071		}
1072		index_invalidate(j, j->x.pos[1].serial);
1073	}
1074#ifdef notyet
1075	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1076		force_dump(...);
1077	}
1078#endif
1079
1080	/*
1081	 * Commit the transaction data to stable storage.
1082	 */
1083	CHECK(journal_fsync(j));
1084
1085	/*
1086	 * Update the transaction header.
1087	 */
1088	CHECK(journal_seek(j, j->x.pos[0].offset));
1089	CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1090				 sizeof(journal_rawxhdr_t),
1091				 j->x.pos[0].serial, j->x.pos[1].serial));
1092
1093	/*
1094	 * Update the journal header.
1095	 */
1096	if (JOURNAL_EMPTY(&j->header)) {
1097		j->header.begin = j->x.pos[0];
1098	}
1099	j->header.end = j->x.pos[1];
1100	journal_header_encode(&j->header, &rawheader);
1101	CHECK(journal_seek(j, 0));
1102	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1103
1104	/*
1105	 * Update the index.
1106	 */
1107	index_add(j, &j->x.pos[0]);
1108
1109	/*
1110	 * Convert the index into on-disk format and write
1111	 * it to disk.
1112	 */
1113	CHECK(index_to_disk(j));
1114
1115	/*
1116	 * Commit the header to stable storage.
1117	 */
1118	CHECK(journal_fsync(j));
1119
1120	/*
1121	 * We no longer have a transaction open.
1122	 */
1123	j->state = JOURNAL_STATE_WRITE;
1124
1125	result = ISC_R_SUCCESS;
1126
1127 failure:
1128	return (result);
1129}
1130
1131isc_result_t
1132dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1133	isc_result_t result;
1134	CHECK(dns_diff_sort(diff, ixfr_order));
1135	CHECK(dns_journal_begin_transaction(j));
1136	CHECK(dns_journal_writediff(j, diff));
1137	CHECK(dns_journal_commit(j));
1138	result = ISC_R_SUCCESS;
1139 failure:
1140	return (result);
1141}
1142
1143void
1144dns_journal_destroy(dns_journal_t **journalp) {
1145	dns_journal_t *j = *journalp;
1146	REQUIRE(DNS_JOURNAL_VALID(j));
1147
1148	j->it.result = ISC_R_FAILURE;
1149	dns_name_invalidate(&j->it.name);
1150	dns_decompress_invalidate(&j->it.dctx);
1151	if (j->rawindex != NULL)
1152		isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1153			    sizeof(journal_rawpos_t));
1154	if (j->index != NULL)
1155		isc_mem_put(j->mctx, j->index, j->header.index_size *
1156			    sizeof(journal_pos_t));
1157	if (j->it.target.base != NULL)
1158		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1159	if (j->it.source.base != NULL)
1160		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1161
1162	if (j->fp != NULL)
1163		(void)isc_stdio_close(j->fp);
1164	j->magic = 0;
1165	isc_mem_put(j->mctx, j, sizeof(*j));
1166	*journalp = NULL;
1167}
1168
1169/*
1170 * Roll the open journal 'j' into the database 'db'.
1171 * A new database version will be created.
1172 */
1173
1174/* XXX Share code with incoming IXFR? */
1175
1176static isc_result_t
1177roll_forward(dns_journal_t *j, dns_db_t *db) {
1178	isc_buffer_t source;		/* Transaction data from disk */
1179	isc_buffer_t target;		/* Ditto after _fromwire check */
1180	isc_uint32_t db_serial;		/* Database SOA serial */
1181	isc_uint32_t end_serial;	/* Last journal SOA serial */
1182	isc_result_t result;
1183	dns_dbversion_t *ver = NULL;
1184	journal_pos_t pos;
1185	dns_diff_t diff;
1186	unsigned int n_soa = 0;
1187	unsigned int n_put = 0;
1188
1189	REQUIRE(DNS_JOURNAL_VALID(j));
1190	REQUIRE(DNS_DB_VALID(db));
1191
1192	dns_diff_init(j->mctx, &diff);
1193
1194	/*
1195	 * Set up empty initial buffers for uncheched and checked
1196	 * wire format transaction data.  They will be reallocated
1197	 * later.
1198	 */
1199	isc_buffer_init(&source, NULL, 0);
1200	isc_buffer_init(&target, NULL, 0);
1201
1202	/*
1203	 * Create the new database version.
1204	 */
1205	CHECK(dns_db_newversion(db, &ver));
1206
1207	/*
1208	 * Get the current database SOA serial number.
1209	 */
1210	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1211
1212	/*
1213	 * Locate a journal entry for the current database serial.
1214	 */
1215	CHECK(journal_find(j, db_serial, &pos));
1216	/*
1217	 * XXX do more drastic things, like marking zone stale,
1218	 * if this fails?
1219	 */
1220	/*
1221	 * XXXRTH  The zone code should probably mark the zone as bad and
1222	 *         scream loudly into the log if this is a dynamic update
1223	 *	   log reply that failed.
1224	 */
1225
1226	end_serial = dns_journal_last_serial(j);
1227	if (db_serial == end_serial)
1228		CHECK(DNS_R_UPTODATE);
1229
1230	CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1231
1232	for (result = dns_journal_first_rr(j);
1233	     result == ISC_R_SUCCESS;
1234	     result = dns_journal_next_rr(j))
1235	{
1236		dns_name_t *name;
1237		isc_uint32_t ttl;
1238		dns_rdata_t *rdata;
1239		dns_difftuple_t *tuple = NULL;
1240
1241		name = NULL;
1242		rdata = NULL;
1243		dns_journal_current_rr(j, &name, &ttl, &rdata);
1244
1245		if (rdata->type == dns_rdatatype_soa) {
1246			n_soa++;
1247			if (n_soa == 2)
1248				db_serial = j->it.current_serial;
1249		}
1250
1251		if (n_soa == 3)
1252			n_soa = 1;
1253		if (n_soa == 0) {
1254			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1255					 "%s: journal file corrupt: missing "
1256					 "initial SOA", j->filename);
1257			FAIL(ISC_R_UNEXPECTED);
1258		}
1259		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1260					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1261					   name, ttl, rdata, &tuple));
1262		dns_diff_append(&diff, &tuple);
1263
1264		if (++n_put > 100)  {
1265			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1266				      "%s: applying diff to database (%u)",
1267				      j->filename, db_serial);
1268			(void)dns_diff_print(&diff, NULL);
1269			CHECK(dns_diff_apply(&diff, db, ver));
1270			dns_diff_clear(&diff);
1271			n_put = 0;
1272		}
1273	}
1274	if (result == ISC_R_NOMORE)
1275		result = ISC_R_SUCCESS;
1276	CHECK(result);
1277
1278	if (n_put != 0) {
1279		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1280			      "%s: applying final diff to database (%u)",
1281			      j->filename, db_serial);
1282		(void)dns_diff_print(&diff, NULL);
1283		CHECK(dns_diff_apply(&diff, db, ver));
1284		dns_diff_clear(&diff);
1285	}
1286
1287 failure:
1288	if (ver != NULL)
1289		dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1290				    ISC_TRUE : ISC_FALSE);
1291
1292	if (source.base != NULL)
1293		isc_mem_put(j->mctx, source.base, source.length);
1294	if (target.base != NULL)
1295		isc_mem_put(j->mctx, target.base, target.length);
1296
1297	dns_diff_clear(&diff);
1298
1299	return (result);
1300}
1301
1302isc_result_t
1303dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1304	dns_journal_t *j;
1305	isc_result_t result;
1306
1307	REQUIRE(DNS_DB_VALID(db));
1308	REQUIRE(filename != NULL);
1309
1310	j = NULL;
1311	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1312	if (result == ISC_R_NOTFOUND) {
1313		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1314			      "no journal file, but that's OK");
1315		return (DNS_R_NOJOURNAL);
1316	}
1317	if (result != ISC_R_SUCCESS)
1318		return (result);
1319	if (JOURNAL_EMPTY(&j->header))
1320		result = DNS_R_UPTODATE;
1321	else
1322		result = roll_forward(j, db);
1323
1324	dns_journal_destroy(&j);
1325
1326	return (result);
1327}
1328
1329isc_result_t
1330dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1331	dns_journal_t *j;
1332	isc_buffer_t source;		/* Transaction data from disk */
1333	isc_buffer_t target;		/* Ditto after _fromwire check */
1334	isc_uint32_t start_serial;		/* Database SOA serial */
1335	isc_uint32_t end_serial;	/* Last journal SOA serial */
1336	isc_result_t result;
1337	dns_diff_t diff;
1338	unsigned int n_soa = 0;
1339	unsigned int n_put = 0;
1340
1341	REQUIRE(filename != NULL);
1342
1343	j = NULL;
1344	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1345	if (result == ISC_R_NOTFOUND) {
1346		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1347		return (DNS_R_NOJOURNAL);
1348	}
1349
1350	if (result != ISC_R_SUCCESS) {
1351		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1352			      "journal open failure: %s: %s",
1353			      isc_result_totext(result), j->filename);
1354		return (result);
1355	}
1356
1357	dns_diff_init(j->mctx, &diff);
1358
1359	/*
1360	 * Set up empty initial buffers for uncheched and checked
1361	 * wire format transaction data.  They will be reallocated
1362	 * later.
1363	 */
1364	isc_buffer_init(&source, NULL, 0);
1365	isc_buffer_init(&target, NULL, 0);
1366
1367	start_serial = dns_journal_first_serial(j);
1368	end_serial = dns_journal_last_serial(j);
1369
1370	CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1371
1372	for (result = dns_journal_first_rr(j);
1373	     result == ISC_R_SUCCESS;
1374	     result = dns_journal_next_rr(j))
1375	{
1376		dns_name_t *name;
1377		isc_uint32_t ttl;
1378		dns_rdata_t *rdata;
1379		dns_difftuple_t *tuple = NULL;
1380
1381		name = NULL;
1382		rdata = NULL;
1383		dns_journal_current_rr(j, &name, &ttl, &rdata);
1384
1385		if (rdata->type == dns_rdatatype_soa)
1386			n_soa++;
1387
1388		if (n_soa == 3)
1389			n_soa = 1;
1390		if (n_soa == 0) {
1391		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1392					 "%s: journal file corrupt: missing "
1393					 "initial SOA", j->filename);
1394			FAIL(ISC_R_UNEXPECTED);
1395		}
1396		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1397					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1398					   name, ttl, rdata, &tuple));
1399		dns_diff_append(&diff, &tuple);
1400
1401		if (++n_put > 100)  {
1402			result = dns_diff_print(&diff, file);
1403			dns_diff_clear(&diff);
1404			n_put = 0;
1405			if (result != ISC_R_SUCCESS)
1406				break;
1407		}
1408	}
1409	if (result == ISC_R_NOMORE)
1410		result = ISC_R_SUCCESS;
1411	CHECK(result);
1412
1413	if (n_put != 0) {
1414		result = dns_diff_print(&diff, file);
1415		dns_diff_clear(&diff);
1416	}
1417	goto cleanup;
1418
1419 failure:
1420	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1421		      "%s: cannot print: journal file corrupt", j->filename);
1422
1423 cleanup:
1424	if (source.base != NULL)
1425		isc_mem_put(j->mctx, source.base, source.length);
1426	if (target.base != NULL)
1427		isc_mem_put(j->mctx, target.base, target.length);
1428
1429	dns_diff_clear(&diff);
1430	dns_journal_destroy(&j);
1431
1432	return (result);
1433}
1434
1435/**************************************************************************/
1436/*
1437 * Miscellaneous accessors.
1438 */
1439isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1440	return (j->header.begin.serial);
1441}
1442
1443isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1444	return (j->header.end.serial);
1445}
1446
1447/**************************************************************************/
1448/*
1449 * Iteration support.
1450 *
1451 * When serving an outgoing IXFR, we transmit a part the journal starting
1452 * at the serial number in the IXFR request and ending at the serial
1453 * number that is current when the IXFR request arrives.  The ending
1454 * serial number is not necessarily at the end of the journal:
1455 * the journal may grow while the IXFR is in progress, but we stop
1456 * when we reach the serial number that was current when the IXFR started.
1457 */
1458
1459static isc_result_t read_one_rr(dns_journal_t *j);
1460
1461/*
1462 * Make sure the buffer 'b' is has at least 'size' bytes
1463 * allocated, and clear it.
1464 *
1465 * Requires:
1466 *	Either b->base is NULL, or it points to b->length bytes of memory
1467 *	previously allocated by isc_mem_get().
1468 */
1469
1470static isc_result_t
1471size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1472	if (b->length < size) {
1473		void *mem = isc_mem_get(mctx, size);
1474		if (mem == NULL)
1475			return (ISC_R_NOMEMORY);
1476		if (b->base != NULL)
1477			isc_mem_put(mctx, b->base, b->length);
1478		b->base = mem;
1479		b->length = size;
1480	}
1481	isc_buffer_clear(b);
1482	return (ISC_R_SUCCESS);
1483}
1484
1485isc_result_t
1486dns_journal_iter_init(dns_journal_t *j,
1487		      isc_uint32_t begin_serial, isc_uint32_t end_serial)
1488{
1489	isc_result_t result;
1490
1491	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1492	INSIST(j->it.bpos.serial == begin_serial);
1493
1494	CHECK(journal_find(j, end_serial, &j->it.epos));
1495	INSIST(j->it.epos.serial == end_serial);
1496
1497	result = ISC_R_SUCCESS;
1498 failure:
1499	j->it.result = result;
1500	return (j->it.result);
1501}
1502
1503
1504isc_result_t
1505dns_journal_first_rr(dns_journal_t *j) {
1506	isc_result_t result;
1507
1508	/*
1509	 * Seek to the beginning of the first transaction we are
1510	 * interested in.
1511	 */
1512	CHECK(journal_seek(j, j->it.bpos.offset));
1513	j->it.current_serial = j->it.bpos.serial;
1514
1515	j->it.xsize = 0;  /* We have no transaction data yet... */
1516	j->it.xpos = 0;	  /* ...and haven't used any of it. */
1517
1518	return (read_one_rr(j));
1519
1520 failure:
1521	return (result);
1522}
1523
1524static isc_result_t
1525read_one_rr(dns_journal_t *j) {
1526	isc_result_t result;
1527
1528	dns_rdatatype_t rdtype;
1529	dns_rdataclass_t rdclass;
1530	unsigned int rdlen;
1531	isc_uint32_t ttl;
1532	journal_xhdr_t xhdr;
1533	journal_rrhdr_t rrhdr;
1534
1535	INSIST(j->offset <= j->it.epos.offset);
1536	if (j->offset == j->it.epos.offset)
1537		return (ISC_R_NOMORE);
1538	if (j->it.xpos == j->it.xsize) {
1539		/*
1540		 * We are at a transaction boundary.
1541		 * Read another transaction header.
1542		 */
1543		CHECK(journal_read_xhdr(j, &xhdr));
1544		if (xhdr.size == 0) {
1545			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1546				      "%s: journal corrupt: empty transaction",
1547				      j->filename);
1548			FAIL(ISC_R_UNEXPECTED);
1549		}
1550		if (xhdr.serial0 != j->it.current_serial) {
1551			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1552					 "%s: journal file corrupt: "
1553					 "expected serial %u, got %u",
1554					 j->filename,
1555					 j->it.current_serial, xhdr.serial0);
1556			FAIL(ISC_R_UNEXPECTED);
1557		}
1558		j->it.xsize = xhdr.size;
1559		j->it.xpos = 0;
1560	}
1561	/*
1562	 * Read an RR.
1563	 */
1564	CHECK(journal_read_rrhdr(j, &rrhdr));
1565	/*
1566	 * Perform a sanity check on the journal RR size.
1567	 * The smallest possible RR has a 1-byte owner name
1568	 * and a 10-byte header.  The largest possible
1569	 * RR has 65535 bytes of data, a header, and a maximum-
1570	 * size owner name, well below 70 k total.
1571	 */
1572	if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1573		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1574				 "%s: journal corrupt: impossible RR size "
1575				 "(%d bytes)", j->filename, rrhdr.size);
1576		FAIL(ISC_R_UNEXPECTED);
1577	}
1578
1579	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1580	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1581	isc_buffer_add(&j->it.source, rrhdr.size);
1582
1583	/*
1584	 * The target buffer is made the same size
1585	 * as the source buffer, with the assumption that when
1586	 * no compression in present, the output of dns_*_fromwire()
1587	 * is no larger than the input.
1588	 */
1589	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1590
1591	/*
1592	 * Parse the owner name.  We don't know where it
1593	 * ends yet, so we make the entire "remaining"
1594	 * part of the buffer "active".
1595	 */
1596	isc_buffer_setactive(&j->it.source,
1597			     j->it.source.used - j->it.source.current);
1598	CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1599				&j->it.dctx, 0, &j->it.target));
1600
1601	/*
1602	 * Check that the RR header is there, and parse it.
1603	 */
1604	if (isc_buffer_remaininglength(&j->it.source) < 10)
1605		FAIL(DNS_R_FORMERR);
1606
1607	rdtype = isc_buffer_getuint16(&j->it.source);
1608	rdclass = isc_buffer_getuint16(&j->it.source);
1609	ttl = isc_buffer_getuint32(&j->it.source);
1610	rdlen = isc_buffer_getuint16(&j->it.source);
1611
1612	/*
1613	 * Parse the rdata.
1614	 */
1615	isc_buffer_setactive(&j->it.source, rdlen);
1616	dns_rdata_reset(&j->it.rdata);
1617	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1618				 rdtype, &j->it.source, &j->it.dctx,
1619				 0, &j->it.target));
1620	j->it.ttl = ttl;
1621
1622	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1623	if (rdtype == dns_rdatatype_soa) {
1624		/* XXX could do additional consistency checks here */
1625		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1626	}
1627
1628	result = ISC_R_SUCCESS;
1629
1630 failure:
1631	j->it.result = result;
1632	return (result);
1633}
1634
1635isc_result_t
1636dns_journal_next_rr(dns_journal_t *j) {
1637	j->it.result = read_one_rr(j);
1638	return (j->it.result);
1639}
1640
1641void
1642dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1643		   dns_rdata_t **rdata)
1644{
1645	REQUIRE(j->it.result == ISC_R_SUCCESS);
1646	*name = &j->it.name;
1647	*ttl = j->it.ttl;
1648	*rdata = &j->it.rdata;
1649}
1650
1651/**************************************************************************/
1652/*
1653 * Generating diffs from databases
1654 */
1655
1656/*
1657 * Construct a diff containing all the RRs at the current name of the
1658 * database iterator 'dbit' in database 'db', version 'ver'.
1659 * Set '*name' to the current name, and append the diff to 'diff'.
1660 * All new tuples will have the operation 'op'.
1661 *
1662 * Requires: 'name' must have buffer large enough to hold the name.
1663 * Typically, a dns_fixedname_t would be used.
1664 */
1665static isc_result_t
1666get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1667	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1668	      dns_diff_t *diff)
1669{
1670	isc_result_t result;
1671	dns_dbnode_t *node = NULL;
1672	dns_rdatasetiter_t *rdsiter = NULL;
1673	dns_difftuple_t *tuple = NULL;
1674
1675	result = dns_dbiterator_current(dbit, &node, name);
1676	if (result != ISC_R_SUCCESS)
1677		return (result);
1678
1679	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1680	if (result != ISC_R_SUCCESS)
1681		goto cleanup_node;
1682
1683	for (result = dns_rdatasetiter_first(rdsiter);
1684	     result == ISC_R_SUCCESS;
1685	     result = dns_rdatasetiter_next(rdsiter))
1686	{
1687		dns_rdataset_t rdataset;
1688
1689		dns_rdataset_init(&rdataset);
1690		dns_rdatasetiter_current(rdsiter, &rdataset);
1691
1692		for (result = dns_rdataset_first(&rdataset);
1693		     result == ISC_R_SUCCESS;
1694		     result = dns_rdataset_next(&rdataset))
1695		{
1696			dns_rdata_t rdata = DNS_RDATA_INIT;
1697			dns_rdataset_current(&rdataset, &rdata);
1698			result = dns_difftuple_create(diff->mctx, op, name,
1699						      rdataset.ttl, &rdata,
1700						      &tuple);
1701			if (result != ISC_R_SUCCESS) {
1702				dns_rdataset_disassociate(&rdataset);
1703				goto cleanup_iterator;
1704			}
1705			dns_diff_append(diff, &tuple);
1706		}
1707		dns_rdataset_disassociate(&rdataset);
1708		if (result != ISC_R_NOMORE)
1709			goto cleanup_iterator;
1710	}
1711	if (result != ISC_R_NOMORE)
1712		goto cleanup_iterator;
1713
1714	result = ISC_R_SUCCESS;
1715
1716 cleanup_iterator:
1717	dns_rdatasetiter_destroy(&rdsiter);
1718
1719 cleanup_node:
1720	dns_db_detachnode(db, &node);
1721
1722	return (result);
1723}
1724
1725/*
1726 * Comparison function for use by dns_diff_subtract when sorting
1727 * the diffs to be subtracted.  The sort keys are the rdata type
1728 * and the rdata itself.  The owner name is ignored, because
1729 * it is known to be the same for all tuples.
1730 */
1731static int
1732rdata_order(const void *av, const void *bv) {
1733	dns_difftuple_t const * const *ap = av;
1734	dns_difftuple_t const * const *bp = bv;
1735	dns_difftuple_t const *a = *ap;
1736	dns_difftuple_t const *b = *bp;
1737	int r;
1738	r = (b->rdata.type - a->rdata.type);
1739	if (r != 0)
1740		return (r);
1741	r = dns_rdata_compare(&a->rdata, &b->rdata);
1742	return (r);
1743}
1744
1745static isc_result_t
1746dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1747	isc_result_t result;
1748	dns_difftuple_t *p[2];
1749	int i, t;
1750	isc_boolean_t append;
1751
1752	CHECK(dns_diff_sort(&diff[0], rdata_order));
1753	CHECK(dns_diff_sort(&diff[1], rdata_order));
1754
1755	for (;;) {
1756		p[0] = ISC_LIST_HEAD(diff[0].tuples);
1757		p[1] = ISC_LIST_HEAD(diff[1].tuples);
1758		if (p[0] == NULL && p[1] == NULL)
1759			break;
1760
1761		for (i = 0; i < 2; i++)
1762			if (p[!i] == NULL) {
1763				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1764				ISC_LIST_APPEND(r->tuples, p[i], link);
1765				goto next;
1766			}
1767		t = rdata_order(&p[0], &p[1]);
1768		if (t < 0) {
1769			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1770			ISC_LIST_APPEND(r->tuples, p[0], link);
1771			goto next;
1772		}
1773		if (t > 0) {
1774			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1775			ISC_LIST_APPEND(r->tuples, p[1], link);
1776			goto next;
1777		}
1778		INSIST(t == 0);
1779		/*
1780		 * Identical RRs in both databases; skip them both
1781		 * if the ttl differs.
1782		 */
1783		append = ISC_TF(p[0]->ttl != p[1]->ttl);
1784		for (i = 0; i < 2; i++) {
1785			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1786			if (append) {
1787				ISC_LIST_APPEND(r->tuples, p[i], link);
1788			} else {
1789				dns_difftuple_free(&p[i]);
1790			}
1791		}
1792	next: ;
1793	}
1794	result = ISC_R_SUCCESS;
1795 failure:
1796	return (result);
1797}
1798
1799/*
1800 * Compare the databases 'dba' and 'dbb' and generate a journal
1801 * entry containing the changes to make 'dba' from 'dbb' (note
1802 * the order).  This journal entry will consist of a single,
1803 * possibly very large transaction.
1804 */
1805
1806isc_result_t
1807dns_db_diff(isc_mem_t *mctx,
1808	    dns_db_t *dba, dns_dbversion_t *dbvera,
1809	    dns_db_t *dbb, dns_dbversion_t *dbverb,
1810	    const char *journal_filename)
1811{
1812	dns_db_t *db[2];
1813	dns_dbversion_t *ver[2];
1814	dns_dbiterator_t *dbit[2] = { NULL, NULL };
1815	isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1816	dns_fixedname_t fixname[2];
1817	isc_result_t result, itresult[2];
1818	dns_diff_t diff[2], resultdiff;
1819	int i, t;
1820	dns_journal_t *journal = NULL;
1821
1822	db[0] = dba, db[1] = dbb;
1823	ver[0] = dbvera, ver[1] = dbverb;
1824
1825	dns_diff_init(mctx, &diff[0]);
1826	dns_diff_init(mctx, &diff[1]);
1827	dns_diff_init(mctx, &resultdiff);
1828
1829	dns_fixedname_init(&fixname[0]);
1830	dns_fixedname_init(&fixname[1]);
1831
1832	result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1833	if (result != ISC_R_SUCCESS)
1834		return (result);
1835
1836	result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1837	if (result != ISC_R_SUCCESS)
1838		goto cleanup_journal;
1839	result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1840	if (result != ISC_R_SUCCESS)
1841		goto cleanup_interator0;
1842
1843	itresult[0] = dns_dbiterator_first(dbit[0]);
1844	itresult[1] = dns_dbiterator_first(dbit[1]);
1845
1846	for (;;) {
1847		for (i = 0; i < 2; i++) {
1848			if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1849				CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1850					    dns_fixedname_name(&fixname[i]),
1851					    i == 0 ?
1852					    DNS_DIFFOP_ADD :
1853					    DNS_DIFFOP_DEL,
1854					    &diff[i]));
1855				itresult[i] = dns_dbiterator_next(dbit[i]);
1856				have[i] = ISC_TRUE;
1857			}
1858		}
1859
1860		if (! have[0] && ! have[1]) {
1861			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1862			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1863			break;
1864		}
1865
1866		for (i = 0; i < 2; i++) {
1867			if (! have[!i]) {
1868				ISC_LIST_APPENDLIST(resultdiff.tuples,
1869						    diff[i].tuples, link);
1870				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1871				have[i] = ISC_FALSE;
1872				goto next;
1873			}
1874		}
1875
1876		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1877				     dns_fixedname_name(&fixname[1]));
1878		if (t < 0) {
1879			ISC_LIST_APPENDLIST(resultdiff.tuples,
1880					    diff[0].tuples, link);
1881			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1882			have[0] = ISC_FALSE;
1883			continue;
1884		}
1885		if (t > 0) {
1886			ISC_LIST_APPENDLIST(resultdiff.tuples,
1887					    diff[1].tuples, link);
1888			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1889			have[1] = ISC_FALSE;
1890			continue;
1891		}
1892		INSIST(t == 0);
1893		CHECK(dns_diff_subtract(diff, &resultdiff));
1894		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1895		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1896		have[0] = have[1] = ISC_FALSE;
1897	next: ;
1898	}
1899	if (itresult[0] != ISC_R_NOMORE)
1900		FAIL(itresult[0]);
1901	if (itresult[1] != ISC_R_NOMORE)
1902		FAIL(itresult[1]);
1903
1904	if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1905		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1906	} else {
1907		CHECK(dns_journal_write_transaction(journal, &resultdiff));
1908	}
1909	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1910	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1911
1912 failure:
1913	dns_diff_clear(&resultdiff);
1914	dns_dbiterator_destroy(&dbit[1]);
1915 cleanup_interator0:
1916	dns_dbiterator_destroy(&dbit[0]);
1917 cleanup_journal:
1918	dns_journal_destroy(&journal);
1919	return (result);
1920}
1921
1922isc_result_t
1923dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1924		    isc_uint32_t target_size)
1925{
1926	unsigned int i;
1927	journal_pos_t best_guess;
1928	journal_pos_t current_pos;
1929	dns_journal_t *j = NULL;
1930	journal_rawheader_t rawheader;
1931	unsigned int copy_length;
1932	unsigned int len;
1933	char *buf = NULL;
1934	unsigned int size = 0;
1935	isc_result_t result;
1936	unsigned int indexend;
1937
1938	CHECK(journal_open(mctx, filename, ISC_TRUE, ISC_FALSE, &j));
1939
1940	if (JOURNAL_EMPTY(&j->header)) {
1941		dns_journal_destroy(&j);
1942		return (ISC_R_SUCCESS);
1943	}
1944
1945	if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1946	    DNS_SERIAL_GT(serial, j->header.end.serial)) {
1947		dns_journal_destroy(&j);
1948		return (ISC_R_RANGE);
1949	}
1950
1951	/*
1952	 * Cope with very small target sizes.
1953	 */
1954	indexend = sizeof(journal_rawheader_t) +
1955		   j->header.index_size * sizeof(journal_rawpos_t);
1956	if (target_size < indexend * 2)
1957		target_size = target_size/2 + indexend;
1958
1959	/*
1960	 * See if there is any work to do.
1961	 */
1962	if ((isc_uint32_t) j->header.end.offset < target_size) {
1963		dns_journal_destroy(&j);
1964		return (ISC_R_SUCCESS);
1965	}
1966
1967	/*
1968	 * Remove overhead so space test below can succeed.
1969	 */
1970	if (target_size >= indexend)
1971		target_size -= indexend;
1972
1973	/*
1974	 * Find if we can create enough free space.
1975	 */
1976	best_guess = j->header.begin;
1977	for (i = 0; i < j->header.index_size; i++) {
1978		if (POS_VALID(j->index[i]) &&
1979		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1980		    ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
1981		     >= target_size / 2) &&
1982		    j->index[i].offset > best_guess.offset)
1983			best_guess = j->index[i];
1984	}
1985
1986	current_pos = best_guess;
1987	while (current_pos.serial != serial) {
1988		CHECK(journal_next(j, &current_pos));
1989		if (current_pos.serial == j->header.end.serial)
1990			break;
1991
1992		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
1993		   ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
1994		     >= (target_size / 2)) &&
1995		    current_pos.offset > best_guess.offset)
1996			best_guess = current_pos;
1997		else
1998			break;
1999	}
2000
2001	INSIST(best_guess.serial != j->header.end.serial);
2002	if (best_guess.serial != serial)
2003		CHECK(journal_next(j, &best_guess));
2004
2005	/*
2006	 * Enough space to proceed?
2007	 */
2008	if ((isc_uint32_t) (j->header.end.offset - best_guess.offset) >
2009	     (isc_uint32_t) (best_guess.offset - indexend)) {
2010		dns_journal_destroy(&j);
2011		return (ISC_R_NOSPACE);
2012	}
2013
2014	copy_length = j->header.end.offset - best_guess.offset;
2015
2016	/*
2017	 * Invalidate entire index, will be rebuilt at end.
2018	 */
2019	for (i = 0; i < j->header.index_size; i++) {
2020		if (POS_VALID(j->index[i]))
2021			POS_INVALIDATE(j->index[i]);
2022	}
2023
2024	/*
2025	 * Convert the index into on-disk format and write
2026	 * it to disk.
2027	 */
2028	CHECK(index_to_disk(j));
2029	CHECK(journal_fsync(j));
2030
2031	/*
2032	 * Update the journal header.
2033	 */
2034	if (copy_length == 0) {
2035		j->header.begin.serial = 0;
2036		j->header.end.serial = 0;
2037		j->header.begin.offset = 0;
2038		j->header.end.offset = 0;
2039	} else {
2040		j->header.begin = best_guess;
2041	}
2042	journal_header_encode(&j->header, &rawheader);
2043	CHECK(journal_seek(j, 0));
2044	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2045	CHECK(journal_fsync(j));
2046
2047	if (copy_length != 0) {
2048		/*
2049		 * Copy best_guess to end into space just freed.
2050		 */
2051		size = 64*1024;
2052		if (copy_length < size)
2053			size = copy_length;
2054		buf = isc_mem_get(mctx, size);
2055		if (buf == NULL) {
2056			result = ISC_R_NOMEMORY;
2057			goto failure;
2058		}
2059
2060		for (i = 0; i < copy_length; i += size) {
2061			len = (copy_length - i) > size ? size :
2062							 (copy_length - i);
2063			CHECK(journal_seek(j, best_guess.offset + i));
2064			CHECK(journal_read(j, buf, len));
2065			CHECK(journal_seek(j, indexend + i));
2066			CHECK(journal_write(j, buf, len));
2067		}
2068
2069		CHECK(journal_fsync(j));
2070
2071		/*
2072		 * Compute new header.
2073		 */
2074		j->header.begin.offset = indexend;
2075		j->header.end.offset = indexend + copy_length;
2076		/*
2077		 * Update the journal header.
2078		 */
2079		journal_header_encode(&j->header, &rawheader);
2080		CHECK(journal_seek(j, 0));
2081		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2082		CHECK(journal_fsync(j));
2083
2084		/*
2085		 * Build new index.
2086		 */
2087		current_pos = j->header.begin;
2088		while (current_pos.serial != j->header.end.serial) {
2089			index_add(j, &current_pos);
2090			CHECK(journal_next(j, &current_pos));
2091		}
2092
2093		/*
2094		 * Write index.
2095		 */
2096		CHECK(index_to_disk(j));
2097		CHECK(journal_fsync(j));
2098
2099		indexend = j->header.end.offset;
2100	}
2101	dns_journal_destroy(&j);
2102	(void)isc_file_truncate(filename, (isc_offset_t)indexend);
2103	result = ISC_R_SUCCESS;
2104
2105 failure:
2106	if (buf != NULL)
2107		isc_mem_put(mctx, buf, size);
2108	if (j != NULL)
2109		dns_journal_destroy(&j);
2110	return (result);
2111}
2112
2113static isc_result_t
2114index_to_disk(dns_journal_t *j) {
2115	isc_result_t result = ISC_R_SUCCESS;
2116
2117	if (j->header.index_size != 0) {
2118		unsigned int i;
2119		unsigned char *p;
2120		unsigned int rawbytes;
2121
2122		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2123
2124		p = j->rawindex;
2125		for (i = 0; i < j->header.index_size; i++) {
2126			encode_uint32(j->index[i].serial, p);
2127			p += 4;
2128			encode_uint32(j->index[i].offset, p);
2129			p += 4;
2130		}
2131		INSIST(p == j->rawindex + rawbytes);
2132
2133		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2134		CHECK(journal_write(j, j->rawindex, rawbytes));
2135	}
2136failure:
2137	return (result);
2138}
2139