journal.c revision 135446
1/*
2 * Copyright (C) 2004  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2002  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: journal.c,v 1.77.2.1.10.8 2004/05/14 05:27:47 marka Exp $ */
19
20#include <config.h>
21
22#include <stdlib.h>
23
24#include <isc/file.h>
25#include <isc/mem.h>
26#include <isc/stdio.h>
27#include <isc/string.h>
28#include <isc/util.h>
29
30#include <dns/compress.h>
31#include <dns/db.h>
32#include <dns/dbiterator.h>
33#include <dns/diff.h>
34#include <dns/fixedname.h>
35#include <dns/journal.h>
36#include <dns/log.h>
37#include <dns/rdataset.h>
38#include <dns/rdatasetiter.h>
39#include <dns/result.h>
40#include <dns/soa.h>
41
42/*
43 * When true, accept IXFR difference sequences where the
44 * SOA serial number does not change (BIND 8 sends such
45 * sequences).
46 */
47static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
48
49/**************************************************************************/
50/*
51 * Miscellaneous utilities.
52 */
53
54#define JOURNAL_COMMON_LOGARGS \
55	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
56
57#define JOURNAL_DEBUG_LOGARGS(n) \
58	JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
59
60/*
61 * It would be non-sensical (or at least obtuse) to use FAIL() with an
62 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
63 * from complaining about "end-of-loop code not reached".
64 */
65#define FAIL(code) \
66	do { result = (code);					\
67		if (result != ISC_R_SUCCESS) goto failure;	\
68	} while (0)
69
70#define CHECK(op) \
71     	do { result = (op); 					\
72		if (result != ISC_R_SUCCESS) goto failure; 	\
73	} while (0)
74
75static isc_result_t index_to_disk(dns_journal_t *);
76
77static inline isc_uint32_t
78decode_uint32(unsigned char *p) {
79	return ((p[0] << 24) +
80		(p[1] << 16) +
81		(p[2] <<  8) +
82		(p[3] <<  0));
83}
84
85static inline void
86encode_uint32(isc_uint32_t val, unsigned char *p) {
87	p[0] = (isc_uint8_t)(val >> 24);
88	p[1] = (isc_uint8_t)(val >> 16);
89	p[2] = (isc_uint8_t)(val >>  8);
90	p[3] = (isc_uint8_t)(val >>  0);
91}
92
93isc_result_t
94dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
95		      dns_diffop_t op, dns_difftuple_t **tp)
96{
97	isc_result_t result;
98	dns_dbnode_t *node;
99	dns_rdataset_t rdataset;
100	dns_rdata_t rdata = DNS_RDATA_INIT;
101	dns_name_t *zonename;
102
103	zonename = dns_db_origin(db);
104
105	node = NULL;
106	result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
107	if (result != ISC_R_SUCCESS)
108		goto nonode;
109
110	dns_rdataset_init(&rdataset);
111	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
112				     (isc_stdtime_t)0, &rdataset, NULL);
113 	if (result != ISC_R_SUCCESS)
114		goto freenode;
115
116	result = dns_rdataset_first(&rdataset);
117 	if (result != ISC_R_SUCCESS)
118		goto freenode;
119
120	dns_rdataset_current(&rdataset, &rdata);
121
122	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
123				      &rdata, tp);
124
125	dns_rdataset_disassociate(&rdataset);
126	dns_db_detachnode(db, &node);
127	return (ISC_R_SUCCESS);
128
129 freenode:
130	dns_db_detachnode(db, &node);
131 nonode:
132	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
133	return (result);
134}
135
136/**************************************************************************/
137/*
138 * Journalling.
139 */
140
141/*
142 * A journal file consists of
143 *
144 *   - A fixed-size header of type journal_rawheader_t.
145 *
146 *   - The index.  This is an unordered array of index entries
147 *     of type journal_rawpos_t giving the locations
148 *     of some arbitrary subset of the journal's addressable
149 *     transactions.  The index entries are used as hints to
150 *     speed up the process of locating a transaction with a given
151 *     serial number.  Unused index entries have an "offset"
152 *     field of zero.  The size of the index can vary between
153 *     journal files, but does not change during the lifetime
154 *     of a file.  The size can be zero.
155 *
156 *   - The journal data.  This  consists of one or more transactions.
157 *     Each transaction begins with a transaction header of type
158 *     journal_rawxhdr_t.  The transaction header is followed by a
159 *     sequence of RRs, similar in structure to an IXFR difference
160 *     sequence (RFC1995).  That is, the pre-transaction SOA,
161 *     zero or more other deleted RRs, the post-transaction SOA,
162 *     and zero or more other added RRs.  Unlike in IXFR, each RR
163 *     is prefixed with a 32-bit length.
164 *
165 *     The journal data part grows as new transactions are
166 *     appended to the file.  Only those transactions
167 *     whose serial number is current-(2^31-1) to current
168 *     are considered "addressable" and may be pointed
169 *     to from the header or index.  They may be preceded
170 *     by old transactions that are no longer addressable,
171 *     and they may be followed by transactions that were
172 *     appended to the journal but never committed by updating
173 *     the "end" position in the header.  The latter will
174 *     be overwritten when new transactions are added.
175 */
176
177/*
178 * On-disk representation of a "pointer" to a journal entry.
179 * These are used in the journal header to locate the beginning
180 * and end of the journal, and in the journal index to locate
181 * other transactions.
182 */
183typedef struct {
184	unsigned char	serial[4];  /* SOA serial before update. */
185	/*
186	 * XXXRTH  Should offset be 8 bytes?
187	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
188	 * XXXAG  ... but we will not be able to seek >2G anyway on many
189	 *            platforms as long as we are using fseek() rather
190	 *            than lseek().
191	 */
192	unsigned char	offset[4];  /* Offset from beginning of file. */
193} journal_rawpos_t;
194
195/*
196 * The on-disk representation of the journal header.
197 * All numbers are stored in big-endian order.
198 */
199
200/*
201 * The header is of a fixed size, with some spare room for future
202 * extensions.
203 */
204#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
205
206typedef union {
207	struct {
208		/* File format version ID. */
209		unsigned char 		format[16];
210		/* Position of the first addressable transaction */
211		journal_rawpos_t 	begin;
212		/* Position of the next (yet nonexistent) transaction. */
213		journal_rawpos_t 	end;
214		/* Number of index entries following the header. */
215		unsigned char 		index_size[4];
216	} h;
217	/* Pad the header to a fixed size. */
218	unsigned char pad[JOURNAL_HEADER_SIZE];
219} journal_rawheader_t;
220
221/*
222 * The on-disk representation of the transaction header.
223 * There is one of these at the beginning of each transaction.
224 */
225typedef struct {
226	unsigned char	size[4]; 	/* In bytes, excluding header. */
227	unsigned char	serial0[4];	/* SOA serial before update. */
228	unsigned char	serial1[4];	/* SOA serial after update. */
229} journal_rawxhdr_t;
230
231/*
232 * The on-disk representation of the RR header.
233 * There is one of these at the beginning of each RR.
234 */
235typedef struct {
236	unsigned char	size[4]; 	/* In bytes, excluding header. */
237} journal_rawrrhdr_t;
238
239/*
240 * The in-core representation of the journal header.
241 */
242typedef struct {
243	isc_uint32_t	serial;
244	isc_offset_t	offset;
245} journal_pos_t;
246
247#define POS_VALID(pos) 		((pos).offset != 0)
248#define POS_INVALIDATE(pos) 	((pos).offset = 0, (pos).serial = 0)
249
250typedef struct {
251	unsigned char 	format[16];
252	journal_pos_t 	begin;
253	journal_pos_t 	end;
254	isc_uint32_t	index_size;
255} journal_header_t;
256
257/*
258 * The in-core representation of the transaction header.
259 */
260
261typedef struct {
262	isc_uint32_t	size;
263	isc_uint32_t	serial0;
264	isc_uint32_t	serial1;
265} journal_xhdr_t;
266
267/*
268 * The in-core representation of the RR header.
269 */
270typedef struct {
271	isc_uint32_t	size;
272} journal_rrhdr_t;
273
274
275/*
276 * Initial contents to store in the header of a newly created
277 * journal file.
278 *
279 * The header starts with the magic string ";BIND LOG V9\n"
280 * to identify the file as a BIND 9 journal file.  An ASCII
281 * identification string is used rather than a binary magic
282 * number to be consistent with BIND 8 (BIND 8 journal files
283 * are ASCII text files).
284 */
285
286static journal_header_t
287initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
288
289#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
290
291typedef enum {
292	JOURNAL_STATE_INVALID,
293	JOURNAL_STATE_READ,
294	JOURNAL_STATE_WRITE,
295	JOURNAL_STATE_TRANSACTION
296} journal_state_t;
297
298struct dns_journal {
299	unsigned int		magic;		/* JOUR */
300	isc_mem_t		*mctx;		/* Memory context */
301	journal_state_t		state;
302	const char 		*filename;	/* Journal file name */
303	FILE *			fp;		/* File handle */
304	isc_offset_t		offset;		/* Current file offset */
305	journal_header_t 	header;		/* In-core journal header */
306	unsigned char		*rawindex;	/* In-core buffer for journal
307						   index in on-disk format */
308	journal_pos_t		*index;		/* In-core journal index */
309
310	/* Current transaction state (when writing). */
311	struct {
312		unsigned int	n_soa;		/* Number of SOAs seen */
313		journal_pos_t	pos[2];		/* Begin/end position */
314	} x;
315
316	/* Iteration state (when reading). */
317	struct {
318		/* These define the part of the journal we iterate over. */
319		journal_pos_t bpos;		/* Position before first, */
320		journal_pos_t epos;		/* and after last
321						   transaction */
322		/* The rest is iterator state. */
323		isc_uint32_t current_serial;	/* Current SOA serial */
324		isc_buffer_t source;		/* Data from disk */
325		isc_buffer_t target;		/* Data from _fromwire check */
326		dns_decompress_t dctx;		/* Dummy decompression ctx */
327		dns_name_t name;		/* Current domain name */
328		dns_rdata_t rdata;		/* Current rdata */
329		isc_uint32_t ttl;		/* Current TTL */
330		unsigned int xsize;		/* Size of transaction data */
331		unsigned int xpos;		/* Current position in it */
332		isc_result_t result;		/* Result of last call */
333	} it;
334};
335
336#define DNS_JOURNAL_MAGIC	ISC_MAGIC('J', 'O', 'U', 'R')
337#define DNS_JOURNAL_VALID(t)	ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
338
339static void
340journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
341	cooked->serial = decode_uint32(raw->serial);
342	cooked->offset = decode_uint32(raw->offset);
343}
344
345static void
346journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
347	encode_uint32(cooked->serial, raw->serial);
348	encode_uint32(cooked->offset, raw->offset);
349}
350
351static void
352journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
353	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
354	memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
355	journal_pos_decode(&raw->h.begin, &cooked->begin);
356	journal_pos_decode(&raw->h.end, &cooked->end);
357	cooked->index_size = decode_uint32(raw->h.index_size);
358}
359
360static void
361journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
362	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
363	memset(raw->pad, 0, sizeof(raw->pad));
364	memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
365	journal_pos_encode(&raw->h.begin, &cooked->begin);
366	journal_pos_encode(&raw->h.end, &cooked->end);
367	encode_uint32(cooked->index_size, raw->h.index_size);
368}
369
370/*
371 * Journal file I/O subroutines, with error checking and reporting.
372 */
373static isc_result_t
374journal_seek(dns_journal_t *j, isc_uint32_t offset) {
375	isc_result_t result;
376	result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
377	if (result != ISC_R_SUCCESS) {
378		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
379			      "%s: seek: %s", j->filename,
380			      isc_result_totext(result));
381		return (ISC_R_UNEXPECTED);
382	}
383	j->offset = offset;
384	return (ISC_R_SUCCESS);
385}
386
387static isc_result_t
388journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
389	isc_result_t result;
390
391	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
392	if (result != ISC_R_SUCCESS) {
393		if (result == ISC_R_EOF)
394			return (ISC_R_NOMORE);
395		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
396			      "%s: read: %s",
397			      j->filename, isc_result_totext(result));
398		return (ISC_R_UNEXPECTED);
399	}
400	j->offset += nbytes;
401	return (ISC_R_SUCCESS);
402}
403
404static isc_result_t
405journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
406	isc_result_t result;
407
408	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
409	if (result != ISC_R_SUCCESS) {
410		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
411			      "%s: write: %s",
412			      j->filename, isc_result_totext(result));
413		return (ISC_R_UNEXPECTED);
414	}
415	j->offset += nbytes;
416	return (ISC_R_SUCCESS);
417}
418
419static isc_result_t
420journal_fsync(dns_journal_t *j) {
421	isc_result_t result;
422	result = isc_stdio_flush(j->fp);
423	if (result != ISC_R_SUCCESS) {
424		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
425			      "%s: flush: %s",
426			      j->filename, isc_result_totext(result));
427		return (ISC_R_UNEXPECTED);
428	}
429	result = isc_stdio_sync(j->fp);
430	if (result != ISC_R_SUCCESS) {
431		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
432			      "%s: fsync: %s",
433			      j->filename, isc_result_totext(result));
434		return (ISC_R_UNEXPECTED);
435	}
436	return (ISC_R_SUCCESS);
437}
438
439/*
440 * Read/write a transaction header at the current file position.
441 */
442
443static isc_result_t
444journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
445	journal_rawxhdr_t raw;
446	isc_result_t result;
447	result = journal_read(j, &raw, sizeof(raw));
448	if (result != ISC_R_SUCCESS)
449		return (result);
450	xhdr->size = decode_uint32(raw.size);
451	xhdr->serial0 = decode_uint32(raw.serial0);
452	xhdr->serial1 = decode_uint32(raw.serial1);
453	return (ISC_R_SUCCESS);
454}
455
456static isc_result_t
457journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
458		   isc_uint32_t serial0, isc_uint32_t serial1)
459{
460	journal_rawxhdr_t raw;
461	encode_uint32(size, raw.size);
462	encode_uint32(serial0, raw.serial0);
463	encode_uint32(serial1, raw.serial1);
464	return (journal_write(j, &raw, sizeof(raw)));
465}
466
467
468/*
469 * Read an RR header at the current file position.
470 */
471
472static isc_result_t
473journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
474	journal_rawrrhdr_t raw;
475	isc_result_t result;
476	result = journal_read(j, &raw, sizeof(raw));
477	if (result != ISC_R_SUCCESS)
478		return (result);
479	rrhdr->size = decode_uint32(raw.size);
480	return (ISC_R_SUCCESS);
481}
482
483static isc_result_t
484journal_file_create(isc_mem_t *mctx, const char *filename) {
485	FILE *fp = NULL;
486	isc_result_t result;
487	journal_header_t header;
488	journal_rawheader_t rawheader;
489	int index_size = 56; /* XXX configurable */
490	int size;
491	void *mem; /* Memory for temporary index image. */
492
493	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
494
495	result = isc_stdio_open(filename, "wb", &fp);
496	if (result != ISC_R_SUCCESS) {
497		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
498			      "%s: create: %s",
499			      filename, isc_result_totext(result));
500		return (ISC_R_UNEXPECTED);
501	}
502
503	header = initial_journal_header;
504	header.index_size = index_size;
505	journal_header_encode(&header, &rawheader);
506
507	size = sizeof(journal_rawheader_t) +
508		index_size * sizeof(journal_rawpos_t);
509
510	mem = isc_mem_get(mctx, size);
511	if (mem == NULL) {
512		(void)isc_stdio_close(fp);
513		(void)isc_file_remove(filename);
514		return (ISC_R_NOMEMORY);
515	}
516	memset(mem, 0, size);
517	memcpy(mem, &rawheader, sizeof(rawheader));
518
519	result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
520	if (result != ISC_R_SUCCESS) {
521		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
522				 "%s: write: %s",
523				 filename, isc_result_totext(result));
524		(void)isc_stdio_close(fp);
525		(void)isc_file_remove(filename);
526		isc_mem_put(mctx, mem, size);
527		return (ISC_R_UNEXPECTED);
528	}
529	isc_mem_put(mctx, mem, size);
530
531	result = isc_stdio_close(fp);
532	if (result != ISC_R_SUCCESS) {
533		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
534				 "%s: close: %s",
535				 filename, isc_result_totext(result));
536		(void)isc_file_remove(filename);
537		return (ISC_R_UNEXPECTED);
538	}
539
540	return (ISC_R_SUCCESS);
541}
542
543static isc_result_t
544journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
545	     isc_boolean_t create, dns_journal_t **journalp) {
546	FILE *fp = NULL;
547	isc_result_t result;
548	journal_rawheader_t rawheader;
549	dns_journal_t *j;
550
551	INSIST(journalp != NULL && *journalp == NULL);
552	j = isc_mem_get(mctx, sizeof(*j));
553	if (j == NULL)
554		return (ISC_R_NOMEMORY);
555
556	j->mctx = mctx;
557	j->state = JOURNAL_STATE_INVALID;
558	j->fp = NULL;
559	j->filename = filename;
560	j->index = NULL;
561	j->rawindex = NULL;
562
563	result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
564
565	if (result == ISC_R_FILENOTFOUND) {
566		if (create) {
567			isc_log_write(JOURNAL_COMMON_LOGARGS,
568				      ISC_LOG_INFO,
569				      "journal file %s does not exist, "
570				      "creating it",
571				      j->filename);
572			CHECK(journal_file_create(mctx, filename));
573			/*
574			 * Retry.
575			 */
576			result = isc_stdio_open(j->filename, "rb+", &fp);
577		} else {
578			FAIL(ISC_R_NOTFOUND);
579		}
580	}
581	if (result != ISC_R_SUCCESS) {
582		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
583			      "%s: open: %s",
584			      j->filename, isc_result_totext(result));
585		FAIL(ISC_R_UNEXPECTED);
586	}
587
588	j->fp = fp;
589
590	/*
591	 * Set magic early so that seek/read can succeed.
592	 */
593	j->magic = DNS_JOURNAL_MAGIC;
594
595	CHECK(journal_seek(j, 0));
596	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
597
598	if (memcmp(rawheader.h.format, initial_journal_header.format,
599		   sizeof(initial_journal_header.format)) != 0) {
600		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
601				 "%s: journal format not recognized",
602				 j->filename);
603		FAIL(ISC_R_UNEXPECTED);
604	}
605	journal_header_decode(&rawheader, &j->header);
606
607	/*
608	 * If there is an index, read the raw index into a dynamically
609	 * allocated buffer and then convert it into a cooked index.
610	 */
611	if (j->header.index_size != 0) {
612		unsigned int i;
613		unsigned int rawbytes;
614		unsigned char *p;
615
616		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
617		j->rawindex = isc_mem_get(mctx, rawbytes);
618		if (j->rawindex == NULL)
619			FAIL(ISC_R_NOMEMORY);
620
621		CHECK(journal_read(j, j->rawindex, rawbytes));
622
623		j->index = isc_mem_get(mctx, j->header.index_size *
624				       sizeof(journal_pos_t));
625		if (j->index == NULL)
626			FAIL(ISC_R_NOMEMORY);
627
628		p = j->rawindex;
629		for (i = 0; i < j->header.index_size; i++) {
630			j->index[i].serial = decode_uint32(p);
631			p += 4;
632			j->index[i].offset = decode_uint32(p);
633			p += 4;
634		}
635		INSIST(p == j->rawindex + rawbytes);
636	}
637	j->offset = -1; /* Invalid, must seek explicitly. */
638
639	/*
640	 * Initialize the iterator.
641	 */
642	dns_name_init(&j->it.name, NULL);
643	dns_rdata_init(&j->it.rdata);
644
645	/*
646	 * Set up empty initial buffers for uncheched and checked
647	 * wire format RR data.  They will be reallocated
648	 * later.
649	 */
650	isc_buffer_init(&j->it.source, NULL, 0);
651	isc_buffer_init(&j->it.target, NULL, 0);
652	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
653
654	j->state =
655		write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
656
657	*journalp = j;
658	return (ISC_R_SUCCESS);
659
660 failure:
661	j->magic = 0;
662	if (j->index != NULL) {
663		isc_mem_put(j->mctx, j->index, j->header.index_size *
664			    sizeof(journal_rawpos_t));
665		j->index = NULL;
666	}
667	if (j->fp != NULL)
668		(void)isc_stdio_close(j->fp);
669	isc_mem_put(j->mctx, j, sizeof(*j));
670	return (result);
671}
672
673isc_result_t
674dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
675		 dns_journal_t **journalp) {
676	return (journal_open(mctx, filename, write, write, journalp));
677}
678
679/*
680 * A comparison function defining the sorting order for
681 * entries in the IXFR-style journal file.
682 *
683 * The IXFR format requires that deletions are sorted before
684 * additions, and within either one, SOA records are sorted
685 * before others.
686 *
687 * Also sort the non-SOA records by type as a courtesy to the
688 * server receiving the IXFR - it may help reduce the amount of
689 * rdataset merging it has to do.
690 */
691static int
692ixfr_order(const void *av, const void *bv) {
693	dns_difftuple_t const * const *ap = av;
694	dns_difftuple_t const * const *bp = bv;
695	dns_difftuple_t const *a = *ap;
696	dns_difftuple_t const *b = *bp;
697	int r;
698
699	r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
700	if (r != 0)
701		return (r);
702
703	r = (b->rdata.type == dns_rdatatype_soa) -
704		(a->rdata.type == dns_rdatatype_soa);
705	if (r != 0)
706		return (r);
707
708	r = (a->rdata.type - b->rdata.type);
709	return (r);
710}
711
712/*
713 * Advance '*pos' to the next journal transaction.
714 *
715 * Requires:
716 *	*pos refers to a valid journal transaction.
717 *
718 * Ensures:
719 *	When ISC_R_SUCCESS is returned,
720 *	*pos refers to the next journal transaction.
721 *
722 * Returns one of:
723 *
724 *    ISC_R_SUCCESS
725 *    ISC_R_NOMORE 	*pos pointed at the last transaction
726 *    Other results due to file errors are possible.
727 */
728static isc_result_t
729journal_next(dns_journal_t *j, journal_pos_t *pos) {
730	isc_result_t result;
731	journal_xhdr_t xhdr;
732	REQUIRE(DNS_JOURNAL_VALID(j));
733
734	result = journal_seek(j, pos->offset);
735	if (result != ISC_R_SUCCESS)
736		return (result);
737
738	if (pos->serial == j->header.end.serial)
739		return (ISC_R_NOMORE);
740	/*
741	 * Read the header of the current transaction.
742	 * This will return ISC_R_NOMORE if we are at EOF.
743	 */
744	result = journal_read_xhdr(j, &xhdr);
745	if (result != ISC_R_SUCCESS)
746		return (result);
747
748	/*
749	 * Check serial number consistency.
750	 */
751	if (xhdr.serial0 != pos->serial) {
752		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
753			      "%s: journal file corrupt: "
754			      "expected serial %u, got %u",
755			      j->filename, pos->serial, xhdr.serial0);
756		return (ISC_R_UNEXPECTED);
757	}
758
759	/*
760	 * Check for offset wraparound.
761	 */
762	if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
763	    < pos->offset) {
764		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
765			      "%s: offset too large", j->filename);
766		return (ISC_R_UNEXPECTED);
767	}
768
769	pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
770	pos->serial = xhdr.serial1;
771	return (ISC_R_SUCCESS);
772}
773
774/*
775 * If the index of the journal 'j' contains an entry "better"
776 * than '*best_guess', replace '*best_guess' with it.
777 *
778 * "Better" means having a serial number closer to 'serial'
779 * but not greater than 'serial'.
780 */
781static void
782index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
783	unsigned int i;
784	if (j->index == NULL)
785		return;
786	for (i = 0; i < j->header.index_size; i++) {
787		if (POS_VALID(j->index[i]) &&
788		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
789		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
790			*best_guess = j->index[i];
791	}
792}
793
794/*
795 * Add a new index entry.  If there is no room, make room by removing
796 * the odd-numbered entries and compacting the others into the first
797 * half of the index.  This decimates old index entries exponentially
798 * over time, so that the index always contains a much larger fraction
799 * of recent serial numbers than of old ones.  This is deliberate -
800 * most index searches are for outgoing IXFR, and IXFR tends to request
801 * recent versions more often than old ones.
802 */
803static void
804index_add(dns_journal_t *j, journal_pos_t *pos) {
805	unsigned int i;
806	if (j->index == NULL)
807		return;
808	/*
809	 * Search for a vacant position.
810	 */
811	for (i = 0; i < j->header.index_size; i++) {
812		if (! POS_VALID(j->index[i]))
813			break;
814	}
815	if (i == j->header.index_size) {
816		unsigned int k = 0;
817		/*
818		 * Found no vacant position.  Make some room.
819		 */
820		for (i = 0; i < j->header.index_size; i += 2) {
821			j->index[k++] = j->index[i];
822		}
823		i = k; /* 'i' identifies the first vacant position. */
824		while (k < j->header.index_size) {
825			POS_INVALIDATE(j->index[k]);
826			k++;
827		}
828	}
829	INSIST(i < j->header.index_size);
830	INSIST(! POS_VALID(j->index[i]));
831
832	/*
833	 * Store the new index entry.
834	 */
835	j->index[i] = *pos;
836}
837
838/*
839 * Invalidate any existing index entries that could become
840 * ambiguous when a new transaction with number 'serial' is added.
841 */
842static void
843index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
844	unsigned int i;
845	if (j->index == NULL)
846		return;
847	for (i = 0; i < j->header.index_size; i++) {
848		if (! DNS_SERIAL_GT(serial, j->index[i].serial))
849			POS_INVALIDATE(j->index[i]);
850	}
851}
852
853/*
854 * Try to find a transaction with initial serial number 'serial'
855 * in the journal 'j'.
856 *
857 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
858 *
859 * If 'serial' is current (= the ending serial number of the
860 * last transaction in the journal), set '*pos' to
861 * the position immediately following the last transaction and
862 * return ISC_R_SUCCESS.
863 *
864 * If 'serial' is within the range of addressable serial numbers
865 * covered by the journal but that particular serial number is missing
866 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
867 *
868 * If 'serial' is outside the range of addressable serial numbers
869 * covered by the journal, return ISC_R_RANGE.
870 *
871 */
872static isc_result_t
873journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
874	isc_result_t result;
875	journal_pos_t current_pos;
876	REQUIRE(DNS_JOURNAL_VALID(j));
877
878	if (DNS_SERIAL_GT(j->header.begin.serial, serial))
879		return (ISC_R_RANGE);
880	if (DNS_SERIAL_GT(serial, j->header.end.serial))
881		return (ISC_R_RANGE);
882	if (serial == j->header.end.serial) {
883		*pos = j->header.end;
884		return (ISC_R_SUCCESS);
885	}
886
887	current_pos = j->header.begin;
888	index_find(j, serial, &current_pos);
889
890	while (current_pos.serial != serial) {
891		if (DNS_SERIAL_GT(current_pos.serial, serial))
892			return (ISC_R_NOTFOUND);
893		result = journal_next(j, &current_pos);
894		if (result != ISC_R_SUCCESS)
895			return (result);
896	}
897	*pos = current_pos;
898	return (ISC_R_SUCCESS);
899}
900
901isc_result_t
902dns_journal_begin_transaction(dns_journal_t *j) {
903	isc_uint32_t offset;
904	isc_result_t result;
905	journal_rawxhdr_t hdr;
906
907	REQUIRE(DNS_JOURNAL_VALID(j));
908	REQUIRE(j->state == JOURNAL_STATE_WRITE);
909
910	/*
911	 * Find the file offset where the new transaction should
912	 * be written, and seek there.
913	 */
914	if (JOURNAL_EMPTY(&j->header)) {
915		offset = sizeof(journal_rawheader_t) +
916			j->header.index_size * sizeof(journal_rawpos_t);
917	} else {
918		offset = j->header.end.offset;
919	}
920	j->x.pos[0].offset = offset;
921	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
922	j->x.n_soa = 0;
923
924	CHECK(journal_seek(j, offset));
925
926	/*
927	 * Write a dummy transaction header of all zeroes to reserve
928	 * space.  It will be filled in when the transaction is
929	 * finished.
930	 */
931	memset(&hdr, 0, sizeof(hdr));
932	CHECK(journal_write(j, &hdr, sizeof(hdr)));
933	j->x.pos[1].offset = j->offset;
934
935	j->state = JOURNAL_STATE_TRANSACTION;
936	result = ISC_R_SUCCESS;
937 failure:
938	return (result);
939}
940
941isc_result_t
942dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
943	dns_difftuple_t *t;
944	isc_buffer_t buffer;
945	void *mem = NULL;
946	unsigned int size;
947	isc_result_t result;
948	isc_region_t used;
949
950	REQUIRE(DNS_DIFF_VALID(diff));
951	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
952
953	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
954	(void)dns_diff_print(diff, NULL);
955
956	/*
957	 * Pass 1: determine the buffer size needed, and
958	 * keep track of SOA serial numbers.
959	 */
960	size = 0;
961	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
962	     t = ISC_LIST_NEXT(t, link))
963	{
964		if (t->rdata.type == dns_rdatatype_soa) {
965			if (j->x.n_soa < 2)
966				j->x.pos[j->x.n_soa].serial =
967					dns_soa_getserial(&t->rdata);
968			j->x.n_soa++;
969		}
970		size += sizeof(journal_rawrrhdr_t);
971		size += t->name.length; /* XXX should have access macro? */
972		size += 10;
973		size += t->rdata.length;
974	}
975
976	mem = isc_mem_get(j->mctx, size);
977	if (mem == NULL)
978		return (ISC_R_NOMEMORY);
979
980	isc_buffer_init(&buffer, mem, size);
981
982	/*
983	 * Pass 2.  Write RRs to buffer.
984	 */
985	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
986	     t = ISC_LIST_NEXT(t, link))
987	{
988		/*
989		 * Write the RR header.
990		 */
991		isc_buffer_putuint32(&buffer, t->name.length + 10 +
992				     t->rdata.length);
993		/*
994		 * Write the owner name, RR header, and RR data.
995		 */
996		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
997		isc_buffer_putuint16(&buffer, t->rdata.type);
998		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
999		isc_buffer_putuint32(&buffer, t->ttl);
1000		INSIST(t->rdata.length < 65536);
1001		isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1002		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1003		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1004	}
1005
1006	isc_buffer_usedregion(&buffer, &used);
1007	INSIST(used.length == size);
1008
1009	j->x.pos[1].offset += used.length;
1010
1011	/*
1012	 * Write the buffer contents to the journal file.
1013	 */
1014	CHECK(journal_write(j, used.base, used.length));
1015
1016	result = ISC_R_SUCCESS;
1017
1018 failure:
1019	if (mem != NULL)
1020		isc_mem_put(j->mctx, mem, size);
1021	return (result);
1022
1023}
1024
1025isc_result_t
1026dns_journal_commit(dns_journal_t *j) {
1027	isc_result_t result;
1028	journal_rawheader_t rawheader;
1029
1030	REQUIRE(DNS_JOURNAL_VALID(j));
1031	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1032
1033	/*
1034	 * Perform some basic consistency checks.
1035	 */
1036	if (j->x.n_soa != 2) {
1037		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1038			      "malformed transaction: %d SOAs",
1039			      j->x.n_soa);
1040		return (ISC_R_UNEXPECTED);
1041	}
1042	if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1043	       (bind8_compat &&
1044		j->x.pos[1].serial == j->x.pos[0].serial)))
1045	{
1046		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1047			      "malformed transaction: serial number "
1048			      "would decrease");
1049		return (ISC_R_UNEXPECTED);
1050	}
1051	if (! JOURNAL_EMPTY(&j->header)) {
1052		if (j->x.pos[0].serial != j->header.end.serial) {
1053			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1054					 "malformed transaction: "
1055					 "%s last serial %u != "
1056					 "transaction first serial %u",
1057					 j->filename,
1058					 j->header.end.serial,
1059					 j->x.pos[0].serial);
1060			return (ISC_R_UNEXPECTED);
1061		}
1062	}
1063
1064	/*
1065	 * Some old journal entries may become non-addressable
1066	 * when we increment the current serial number.  Purge them
1067	 * by stepping header.begin forward to the first addressable
1068	 * transaction.  Also purge them from the index.
1069	 */
1070	if (! JOURNAL_EMPTY(&j->header)) {
1071		while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1072				       j->header.begin.serial)) {
1073			CHECK(journal_next(j, &j->header.begin));
1074		}
1075		index_invalidate(j, j->x.pos[1].serial);
1076	}
1077#ifdef notyet
1078	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1079		force_dump(...);
1080	}
1081#endif
1082
1083	/*
1084	 * Commit the transaction data to stable storage.
1085	 */
1086	CHECK(journal_fsync(j));
1087
1088	/*
1089	 * Update the transaction header.
1090	 */
1091	CHECK(journal_seek(j, j->x.pos[0].offset));
1092	CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1093				 sizeof(journal_rawxhdr_t),
1094				 j->x.pos[0].serial, j->x.pos[1].serial));
1095
1096	/*
1097	 * Update the journal header.
1098	 */
1099	if (JOURNAL_EMPTY(&j->header)) {
1100		j->header.begin = j->x.pos[0];
1101	}
1102	j->header.end = j->x.pos[1];
1103	journal_header_encode(&j->header, &rawheader);
1104	CHECK(journal_seek(j, 0));
1105	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1106
1107	/*
1108	 * Update the index.
1109	 */
1110	index_add(j, &j->x.pos[0]);
1111
1112	/*
1113	 * Convert the index into on-disk format and write
1114	 * it to disk.
1115	 */
1116	CHECK(index_to_disk(j));
1117
1118	/*
1119	 * Commit the header to stable storage.
1120	 */
1121	CHECK(journal_fsync(j));
1122
1123	/*
1124	 * We no longer have a transaction open.
1125	 */
1126	j->state = JOURNAL_STATE_WRITE;
1127
1128	result = ISC_R_SUCCESS;
1129
1130 failure:
1131	return (result);
1132}
1133
1134isc_result_t
1135dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1136	isc_result_t result;
1137	CHECK(dns_diff_sort(diff, ixfr_order));
1138	CHECK(dns_journal_begin_transaction(j));
1139	CHECK(dns_journal_writediff(j, diff));
1140	CHECK(dns_journal_commit(j));
1141	result = ISC_R_SUCCESS;
1142 failure:
1143	return (result);
1144}
1145
1146void
1147dns_journal_destroy(dns_journal_t **journalp) {
1148	dns_journal_t *j = *journalp;
1149	REQUIRE(DNS_JOURNAL_VALID(j));
1150
1151	j->it.result = ISC_R_FAILURE;
1152	dns_name_invalidate(&j->it.name);
1153	dns_decompress_invalidate(&j->it.dctx);
1154	if (j->rawindex != NULL)
1155		isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1156			    sizeof(journal_rawpos_t));
1157	if (j->index != NULL)
1158		isc_mem_put(j->mctx, j->index, j->header.index_size *
1159			    sizeof(journal_pos_t));
1160	if (j->it.target.base != NULL)
1161		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1162	if (j->it.source.base != NULL)
1163		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1164
1165	if (j->fp != NULL)
1166		(void)isc_stdio_close(j->fp);
1167	j->magic = 0;
1168	isc_mem_put(j->mctx, j, sizeof(*j));
1169	*journalp = NULL;
1170}
1171
1172/*
1173 * Roll the open journal 'j' into the database 'db'.
1174 * A new database version will be created.
1175 */
1176
1177/* XXX Share code with incoming IXFR? */
1178
1179static isc_result_t
1180roll_forward(dns_journal_t *j, dns_db_t *db) {
1181	isc_buffer_t source;		/* Transaction data from disk */
1182	isc_buffer_t target;		/* Ditto after _fromwire check */
1183	isc_uint32_t db_serial;		/* Database SOA serial */
1184	isc_uint32_t end_serial;	/* Last journal SOA serial */
1185	isc_result_t result;
1186	dns_dbversion_t *ver = NULL;
1187	journal_pos_t pos;
1188	dns_diff_t diff;
1189	unsigned int n_soa = 0;
1190	unsigned int n_put = 0;
1191
1192	REQUIRE(DNS_JOURNAL_VALID(j));
1193	REQUIRE(DNS_DB_VALID(db));
1194
1195	dns_diff_init(j->mctx, &diff);
1196
1197	/*
1198	 * Set up empty initial buffers for uncheched and checked
1199	 * wire format transaction data.  They will be reallocated
1200	 * later.
1201	 */
1202	isc_buffer_init(&source, NULL, 0);
1203	isc_buffer_init(&target, NULL, 0);
1204
1205	/*
1206	 * Create the new database version.
1207	 */
1208	CHECK(dns_db_newversion(db, &ver));
1209
1210	/*
1211	 * Get the current database SOA serial number.
1212	 */
1213	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1214
1215	/*
1216	 * Locate a journal entry for the current database serial.
1217	 */
1218	CHECK(journal_find(j, db_serial, &pos));
1219	/*
1220	 * XXX do more drastic things, like marking zone stale,
1221	 * if this fails?
1222	 */
1223	/*
1224	 * XXXRTH  The zone code should probably mark the zone as bad and
1225	 *         scream loudly into the log if this is a dynamic update
1226	 *	   log reply that failed.
1227	 */
1228
1229	end_serial = dns_journal_last_serial(j);
1230	if (db_serial == end_serial)
1231		CHECK(DNS_R_UPTODATE);
1232
1233	CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1234
1235	for (result = dns_journal_first_rr(j);
1236	     result == ISC_R_SUCCESS;
1237	     result = dns_journal_next_rr(j))
1238	{
1239		dns_name_t *name;
1240		isc_uint32_t ttl;
1241		dns_rdata_t *rdata;
1242		dns_difftuple_t *tuple = NULL;
1243
1244		name = NULL;
1245		rdata = NULL;
1246		dns_journal_current_rr(j, &name, &ttl, &rdata);
1247
1248		if (rdata->type == dns_rdatatype_soa) {
1249			n_soa++;
1250			if (n_soa == 2)
1251				db_serial = j->it.current_serial;
1252		}
1253
1254		if (n_soa == 3)
1255			n_soa = 1;
1256		if (n_soa == 0) {
1257			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1258					 "%s: journal file corrupt: missing "
1259					 "initial SOA", j->filename);
1260			FAIL(ISC_R_UNEXPECTED);
1261		}
1262		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1263					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1264					   name, ttl, rdata, &tuple));
1265		dns_diff_append(&diff, &tuple);
1266
1267		if (++n_put > 100)  {
1268			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1269				      "applying diff to database (%u)",
1270				      db_serial);
1271			(void)dns_diff_print(&diff, NULL);
1272			CHECK(dns_diff_apply(&diff, db, ver));
1273			dns_diff_clear(&diff);
1274			n_put = 0;
1275		}
1276	}
1277	if (result == ISC_R_NOMORE)
1278		result = ISC_R_SUCCESS;
1279	CHECK(result);
1280
1281	if (n_put != 0) {
1282		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1283			      "applying final diff to database (%u)",
1284			      db_serial);
1285		(void)dns_diff_print(&diff, NULL);
1286		CHECK(dns_diff_apply(&diff, db, ver));
1287		dns_diff_clear(&diff);
1288	}
1289
1290 failure:
1291	if (ver != NULL)
1292		dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1293				    ISC_TRUE : ISC_FALSE);
1294
1295	if (source.base != NULL)
1296		isc_mem_put(j->mctx, source.base, source.length);
1297	if (target.base != NULL)
1298		isc_mem_put(j->mctx, target.base, target.length);
1299
1300	dns_diff_clear(&diff);
1301
1302	return (result);
1303}
1304
1305isc_result_t
1306dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1307	dns_journal_t *j;
1308	isc_result_t result;
1309
1310	REQUIRE(DNS_DB_VALID(db));
1311	REQUIRE(filename != NULL);
1312
1313	j = NULL;
1314	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1315	if (result == ISC_R_NOTFOUND) {
1316		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1317			      "no journal file, but that's OK");
1318		return (DNS_R_NOJOURNAL);
1319	}
1320	if (result != ISC_R_SUCCESS)
1321		return (result);
1322	if (JOURNAL_EMPTY(&j->header))
1323		result = DNS_R_UPTODATE;
1324	else
1325		result = roll_forward(j, db);
1326
1327	dns_journal_destroy(&j);
1328
1329	return (result);
1330}
1331
1332isc_result_t
1333dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1334	dns_journal_t *j;
1335	isc_buffer_t source;		/* Transaction data from disk */
1336	isc_buffer_t target;		/* Ditto after _fromwire check */
1337	isc_uint32_t start_serial;		/* Database SOA serial */
1338	isc_uint32_t end_serial;	/* Last journal SOA serial */
1339	isc_result_t result;
1340	dns_diff_t diff;
1341	unsigned int n_soa = 0;
1342	unsigned int n_put = 0;
1343
1344	REQUIRE(filename != NULL);
1345
1346	j = NULL;
1347	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1348	if (result == ISC_R_NOTFOUND) {
1349		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1350		return (DNS_R_NOJOURNAL);
1351	}
1352
1353	if (result != ISC_R_SUCCESS) {
1354		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1355			      "journal open failure");
1356		return (result);
1357	}
1358
1359	dns_diff_init(j->mctx, &diff);
1360
1361	/*
1362	 * Set up empty initial buffers for uncheched and checked
1363	 * wire format transaction data.  They will be reallocated
1364	 * later.
1365	 */
1366	isc_buffer_init(&source, NULL, 0);
1367	isc_buffer_init(&target, NULL, 0);
1368
1369	start_serial = dns_journal_first_serial(j);
1370	end_serial = dns_journal_last_serial(j);
1371
1372	CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1373
1374	for (result = dns_journal_first_rr(j);
1375	     result == ISC_R_SUCCESS;
1376	     result = dns_journal_next_rr(j))
1377	{
1378		dns_name_t *name;
1379		isc_uint32_t ttl;
1380		dns_rdata_t *rdata;
1381		dns_difftuple_t *tuple = NULL;
1382
1383		name = NULL;
1384		rdata = NULL;
1385		dns_journal_current_rr(j, &name, &ttl, &rdata);
1386
1387		if (rdata->type == dns_rdatatype_soa)
1388			n_soa++;
1389
1390		if (n_soa == 3)
1391			n_soa = 1;
1392		if (n_soa == 0) {
1393		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1394					 "%s: journal file corrupt: missing "
1395					 "initial SOA", j->filename);
1396			FAIL(ISC_R_UNEXPECTED);
1397		}
1398		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1399					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1400					   name, ttl, rdata, &tuple));
1401		dns_diff_append(&diff, &tuple);
1402
1403		if (++n_put > 100)  {
1404			result = dns_diff_print(&diff, file);
1405			dns_diff_clear(&diff);
1406			n_put = 0;
1407			if (result != ISC_R_SUCCESS)
1408				break;
1409		}
1410	}
1411	if (result == ISC_R_NOMORE)
1412		result = ISC_R_SUCCESS;
1413	CHECK(result);
1414
1415	if (n_put != 0) {
1416		result = dns_diff_print(&diff, file);
1417		dns_diff_clear(&diff);
1418	}
1419	goto cleanup;
1420
1421 failure:
1422	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1423		      "%s: cannot print: journal file corrupt", j->filename);
1424
1425 cleanup:
1426	if (source.base != NULL)
1427		isc_mem_put(j->mctx, source.base, source.length);
1428	if (target.base != NULL)
1429		isc_mem_put(j->mctx, target.base, target.length);
1430
1431	dns_diff_clear(&diff);
1432	dns_journal_destroy(&j);
1433
1434	return (result);
1435}
1436
1437/**************************************************************************/
1438/*
1439 * Miscellaneous accessors.
1440 */
1441isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1442	return (j->header.begin.serial);
1443}
1444
1445isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1446	return (j->header.end.serial);
1447}
1448
1449/**************************************************************************/
1450/*
1451 * Iteration support.
1452 *
1453 * When serving an outgoing IXFR, we transmit a part the journal starting
1454 * at the serial number in the IXFR request and ending at the serial
1455 * number that is current when the IXFR request arrives.  The ending
1456 * serial number is not necessarily at the end of the journal:
1457 * the journal may grow while the IXFR is in progress, but we stop
1458 * when we reach the serial number that was current when the IXFR started.
1459 */
1460
1461static isc_result_t read_one_rr(dns_journal_t *j);
1462
1463/*
1464 * Make sure the buffer 'b' is has at least 'size' bytes
1465 * allocated, and clear it.
1466 *
1467 * Requires:
1468 *	Either b->base is NULL, or it points to b->length bytes of memory
1469 *	previously allocated by isc_mem_get().
1470 */
1471
1472static isc_result_t
1473size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1474	if (b->length < size) {
1475		void *mem = isc_mem_get(mctx, size);
1476		if (mem == NULL)
1477			return (ISC_R_NOMEMORY);
1478		if (b->base != NULL)
1479			isc_mem_put(mctx, b->base, b->length);
1480		b->base = mem;
1481		b->length = size;
1482	}
1483	isc_buffer_clear(b);
1484	return (ISC_R_SUCCESS);
1485}
1486
1487isc_result_t
1488dns_journal_iter_init(dns_journal_t *j,
1489		      isc_uint32_t begin_serial, isc_uint32_t end_serial)
1490{
1491	isc_result_t result;
1492
1493	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1494	INSIST(j->it.bpos.serial == begin_serial);
1495
1496	CHECK(journal_find(j, end_serial, &j->it.epos));
1497	INSIST(j->it.epos.serial == end_serial);
1498
1499	result = ISC_R_SUCCESS;
1500 failure:
1501	j->it.result = result;
1502	return (j->it.result);
1503}
1504
1505
1506isc_result_t
1507dns_journal_first_rr(dns_journal_t *j) {
1508	isc_result_t result;
1509
1510	/*
1511	 * Seek to the beginning of the first transaction we are
1512	 * interested in.
1513	 */
1514	CHECK(journal_seek(j, j->it.bpos.offset));
1515	j->it.current_serial = j->it.bpos.serial;
1516
1517	j->it.xsize = 0;  /* We have no transaction data yet... */
1518	j->it.xpos = 0;	  /* ...and haven't used any of it. */
1519
1520	return (read_one_rr(j));
1521
1522 failure:
1523	return (result);
1524}
1525
1526static isc_result_t
1527read_one_rr(dns_journal_t *j) {
1528	isc_result_t result;
1529
1530	dns_rdatatype_t rdtype;
1531	dns_rdataclass_t rdclass;
1532	unsigned int rdlen;
1533	isc_uint32_t ttl;
1534	journal_xhdr_t xhdr;
1535	journal_rrhdr_t rrhdr;
1536
1537	INSIST(j->offset <= j->it.epos.offset);
1538	if (j->offset == j->it.epos.offset)
1539		return (ISC_R_NOMORE);
1540	if (j->it.xpos == j->it.xsize) {
1541		/*
1542		 * We are at a transaction boundary.
1543		 * Read another transaction header.
1544		 */
1545		CHECK(journal_read_xhdr(j, &xhdr));
1546		if (xhdr.size == 0) {
1547			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1548				      "journal corrupt: empty transaction");
1549			FAIL(ISC_R_UNEXPECTED);
1550		}
1551		if (xhdr.serial0 != j->it.current_serial) {
1552			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1553					 "%s: journal file corrupt: "
1554					 "expected serial %u, got %u",
1555					 j->filename,
1556					 j->it.current_serial, xhdr.serial0);
1557			FAIL(ISC_R_UNEXPECTED);
1558		}
1559		j->it.xsize = xhdr.size;
1560		j->it.xpos = 0;
1561	}
1562	/*
1563	 * Read an RR.
1564	 */
1565	result = journal_read_rrhdr(j, &rrhdr);
1566	/*
1567	 * Perform a sanity check on the journal RR size.
1568	 * The smallest possible RR has a 1-byte owner name
1569	 * and a 10-byte header.  The largest possible
1570	 * RR has 65535 bytes of data, a header, and a maximum-
1571	 * size owner name, well below 70 k total.
1572	 */
1573	if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1574		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1575				 "%s: journal corrupt: impossible RR size "
1576				 "(%d bytes)", j->filename, rrhdr.size);
1577		FAIL(ISC_R_UNEXPECTED);
1578	}
1579
1580	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1581	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1582	isc_buffer_add(&j->it.source, rrhdr.size);
1583
1584	/*
1585	 * The target buffer is made the same size
1586	 * as the source buffer, with the assumption that when
1587	 * no compression in present, the output of dns_*_fromwire()
1588	 * is no larger than the input.
1589	 */
1590	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1591
1592	/*
1593	 * Parse the owner name.  We don't know where it
1594	 * ends yet, so we make the entire "remaining"
1595	 * part of the buffer "active".
1596	 */
1597	isc_buffer_setactive(&j->it.source,
1598			     j->it.source.used - j->it.source.current);
1599	CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1600				&j->it.dctx, 0, &j->it.target));
1601
1602	/*
1603	 * Check that the RR header is there, and parse it.
1604	 */
1605	if (isc_buffer_remaininglength(&j->it.source) < 10)
1606		FAIL(DNS_R_FORMERR);
1607
1608	rdtype = isc_buffer_getuint16(&j->it.source);
1609	rdclass = isc_buffer_getuint16(&j->it.source);
1610	ttl = isc_buffer_getuint32(&j->it.source);
1611	rdlen = isc_buffer_getuint16(&j->it.source);
1612
1613	/*
1614	 * Parse the rdata.
1615	 */
1616	isc_buffer_setactive(&j->it.source, rdlen);
1617	dns_rdata_reset(&j->it.rdata);
1618	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1619				 rdtype, &j->it.source, &j->it.dctx,
1620				 0, &j->it.target));
1621	j->it.ttl = ttl;
1622
1623	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1624	if (rdtype == dns_rdatatype_soa) {
1625		/* XXX could do additional consistency checks here */
1626		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1627	}
1628
1629	result = ISC_R_SUCCESS;
1630
1631 failure:
1632	j->it.result = result;
1633	return (result);
1634}
1635
1636isc_result_t
1637dns_journal_next_rr(dns_journal_t *j) {
1638	j->it.result = read_one_rr(j);
1639	return (j->it.result);
1640}
1641
1642void
1643dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1644		   dns_rdata_t **rdata)
1645{
1646	REQUIRE(j->it.result == ISC_R_SUCCESS);
1647	*name = &j->it.name;
1648	*ttl = j->it.ttl;
1649	*rdata = &j->it.rdata;
1650}
1651
1652/**************************************************************************/
1653/*
1654 * Generating diffs from databases
1655 */
1656
1657/*
1658 * Construct a diff containing all the RRs at the current name of the
1659 * database iterator 'dbit' in database 'db', version 'ver'.
1660 * Set '*name' to the current name, and append the diff to 'diff'.
1661 * All new tuples will have the operation 'op'.
1662 *
1663 * Requires: 'name' must have buffer large enough to hold the name.
1664 * Typically, a dns_fixedname_t would be used.
1665 */
1666static isc_result_t
1667get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1668	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1669	      dns_diff_t *diff)
1670{
1671	isc_result_t result;
1672	dns_dbnode_t *node = NULL;
1673	dns_rdatasetiter_t *rdsiter = NULL;
1674	dns_difftuple_t *tuple = NULL;
1675
1676	result = dns_dbiterator_current(dbit, &node, name);
1677	if (result != ISC_R_SUCCESS)
1678		return (result);
1679
1680	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1681	if (result != ISC_R_SUCCESS)
1682		goto cleanup_node;
1683
1684	for (result = dns_rdatasetiter_first(rdsiter);
1685	     result == ISC_R_SUCCESS;
1686	     result = dns_rdatasetiter_next(rdsiter))
1687	{
1688		dns_rdataset_t rdataset;
1689
1690		dns_rdataset_init(&rdataset);
1691		dns_rdatasetiter_current(rdsiter, &rdataset);
1692
1693		for (result = dns_rdataset_first(&rdataset);
1694		     result == ISC_R_SUCCESS;
1695		     result = dns_rdataset_next(&rdataset))
1696		{
1697			dns_rdata_t rdata = DNS_RDATA_INIT;
1698			dns_rdataset_current(&rdataset, &rdata);
1699			result = dns_difftuple_create(diff->mctx, op, name,
1700						      rdataset.ttl, &rdata,
1701						      &tuple);
1702			if (result != ISC_R_SUCCESS) {
1703				dns_rdataset_disassociate(&rdataset);
1704				goto cleanup_iterator;
1705			}
1706			dns_diff_append(diff, &tuple);
1707		}
1708		dns_rdataset_disassociate(&rdataset);
1709		if (result != ISC_R_NOMORE)
1710			goto cleanup_iterator;
1711	}
1712	if (result != ISC_R_NOMORE)
1713		goto cleanup_iterator;
1714
1715	result = ISC_R_SUCCESS;
1716
1717 cleanup_iterator:
1718	dns_rdatasetiter_destroy(&rdsiter);
1719
1720 cleanup_node:
1721	dns_db_detachnode(db, &node);
1722
1723	return (result);
1724}
1725
1726/*
1727 * Comparison function for use by dns_diff_subtract when sorting
1728 * the diffs to be subtracted.  The sort keys are the rdata type
1729 * and the rdata itself.  The owner name is ignored, because
1730 * it is known to be the same for all tuples.
1731 */
1732static int
1733rdata_order(const void *av, const void *bv) {
1734	dns_difftuple_t const * const *ap = av;
1735	dns_difftuple_t const * const *bp = bv;
1736	dns_difftuple_t const *a = *ap;
1737	dns_difftuple_t const *b = *bp;
1738	int r;
1739	r = (b->rdata.type - a->rdata.type);
1740	if (r != 0)
1741		return (r);
1742	r = dns_rdata_compare(&a->rdata, &b->rdata);
1743	return (r);
1744}
1745
1746static isc_result_t
1747dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1748	isc_result_t result;
1749	dns_difftuple_t *p[2];
1750	int i, t;
1751	CHECK(dns_diff_sort(&diff[0], rdata_order));
1752	CHECK(dns_diff_sort(&diff[1], rdata_order));
1753
1754	for (;;) {
1755		p[0] = ISC_LIST_HEAD(diff[0].tuples);
1756		p[1] = ISC_LIST_HEAD(diff[1].tuples);
1757		if (p[0] == NULL && p[1] == NULL)
1758			break;
1759
1760		for (i = 0; i < 2; i++)
1761			if (p[!i] == NULL) {
1762				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1763				ISC_LIST_APPEND(r->tuples, p[i], link);
1764				goto next;
1765			}
1766		t = rdata_order(&p[0], &p[1]);
1767		if (t < 0) {
1768			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1769			ISC_LIST_APPEND(r->tuples, p[0], link);
1770			goto next;
1771		}
1772		if (t > 0) {
1773			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1774			ISC_LIST_APPEND(r->tuples, p[1], link);
1775			goto next;
1776		}
1777		INSIST(t == 0);
1778		/*
1779		 * Identical RRs in both databases; skip them both.
1780		 */
1781		for (i = 0; i < 2; i++) {
1782			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1783			dns_difftuple_free(&p[i]);
1784		}
1785	next: ;
1786	}
1787	result = ISC_R_SUCCESS;
1788 failure:
1789	return (result);
1790}
1791
1792/*
1793 * Compare the databases 'dba' and 'dbb' and generate a journal
1794 * entry containing the changes to make 'dba' from 'dbb' (note
1795 * the order).  This journal entry will consist of a single,
1796 * possibly very large transaction.
1797 */
1798
1799isc_result_t
1800dns_db_diff(isc_mem_t *mctx,
1801	    dns_db_t *dba, dns_dbversion_t *dbvera,
1802	    dns_db_t *dbb, dns_dbversion_t *dbverb,
1803	    const char *journal_filename)
1804{
1805	dns_db_t *db[2];
1806	dns_dbversion_t *ver[2];
1807	dns_dbiterator_t *dbit[2] = { NULL, NULL };
1808	isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1809	dns_fixedname_t fixname[2];
1810	isc_result_t result, itresult[2];
1811	dns_diff_t diff[2], resultdiff;
1812	int i, t;
1813	dns_journal_t *journal = NULL;
1814
1815	db[0] = dba, db[1] = dbb;
1816	ver[0] = dbvera, ver[1] = dbverb;
1817
1818	dns_diff_init(mctx, &diff[0]);
1819	dns_diff_init(mctx, &diff[1]);
1820	dns_diff_init(mctx, &resultdiff);
1821
1822	dns_fixedname_init(&fixname[0]);
1823	dns_fixedname_init(&fixname[1]);
1824
1825	result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1826	if (result != ISC_R_SUCCESS)
1827		return (result);
1828
1829	result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1830	if (result != ISC_R_SUCCESS)
1831		goto cleanup_journal;
1832	result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1833	if (result != ISC_R_SUCCESS)
1834		goto cleanup_interator0;
1835
1836	itresult[0] = dns_dbiterator_first(dbit[0]);
1837	itresult[1] = dns_dbiterator_first(dbit[1]);
1838
1839	for (;;) {
1840		for (i = 0; i < 2; i++) {
1841			if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1842				CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1843					    dns_fixedname_name(&fixname[i]),
1844					    i == 0 ?
1845					    DNS_DIFFOP_ADD :
1846					    DNS_DIFFOP_DEL,
1847					    &diff[i]));
1848				itresult[i] = dns_dbiterator_next(dbit[i]);
1849				have[i] = ISC_TRUE;
1850			}
1851		}
1852
1853		if (! have[0] && ! have[1]) {
1854			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1855			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1856			break;
1857		}
1858
1859		for (i = 0; i < 2; i++) {
1860			if (! have[!i]) {
1861				ISC_LIST_APPENDLIST(resultdiff.tuples,
1862						    diff[i].tuples, link);
1863				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1864				have[i] = ISC_FALSE;
1865				goto next;
1866			}
1867		}
1868
1869		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1870				     dns_fixedname_name(&fixname[1]));
1871		if (t < 0) {
1872			ISC_LIST_APPENDLIST(resultdiff.tuples,
1873					    diff[0].tuples, link);
1874			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1875			have[0] = ISC_FALSE;
1876			continue;
1877		}
1878		if (t > 0) {
1879			ISC_LIST_APPENDLIST(resultdiff.tuples,
1880					    diff[1].tuples, link);
1881			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1882			have[1] = ISC_FALSE;
1883			continue;
1884		}
1885		INSIST(t == 0);
1886		CHECK(dns_diff_subtract(diff, &resultdiff));
1887		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1888		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1889		have[0] = have[1] = ISC_FALSE;
1890	next: ;
1891	}
1892	if (itresult[0] != ISC_R_NOMORE)
1893		FAIL(itresult[0]);
1894	if (itresult[1] != ISC_R_NOMORE)
1895		FAIL(itresult[1]);
1896
1897	if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1898		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1899	} else {
1900		CHECK(dns_journal_write_transaction(journal, &resultdiff));
1901	}
1902	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1903	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1904
1905 failure:
1906	dns_diff_clear(&resultdiff);
1907	dns_dbiterator_destroy(&dbit[1]);
1908 cleanup_interator0:
1909	dns_dbiterator_destroy(&dbit[0]);
1910 cleanup_journal:
1911	dns_journal_destroy(&journal);
1912	return (result);
1913}
1914
1915isc_result_t
1916dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1917		    isc_uint32_t target_size)
1918{
1919	unsigned int i;
1920	journal_pos_t best_guess;
1921	journal_pos_t current_pos;
1922	dns_journal_t *j = NULL;
1923	journal_rawheader_t rawheader;
1924	unsigned int copy_length;
1925	unsigned int len;
1926	char *buf = NULL;
1927	unsigned int size = 0;
1928	isc_result_t result;
1929	unsigned int indexend;
1930
1931	CHECK(journal_open(mctx, filename, ISC_TRUE, ISC_FALSE, &j));
1932
1933	if (JOURNAL_EMPTY(&j->header)) {
1934		dns_journal_destroy(&j);
1935		return (ISC_R_SUCCESS);
1936	}
1937
1938	if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1939	    DNS_SERIAL_GT(serial, j->header.end.serial)) {
1940		dns_journal_destroy(&j);
1941		return (ISC_R_RANGE);
1942	}
1943
1944	/*
1945	 * Cope with very small target sizes.
1946	 */
1947	indexend = sizeof(journal_rawheader_t) +
1948		   j->header.index_size * sizeof(journal_rawpos_t);
1949	if (target_size < indexend * 2)
1950		target_size = target_size/2 + indexend;
1951
1952	/*
1953	 * See if there is any work to do.
1954	 */
1955	if ((isc_uint32_t) j->header.end.offset < target_size) {
1956		dns_journal_destroy(&j);
1957		return (ISC_R_SUCCESS);
1958	}
1959
1960	/*
1961	 * Remove overhead so space test below can succeed.
1962	 */
1963	if (target_size >= indexend)
1964		target_size -= indexend;
1965
1966	/*
1967	 * Find if we can create enough free space.
1968	 */
1969	best_guess = j->header.begin;
1970	for (i = 0; i < j->header.index_size; i++) {
1971		if (POS_VALID(j->index[i]) &&
1972		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1973		    ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
1974		     >= target_size / 2) &&
1975		    j->index[i].offset > best_guess.offset)
1976			best_guess = j->index[i];
1977	}
1978
1979	current_pos = best_guess;
1980	while (current_pos.serial != serial) {
1981		CHECK(journal_next(j, &current_pos));
1982		if (current_pos.serial == j->header.end.serial)
1983			break;
1984
1985		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
1986		   ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
1987		     >= (target_size / 2)) &&
1988		    current_pos.offset > best_guess.offset)
1989			best_guess = current_pos;
1990		else
1991			break;
1992	}
1993
1994	INSIST(best_guess.serial != j->header.end.serial);
1995	if (best_guess.serial != serial)
1996		CHECK(journal_next(j, &best_guess));
1997
1998	/*
1999	 * Enough space to proceed?
2000	 */
2001	if ((isc_uint32_t) (j->header.end.offset - best_guess.offset) >
2002	     (isc_uint32_t) (best_guess.offset - indexend)) {
2003		dns_journal_destroy(&j);
2004		return (ISC_R_NOSPACE);
2005	}
2006
2007	copy_length = j->header.end.offset - best_guess.offset;
2008
2009	/*
2010	 * Invalidate entire index, will be rebuilt at end.
2011	 */
2012	for (i = 0; i < j->header.index_size; i++) {
2013		if (POS_VALID(j->index[i]))
2014			POS_INVALIDATE(j->index[i]);
2015	}
2016
2017	/*
2018	 * Convert the index into on-disk format and write
2019	 * it to disk.
2020	 */
2021	CHECK(index_to_disk(j));
2022	CHECK(journal_fsync(j));
2023
2024	/*
2025	 * Update the journal header.
2026	 */
2027	if (copy_length == 0) {
2028		j->header.begin.serial = 0;
2029		j->header.end.serial = 0;
2030		j->header.begin.offset = 0;
2031		j->header.end.offset = 0;
2032	} else {
2033		j->header.begin = best_guess;
2034	}
2035	journal_header_encode(&j->header, &rawheader);
2036	CHECK(journal_seek(j, 0));
2037	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2038	CHECK(journal_fsync(j));
2039
2040	if (copy_length != 0) {
2041		/*
2042		 * Copy best_guess to end into space just freed.
2043		 */
2044		size = 64*1024;
2045		if (copy_length < size)
2046			size = copy_length;
2047		buf = isc_mem_get(mctx, size);
2048		if (buf == NULL) {
2049			result = ISC_R_NOMEMORY;
2050			goto failure;
2051		}
2052
2053		for (i = 0; i < copy_length; i += size) {
2054			len = (copy_length - i) > size ? size :
2055							 (copy_length - i);
2056			CHECK(journal_seek(j, best_guess.offset + i));
2057			CHECK(journal_read(j, buf, len));
2058			CHECK(journal_seek(j, indexend + i));
2059			CHECK(journal_write(j, buf, len));
2060		}
2061
2062		CHECK(journal_fsync(j));
2063
2064		/*
2065		 * Compute new header.
2066		 */
2067		j->header.begin.offset = indexend;
2068		j->header.end.offset = indexend + copy_length;
2069		/*
2070		 * Update the journal header.
2071		 */
2072		journal_header_encode(&j->header, &rawheader);
2073		CHECK(journal_seek(j, 0));
2074		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2075		CHECK(journal_fsync(j));
2076
2077		/*
2078		 * Build new index.
2079		 */
2080		current_pos = j->header.begin;
2081		while (current_pos.serial != j->header.end.serial) {
2082			index_add(j, &current_pos);
2083			CHECK(journal_next(j, &current_pos));
2084		}
2085
2086		/*
2087		 * Write index.
2088		 */
2089		CHECK(index_to_disk(j));
2090		CHECK(journal_fsync(j));
2091
2092		indexend = j->header.end.offset;
2093	}
2094	dns_journal_destroy(&j);
2095	(void)isc_file_truncate(filename, (isc_offset_t)indexend);
2096	result = ISC_R_SUCCESS;
2097
2098 failure:
2099	if (buf != NULL)
2100		isc_mem_put(mctx, buf, size);
2101	if (j != NULL)
2102		dns_journal_destroy(&j);
2103	return (result);
2104}
2105
2106static isc_result_t
2107index_to_disk(dns_journal_t *j) {
2108	isc_result_t result = ISC_R_SUCCESS;
2109
2110	if (j->header.index_size != 0) {
2111		unsigned int i;
2112		unsigned char *p;
2113		unsigned int rawbytes;
2114
2115		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2116
2117		p = j->rawindex;
2118		for (i = 0; i < j->header.index_size; i++) {
2119			encode_uint32(j->index[i].serial, p);
2120			p += 4;
2121			encode_uint32(j->index[i].offset, p);
2122			p += 4;
2123		}
2124		INSIST(p == j->rawindex + rawbytes);
2125
2126		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2127		CHECK(journal_write(j, j->rawindex, rawbytes));
2128	}
2129failure:
2130	return (result);
2131}
2132