journal.c revision 153816
1/*
2 * Copyright (C) 2004, 2005  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2002  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: journal.c,v 1.77.2.1.10.13 2005/11/03 23:08:41 marka Exp $ */
19
20#include <config.h>
21
22#include <stdlib.h>
23#include <unistd.h>
24
25#include <isc/file.h>
26#include <isc/mem.h>
27#include <isc/stdio.h>
28#include <isc/string.h>
29#include <isc/util.h>
30
31#include <dns/compress.h>
32#include <dns/db.h>
33#include <dns/dbiterator.h>
34#include <dns/diff.h>
35#include <dns/fixedname.h>
36#include <dns/journal.h>
37#include <dns/log.h>
38#include <dns/rdataset.h>
39#include <dns/rdatasetiter.h>
40#include <dns/result.h>
41#include <dns/soa.h>
42
43/*
44 * When true, accept IXFR difference sequences where the
45 * SOA serial number does not change (BIND 8 sends such
46 * sequences).
47 */
48static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
49
50/**************************************************************************/
51/*
52 * Miscellaneous utilities.
53 */
54
55#define JOURNAL_COMMON_LOGARGS \
56	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
57
58#define JOURNAL_DEBUG_LOGARGS(n) \
59	JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
60
61/*
62 * It would be non-sensical (or at least obtuse) to use FAIL() with an
63 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
64 * from complaining about "end-of-loop code not reached".
65 */
66#define FAIL(code) \
67	do { result = (code);					\
68		if (result != ISC_R_SUCCESS) goto failure;	\
69	} while (0)
70
71#define CHECK(op) \
72     	do { result = (op); 					\
73		if (result != ISC_R_SUCCESS) goto failure; 	\
74	} while (0)
75
76static isc_result_t index_to_disk(dns_journal_t *);
77
78static inline isc_uint32_t
79decode_uint32(unsigned char *p) {
80	return ((p[0] << 24) +
81		(p[1] << 16) +
82		(p[2] <<  8) +
83		(p[3] <<  0));
84}
85
86static inline void
87encode_uint32(isc_uint32_t val, unsigned char *p) {
88	p[0] = (isc_uint8_t)(val >> 24);
89	p[1] = (isc_uint8_t)(val >> 16);
90	p[2] = (isc_uint8_t)(val >>  8);
91	p[3] = (isc_uint8_t)(val >>  0);
92}
93
94isc_result_t
95dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
96		      dns_diffop_t op, dns_difftuple_t **tp)
97{
98	isc_result_t result;
99	dns_dbnode_t *node;
100	dns_rdataset_t rdataset;
101	dns_rdata_t rdata = DNS_RDATA_INIT;
102	dns_name_t *zonename;
103
104	zonename = dns_db_origin(db);
105
106	node = NULL;
107	result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
108	if (result != ISC_R_SUCCESS)
109		goto nonode;
110
111	dns_rdataset_init(&rdataset);
112	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
113				     (isc_stdtime_t)0, &rdataset, NULL);
114 	if (result != ISC_R_SUCCESS)
115		goto freenode;
116
117	result = dns_rdataset_first(&rdataset);
118 	if (result != ISC_R_SUCCESS)
119		goto freenode;
120
121	dns_rdataset_current(&rdataset, &rdata);
122
123	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
124				      &rdata, tp);
125
126	dns_rdataset_disassociate(&rdataset);
127	dns_db_detachnode(db, &node);
128	return (ISC_R_SUCCESS);
129
130 freenode:
131	dns_db_detachnode(db, &node);
132 nonode:
133	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
134	return (result);
135}
136
137/**************************************************************************/
138/*
139 * Journalling.
140 */
141
142/*
143 * A journal file consists of
144 *
145 *   - A fixed-size header of type journal_rawheader_t.
146 *
147 *   - The index.  This is an unordered array of index entries
148 *     of type journal_rawpos_t giving the locations
149 *     of some arbitrary subset of the journal's addressable
150 *     transactions.  The index entries are used as hints to
151 *     speed up the process of locating a transaction with a given
152 *     serial number.  Unused index entries have an "offset"
153 *     field of zero.  The size of the index can vary between
154 *     journal files, but does not change during the lifetime
155 *     of a file.  The size can be zero.
156 *
157 *   - The journal data.  This  consists of one or more transactions.
158 *     Each transaction begins with a transaction header of type
159 *     journal_rawxhdr_t.  The transaction header is followed by a
160 *     sequence of RRs, similar in structure to an IXFR difference
161 *     sequence (RFC1995).  That is, the pre-transaction SOA,
162 *     zero or more other deleted RRs, the post-transaction SOA,
163 *     and zero or more other added RRs.  Unlike in IXFR, each RR
164 *     is prefixed with a 32-bit length.
165 *
166 *     The journal data part grows as new transactions are
167 *     appended to the file.  Only those transactions
168 *     whose serial number is current-(2^31-1) to current
169 *     are considered "addressable" and may be pointed
170 *     to from the header or index.  They may be preceded
171 *     by old transactions that are no longer addressable,
172 *     and they may be followed by transactions that were
173 *     appended to the journal but never committed by updating
174 *     the "end" position in the header.  The latter will
175 *     be overwritten when new transactions are added.
176 */
177
178/*
179 * On-disk representation of a "pointer" to a journal entry.
180 * These are used in the journal header to locate the beginning
181 * and end of the journal, and in the journal index to locate
182 * other transactions.
183 */
184typedef struct {
185	unsigned char	serial[4];  /* SOA serial before update. */
186	/*
187	 * XXXRTH  Should offset be 8 bytes?
188	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
189	 * XXXAG  ... but we will not be able to seek >2G anyway on many
190	 *            platforms as long as we are using fseek() rather
191	 *            than lseek().
192	 */
193	unsigned char	offset[4];  /* Offset from beginning of file. */
194} journal_rawpos_t;
195
196/*
197 * The on-disk representation of the journal header.
198 * All numbers are stored in big-endian order.
199 */
200
201/*
202 * The header is of a fixed size, with some spare room for future
203 * extensions.
204 */
205#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
206
207typedef union {
208	struct {
209		/* File format version ID. */
210		unsigned char 		format[16];
211		/* Position of the first addressable transaction */
212		journal_rawpos_t 	begin;
213		/* Position of the next (yet nonexistent) transaction. */
214		journal_rawpos_t 	end;
215		/* Number of index entries following the header. */
216		unsigned char 		index_size[4];
217	} h;
218	/* Pad the header to a fixed size. */
219	unsigned char pad[JOURNAL_HEADER_SIZE];
220} journal_rawheader_t;
221
222/*
223 * The on-disk representation of the transaction header.
224 * There is one of these at the beginning of each transaction.
225 */
226typedef struct {
227	unsigned char	size[4]; 	/* In bytes, excluding header. */
228	unsigned char	serial0[4];	/* SOA serial before update. */
229	unsigned char	serial1[4];	/* SOA serial after update. */
230} journal_rawxhdr_t;
231
232/*
233 * The on-disk representation of the RR header.
234 * There is one of these at the beginning of each RR.
235 */
236typedef struct {
237	unsigned char	size[4]; 	/* In bytes, excluding header. */
238} journal_rawrrhdr_t;
239
240/*
241 * The in-core representation of the journal header.
242 */
243typedef struct {
244	isc_uint32_t	serial;
245	isc_offset_t	offset;
246} journal_pos_t;
247
248#define POS_VALID(pos) 		((pos).offset != 0)
249#define POS_INVALIDATE(pos) 	((pos).offset = 0, (pos).serial = 0)
250
251typedef struct {
252	unsigned char 	format[16];
253	journal_pos_t 	begin;
254	journal_pos_t 	end;
255	isc_uint32_t	index_size;
256} journal_header_t;
257
258/*
259 * The in-core representation of the transaction header.
260 */
261
262typedef struct {
263	isc_uint32_t	size;
264	isc_uint32_t	serial0;
265	isc_uint32_t	serial1;
266} journal_xhdr_t;
267
268/*
269 * The in-core representation of the RR header.
270 */
271typedef struct {
272	isc_uint32_t	size;
273} journal_rrhdr_t;
274
275
276/*
277 * Initial contents to store in the header of a newly created
278 * journal file.
279 *
280 * The header starts with the magic string ";BIND LOG V9\n"
281 * to identify the file as a BIND 9 journal file.  An ASCII
282 * identification string is used rather than a binary magic
283 * number to be consistent with BIND 8 (BIND 8 journal files
284 * are ASCII text files).
285 */
286
287static journal_header_t
288initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
289
290#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
291
292typedef enum {
293	JOURNAL_STATE_INVALID,
294	JOURNAL_STATE_READ,
295	JOURNAL_STATE_WRITE,
296	JOURNAL_STATE_TRANSACTION
297} journal_state_t;
298
299struct dns_journal {
300	unsigned int		magic;		/* JOUR */
301	isc_mem_t		*mctx;		/* Memory context */
302	journal_state_t		state;
303	const char 		*filename;	/* Journal file name */
304	FILE *			fp;		/* File handle */
305	isc_offset_t		offset;		/* Current file offset */
306	journal_header_t 	header;		/* In-core journal header */
307	unsigned char		*rawindex;	/* In-core buffer for journal
308						   index in on-disk format */
309	journal_pos_t		*index;		/* In-core journal index */
310
311	/* Current transaction state (when writing). */
312	struct {
313		unsigned int	n_soa;		/* Number of SOAs seen */
314		journal_pos_t	pos[2];		/* Begin/end position */
315	} x;
316
317	/* Iteration state (when reading). */
318	struct {
319		/* These define the part of the journal we iterate over. */
320		journal_pos_t bpos;		/* Position before first, */
321		journal_pos_t epos;		/* and after last
322						   transaction */
323		/* The rest is iterator state. */
324		isc_uint32_t current_serial;	/* Current SOA serial */
325		isc_buffer_t source;		/* Data from disk */
326		isc_buffer_t target;		/* Data from _fromwire check */
327		dns_decompress_t dctx;		/* Dummy decompression ctx */
328		dns_name_t name;		/* Current domain name */
329		dns_rdata_t rdata;		/* Current rdata */
330		isc_uint32_t ttl;		/* Current TTL */
331		unsigned int xsize;		/* Size of transaction data */
332		unsigned int xpos;		/* Current position in it */
333		isc_result_t result;		/* Result of last call */
334	} it;
335};
336
337#define DNS_JOURNAL_MAGIC	ISC_MAGIC('J', 'O', 'U', 'R')
338#define DNS_JOURNAL_VALID(t)	ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
339
340static void
341journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
342	cooked->serial = decode_uint32(raw->serial);
343	cooked->offset = decode_uint32(raw->offset);
344}
345
346static void
347journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
348	encode_uint32(cooked->serial, raw->serial);
349	encode_uint32(cooked->offset, raw->offset);
350}
351
352static void
353journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
354	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
355	memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
356	journal_pos_decode(&raw->h.begin, &cooked->begin);
357	journal_pos_decode(&raw->h.end, &cooked->end);
358	cooked->index_size = decode_uint32(raw->h.index_size);
359}
360
361static void
362journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
363	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
364	memset(raw->pad, 0, sizeof(raw->pad));
365	memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
366	journal_pos_encode(&raw->h.begin, &cooked->begin);
367	journal_pos_encode(&raw->h.end, &cooked->end);
368	encode_uint32(cooked->index_size, raw->h.index_size);
369}
370
371/*
372 * Journal file I/O subroutines, with error checking and reporting.
373 */
374static isc_result_t
375journal_seek(dns_journal_t *j, isc_uint32_t offset) {
376	isc_result_t result;
377	result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
378	if (result != ISC_R_SUCCESS) {
379		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
380			      "%s: seek: %s", j->filename,
381			      isc_result_totext(result));
382		return (ISC_R_UNEXPECTED);
383	}
384	j->offset = offset;
385	return (ISC_R_SUCCESS);
386}
387
388static isc_result_t
389journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
390	isc_result_t result;
391
392	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
393	if (result != ISC_R_SUCCESS) {
394		if (result == ISC_R_EOF)
395			return (ISC_R_NOMORE);
396		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
397			      "%s: read: %s",
398			      j->filename, isc_result_totext(result));
399		return (ISC_R_UNEXPECTED);
400	}
401	j->offset += nbytes;
402	return (ISC_R_SUCCESS);
403}
404
405static isc_result_t
406journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
407	isc_result_t result;
408
409	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
410	if (result != ISC_R_SUCCESS) {
411		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
412			      "%s: write: %s",
413			      j->filename, isc_result_totext(result));
414		return (ISC_R_UNEXPECTED);
415	}
416	j->offset += nbytes;
417	return (ISC_R_SUCCESS);
418}
419
420static isc_result_t
421journal_fsync(dns_journal_t *j) {
422	isc_result_t result;
423	result = isc_stdio_flush(j->fp);
424	if (result != ISC_R_SUCCESS) {
425		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
426			      "%s: flush: %s",
427			      j->filename, isc_result_totext(result));
428		return (ISC_R_UNEXPECTED);
429	}
430	result = isc_stdio_sync(j->fp);
431	if (result != ISC_R_SUCCESS) {
432		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
433			      "%s: fsync: %s",
434			      j->filename, isc_result_totext(result));
435		return (ISC_R_UNEXPECTED);
436	}
437	return (ISC_R_SUCCESS);
438}
439
440/*
441 * Read/write a transaction header at the current file position.
442 */
443
444static isc_result_t
445journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
446	journal_rawxhdr_t raw;
447	isc_result_t result;
448	result = journal_read(j, &raw, sizeof(raw));
449	if (result != ISC_R_SUCCESS)
450		return (result);
451	xhdr->size = decode_uint32(raw.size);
452	xhdr->serial0 = decode_uint32(raw.serial0);
453	xhdr->serial1 = decode_uint32(raw.serial1);
454	return (ISC_R_SUCCESS);
455}
456
457static isc_result_t
458journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
459		   isc_uint32_t serial0, isc_uint32_t serial1)
460{
461	journal_rawxhdr_t raw;
462	encode_uint32(size, raw.size);
463	encode_uint32(serial0, raw.serial0);
464	encode_uint32(serial1, raw.serial1);
465	return (journal_write(j, &raw, sizeof(raw)));
466}
467
468
469/*
470 * Read an RR header at the current file position.
471 */
472
473static isc_result_t
474journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
475	journal_rawrrhdr_t raw;
476	isc_result_t result;
477	result = journal_read(j, &raw, sizeof(raw));
478	if (result != ISC_R_SUCCESS)
479		return (result);
480	rrhdr->size = decode_uint32(raw.size);
481	return (ISC_R_SUCCESS);
482}
483
484static isc_result_t
485journal_file_create(isc_mem_t *mctx, const char *filename) {
486	FILE *fp = NULL;
487	isc_result_t result;
488	journal_header_t header;
489	journal_rawheader_t rawheader;
490	int index_size = 56; /* XXX configurable */
491	int size;
492	void *mem; /* Memory for temporary index image. */
493
494	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
495
496	result = isc_stdio_open(filename, "wb", &fp);
497	if (result != ISC_R_SUCCESS) {
498		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
499			      "%s: create: %s",
500			      filename, isc_result_totext(result));
501		return (ISC_R_UNEXPECTED);
502	}
503
504	header = initial_journal_header;
505	header.index_size = index_size;
506	journal_header_encode(&header, &rawheader);
507
508	size = sizeof(journal_rawheader_t) +
509		index_size * sizeof(journal_rawpos_t);
510
511	mem = isc_mem_get(mctx, size);
512	if (mem == NULL) {
513		(void)isc_stdio_close(fp);
514		(void)isc_file_remove(filename);
515		return (ISC_R_NOMEMORY);
516	}
517	memset(mem, 0, size);
518	memcpy(mem, &rawheader, sizeof(rawheader));
519
520	result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
521	if (result != ISC_R_SUCCESS) {
522		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
523				 "%s: write: %s",
524				 filename, isc_result_totext(result));
525		(void)isc_stdio_close(fp);
526		(void)isc_file_remove(filename);
527		isc_mem_put(mctx, mem, size);
528		return (ISC_R_UNEXPECTED);
529	}
530	isc_mem_put(mctx, mem, size);
531
532	result = isc_stdio_close(fp);
533	if (result != ISC_R_SUCCESS) {
534		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
535				 "%s: close: %s",
536				 filename, isc_result_totext(result));
537		(void)isc_file_remove(filename);
538		return (ISC_R_UNEXPECTED);
539	}
540
541	return (ISC_R_SUCCESS);
542}
543
544static isc_result_t
545journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
546	     isc_boolean_t create, dns_journal_t **journalp) {
547	FILE *fp = NULL;
548	isc_result_t result;
549	journal_rawheader_t rawheader;
550	dns_journal_t *j;
551
552	INSIST(journalp != NULL && *journalp == NULL);
553	j = isc_mem_get(mctx, sizeof(*j));
554	if (j == NULL)
555		return (ISC_R_NOMEMORY);
556
557	j->mctx = mctx;
558	j->state = JOURNAL_STATE_INVALID;
559	j->fp = NULL;
560	j->filename = filename;
561	j->index = NULL;
562	j->rawindex = NULL;
563
564	result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
565
566	if (result == ISC_R_FILENOTFOUND) {
567		if (create) {
568			isc_log_write(JOURNAL_COMMON_LOGARGS,
569				      ISC_LOG_INFO,
570				      "journal file %s does not exist, "
571				      "creating it",
572				      j->filename);
573			CHECK(journal_file_create(mctx, filename));
574			/*
575			 * Retry.
576			 */
577			result = isc_stdio_open(j->filename, "rb+", &fp);
578		} else {
579			FAIL(ISC_R_NOTFOUND);
580		}
581	}
582	if (result != ISC_R_SUCCESS) {
583		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
584			      "%s: open: %s",
585			      j->filename, isc_result_totext(result));
586		FAIL(ISC_R_UNEXPECTED);
587	}
588
589	j->fp = fp;
590
591	/*
592	 * Set magic early so that seek/read can succeed.
593	 */
594	j->magic = DNS_JOURNAL_MAGIC;
595
596	CHECK(journal_seek(j, 0));
597	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
598
599	if (memcmp(rawheader.h.format, initial_journal_header.format,
600		   sizeof(initial_journal_header.format)) != 0) {
601		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
602				 "%s: journal format not recognized",
603				 j->filename);
604		FAIL(ISC_R_UNEXPECTED);
605	}
606	journal_header_decode(&rawheader, &j->header);
607
608	/*
609	 * If there is an index, read the raw index into a dynamically
610	 * allocated buffer and then convert it into a cooked index.
611	 */
612	if (j->header.index_size != 0) {
613		unsigned int i;
614		unsigned int rawbytes;
615		unsigned char *p;
616
617		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
618		j->rawindex = isc_mem_get(mctx, rawbytes);
619		if (j->rawindex == NULL)
620			FAIL(ISC_R_NOMEMORY);
621
622		CHECK(journal_read(j, j->rawindex, rawbytes));
623
624		j->index = isc_mem_get(mctx, j->header.index_size *
625				       sizeof(journal_pos_t));
626		if (j->index == NULL)
627			FAIL(ISC_R_NOMEMORY);
628
629		p = j->rawindex;
630		for (i = 0; i < j->header.index_size; i++) {
631			j->index[i].serial = decode_uint32(p);
632			p += 4;
633			j->index[i].offset = decode_uint32(p);
634			p += 4;
635		}
636		INSIST(p == j->rawindex + rawbytes);
637	}
638	j->offset = -1; /* Invalid, must seek explicitly. */
639
640	/*
641	 * Initialize the iterator.
642	 */
643	dns_name_init(&j->it.name, NULL);
644	dns_rdata_init(&j->it.rdata);
645
646	/*
647	 * Set up empty initial buffers for uncheched and checked
648	 * wire format RR data.  They will be reallocated
649	 * later.
650	 */
651	isc_buffer_init(&j->it.source, NULL, 0);
652	isc_buffer_init(&j->it.target, NULL, 0);
653	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
654
655	j->state =
656		write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
657
658	*journalp = j;
659	return (ISC_R_SUCCESS);
660
661 failure:
662	j->magic = 0;
663	if (j->index != NULL) {
664		isc_mem_put(j->mctx, j->index, j->header.index_size *
665			    sizeof(journal_rawpos_t));
666		j->index = NULL;
667	}
668	if (j->fp != NULL)
669		(void)isc_stdio_close(j->fp);
670	isc_mem_put(j->mctx, j, sizeof(*j));
671	return (result);
672}
673
674isc_result_t
675dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
676		 dns_journal_t **journalp) {
677	return (journal_open(mctx, filename, write, write, journalp));
678}
679
680/*
681 * A comparison function defining the sorting order for
682 * entries in the IXFR-style journal file.
683 *
684 * The IXFR format requires that deletions are sorted before
685 * additions, and within either one, SOA records are sorted
686 * before others.
687 *
688 * Also sort the non-SOA records by type as a courtesy to the
689 * server receiving the IXFR - it may help reduce the amount of
690 * rdataset merging it has to do.
691 */
692static int
693ixfr_order(const void *av, const void *bv) {
694	dns_difftuple_t const * const *ap = av;
695	dns_difftuple_t const * const *bp = bv;
696	dns_difftuple_t const *a = *ap;
697	dns_difftuple_t const *b = *bp;
698	int r;
699
700	r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
701	if (r != 0)
702		return (r);
703
704	r = (b->rdata.type == dns_rdatatype_soa) -
705		(a->rdata.type == dns_rdatatype_soa);
706	if (r != 0)
707		return (r);
708
709	r = (a->rdata.type - b->rdata.type);
710	return (r);
711}
712
713/*
714 * Advance '*pos' to the next journal transaction.
715 *
716 * Requires:
717 *	*pos refers to a valid journal transaction.
718 *
719 * Ensures:
720 *	When ISC_R_SUCCESS is returned,
721 *	*pos refers to the next journal transaction.
722 *
723 * Returns one of:
724 *
725 *    ISC_R_SUCCESS
726 *    ISC_R_NOMORE 	*pos pointed at the last transaction
727 *    Other results due to file errors are possible.
728 */
729static isc_result_t
730journal_next(dns_journal_t *j, journal_pos_t *pos) {
731	isc_result_t result;
732	journal_xhdr_t xhdr;
733	REQUIRE(DNS_JOURNAL_VALID(j));
734
735	result = journal_seek(j, pos->offset);
736	if (result != ISC_R_SUCCESS)
737		return (result);
738
739	if (pos->serial == j->header.end.serial)
740		return (ISC_R_NOMORE);
741	/*
742	 * Read the header of the current transaction.
743	 * This will return ISC_R_NOMORE if we are at EOF.
744	 */
745	result = journal_read_xhdr(j, &xhdr);
746	if (result != ISC_R_SUCCESS)
747		return (result);
748
749	/*
750	 * Check serial number consistency.
751	 */
752	if (xhdr.serial0 != pos->serial) {
753		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
754			      "%s: journal file corrupt: "
755			      "expected serial %u, got %u",
756			      j->filename, pos->serial, xhdr.serial0);
757		return (ISC_R_UNEXPECTED);
758	}
759
760	/*
761	 * Check for offset wraparound.
762	 */
763	if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
764	    < pos->offset) {
765		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
766			      "%s: offset too large", j->filename);
767		return (ISC_R_UNEXPECTED);
768	}
769
770	pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
771	pos->serial = xhdr.serial1;
772	return (ISC_R_SUCCESS);
773}
774
775/*
776 * If the index of the journal 'j' contains an entry "better"
777 * than '*best_guess', replace '*best_guess' with it.
778 *
779 * "Better" means having a serial number closer to 'serial'
780 * but not greater than 'serial'.
781 */
782static void
783index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
784	unsigned int i;
785	if (j->index == NULL)
786		return;
787	for (i = 0; i < j->header.index_size; i++) {
788		if (POS_VALID(j->index[i]) &&
789		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
790		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
791			*best_guess = j->index[i];
792	}
793}
794
795/*
796 * Add a new index entry.  If there is no room, make room by removing
797 * the odd-numbered entries and compacting the others into the first
798 * half of the index.  This decimates old index entries exponentially
799 * over time, so that the index always contains a much larger fraction
800 * of recent serial numbers than of old ones.  This is deliberate -
801 * most index searches are for outgoing IXFR, and IXFR tends to request
802 * recent versions more often than old ones.
803 */
804static void
805index_add(dns_journal_t *j, journal_pos_t *pos) {
806	unsigned int i;
807	if (j->index == NULL)
808		return;
809	/*
810	 * Search for a vacant position.
811	 */
812	for (i = 0; i < j->header.index_size; i++) {
813		if (! POS_VALID(j->index[i]))
814			break;
815	}
816	if (i == j->header.index_size) {
817		unsigned int k = 0;
818		/*
819		 * Found no vacant position.  Make some room.
820		 */
821		for (i = 0; i < j->header.index_size; i += 2) {
822			j->index[k++] = j->index[i];
823		}
824		i = k; /* 'i' identifies the first vacant position. */
825		while (k < j->header.index_size) {
826			POS_INVALIDATE(j->index[k]);
827			k++;
828		}
829	}
830	INSIST(i < j->header.index_size);
831	INSIST(! POS_VALID(j->index[i]));
832
833	/*
834	 * Store the new index entry.
835	 */
836	j->index[i] = *pos;
837}
838
839/*
840 * Invalidate any existing index entries that could become
841 * ambiguous when a new transaction with number 'serial' is added.
842 */
843static void
844index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
845	unsigned int i;
846	if (j->index == NULL)
847		return;
848	for (i = 0; i < j->header.index_size; i++) {
849		if (! DNS_SERIAL_GT(serial, j->index[i].serial))
850			POS_INVALIDATE(j->index[i]);
851	}
852}
853
854/*
855 * Try to find a transaction with initial serial number 'serial'
856 * in the journal 'j'.
857 *
858 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
859 *
860 * If 'serial' is current (= the ending serial number of the
861 * last transaction in the journal), set '*pos' to
862 * the position immediately following the last transaction and
863 * return ISC_R_SUCCESS.
864 *
865 * If 'serial' is within the range of addressable serial numbers
866 * covered by the journal but that particular serial number is missing
867 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
868 *
869 * If 'serial' is outside the range of addressable serial numbers
870 * covered by the journal, return ISC_R_RANGE.
871 *
872 */
873static isc_result_t
874journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
875	isc_result_t result;
876	journal_pos_t current_pos;
877	REQUIRE(DNS_JOURNAL_VALID(j));
878
879	if (DNS_SERIAL_GT(j->header.begin.serial, serial))
880		return (ISC_R_RANGE);
881	if (DNS_SERIAL_GT(serial, j->header.end.serial))
882		return (ISC_R_RANGE);
883	if (serial == j->header.end.serial) {
884		*pos = j->header.end;
885		return (ISC_R_SUCCESS);
886	}
887
888	current_pos = j->header.begin;
889	index_find(j, serial, &current_pos);
890
891	while (current_pos.serial != serial) {
892		if (DNS_SERIAL_GT(current_pos.serial, serial))
893			return (ISC_R_NOTFOUND);
894		result = journal_next(j, &current_pos);
895		if (result != ISC_R_SUCCESS)
896			return (result);
897	}
898	*pos = current_pos;
899	return (ISC_R_SUCCESS);
900}
901
902isc_result_t
903dns_journal_begin_transaction(dns_journal_t *j) {
904	isc_uint32_t offset;
905	isc_result_t result;
906	journal_rawxhdr_t hdr;
907
908	REQUIRE(DNS_JOURNAL_VALID(j));
909	REQUIRE(j->state == JOURNAL_STATE_WRITE);
910
911	/*
912	 * Find the file offset where the new transaction should
913	 * be written, and seek there.
914	 */
915	if (JOURNAL_EMPTY(&j->header)) {
916		offset = sizeof(journal_rawheader_t) +
917			j->header.index_size * sizeof(journal_rawpos_t);
918	} else {
919		offset = j->header.end.offset;
920	}
921	j->x.pos[0].offset = offset;
922	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
923	j->x.n_soa = 0;
924
925	CHECK(journal_seek(j, offset));
926
927	/*
928	 * Write a dummy transaction header of all zeroes to reserve
929	 * space.  It will be filled in when the transaction is
930	 * finished.
931	 */
932	memset(&hdr, 0, sizeof(hdr));
933	CHECK(journal_write(j, &hdr, sizeof(hdr)));
934	j->x.pos[1].offset = j->offset;
935
936	j->state = JOURNAL_STATE_TRANSACTION;
937	result = ISC_R_SUCCESS;
938 failure:
939	return (result);
940}
941
942isc_result_t
943dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
944	dns_difftuple_t *t;
945	isc_buffer_t buffer;
946	void *mem = NULL;
947	unsigned int size;
948	isc_result_t result;
949	isc_region_t used;
950
951	REQUIRE(DNS_DIFF_VALID(diff));
952	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
953
954	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
955	(void)dns_diff_print(diff, NULL);
956
957	/*
958	 * Pass 1: determine the buffer size needed, and
959	 * keep track of SOA serial numbers.
960	 */
961	size = 0;
962	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
963	     t = ISC_LIST_NEXT(t, link))
964	{
965		if (t->rdata.type == dns_rdatatype_soa) {
966			if (j->x.n_soa < 2)
967				j->x.pos[j->x.n_soa].serial =
968					dns_soa_getserial(&t->rdata);
969			j->x.n_soa++;
970		}
971		size += sizeof(journal_rawrrhdr_t);
972		size += t->name.length; /* XXX should have access macro? */
973		size += 10;
974		size += t->rdata.length;
975	}
976
977	mem = isc_mem_get(j->mctx, size);
978	if (mem == NULL)
979		return (ISC_R_NOMEMORY);
980
981	isc_buffer_init(&buffer, mem, size);
982
983	/*
984	 * Pass 2.  Write RRs to buffer.
985	 */
986	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
987	     t = ISC_LIST_NEXT(t, link))
988	{
989		/*
990		 * Write the RR header.
991		 */
992		isc_buffer_putuint32(&buffer, t->name.length + 10 +
993				     t->rdata.length);
994		/*
995		 * Write the owner name, RR header, and RR data.
996		 */
997		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
998		isc_buffer_putuint16(&buffer, t->rdata.type);
999		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1000		isc_buffer_putuint32(&buffer, t->ttl);
1001		INSIST(t->rdata.length < 65536);
1002		isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1003		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1004		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1005	}
1006
1007	isc_buffer_usedregion(&buffer, &used);
1008	INSIST(used.length == size);
1009
1010	j->x.pos[1].offset += used.length;
1011
1012	/*
1013	 * Write the buffer contents to the journal file.
1014	 */
1015	CHECK(journal_write(j, used.base, used.length));
1016
1017	result = ISC_R_SUCCESS;
1018
1019 failure:
1020	if (mem != NULL)
1021		isc_mem_put(j->mctx, mem, size);
1022	return (result);
1023
1024}
1025
1026isc_result_t
1027dns_journal_commit(dns_journal_t *j) {
1028	isc_result_t result;
1029	journal_rawheader_t rawheader;
1030
1031	REQUIRE(DNS_JOURNAL_VALID(j));
1032	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1033
1034	/*
1035	 * Perform some basic consistency checks.
1036	 */
1037	if (j->x.n_soa != 2) {
1038		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1039			      "%s: malformed transaction: %d SOAs",
1040			      j->filename, j->x.n_soa);
1041		return (ISC_R_UNEXPECTED);
1042	}
1043	if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1044	       (bind8_compat &&
1045		j->x.pos[1].serial == j->x.pos[0].serial)))
1046	{
1047		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1048			      "%s: malformed transaction: serial number "
1049			      "would decrease", j->filename);
1050		return (ISC_R_UNEXPECTED);
1051	}
1052	if (! JOURNAL_EMPTY(&j->header)) {
1053		if (j->x.pos[0].serial != j->header.end.serial) {
1054			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1055					 "malformed transaction: "
1056					 "%s last serial %u != "
1057					 "transaction first serial %u",
1058					 j->filename,
1059					 j->header.end.serial,
1060					 j->x.pos[0].serial);
1061			return (ISC_R_UNEXPECTED);
1062		}
1063	}
1064
1065	/*
1066	 * Some old journal entries may become non-addressable
1067	 * when we increment the current serial number.  Purge them
1068	 * by stepping header.begin forward to the first addressable
1069	 * transaction.  Also purge them from the index.
1070	 */
1071	if (! JOURNAL_EMPTY(&j->header)) {
1072		while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1073				       j->header.begin.serial)) {
1074			CHECK(journal_next(j, &j->header.begin));
1075		}
1076		index_invalidate(j, j->x.pos[1].serial);
1077	}
1078#ifdef notyet
1079	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1080		force_dump(...);
1081	}
1082#endif
1083
1084	/*
1085	 * Commit the transaction data to stable storage.
1086	 */
1087	CHECK(journal_fsync(j));
1088
1089	/*
1090	 * Update the transaction header.
1091	 */
1092	CHECK(journal_seek(j, j->x.pos[0].offset));
1093	CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1094				 sizeof(journal_rawxhdr_t),
1095				 j->x.pos[0].serial, j->x.pos[1].serial));
1096
1097	/*
1098	 * Update the journal header.
1099	 */
1100	if (JOURNAL_EMPTY(&j->header)) {
1101		j->header.begin = j->x.pos[0];
1102	}
1103	j->header.end = j->x.pos[1];
1104	journal_header_encode(&j->header, &rawheader);
1105	CHECK(journal_seek(j, 0));
1106	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1107
1108	/*
1109	 * Update the index.
1110	 */
1111	index_add(j, &j->x.pos[0]);
1112
1113	/*
1114	 * Convert the index into on-disk format and write
1115	 * it to disk.
1116	 */
1117	CHECK(index_to_disk(j));
1118
1119	/*
1120	 * Commit the header to stable storage.
1121	 */
1122	CHECK(journal_fsync(j));
1123
1124	/*
1125	 * We no longer have a transaction open.
1126	 */
1127	j->state = JOURNAL_STATE_WRITE;
1128
1129	result = ISC_R_SUCCESS;
1130
1131 failure:
1132	return (result);
1133}
1134
1135isc_result_t
1136dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1137	isc_result_t result;
1138	CHECK(dns_diff_sort(diff, ixfr_order));
1139	CHECK(dns_journal_begin_transaction(j));
1140	CHECK(dns_journal_writediff(j, diff));
1141	CHECK(dns_journal_commit(j));
1142	result = ISC_R_SUCCESS;
1143 failure:
1144	return (result);
1145}
1146
1147void
1148dns_journal_destroy(dns_journal_t **journalp) {
1149	dns_journal_t *j = *journalp;
1150	REQUIRE(DNS_JOURNAL_VALID(j));
1151
1152	j->it.result = ISC_R_FAILURE;
1153	dns_name_invalidate(&j->it.name);
1154	dns_decompress_invalidate(&j->it.dctx);
1155	if (j->rawindex != NULL)
1156		isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1157			    sizeof(journal_rawpos_t));
1158	if (j->index != NULL)
1159		isc_mem_put(j->mctx, j->index, j->header.index_size *
1160			    sizeof(journal_pos_t));
1161	if (j->it.target.base != NULL)
1162		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1163	if (j->it.source.base != NULL)
1164		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1165
1166	if (j->fp != NULL)
1167		(void)isc_stdio_close(j->fp);
1168	j->magic = 0;
1169	isc_mem_put(j->mctx, j, sizeof(*j));
1170	*journalp = NULL;
1171}
1172
1173/*
1174 * Roll the open journal 'j' into the database 'db'.
1175 * A new database version will be created.
1176 */
1177
1178/* XXX Share code with incoming IXFR? */
1179
1180static isc_result_t
1181roll_forward(dns_journal_t *j, dns_db_t *db) {
1182	isc_buffer_t source;		/* Transaction data from disk */
1183	isc_buffer_t target;		/* Ditto after _fromwire check */
1184	isc_uint32_t db_serial;		/* Database SOA serial */
1185	isc_uint32_t end_serial;	/* Last journal SOA serial */
1186	isc_result_t result;
1187	dns_dbversion_t *ver = NULL;
1188	journal_pos_t pos;
1189	dns_diff_t diff;
1190	unsigned int n_soa = 0;
1191	unsigned int n_put = 0;
1192
1193	REQUIRE(DNS_JOURNAL_VALID(j));
1194	REQUIRE(DNS_DB_VALID(db));
1195
1196	dns_diff_init(j->mctx, &diff);
1197
1198	/*
1199	 * Set up empty initial buffers for uncheched and checked
1200	 * wire format transaction data.  They will be reallocated
1201	 * later.
1202	 */
1203	isc_buffer_init(&source, NULL, 0);
1204	isc_buffer_init(&target, NULL, 0);
1205
1206	/*
1207	 * Create the new database version.
1208	 */
1209	CHECK(dns_db_newversion(db, &ver));
1210
1211	/*
1212	 * Get the current database SOA serial number.
1213	 */
1214	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1215
1216	/*
1217	 * Locate a journal entry for the current database serial.
1218	 */
1219	CHECK(journal_find(j, db_serial, &pos));
1220	/*
1221	 * XXX do more drastic things, like marking zone stale,
1222	 * if this fails?
1223	 */
1224	/*
1225	 * XXXRTH  The zone code should probably mark the zone as bad and
1226	 *         scream loudly into the log if this is a dynamic update
1227	 *	   log reply that failed.
1228	 */
1229
1230	end_serial = dns_journal_last_serial(j);
1231	if (db_serial == end_serial)
1232		CHECK(DNS_R_UPTODATE);
1233
1234	CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1235
1236	for (result = dns_journal_first_rr(j);
1237	     result == ISC_R_SUCCESS;
1238	     result = dns_journal_next_rr(j))
1239	{
1240		dns_name_t *name;
1241		isc_uint32_t ttl;
1242		dns_rdata_t *rdata;
1243		dns_difftuple_t *tuple = NULL;
1244
1245		name = NULL;
1246		rdata = NULL;
1247		dns_journal_current_rr(j, &name, &ttl, &rdata);
1248
1249		if (rdata->type == dns_rdatatype_soa) {
1250			n_soa++;
1251			if (n_soa == 2)
1252				db_serial = j->it.current_serial;
1253		}
1254
1255		if (n_soa == 3)
1256			n_soa = 1;
1257		if (n_soa == 0) {
1258			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1259					 "%s: journal file corrupt: missing "
1260					 "initial SOA", j->filename);
1261			FAIL(ISC_R_UNEXPECTED);
1262		}
1263		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1264					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1265					   name, ttl, rdata, &tuple));
1266		dns_diff_append(&diff, &tuple);
1267
1268		if (++n_put > 100)  {
1269			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1270				      "%s: applying diff to database (%u)",
1271				      j->filename, db_serial);
1272			(void)dns_diff_print(&diff, NULL);
1273			CHECK(dns_diff_apply(&diff, db, ver));
1274			dns_diff_clear(&diff);
1275			n_put = 0;
1276		}
1277	}
1278	if (result == ISC_R_NOMORE)
1279		result = ISC_R_SUCCESS;
1280	CHECK(result);
1281
1282	if (n_put != 0) {
1283		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1284			      "%s: applying final diff to database (%u)",
1285			      j->filename, db_serial);
1286		(void)dns_diff_print(&diff, NULL);
1287		CHECK(dns_diff_apply(&diff, db, ver));
1288		dns_diff_clear(&diff);
1289	}
1290
1291 failure:
1292	if (ver != NULL)
1293		dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1294				    ISC_TRUE : ISC_FALSE);
1295
1296	if (source.base != NULL)
1297		isc_mem_put(j->mctx, source.base, source.length);
1298	if (target.base != NULL)
1299		isc_mem_put(j->mctx, target.base, target.length);
1300
1301	dns_diff_clear(&diff);
1302
1303	return (result);
1304}
1305
1306isc_result_t
1307dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1308	dns_journal_t *j;
1309	isc_result_t result;
1310
1311	REQUIRE(DNS_DB_VALID(db));
1312	REQUIRE(filename != NULL);
1313
1314	j = NULL;
1315	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1316	if (result == ISC_R_NOTFOUND) {
1317		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1318			      "no journal file, but that's OK");
1319		return (DNS_R_NOJOURNAL);
1320	}
1321	if (result != ISC_R_SUCCESS)
1322		return (result);
1323	if (JOURNAL_EMPTY(&j->header))
1324		result = DNS_R_UPTODATE;
1325	else
1326		result = roll_forward(j, db);
1327
1328	dns_journal_destroy(&j);
1329
1330	return (result);
1331}
1332
1333isc_result_t
1334dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1335	dns_journal_t *j;
1336	isc_buffer_t source;		/* Transaction data from disk */
1337	isc_buffer_t target;		/* Ditto after _fromwire check */
1338	isc_uint32_t start_serial;		/* Database SOA serial */
1339	isc_uint32_t end_serial;	/* Last journal SOA serial */
1340	isc_result_t result;
1341	dns_diff_t diff;
1342	unsigned int n_soa = 0;
1343	unsigned int n_put = 0;
1344
1345	REQUIRE(filename != NULL);
1346
1347	j = NULL;
1348	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1349	if (result == ISC_R_NOTFOUND) {
1350		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1351		return (DNS_R_NOJOURNAL);
1352	}
1353
1354	if (result != ISC_R_SUCCESS) {
1355		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1356			      "journal open failure: %s: %s",
1357			      isc_result_totext(result), j->filename);
1358		return (result);
1359	}
1360
1361	dns_diff_init(j->mctx, &diff);
1362
1363	/*
1364	 * Set up empty initial buffers for uncheched and checked
1365	 * wire format transaction data.  They will be reallocated
1366	 * later.
1367	 */
1368	isc_buffer_init(&source, NULL, 0);
1369	isc_buffer_init(&target, NULL, 0);
1370
1371	start_serial = dns_journal_first_serial(j);
1372	end_serial = dns_journal_last_serial(j);
1373
1374	CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1375
1376	for (result = dns_journal_first_rr(j);
1377	     result == ISC_R_SUCCESS;
1378	     result = dns_journal_next_rr(j))
1379	{
1380		dns_name_t *name;
1381		isc_uint32_t ttl;
1382		dns_rdata_t *rdata;
1383		dns_difftuple_t *tuple = NULL;
1384
1385		name = NULL;
1386		rdata = NULL;
1387		dns_journal_current_rr(j, &name, &ttl, &rdata);
1388
1389		if (rdata->type == dns_rdatatype_soa)
1390			n_soa++;
1391
1392		if (n_soa == 3)
1393			n_soa = 1;
1394		if (n_soa == 0) {
1395		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1396					 "%s: journal file corrupt: missing "
1397					 "initial SOA", j->filename);
1398			FAIL(ISC_R_UNEXPECTED);
1399		}
1400		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1401					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1402					   name, ttl, rdata, &tuple));
1403		dns_diff_append(&diff, &tuple);
1404
1405		if (++n_put > 100)  {
1406			result = dns_diff_print(&diff, file);
1407			dns_diff_clear(&diff);
1408			n_put = 0;
1409			if (result != ISC_R_SUCCESS)
1410				break;
1411		}
1412	}
1413	if (result == ISC_R_NOMORE)
1414		result = ISC_R_SUCCESS;
1415	CHECK(result);
1416
1417	if (n_put != 0) {
1418		result = dns_diff_print(&diff, file);
1419		dns_diff_clear(&diff);
1420	}
1421	goto cleanup;
1422
1423 failure:
1424	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1425		      "%s: cannot print: journal file corrupt", j->filename);
1426
1427 cleanup:
1428	if (source.base != NULL)
1429		isc_mem_put(j->mctx, source.base, source.length);
1430	if (target.base != NULL)
1431		isc_mem_put(j->mctx, target.base, target.length);
1432
1433	dns_diff_clear(&diff);
1434	dns_journal_destroy(&j);
1435
1436	return (result);
1437}
1438
1439/**************************************************************************/
1440/*
1441 * Miscellaneous accessors.
1442 */
1443isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1444	return (j->header.begin.serial);
1445}
1446
1447isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1448	return (j->header.end.serial);
1449}
1450
1451/**************************************************************************/
1452/*
1453 * Iteration support.
1454 *
1455 * When serving an outgoing IXFR, we transmit a part the journal starting
1456 * at the serial number in the IXFR request and ending at the serial
1457 * number that is current when the IXFR request arrives.  The ending
1458 * serial number is not necessarily at the end of the journal:
1459 * the journal may grow while the IXFR is in progress, but we stop
1460 * when we reach the serial number that was current when the IXFR started.
1461 */
1462
1463static isc_result_t read_one_rr(dns_journal_t *j);
1464
1465/*
1466 * Make sure the buffer 'b' is has at least 'size' bytes
1467 * allocated, and clear it.
1468 *
1469 * Requires:
1470 *	Either b->base is NULL, or it points to b->length bytes of memory
1471 *	previously allocated by isc_mem_get().
1472 */
1473
1474static isc_result_t
1475size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1476	if (b->length < size) {
1477		void *mem = isc_mem_get(mctx, size);
1478		if (mem == NULL)
1479			return (ISC_R_NOMEMORY);
1480		if (b->base != NULL)
1481			isc_mem_put(mctx, b->base, b->length);
1482		b->base = mem;
1483		b->length = size;
1484	}
1485	isc_buffer_clear(b);
1486	return (ISC_R_SUCCESS);
1487}
1488
1489isc_result_t
1490dns_journal_iter_init(dns_journal_t *j,
1491		      isc_uint32_t begin_serial, isc_uint32_t end_serial)
1492{
1493	isc_result_t result;
1494
1495	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1496	INSIST(j->it.bpos.serial == begin_serial);
1497
1498	CHECK(journal_find(j, end_serial, &j->it.epos));
1499	INSIST(j->it.epos.serial == end_serial);
1500
1501	result = ISC_R_SUCCESS;
1502 failure:
1503	j->it.result = result;
1504	return (j->it.result);
1505}
1506
1507
1508isc_result_t
1509dns_journal_first_rr(dns_journal_t *j) {
1510	isc_result_t result;
1511
1512	/*
1513	 * Seek to the beginning of the first transaction we are
1514	 * interested in.
1515	 */
1516	CHECK(journal_seek(j, j->it.bpos.offset));
1517	j->it.current_serial = j->it.bpos.serial;
1518
1519	j->it.xsize = 0;  /* We have no transaction data yet... */
1520	j->it.xpos = 0;	  /* ...and haven't used any of it. */
1521
1522	return (read_one_rr(j));
1523
1524 failure:
1525	return (result);
1526}
1527
1528static isc_result_t
1529read_one_rr(dns_journal_t *j) {
1530	isc_result_t result;
1531
1532	dns_rdatatype_t rdtype;
1533	dns_rdataclass_t rdclass;
1534	unsigned int rdlen;
1535	isc_uint32_t ttl;
1536	journal_xhdr_t xhdr;
1537	journal_rrhdr_t rrhdr;
1538
1539	INSIST(j->offset <= j->it.epos.offset);
1540	if (j->offset == j->it.epos.offset)
1541		return (ISC_R_NOMORE);
1542	if (j->it.xpos == j->it.xsize) {
1543		/*
1544		 * We are at a transaction boundary.
1545		 * Read another transaction header.
1546		 */
1547		CHECK(journal_read_xhdr(j, &xhdr));
1548		if (xhdr.size == 0) {
1549			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1550				      "%s: journal corrupt: empty transaction",
1551				      j->filename);
1552			FAIL(ISC_R_UNEXPECTED);
1553		}
1554		if (xhdr.serial0 != j->it.current_serial) {
1555			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1556					 "%s: journal file corrupt: "
1557					 "expected serial %u, got %u",
1558					 j->filename,
1559					 j->it.current_serial, xhdr.serial0);
1560			FAIL(ISC_R_UNEXPECTED);
1561		}
1562		j->it.xsize = xhdr.size;
1563		j->it.xpos = 0;
1564	}
1565	/*
1566	 * Read an RR.
1567	 */
1568	CHECK(journal_read_rrhdr(j, &rrhdr));
1569	/*
1570	 * Perform a sanity check on the journal RR size.
1571	 * The smallest possible RR has a 1-byte owner name
1572	 * and a 10-byte header.  The largest possible
1573	 * RR has 65535 bytes of data, a header, and a maximum-
1574	 * size owner name, well below 70 k total.
1575	 */
1576	if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1577		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1578				 "%s: journal corrupt: impossible RR size "
1579				 "(%d bytes)", j->filename, rrhdr.size);
1580		FAIL(ISC_R_UNEXPECTED);
1581	}
1582
1583	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1584	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1585	isc_buffer_add(&j->it.source, rrhdr.size);
1586
1587	/*
1588	 * The target buffer is made the same size
1589	 * as the source buffer, with the assumption that when
1590	 * no compression in present, the output of dns_*_fromwire()
1591	 * is no larger than the input.
1592	 */
1593	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1594
1595	/*
1596	 * Parse the owner name.  We don't know where it
1597	 * ends yet, so we make the entire "remaining"
1598	 * part of the buffer "active".
1599	 */
1600	isc_buffer_setactive(&j->it.source,
1601			     j->it.source.used - j->it.source.current);
1602	CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1603				&j->it.dctx, 0, &j->it.target));
1604
1605	/*
1606	 * Check that the RR header is there, and parse it.
1607	 */
1608	if (isc_buffer_remaininglength(&j->it.source) < 10)
1609		FAIL(DNS_R_FORMERR);
1610
1611	rdtype = isc_buffer_getuint16(&j->it.source);
1612	rdclass = isc_buffer_getuint16(&j->it.source);
1613	ttl = isc_buffer_getuint32(&j->it.source);
1614	rdlen = isc_buffer_getuint16(&j->it.source);
1615
1616	/*
1617	 * Parse the rdata.
1618	 */
1619	isc_buffer_setactive(&j->it.source, rdlen);
1620	dns_rdata_reset(&j->it.rdata);
1621	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1622				 rdtype, &j->it.source, &j->it.dctx,
1623				 0, &j->it.target));
1624	j->it.ttl = ttl;
1625
1626	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1627	if (rdtype == dns_rdatatype_soa) {
1628		/* XXX could do additional consistency checks here */
1629		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1630	}
1631
1632	result = ISC_R_SUCCESS;
1633
1634 failure:
1635	j->it.result = result;
1636	return (result);
1637}
1638
1639isc_result_t
1640dns_journal_next_rr(dns_journal_t *j) {
1641	j->it.result = read_one_rr(j);
1642	return (j->it.result);
1643}
1644
1645void
1646dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1647		   dns_rdata_t **rdata)
1648{
1649	REQUIRE(j->it.result == ISC_R_SUCCESS);
1650	*name = &j->it.name;
1651	*ttl = j->it.ttl;
1652	*rdata = &j->it.rdata;
1653}
1654
1655/**************************************************************************/
1656/*
1657 * Generating diffs from databases
1658 */
1659
1660/*
1661 * Construct a diff containing all the RRs at the current name of the
1662 * database iterator 'dbit' in database 'db', version 'ver'.
1663 * Set '*name' to the current name, and append the diff to 'diff'.
1664 * All new tuples will have the operation 'op'.
1665 *
1666 * Requires: 'name' must have buffer large enough to hold the name.
1667 * Typically, a dns_fixedname_t would be used.
1668 */
1669static isc_result_t
1670get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1671	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1672	      dns_diff_t *diff)
1673{
1674	isc_result_t result;
1675	dns_dbnode_t *node = NULL;
1676	dns_rdatasetiter_t *rdsiter = NULL;
1677	dns_difftuple_t *tuple = NULL;
1678
1679	result = dns_dbiterator_current(dbit, &node, name);
1680	if (result != ISC_R_SUCCESS)
1681		return (result);
1682
1683	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1684	if (result != ISC_R_SUCCESS)
1685		goto cleanup_node;
1686
1687	for (result = dns_rdatasetiter_first(rdsiter);
1688	     result == ISC_R_SUCCESS;
1689	     result = dns_rdatasetiter_next(rdsiter))
1690	{
1691		dns_rdataset_t rdataset;
1692
1693		dns_rdataset_init(&rdataset);
1694		dns_rdatasetiter_current(rdsiter, &rdataset);
1695
1696		for (result = dns_rdataset_first(&rdataset);
1697		     result == ISC_R_SUCCESS;
1698		     result = dns_rdataset_next(&rdataset))
1699		{
1700			dns_rdata_t rdata = DNS_RDATA_INIT;
1701			dns_rdataset_current(&rdataset, &rdata);
1702			result = dns_difftuple_create(diff->mctx, op, name,
1703						      rdataset.ttl, &rdata,
1704						      &tuple);
1705			if (result != ISC_R_SUCCESS) {
1706				dns_rdataset_disassociate(&rdataset);
1707				goto cleanup_iterator;
1708			}
1709			dns_diff_append(diff, &tuple);
1710		}
1711		dns_rdataset_disassociate(&rdataset);
1712		if (result != ISC_R_NOMORE)
1713			goto cleanup_iterator;
1714	}
1715	if (result != ISC_R_NOMORE)
1716		goto cleanup_iterator;
1717
1718	result = ISC_R_SUCCESS;
1719
1720 cleanup_iterator:
1721	dns_rdatasetiter_destroy(&rdsiter);
1722
1723 cleanup_node:
1724	dns_db_detachnode(db, &node);
1725
1726	return (result);
1727}
1728
1729/*
1730 * Comparison function for use by dns_diff_subtract when sorting
1731 * the diffs to be subtracted.  The sort keys are the rdata type
1732 * and the rdata itself.  The owner name is ignored, because
1733 * it is known to be the same for all tuples.
1734 */
1735static int
1736rdata_order(const void *av, const void *bv) {
1737	dns_difftuple_t const * const *ap = av;
1738	dns_difftuple_t const * const *bp = bv;
1739	dns_difftuple_t const *a = *ap;
1740	dns_difftuple_t const *b = *bp;
1741	int r;
1742	r = (b->rdata.type - a->rdata.type);
1743	if (r != 0)
1744		return (r);
1745	r = dns_rdata_compare(&a->rdata, &b->rdata);
1746	return (r);
1747}
1748
1749static isc_result_t
1750dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1751	isc_result_t result;
1752	dns_difftuple_t *p[2];
1753	int i, t;
1754	isc_boolean_t append;
1755
1756	CHECK(dns_diff_sort(&diff[0], rdata_order));
1757	CHECK(dns_diff_sort(&diff[1], rdata_order));
1758
1759	for (;;) {
1760		p[0] = ISC_LIST_HEAD(diff[0].tuples);
1761		p[1] = ISC_LIST_HEAD(diff[1].tuples);
1762		if (p[0] == NULL && p[1] == NULL)
1763			break;
1764
1765		for (i = 0; i < 2; i++)
1766			if (p[!i] == NULL) {
1767				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1768				ISC_LIST_APPEND(r->tuples, p[i], link);
1769				goto next;
1770			}
1771		t = rdata_order(&p[0], &p[1]);
1772		if (t < 0) {
1773			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1774			ISC_LIST_APPEND(r->tuples, p[0], link);
1775			goto next;
1776		}
1777		if (t > 0) {
1778			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1779			ISC_LIST_APPEND(r->tuples, p[1], link);
1780			goto next;
1781		}
1782		INSIST(t == 0);
1783		/*
1784		 * Identical RRs in both databases; skip them both
1785		 * if the ttl differs.
1786		 */
1787		append = ISC_TF(p[0]->ttl != p[1]->ttl);
1788		for (i = 0; i < 2; i++) {
1789			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1790			if (append) {
1791				ISC_LIST_APPEND(r->tuples, p[i], link);
1792			} else {
1793				dns_difftuple_free(&p[i]);
1794			}
1795		}
1796	next: ;
1797	}
1798	result = ISC_R_SUCCESS;
1799 failure:
1800	return (result);
1801}
1802
1803/*
1804 * Compare the databases 'dba' and 'dbb' and generate a journal
1805 * entry containing the changes to make 'dba' from 'dbb' (note
1806 * the order).  This journal entry will consist of a single,
1807 * possibly very large transaction.
1808 */
1809
1810isc_result_t
1811dns_db_diff(isc_mem_t *mctx,
1812	    dns_db_t *dba, dns_dbversion_t *dbvera,
1813	    dns_db_t *dbb, dns_dbversion_t *dbverb,
1814	    const char *journal_filename)
1815{
1816	dns_db_t *db[2];
1817	dns_dbversion_t *ver[2];
1818	dns_dbiterator_t *dbit[2] = { NULL, NULL };
1819	isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1820	dns_fixedname_t fixname[2];
1821	isc_result_t result, itresult[2];
1822	dns_diff_t diff[2], resultdiff;
1823	int i, t;
1824	dns_journal_t *journal = NULL;
1825
1826	db[0] = dba, db[1] = dbb;
1827	ver[0] = dbvera, ver[1] = dbverb;
1828
1829	dns_diff_init(mctx, &diff[0]);
1830	dns_diff_init(mctx, &diff[1]);
1831	dns_diff_init(mctx, &resultdiff);
1832
1833	dns_fixedname_init(&fixname[0]);
1834	dns_fixedname_init(&fixname[1]);
1835
1836	result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1837	if (result != ISC_R_SUCCESS)
1838		return (result);
1839
1840	result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1841	if (result != ISC_R_SUCCESS)
1842		goto cleanup_journal;
1843	result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1844	if (result != ISC_R_SUCCESS)
1845		goto cleanup_interator0;
1846
1847	itresult[0] = dns_dbiterator_first(dbit[0]);
1848	itresult[1] = dns_dbiterator_first(dbit[1]);
1849
1850	for (;;) {
1851		for (i = 0; i < 2; i++) {
1852			if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1853				CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1854					    dns_fixedname_name(&fixname[i]),
1855					    i == 0 ?
1856					    DNS_DIFFOP_ADD :
1857					    DNS_DIFFOP_DEL,
1858					    &diff[i]));
1859				itresult[i] = dns_dbiterator_next(dbit[i]);
1860				have[i] = ISC_TRUE;
1861			}
1862		}
1863
1864		if (! have[0] && ! have[1]) {
1865			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1866			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1867			break;
1868		}
1869
1870		for (i = 0; i < 2; i++) {
1871			if (! have[!i]) {
1872				ISC_LIST_APPENDLIST(resultdiff.tuples,
1873						    diff[i].tuples, link);
1874				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1875				have[i] = ISC_FALSE;
1876				goto next;
1877			}
1878		}
1879
1880		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1881				     dns_fixedname_name(&fixname[1]));
1882		if (t < 0) {
1883			ISC_LIST_APPENDLIST(resultdiff.tuples,
1884					    diff[0].tuples, link);
1885			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1886			have[0] = ISC_FALSE;
1887			continue;
1888		}
1889		if (t > 0) {
1890			ISC_LIST_APPENDLIST(resultdiff.tuples,
1891					    diff[1].tuples, link);
1892			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1893			have[1] = ISC_FALSE;
1894			continue;
1895		}
1896		INSIST(t == 0);
1897		CHECK(dns_diff_subtract(diff, &resultdiff));
1898		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1899		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1900		have[0] = have[1] = ISC_FALSE;
1901	next: ;
1902	}
1903	if (itresult[0] != ISC_R_NOMORE)
1904		FAIL(itresult[0]);
1905	if (itresult[1] != ISC_R_NOMORE)
1906		FAIL(itresult[1]);
1907
1908	if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1909		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1910	} else {
1911		CHECK(dns_journal_write_transaction(journal, &resultdiff));
1912	}
1913	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1914	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1915
1916 failure:
1917	dns_diff_clear(&resultdiff);
1918	dns_dbiterator_destroy(&dbit[1]);
1919 cleanup_interator0:
1920	dns_dbiterator_destroy(&dbit[0]);
1921 cleanup_journal:
1922	dns_journal_destroy(&journal);
1923	return (result);
1924}
1925
1926isc_result_t
1927dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1928		    isc_uint32_t target_size)
1929{
1930	unsigned int i;
1931	journal_pos_t best_guess;
1932	journal_pos_t current_pos;
1933	dns_journal_t *j = NULL;
1934	journal_rawheader_t rawheader;
1935	unsigned int copy_length;
1936	unsigned int len;
1937	char *buf = NULL;
1938	unsigned int size = 0;
1939	isc_result_t result;
1940	unsigned int indexend;
1941
1942	CHECK(journal_open(mctx, filename, ISC_TRUE, ISC_FALSE, &j));
1943
1944	if (JOURNAL_EMPTY(&j->header)) {
1945		dns_journal_destroy(&j);
1946		return (ISC_R_SUCCESS);
1947	}
1948
1949	if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1950	    DNS_SERIAL_GT(serial, j->header.end.serial)) {
1951		dns_journal_destroy(&j);
1952		return (ISC_R_RANGE);
1953	}
1954
1955	/*
1956	 * Cope with very small target sizes.
1957	 */
1958	indexend = sizeof(journal_rawheader_t) +
1959		   j->header.index_size * sizeof(journal_rawpos_t);
1960	if (target_size < indexend * 2)
1961		target_size = target_size/2 + indexend;
1962
1963	/*
1964	 * See if there is any work to do.
1965	 */
1966	if ((isc_uint32_t) j->header.end.offset < target_size) {
1967		dns_journal_destroy(&j);
1968		return (ISC_R_SUCCESS);
1969	}
1970
1971	/*
1972	 * Remove overhead so space test below can succeed.
1973	 */
1974	if (target_size >= indexend)
1975		target_size -= indexend;
1976
1977	/*
1978	 * Find if we can create enough free space.
1979	 */
1980	best_guess = j->header.begin;
1981	for (i = 0; i < j->header.index_size; i++) {
1982		if (POS_VALID(j->index[i]) &&
1983		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1984		    ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
1985		     >= target_size / 2) &&
1986		    j->index[i].offset > best_guess.offset)
1987			best_guess = j->index[i];
1988	}
1989
1990	current_pos = best_guess;
1991	while (current_pos.serial != serial) {
1992		CHECK(journal_next(j, &current_pos));
1993		if (current_pos.serial == j->header.end.serial)
1994			break;
1995
1996		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
1997		   ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
1998		     >= (target_size / 2)) &&
1999		    current_pos.offset > best_guess.offset)
2000			best_guess = current_pos;
2001		else
2002			break;
2003	}
2004
2005	INSIST(best_guess.serial != j->header.end.serial);
2006	if (best_guess.serial != serial)
2007		CHECK(journal_next(j, &best_guess));
2008
2009	/*
2010	 * Enough space to proceed?
2011	 */
2012	if ((isc_uint32_t) (j->header.end.offset - best_guess.offset) >
2013	     (isc_uint32_t) (best_guess.offset - indexend)) {
2014		dns_journal_destroy(&j);
2015		return (ISC_R_NOSPACE);
2016	}
2017
2018	copy_length = j->header.end.offset - best_guess.offset;
2019
2020	/*
2021	 * Invalidate entire index, will be rebuilt at end.
2022	 */
2023	for (i = 0; i < j->header.index_size; i++) {
2024		if (POS_VALID(j->index[i]))
2025			POS_INVALIDATE(j->index[i]);
2026	}
2027
2028	/*
2029	 * Convert the index into on-disk format and write
2030	 * it to disk.
2031	 */
2032	CHECK(index_to_disk(j));
2033	CHECK(journal_fsync(j));
2034
2035	/*
2036	 * Update the journal header.
2037	 */
2038	if (copy_length == 0) {
2039		j->header.begin.serial = 0;
2040		j->header.end.serial = 0;
2041		j->header.begin.offset = 0;
2042		j->header.end.offset = 0;
2043	} else {
2044		j->header.begin = best_guess;
2045	}
2046	journal_header_encode(&j->header, &rawheader);
2047	CHECK(journal_seek(j, 0));
2048	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2049	CHECK(journal_fsync(j));
2050
2051	if (copy_length != 0) {
2052		/*
2053		 * Copy best_guess to end into space just freed.
2054		 */
2055		size = 64*1024;
2056		if (copy_length < size)
2057			size = copy_length;
2058		buf = isc_mem_get(mctx, size);
2059		if (buf == NULL) {
2060			result = ISC_R_NOMEMORY;
2061			goto failure;
2062		}
2063
2064		for (i = 0; i < copy_length; i += size) {
2065			len = (copy_length - i) > size ? size :
2066							 (copy_length - i);
2067			CHECK(journal_seek(j, best_guess.offset + i));
2068			CHECK(journal_read(j, buf, len));
2069			CHECK(journal_seek(j, indexend + i));
2070			CHECK(journal_write(j, buf, len));
2071		}
2072
2073		CHECK(journal_fsync(j));
2074
2075		/*
2076		 * Compute new header.
2077		 */
2078		j->header.begin.offset = indexend;
2079		j->header.end.offset = indexend + copy_length;
2080		/*
2081		 * Update the journal header.
2082		 */
2083		journal_header_encode(&j->header, &rawheader);
2084		CHECK(journal_seek(j, 0));
2085		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2086		CHECK(journal_fsync(j));
2087
2088		/*
2089		 * Build new index.
2090		 */
2091		current_pos = j->header.begin;
2092		while (current_pos.serial != j->header.end.serial) {
2093			index_add(j, &current_pos);
2094			CHECK(journal_next(j, &current_pos));
2095		}
2096
2097		/*
2098		 * Write index.
2099		 */
2100		CHECK(index_to_disk(j));
2101		CHECK(journal_fsync(j));
2102
2103		indexend = j->header.end.offset;
2104	}
2105	dns_journal_destroy(&j);
2106	(void)isc_file_truncate(filename, (isc_offset_t)indexend);
2107	result = ISC_R_SUCCESS;
2108
2109 failure:
2110	if (buf != NULL)
2111		isc_mem_put(mctx, buf, size);
2112	if (j != NULL)
2113		dns_journal_destroy(&j);
2114	return (result);
2115}
2116
2117static isc_result_t
2118index_to_disk(dns_journal_t *j) {
2119	isc_result_t result = ISC_R_SUCCESS;
2120
2121	if (j->header.index_size != 0) {
2122		unsigned int i;
2123		unsigned char *p;
2124		unsigned int rawbytes;
2125
2126		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2127
2128		p = j->rawindex;
2129		for (i = 0; i < j->header.index_size; i++) {
2130			encode_uint32(j->index[i].serial, p);
2131			p += 4;
2132			encode_uint32(j->index[i].offset, p);
2133			p += 4;
2134		}
2135		INSIST(p == j->rawindex + rawbytes);
2136
2137		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2138		CHECK(journal_write(j, j->rawindex, rawbytes));
2139	}
2140failure:
2141	return (result);
2142}
2143