journal.c revision 143731
1/*
2 * Copyright (C) 2004  Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2002  Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/* $Id: journal.c,v 1.77.2.1.10.9 2004/09/16 04:57:02 marka Exp $ */
19
20#include <config.h>
21
22#include <stdlib.h>
23
24#include <isc/file.h>
25#include <isc/mem.h>
26#include <isc/stdio.h>
27#include <isc/string.h>
28#include <isc/util.h>
29
30#include <dns/compress.h>
31#include <dns/db.h>
32#include <dns/dbiterator.h>
33#include <dns/diff.h>
34#include <dns/fixedname.h>
35#include <dns/journal.h>
36#include <dns/log.h>
37#include <dns/rdataset.h>
38#include <dns/rdatasetiter.h>
39#include <dns/result.h>
40#include <dns/soa.h>
41
42/*
43 * When true, accept IXFR difference sequences where the
44 * SOA serial number does not change (BIND 8 sends such
45 * sequences).
46 */
47static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
48
49/**************************************************************************/
50/*
51 * Miscellaneous utilities.
52 */
53
54#define JOURNAL_COMMON_LOGARGS \
55	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
56
57#define JOURNAL_DEBUG_LOGARGS(n) \
58	JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
59
60/*
61 * It would be non-sensical (or at least obtuse) to use FAIL() with an
62 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
63 * from complaining about "end-of-loop code not reached".
64 */
65#define FAIL(code) \
66	do { result = (code);					\
67		if (result != ISC_R_SUCCESS) goto failure;	\
68	} while (0)
69
70#define CHECK(op) \
71     	do { result = (op); 					\
72		if (result != ISC_R_SUCCESS) goto failure; 	\
73	} while (0)
74
75static isc_result_t index_to_disk(dns_journal_t *);
76
77static inline isc_uint32_t
78decode_uint32(unsigned char *p) {
79	return ((p[0] << 24) +
80		(p[1] << 16) +
81		(p[2] <<  8) +
82		(p[3] <<  0));
83}
84
85static inline void
86encode_uint32(isc_uint32_t val, unsigned char *p) {
87	p[0] = (isc_uint8_t)(val >> 24);
88	p[1] = (isc_uint8_t)(val >> 16);
89	p[2] = (isc_uint8_t)(val >>  8);
90	p[3] = (isc_uint8_t)(val >>  0);
91}
92
93isc_result_t
94dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
95		      dns_diffop_t op, dns_difftuple_t **tp)
96{
97	isc_result_t result;
98	dns_dbnode_t *node;
99	dns_rdataset_t rdataset;
100	dns_rdata_t rdata = DNS_RDATA_INIT;
101	dns_name_t *zonename;
102
103	zonename = dns_db_origin(db);
104
105	node = NULL;
106	result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
107	if (result != ISC_R_SUCCESS)
108		goto nonode;
109
110	dns_rdataset_init(&rdataset);
111	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
112				     (isc_stdtime_t)0, &rdataset, NULL);
113 	if (result != ISC_R_SUCCESS)
114		goto freenode;
115
116	result = dns_rdataset_first(&rdataset);
117 	if (result != ISC_R_SUCCESS)
118		goto freenode;
119
120	dns_rdataset_current(&rdataset, &rdata);
121
122	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
123				      &rdata, tp);
124
125	dns_rdataset_disassociate(&rdataset);
126	dns_db_detachnode(db, &node);
127	return (ISC_R_SUCCESS);
128
129 freenode:
130	dns_db_detachnode(db, &node);
131 nonode:
132	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
133	return (result);
134}
135
136/**************************************************************************/
137/*
138 * Journalling.
139 */
140
141/*
142 * A journal file consists of
143 *
144 *   - A fixed-size header of type journal_rawheader_t.
145 *
146 *   - The index.  This is an unordered array of index entries
147 *     of type journal_rawpos_t giving the locations
148 *     of some arbitrary subset of the journal's addressable
149 *     transactions.  The index entries are used as hints to
150 *     speed up the process of locating a transaction with a given
151 *     serial number.  Unused index entries have an "offset"
152 *     field of zero.  The size of the index can vary between
153 *     journal files, but does not change during the lifetime
154 *     of a file.  The size can be zero.
155 *
156 *   - The journal data.  This  consists of one or more transactions.
157 *     Each transaction begins with a transaction header of type
158 *     journal_rawxhdr_t.  The transaction header is followed by a
159 *     sequence of RRs, similar in structure to an IXFR difference
160 *     sequence (RFC1995).  That is, the pre-transaction SOA,
161 *     zero or more other deleted RRs, the post-transaction SOA,
162 *     and zero or more other added RRs.  Unlike in IXFR, each RR
163 *     is prefixed with a 32-bit length.
164 *
165 *     The journal data part grows as new transactions are
166 *     appended to the file.  Only those transactions
167 *     whose serial number is current-(2^31-1) to current
168 *     are considered "addressable" and may be pointed
169 *     to from the header or index.  They may be preceded
170 *     by old transactions that are no longer addressable,
171 *     and they may be followed by transactions that were
172 *     appended to the journal but never committed by updating
173 *     the "end" position in the header.  The latter will
174 *     be overwritten when new transactions are added.
175 */
176
177/*
178 * On-disk representation of a "pointer" to a journal entry.
179 * These are used in the journal header to locate the beginning
180 * and end of the journal, and in the journal index to locate
181 * other transactions.
182 */
183typedef struct {
184	unsigned char	serial[4];  /* SOA serial before update. */
185	/*
186	 * XXXRTH  Should offset be 8 bytes?
187	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
188	 * XXXAG  ... but we will not be able to seek >2G anyway on many
189	 *            platforms as long as we are using fseek() rather
190	 *            than lseek().
191	 */
192	unsigned char	offset[4];  /* Offset from beginning of file. */
193} journal_rawpos_t;
194
195/*
196 * The on-disk representation of the journal header.
197 * All numbers are stored in big-endian order.
198 */
199
200/*
201 * The header is of a fixed size, with some spare room for future
202 * extensions.
203 */
204#define JOURNAL_HEADER_SIZE 64 /* Bytes. */
205
206typedef union {
207	struct {
208		/* File format version ID. */
209		unsigned char 		format[16];
210		/* Position of the first addressable transaction */
211		journal_rawpos_t 	begin;
212		/* Position of the next (yet nonexistent) transaction. */
213		journal_rawpos_t 	end;
214		/* Number of index entries following the header. */
215		unsigned char 		index_size[4];
216	} h;
217	/* Pad the header to a fixed size. */
218	unsigned char pad[JOURNAL_HEADER_SIZE];
219} journal_rawheader_t;
220
221/*
222 * The on-disk representation of the transaction header.
223 * There is one of these at the beginning of each transaction.
224 */
225typedef struct {
226	unsigned char	size[4]; 	/* In bytes, excluding header. */
227	unsigned char	serial0[4];	/* SOA serial before update. */
228	unsigned char	serial1[4];	/* SOA serial after update. */
229} journal_rawxhdr_t;
230
231/*
232 * The on-disk representation of the RR header.
233 * There is one of these at the beginning of each RR.
234 */
235typedef struct {
236	unsigned char	size[4]; 	/* In bytes, excluding header. */
237} journal_rawrrhdr_t;
238
239/*
240 * The in-core representation of the journal header.
241 */
242typedef struct {
243	isc_uint32_t	serial;
244	isc_offset_t	offset;
245} journal_pos_t;
246
247#define POS_VALID(pos) 		((pos).offset != 0)
248#define POS_INVALIDATE(pos) 	((pos).offset = 0, (pos).serial = 0)
249
250typedef struct {
251	unsigned char 	format[16];
252	journal_pos_t 	begin;
253	journal_pos_t 	end;
254	isc_uint32_t	index_size;
255} journal_header_t;
256
257/*
258 * The in-core representation of the transaction header.
259 */
260
261typedef struct {
262	isc_uint32_t	size;
263	isc_uint32_t	serial0;
264	isc_uint32_t	serial1;
265} journal_xhdr_t;
266
267/*
268 * The in-core representation of the RR header.
269 */
270typedef struct {
271	isc_uint32_t	size;
272} journal_rrhdr_t;
273
274
275/*
276 * Initial contents to store in the header of a newly created
277 * journal file.
278 *
279 * The header starts with the magic string ";BIND LOG V9\n"
280 * to identify the file as a BIND 9 journal file.  An ASCII
281 * identification string is used rather than a binary magic
282 * number to be consistent with BIND 8 (BIND 8 journal files
283 * are ASCII text files).
284 */
285
286static journal_header_t
287initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
288
289#define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
290
291typedef enum {
292	JOURNAL_STATE_INVALID,
293	JOURNAL_STATE_READ,
294	JOURNAL_STATE_WRITE,
295	JOURNAL_STATE_TRANSACTION
296} journal_state_t;
297
298struct dns_journal {
299	unsigned int		magic;		/* JOUR */
300	isc_mem_t		*mctx;		/* Memory context */
301	journal_state_t		state;
302	const char 		*filename;	/* Journal file name */
303	FILE *			fp;		/* File handle */
304	isc_offset_t		offset;		/* Current file offset */
305	journal_header_t 	header;		/* In-core journal header */
306	unsigned char		*rawindex;	/* In-core buffer for journal
307						   index in on-disk format */
308	journal_pos_t		*index;		/* In-core journal index */
309
310	/* Current transaction state (when writing). */
311	struct {
312		unsigned int	n_soa;		/* Number of SOAs seen */
313		journal_pos_t	pos[2];		/* Begin/end position */
314	} x;
315
316	/* Iteration state (when reading). */
317	struct {
318		/* These define the part of the journal we iterate over. */
319		journal_pos_t bpos;		/* Position before first, */
320		journal_pos_t epos;		/* and after last
321						   transaction */
322		/* The rest is iterator state. */
323		isc_uint32_t current_serial;	/* Current SOA serial */
324		isc_buffer_t source;		/* Data from disk */
325		isc_buffer_t target;		/* Data from _fromwire check */
326		dns_decompress_t dctx;		/* Dummy decompression ctx */
327		dns_name_t name;		/* Current domain name */
328		dns_rdata_t rdata;		/* Current rdata */
329		isc_uint32_t ttl;		/* Current TTL */
330		unsigned int xsize;		/* Size of transaction data */
331		unsigned int xpos;		/* Current position in it */
332		isc_result_t result;		/* Result of last call */
333	} it;
334};
335
336#define DNS_JOURNAL_MAGIC	ISC_MAGIC('J', 'O', 'U', 'R')
337#define DNS_JOURNAL_VALID(t)	ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
338
339static void
340journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
341	cooked->serial = decode_uint32(raw->serial);
342	cooked->offset = decode_uint32(raw->offset);
343}
344
345static void
346journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
347	encode_uint32(cooked->serial, raw->serial);
348	encode_uint32(cooked->offset, raw->offset);
349}
350
351static void
352journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
353	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
354	memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
355	journal_pos_decode(&raw->h.begin, &cooked->begin);
356	journal_pos_decode(&raw->h.end, &cooked->end);
357	cooked->index_size = decode_uint32(raw->h.index_size);
358}
359
360static void
361journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
362	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
363	memset(raw->pad, 0, sizeof(raw->pad));
364	memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
365	journal_pos_encode(&raw->h.begin, &cooked->begin);
366	journal_pos_encode(&raw->h.end, &cooked->end);
367	encode_uint32(cooked->index_size, raw->h.index_size);
368}
369
370/*
371 * Journal file I/O subroutines, with error checking and reporting.
372 */
373static isc_result_t
374journal_seek(dns_journal_t *j, isc_uint32_t offset) {
375	isc_result_t result;
376	result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
377	if (result != ISC_R_SUCCESS) {
378		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
379			      "%s: seek: %s", j->filename,
380			      isc_result_totext(result));
381		return (ISC_R_UNEXPECTED);
382	}
383	j->offset = offset;
384	return (ISC_R_SUCCESS);
385}
386
387static isc_result_t
388journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
389	isc_result_t result;
390
391	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
392	if (result != ISC_R_SUCCESS) {
393		if (result == ISC_R_EOF)
394			return (ISC_R_NOMORE);
395		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
396			      "%s: read: %s",
397			      j->filename, isc_result_totext(result));
398		return (ISC_R_UNEXPECTED);
399	}
400	j->offset += nbytes;
401	return (ISC_R_SUCCESS);
402}
403
404static isc_result_t
405journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
406	isc_result_t result;
407
408	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
409	if (result != ISC_R_SUCCESS) {
410		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
411			      "%s: write: %s",
412			      j->filename, isc_result_totext(result));
413		return (ISC_R_UNEXPECTED);
414	}
415	j->offset += nbytes;
416	return (ISC_R_SUCCESS);
417}
418
419static isc_result_t
420journal_fsync(dns_journal_t *j) {
421	isc_result_t result;
422	result = isc_stdio_flush(j->fp);
423	if (result != ISC_R_SUCCESS) {
424		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
425			      "%s: flush: %s",
426			      j->filename, isc_result_totext(result));
427		return (ISC_R_UNEXPECTED);
428	}
429	result = isc_stdio_sync(j->fp);
430	if (result != ISC_R_SUCCESS) {
431		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
432			      "%s: fsync: %s",
433			      j->filename, isc_result_totext(result));
434		return (ISC_R_UNEXPECTED);
435	}
436	return (ISC_R_SUCCESS);
437}
438
439/*
440 * Read/write a transaction header at the current file position.
441 */
442
443static isc_result_t
444journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
445	journal_rawxhdr_t raw;
446	isc_result_t result;
447	result = journal_read(j, &raw, sizeof(raw));
448	if (result != ISC_R_SUCCESS)
449		return (result);
450	xhdr->size = decode_uint32(raw.size);
451	xhdr->serial0 = decode_uint32(raw.serial0);
452	xhdr->serial1 = decode_uint32(raw.serial1);
453	return (ISC_R_SUCCESS);
454}
455
456static isc_result_t
457journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
458		   isc_uint32_t serial0, isc_uint32_t serial1)
459{
460	journal_rawxhdr_t raw;
461	encode_uint32(size, raw.size);
462	encode_uint32(serial0, raw.serial0);
463	encode_uint32(serial1, raw.serial1);
464	return (journal_write(j, &raw, sizeof(raw)));
465}
466
467
468/*
469 * Read an RR header at the current file position.
470 */
471
472static isc_result_t
473journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
474	journal_rawrrhdr_t raw;
475	isc_result_t result;
476	result = journal_read(j, &raw, sizeof(raw));
477	if (result != ISC_R_SUCCESS)
478		return (result);
479	rrhdr->size = decode_uint32(raw.size);
480	return (ISC_R_SUCCESS);
481}
482
483static isc_result_t
484journal_file_create(isc_mem_t *mctx, const char *filename) {
485	FILE *fp = NULL;
486	isc_result_t result;
487	journal_header_t header;
488	journal_rawheader_t rawheader;
489	int index_size = 56; /* XXX configurable */
490	int size;
491	void *mem; /* Memory for temporary index image. */
492
493	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
494
495	result = isc_stdio_open(filename, "wb", &fp);
496	if (result != ISC_R_SUCCESS) {
497		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
498			      "%s: create: %s",
499			      filename, isc_result_totext(result));
500		return (ISC_R_UNEXPECTED);
501	}
502
503	header = initial_journal_header;
504	header.index_size = index_size;
505	journal_header_encode(&header, &rawheader);
506
507	size = sizeof(journal_rawheader_t) +
508		index_size * sizeof(journal_rawpos_t);
509
510	mem = isc_mem_get(mctx, size);
511	if (mem == NULL) {
512		(void)isc_stdio_close(fp);
513		(void)isc_file_remove(filename);
514		return (ISC_R_NOMEMORY);
515	}
516	memset(mem, 0, size);
517	memcpy(mem, &rawheader, sizeof(rawheader));
518
519	result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
520	if (result != ISC_R_SUCCESS) {
521		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
522				 "%s: write: %s",
523				 filename, isc_result_totext(result));
524		(void)isc_stdio_close(fp);
525		(void)isc_file_remove(filename);
526		isc_mem_put(mctx, mem, size);
527		return (ISC_R_UNEXPECTED);
528	}
529	isc_mem_put(mctx, mem, size);
530
531	result = isc_stdio_close(fp);
532	if (result != ISC_R_SUCCESS) {
533		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
534				 "%s: close: %s",
535				 filename, isc_result_totext(result));
536		(void)isc_file_remove(filename);
537		return (ISC_R_UNEXPECTED);
538	}
539
540	return (ISC_R_SUCCESS);
541}
542
543static isc_result_t
544journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
545	     isc_boolean_t create, dns_journal_t **journalp) {
546	FILE *fp = NULL;
547	isc_result_t result;
548	journal_rawheader_t rawheader;
549	dns_journal_t *j;
550
551	INSIST(journalp != NULL && *journalp == NULL);
552	j = isc_mem_get(mctx, sizeof(*j));
553	if (j == NULL)
554		return (ISC_R_NOMEMORY);
555
556	j->mctx = mctx;
557	j->state = JOURNAL_STATE_INVALID;
558	j->fp = NULL;
559	j->filename = filename;
560	j->index = NULL;
561	j->rawindex = NULL;
562
563	result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
564
565	if (result == ISC_R_FILENOTFOUND) {
566		if (create) {
567			isc_log_write(JOURNAL_COMMON_LOGARGS,
568				      ISC_LOG_INFO,
569				      "journal file %s does not exist, "
570				      "creating it",
571				      j->filename);
572			CHECK(journal_file_create(mctx, filename));
573			/*
574			 * Retry.
575			 */
576			result = isc_stdio_open(j->filename, "rb+", &fp);
577		} else {
578			FAIL(ISC_R_NOTFOUND);
579		}
580	}
581	if (result != ISC_R_SUCCESS) {
582		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
583			      "%s: open: %s",
584			      j->filename, isc_result_totext(result));
585		FAIL(ISC_R_UNEXPECTED);
586	}
587
588	j->fp = fp;
589
590	/*
591	 * Set magic early so that seek/read can succeed.
592	 */
593	j->magic = DNS_JOURNAL_MAGIC;
594
595	CHECK(journal_seek(j, 0));
596	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
597
598	if (memcmp(rawheader.h.format, initial_journal_header.format,
599		   sizeof(initial_journal_header.format)) != 0) {
600		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
601				 "%s: journal format not recognized",
602				 j->filename);
603		FAIL(ISC_R_UNEXPECTED);
604	}
605	journal_header_decode(&rawheader, &j->header);
606
607	/*
608	 * If there is an index, read the raw index into a dynamically
609	 * allocated buffer and then convert it into a cooked index.
610	 */
611	if (j->header.index_size != 0) {
612		unsigned int i;
613		unsigned int rawbytes;
614		unsigned char *p;
615
616		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
617		j->rawindex = isc_mem_get(mctx, rawbytes);
618		if (j->rawindex == NULL)
619			FAIL(ISC_R_NOMEMORY);
620
621		CHECK(journal_read(j, j->rawindex, rawbytes));
622
623		j->index = isc_mem_get(mctx, j->header.index_size *
624				       sizeof(journal_pos_t));
625		if (j->index == NULL)
626			FAIL(ISC_R_NOMEMORY);
627
628		p = j->rawindex;
629		for (i = 0; i < j->header.index_size; i++) {
630			j->index[i].serial = decode_uint32(p);
631			p += 4;
632			j->index[i].offset = decode_uint32(p);
633			p += 4;
634		}
635		INSIST(p == j->rawindex + rawbytes);
636	}
637	j->offset = -1; /* Invalid, must seek explicitly. */
638
639	/*
640	 * Initialize the iterator.
641	 */
642	dns_name_init(&j->it.name, NULL);
643	dns_rdata_init(&j->it.rdata);
644
645	/*
646	 * Set up empty initial buffers for uncheched and checked
647	 * wire format RR data.  They will be reallocated
648	 * later.
649	 */
650	isc_buffer_init(&j->it.source, NULL, 0);
651	isc_buffer_init(&j->it.target, NULL, 0);
652	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
653
654	j->state =
655		write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
656
657	*journalp = j;
658	return (ISC_R_SUCCESS);
659
660 failure:
661	j->magic = 0;
662	if (j->index != NULL) {
663		isc_mem_put(j->mctx, j->index, j->header.index_size *
664			    sizeof(journal_rawpos_t));
665		j->index = NULL;
666	}
667	if (j->fp != NULL)
668		(void)isc_stdio_close(j->fp);
669	isc_mem_put(j->mctx, j, sizeof(*j));
670	return (result);
671}
672
673isc_result_t
674dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
675		 dns_journal_t **journalp) {
676	return (journal_open(mctx, filename, write, write, journalp));
677}
678
679/*
680 * A comparison function defining the sorting order for
681 * entries in the IXFR-style journal file.
682 *
683 * The IXFR format requires that deletions are sorted before
684 * additions, and within either one, SOA records are sorted
685 * before others.
686 *
687 * Also sort the non-SOA records by type as a courtesy to the
688 * server receiving the IXFR - it may help reduce the amount of
689 * rdataset merging it has to do.
690 */
691static int
692ixfr_order(const void *av, const void *bv) {
693	dns_difftuple_t const * const *ap = av;
694	dns_difftuple_t const * const *bp = bv;
695	dns_difftuple_t const *a = *ap;
696	dns_difftuple_t const *b = *bp;
697	int r;
698
699	r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
700	if (r != 0)
701		return (r);
702
703	r = (b->rdata.type == dns_rdatatype_soa) -
704		(a->rdata.type == dns_rdatatype_soa);
705	if (r != 0)
706		return (r);
707
708	r = (a->rdata.type - b->rdata.type);
709	return (r);
710}
711
712/*
713 * Advance '*pos' to the next journal transaction.
714 *
715 * Requires:
716 *	*pos refers to a valid journal transaction.
717 *
718 * Ensures:
719 *	When ISC_R_SUCCESS is returned,
720 *	*pos refers to the next journal transaction.
721 *
722 * Returns one of:
723 *
724 *    ISC_R_SUCCESS
725 *    ISC_R_NOMORE 	*pos pointed at the last transaction
726 *    Other results due to file errors are possible.
727 */
728static isc_result_t
729journal_next(dns_journal_t *j, journal_pos_t *pos) {
730	isc_result_t result;
731	journal_xhdr_t xhdr;
732	REQUIRE(DNS_JOURNAL_VALID(j));
733
734	result = journal_seek(j, pos->offset);
735	if (result != ISC_R_SUCCESS)
736		return (result);
737
738	if (pos->serial == j->header.end.serial)
739		return (ISC_R_NOMORE);
740	/*
741	 * Read the header of the current transaction.
742	 * This will return ISC_R_NOMORE if we are at EOF.
743	 */
744	result = journal_read_xhdr(j, &xhdr);
745	if (result != ISC_R_SUCCESS)
746		return (result);
747
748	/*
749	 * Check serial number consistency.
750	 */
751	if (xhdr.serial0 != pos->serial) {
752		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
753			      "%s: journal file corrupt: "
754			      "expected serial %u, got %u",
755			      j->filename, pos->serial, xhdr.serial0);
756		return (ISC_R_UNEXPECTED);
757	}
758
759	/*
760	 * Check for offset wraparound.
761	 */
762	if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
763	    < pos->offset) {
764		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
765			      "%s: offset too large", j->filename);
766		return (ISC_R_UNEXPECTED);
767	}
768
769	pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
770	pos->serial = xhdr.serial1;
771	return (ISC_R_SUCCESS);
772}
773
774/*
775 * If the index of the journal 'j' contains an entry "better"
776 * than '*best_guess', replace '*best_guess' with it.
777 *
778 * "Better" means having a serial number closer to 'serial'
779 * but not greater than 'serial'.
780 */
781static void
782index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
783	unsigned int i;
784	if (j->index == NULL)
785		return;
786	for (i = 0; i < j->header.index_size; i++) {
787		if (POS_VALID(j->index[i]) &&
788		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
789		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
790			*best_guess = j->index[i];
791	}
792}
793
794/*
795 * Add a new index entry.  If there is no room, make room by removing
796 * the odd-numbered entries and compacting the others into the first
797 * half of the index.  This decimates old index entries exponentially
798 * over time, so that the index always contains a much larger fraction
799 * of recent serial numbers than of old ones.  This is deliberate -
800 * most index searches are for outgoing IXFR, and IXFR tends to request
801 * recent versions more often than old ones.
802 */
803static void
804index_add(dns_journal_t *j, journal_pos_t *pos) {
805	unsigned int i;
806	if (j->index == NULL)
807		return;
808	/*
809	 * Search for a vacant position.
810	 */
811	for (i = 0; i < j->header.index_size; i++) {
812		if (! POS_VALID(j->index[i]))
813			break;
814	}
815	if (i == j->header.index_size) {
816		unsigned int k = 0;
817		/*
818		 * Found no vacant position.  Make some room.
819		 */
820		for (i = 0; i < j->header.index_size; i += 2) {
821			j->index[k++] = j->index[i];
822		}
823		i = k; /* 'i' identifies the first vacant position. */
824		while (k < j->header.index_size) {
825			POS_INVALIDATE(j->index[k]);
826			k++;
827		}
828	}
829	INSIST(i < j->header.index_size);
830	INSIST(! POS_VALID(j->index[i]));
831
832	/*
833	 * Store the new index entry.
834	 */
835	j->index[i] = *pos;
836}
837
838/*
839 * Invalidate any existing index entries that could become
840 * ambiguous when a new transaction with number 'serial' is added.
841 */
842static void
843index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
844	unsigned int i;
845	if (j->index == NULL)
846		return;
847	for (i = 0; i < j->header.index_size; i++) {
848		if (! DNS_SERIAL_GT(serial, j->index[i].serial))
849			POS_INVALIDATE(j->index[i]);
850	}
851}
852
853/*
854 * Try to find a transaction with initial serial number 'serial'
855 * in the journal 'j'.
856 *
857 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
858 *
859 * If 'serial' is current (= the ending serial number of the
860 * last transaction in the journal), set '*pos' to
861 * the position immediately following the last transaction and
862 * return ISC_R_SUCCESS.
863 *
864 * If 'serial' is within the range of addressable serial numbers
865 * covered by the journal but that particular serial number is missing
866 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
867 *
868 * If 'serial' is outside the range of addressable serial numbers
869 * covered by the journal, return ISC_R_RANGE.
870 *
871 */
872static isc_result_t
873journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
874	isc_result_t result;
875	journal_pos_t current_pos;
876	REQUIRE(DNS_JOURNAL_VALID(j));
877
878	if (DNS_SERIAL_GT(j->header.begin.serial, serial))
879		return (ISC_R_RANGE);
880	if (DNS_SERIAL_GT(serial, j->header.end.serial))
881		return (ISC_R_RANGE);
882	if (serial == j->header.end.serial) {
883		*pos = j->header.end;
884		return (ISC_R_SUCCESS);
885	}
886
887	current_pos = j->header.begin;
888	index_find(j, serial, &current_pos);
889
890	while (current_pos.serial != serial) {
891		if (DNS_SERIAL_GT(current_pos.serial, serial))
892			return (ISC_R_NOTFOUND);
893		result = journal_next(j, &current_pos);
894		if (result != ISC_R_SUCCESS)
895			return (result);
896	}
897	*pos = current_pos;
898	return (ISC_R_SUCCESS);
899}
900
901isc_result_t
902dns_journal_begin_transaction(dns_journal_t *j) {
903	isc_uint32_t offset;
904	isc_result_t result;
905	journal_rawxhdr_t hdr;
906
907	REQUIRE(DNS_JOURNAL_VALID(j));
908	REQUIRE(j->state == JOURNAL_STATE_WRITE);
909
910	/*
911	 * Find the file offset where the new transaction should
912	 * be written, and seek there.
913	 */
914	if (JOURNAL_EMPTY(&j->header)) {
915		offset = sizeof(journal_rawheader_t) +
916			j->header.index_size * sizeof(journal_rawpos_t);
917	} else {
918		offset = j->header.end.offset;
919	}
920	j->x.pos[0].offset = offset;
921	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
922	j->x.n_soa = 0;
923
924	CHECK(journal_seek(j, offset));
925
926	/*
927	 * Write a dummy transaction header of all zeroes to reserve
928	 * space.  It will be filled in when the transaction is
929	 * finished.
930	 */
931	memset(&hdr, 0, sizeof(hdr));
932	CHECK(journal_write(j, &hdr, sizeof(hdr)));
933	j->x.pos[1].offset = j->offset;
934
935	j->state = JOURNAL_STATE_TRANSACTION;
936	result = ISC_R_SUCCESS;
937 failure:
938	return (result);
939}
940
941isc_result_t
942dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
943	dns_difftuple_t *t;
944	isc_buffer_t buffer;
945	void *mem = NULL;
946	unsigned int size;
947	isc_result_t result;
948	isc_region_t used;
949
950	REQUIRE(DNS_DIFF_VALID(diff));
951	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
952
953	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
954	(void)dns_diff_print(diff, NULL);
955
956	/*
957	 * Pass 1: determine the buffer size needed, and
958	 * keep track of SOA serial numbers.
959	 */
960	size = 0;
961	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
962	     t = ISC_LIST_NEXT(t, link))
963	{
964		if (t->rdata.type == dns_rdatatype_soa) {
965			if (j->x.n_soa < 2)
966				j->x.pos[j->x.n_soa].serial =
967					dns_soa_getserial(&t->rdata);
968			j->x.n_soa++;
969		}
970		size += sizeof(journal_rawrrhdr_t);
971		size += t->name.length; /* XXX should have access macro? */
972		size += 10;
973		size += t->rdata.length;
974	}
975
976	mem = isc_mem_get(j->mctx, size);
977	if (mem == NULL)
978		return (ISC_R_NOMEMORY);
979
980	isc_buffer_init(&buffer, mem, size);
981
982	/*
983	 * Pass 2.  Write RRs to buffer.
984	 */
985	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
986	     t = ISC_LIST_NEXT(t, link))
987	{
988		/*
989		 * Write the RR header.
990		 */
991		isc_buffer_putuint32(&buffer, t->name.length + 10 +
992				     t->rdata.length);
993		/*
994		 * Write the owner name, RR header, and RR data.
995		 */
996		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
997		isc_buffer_putuint16(&buffer, t->rdata.type);
998		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
999		isc_buffer_putuint32(&buffer, t->ttl);
1000		INSIST(t->rdata.length < 65536);
1001		isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1002		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1003		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1004	}
1005
1006	isc_buffer_usedregion(&buffer, &used);
1007	INSIST(used.length == size);
1008
1009	j->x.pos[1].offset += used.length;
1010
1011	/*
1012	 * Write the buffer contents to the journal file.
1013	 */
1014	CHECK(journal_write(j, used.base, used.length));
1015
1016	result = ISC_R_SUCCESS;
1017
1018 failure:
1019	if (mem != NULL)
1020		isc_mem_put(j->mctx, mem, size);
1021	return (result);
1022
1023}
1024
1025isc_result_t
1026dns_journal_commit(dns_journal_t *j) {
1027	isc_result_t result;
1028	journal_rawheader_t rawheader;
1029
1030	REQUIRE(DNS_JOURNAL_VALID(j));
1031	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1032
1033	/*
1034	 * Perform some basic consistency checks.
1035	 */
1036	if (j->x.n_soa != 2) {
1037		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1038			      "%s: malformed transaction: %d SOAs",
1039			      j->filename, j->x.n_soa);
1040		return (ISC_R_UNEXPECTED);
1041	}
1042	if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1043	       (bind8_compat &&
1044		j->x.pos[1].serial == j->x.pos[0].serial)))
1045	{
1046		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1047			      "%s: malformed transaction: serial number "
1048			      "would decrease", j->filename);
1049		return (ISC_R_UNEXPECTED);
1050	}
1051	if (! JOURNAL_EMPTY(&j->header)) {
1052		if (j->x.pos[0].serial != j->header.end.serial) {
1053			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1054					 "malformed transaction: "
1055					 "%s last serial %u != "
1056					 "transaction first serial %u",
1057					 j->filename,
1058					 j->header.end.serial,
1059					 j->x.pos[0].serial);
1060			return (ISC_R_UNEXPECTED);
1061		}
1062	}
1063
1064	/*
1065	 * Some old journal entries may become non-addressable
1066	 * when we increment the current serial number.  Purge them
1067	 * by stepping header.begin forward to the first addressable
1068	 * transaction.  Also purge them from the index.
1069	 */
1070	if (! JOURNAL_EMPTY(&j->header)) {
1071		while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1072				       j->header.begin.serial)) {
1073			CHECK(journal_next(j, &j->header.begin));
1074		}
1075		index_invalidate(j, j->x.pos[1].serial);
1076	}
1077#ifdef notyet
1078	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1079		force_dump(...);
1080	}
1081#endif
1082
1083	/*
1084	 * Commit the transaction data to stable storage.
1085	 */
1086	CHECK(journal_fsync(j));
1087
1088	/*
1089	 * Update the transaction header.
1090	 */
1091	CHECK(journal_seek(j, j->x.pos[0].offset));
1092	CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1093				 sizeof(journal_rawxhdr_t),
1094				 j->x.pos[0].serial, j->x.pos[1].serial));
1095
1096	/*
1097	 * Update the journal header.
1098	 */
1099	if (JOURNAL_EMPTY(&j->header)) {
1100		j->header.begin = j->x.pos[0];
1101	}
1102	j->header.end = j->x.pos[1];
1103	journal_header_encode(&j->header, &rawheader);
1104	CHECK(journal_seek(j, 0));
1105	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1106
1107	/*
1108	 * Update the index.
1109	 */
1110	index_add(j, &j->x.pos[0]);
1111
1112	/*
1113	 * Convert the index into on-disk format and write
1114	 * it to disk.
1115	 */
1116	CHECK(index_to_disk(j));
1117
1118	/*
1119	 * Commit the header to stable storage.
1120	 */
1121	CHECK(journal_fsync(j));
1122
1123	/*
1124	 * We no longer have a transaction open.
1125	 */
1126	j->state = JOURNAL_STATE_WRITE;
1127
1128	result = ISC_R_SUCCESS;
1129
1130 failure:
1131	return (result);
1132}
1133
1134isc_result_t
1135dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1136	isc_result_t result;
1137	CHECK(dns_diff_sort(diff, ixfr_order));
1138	CHECK(dns_journal_begin_transaction(j));
1139	CHECK(dns_journal_writediff(j, diff));
1140	CHECK(dns_journal_commit(j));
1141	result = ISC_R_SUCCESS;
1142 failure:
1143	return (result);
1144}
1145
1146void
1147dns_journal_destroy(dns_journal_t **journalp) {
1148	dns_journal_t *j = *journalp;
1149	REQUIRE(DNS_JOURNAL_VALID(j));
1150
1151	j->it.result = ISC_R_FAILURE;
1152	dns_name_invalidate(&j->it.name);
1153	dns_decompress_invalidate(&j->it.dctx);
1154	if (j->rawindex != NULL)
1155		isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1156			    sizeof(journal_rawpos_t));
1157	if (j->index != NULL)
1158		isc_mem_put(j->mctx, j->index, j->header.index_size *
1159			    sizeof(journal_pos_t));
1160	if (j->it.target.base != NULL)
1161		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1162	if (j->it.source.base != NULL)
1163		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1164
1165	if (j->fp != NULL)
1166		(void)isc_stdio_close(j->fp);
1167	j->magic = 0;
1168	isc_mem_put(j->mctx, j, sizeof(*j));
1169	*journalp = NULL;
1170}
1171
1172/*
1173 * Roll the open journal 'j' into the database 'db'.
1174 * A new database version will be created.
1175 */
1176
1177/* XXX Share code with incoming IXFR? */
1178
1179static isc_result_t
1180roll_forward(dns_journal_t *j, dns_db_t *db) {
1181	isc_buffer_t source;		/* Transaction data from disk */
1182	isc_buffer_t target;		/* Ditto after _fromwire check */
1183	isc_uint32_t db_serial;		/* Database SOA serial */
1184	isc_uint32_t end_serial;	/* Last journal SOA serial */
1185	isc_result_t result;
1186	dns_dbversion_t *ver = NULL;
1187	journal_pos_t pos;
1188	dns_diff_t diff;
1189	unsigned int n_soa = 0;
1190	unsigned int n_put = 0;
1191
1192	REQUIRE(DNS_JOURNAL_VALID(j));
1193	REQUIRE(DNS_DB_VALID(db));
1194
1195	dns_diff_init(j->mctx, &diff);
1196
1197	/*
1198	 * Set up empty initial buffers for uncheched and checked
1199	 * wire format transaction data.  They will be reallocated
1200	 * later.
1201	 */
1202	isc_buffer_init(&source, NULL, 0);
1203	isc_buffer_init(&target, NULL, 0);
1204
1205	/*
1206	 * Create the new database version.
1207	 */
1208	CHECK(dns_db_newversion(db, &ver));
1209
1210	/*
1211	 * Get the current database SOA serial number.
1212	 */
1213	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1214
1215	/*
1216	 * Locate a journal entry for the current database serial.
1217	 */
1218	CHECK(journal_find(j, db_serial, &pos));
1219	/*
1220	 * XXX do more drastic things, like marking zone stale,
1221	 * if this fails?
1222	 */
1223	/*
1224	 * XXXRTH  The zone code should probably mark the zone as bad and
1225	 *         scream loudly into the log if this is a dynamic update
1226	 *	   log reply that failed.
1227	 */
1228
1229	end_serial = dns_journal_last_serial(j);
1230	if (db_serial == end_serial)
1231		CHECK(DNS_R_UPTODATE);
1232
1233	CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1234
1235	for (result = dns_journal_first_rr(j);
1236	     result == ISC_R_SUCCESS;
1237	     result = dns_journal_next_rr(j))
1238	{
1239		dns_name_t *name;
1240		isc_uint32_t ttl;
1241		dns_rdata_t *rdata;
1242		dns_difftuple_t *tuple = NULL;
1243
1244		name = NULL;
1245		rdata = NULL;
1246		dns_journal_current_rr(j, &name, &ttl, &rdata);
1247
1248		if (rdata->type == dns_rdatatype_soa) {
1249			n_soa++;
1250			if (n_soa == 2)
1251				db_serial = j->it.current_serial;
1252		}
1253
1254		if (n_soa == 3)
1255			n_soa = 1;
1256		if (n_soa == 0) {
1257			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1258					 "%s: journal file corrupt: missing "
1259					 "initial SOA", j->filename);
1260			FAIL(ISC_R_UNEXPECTED);
1261		}
1262		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1263					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1264					   name, ttl, rdata, &tuple));
1265		dns_diff_append(&diff, &tuple);
1266
1267		if (++n_put > 100)  {
1268			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1269				      "%s: applying diff to database (%u)",
1270				      j->filename, db_serial);
1271			(void)dns_diff_print(&diff, NULL);
1272			CHECK(dns_diff_apply(&diff, db, ver));
1273			dns_diff_clear(&diff);
1274			n_put = 0;
1275		}
1276	}
1277	if (result == ISC_R_NOMORE)
1278		result = ISC_R_SUCCESS;
1279	CHECK(result);
1280
1281	if (n_put != 0) {
1282		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1283			      "%s: applying final diff to database (%u)",
1284			      j->filename, db_serial);
1285		(void)dns_diff_print(&diff, NULL);
1286		CHECK(dns_diff_apply(&diff, db, ver));
1287		dns_diff_clear(&diff);
1288	}
1289
1290 failure:
1291	if (ver != NULL)
1292		dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1293				    ISC_TRUE : ISC_FALSE);
1294
1295	if (source.base != NULL)
1296		isc_mem_put(j->mctx, source.base, source.length);
1297	if (target.base != NULL)
1298		isc_mem_put(j->mctx, target.base, target.length);
1299
1300	dns_diff_clear(&diff);
1301
1302	return (result);
1303}
1304
1305isc_result_t
1306dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1307	dns_journal_t *j;
1308	isc_result_t result;
1309
1310	REQUIRE(DNS_DB_VALID(db));
1311	REQUIRE(filename != NULL);
1312
1313	j = NULL;
1314	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1315	if (result == ISC_R_NOTFOUND) {
1316		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1317			      "no journal file, but that's OK");
1318		return (DNS_R_NOJOURNAL);
1319	}
1320	if (result != ISC_R_SUCCESS)
1321		return (result);
1322	if (JOURNAL_EMPTY(&j->header))
1323		result = DNS_R_UPTODATE;
1324	else
1325		result = roll_forward(j, db);
1326
1327	dns_journal_destroy(&j);
1328
1329	return (result);
1330}
1331
1332isc_result_t
1333dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1334	dns_journal_t *j;
1335	isc_buffer_t source;		/* Transaction data from disk */
1336	isc_buffer_t target;		/* Ditto after _fromwire check */
1337	isc_uint32_t start_serial;		/* Database SOA serial */
1338	isc_uint32_t end_serial;	/* Last journal SOA serial */
1339	isc_result_t result;
1340	dns_diff_t diff;
1341	unsigned int n_soa = 0;
1342	unsigned int n_put = 0;
1343
1344	REQUIRE(filename != NULL);
1345
1346	j = NULL;
1347	result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1348	if (result == ISC_R_NOTFOUND) {
1349		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1350		return (DNS_R_NOJOURNAL);
1351	}
1352
1353	if (result != ISC_R_SUCCESS) {
1354		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1355			      "journal open failure: %s: %s",
1356			      isc_result_totext(result), j->filename);
1357		return (result);
1358	}
1359
1360	dns_diff_init(j->mctx, &diff);
1361
1362	/*
1363	 * Set up empty initial buffers for uncheched and checked
1364	 * wire format transaction data.  They will be reallocated
1365	 * later.
1366	 */
1367	isc_buffer_init(&source, NULL, 0);
1368	isc_buffer_init(&target, NULL, 0);
1369
1370	start_serial = dns_journal_first_serial(j);
1371	end_serial = dns_journal_last_serial(j);
1372
1373	CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1374
1375	for (result = dns_journal_first_rr(j);
1376	     result == ISC_R_SUCCESS;
1377	     result = dns_journal_next_rr(j))
1378	{
1379		dns_name_t *name;
1380		isc_uint32_t ttl;
1381		dns_rdata_t *rdata;
1382		dns_difftuple_t *tuple = NULL;
1383
1384		name = NULL;
1385		rdata = NULL;
1386		dns_journal_current_rr(j, &name, &ttl, &rdata);
1387
1388		if (rdata->type == dns_rdatatype_soa)
1389			n_soa++;
1390
1391		if (n_soa == 3)
1392			n_soa = 1;
1393		if (n_soa == 0) {
1394		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1395					 "%s: journal file corrupt: missing "
1396					 "initial SOA", j->filename);
1397			FAIL(ISC_R_UNEXPECTED);
1398		}
1399		CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1400					   DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1401					   name, ttl, rdata, &tuple));
1402		dns_diff_append(&diff, &tuple);
1403
1404		if (++n_put > 100)  {
1405			result = dns_diff_print(&diff, file);
1406			dns_diff_clear(&diff);
1407			n_put = 0;
1408			if (result != ISC_R_SUCCESS)
1409				break;
1410		}
1411	}
1412	if (result == ISC_R_NOMORE)
1413		result = ISC_R_SUCCESS;
1414	CHECK(result);
1415
1416	if (n_put != 0) {
1417		result = dns_diff_print(&diff, file);
1418		dns_diff_clear(&diff);
1419	}
1420	goto cleanup;
1421
1422 failure:
1423	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1424		      "%s: cannot print: journal file corrupt", j->filename);
1425
1426 cleanup:
1427	if (source.base != NULL)
1428		isc_mem_put(j->mctx, source.base, source.length);
1429	if (target.base != NULL)
1430		isc_mem_put(j->mctx, target.base, target.length);
1431
1432	dns_diff_clear(&diff);
1433	dns_journal_destroy(&j);
1434
1435	return (result);
1436}
1437
1438/**************************************************************************/
1439/*
1440 * Miscellaneous accessors.
1441 */
1442isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1443	return (j->header.begin.serial);
1444}
1445
1446isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1447	return (j->header.end.serial);
1448}
1449
1450/**************************************************************************/
1451/*
1452 * Iteration support.
1453 *
1454 * When serving an outgoing IXFR, we transmit a part the journal starting
1455 * at the serial number in the IXFR request and ending at the serial
1456 * number that is current when the IXFR request arrives.  The ending
1457 * serial number is not necessarily at the end of the journal:
1458 * the journal may grow while the IXFR is in progress, but we stop
1459 * when we reach the serial number that was current when the IXFR started.
1460 */
1461
1462static isc_result_t read_one_rr(dns_journal_t *j);
1463
1464/*
1465 * Make sure the buffer 'b' is has at least 'size' bytes
1466 * allocated, and clear it.
1467 *
1468 * Requires:
1469 *	Either b->base is NULL, or it points to b->length bytes of memory
1470 *	previously allocated by isc_mem_get().
1471 */
1472
1473static isc_result_t
1474size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1475	if (b->length < size) {
1476		void *mem = isc_mem_get(mctx, size);
1477		if (mem == NULL)
1478			return (ISC_R_NOMEMORY);
1479		if (b->base != NULL)
1480			isc_mem_put(mctx, b->base, b->length);
1481		b->base = mem;
1482		b->length = size;
1483	}
1484	isc_buffer_clear(b);
1485	return (ISC_R_SUCCESS);
1486}
1487
1488isc_result_t
1489dns_journal_iter_init(dns_journal_t *j,
1490		      isc_uint32_t begin_serial, isc_uint32_t end_serial)
1491{
1492	isc_result_t result;
1493
1494	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1495	INSIST(j->it.bpos.serial == begin_serial);
1496
1497	CHECK(journal_find(j, end_serial, &j->it.epos));
1498	INSIST(j->it.epos.serial == end_serial);
1499
1500	result = ISC_R_SUCCESS;
1501 failure:
1502	j->it.result = result;
1503	return (j->it.result);
1504}
1505
1506
1507isc_result_t
1508dns_journal_first_rr(dns_journal_t *j) {
1509	isc_result_t result;
1510
1511	/*
1512	 * Seek to the beginning of the first transaction we are
1513	 * interested in.
1514	 */
1515	CHECK(journal_seek(j, j->it.bpos.offset));
1516	j->it.current_serial = j->it.bpos.serial;
1517
1518	j->it.xsize = 0;  /* We have no transaction data yet... */
1519	j->it.xpos = 0;	  /* ...and haven't used any of it. */
1520
1521	return (read_one_rr(j));
1522
1523 failure:
1524	return (result);
1525}
1526
1527static isc_result_t
1528read_one_rr(dns_journal_t *j) {
1529	isc_result_t result;
1530
1531	dns_rdatatype_t rdtype;
1532	dns_rdataclass_t rdclass;
1533	unsigned int rdlen;
1534	isc_uint32_t ttl;
1535	journal_xhdr_t xhdr;
1536	journal_rrhdr_t rrhdr;
1537
1538	INSIST(j->offset <= j->it.epos.offset);
1539	if (j->offset == j->it.epos.offset)
1540		return (ISC_R_NOMORE);
1541	if (j->it.xpos == j->it.xsize) {
1542		/*
1543		 * We are at a transaction boundary.
1544		 * Read another transaction header.
1545		 */
1546		CHECK(journal_read_xhdr(j, &xhdr));
1547		if (xhdr.size == 0) {
1548			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1549				      "%s: journal corrupt: empty transaction",
1550				      j->filename);
1551			FAIL(ISC_R_UNEXPECTED);
1552		}
1553		if (xhdr.serial0 != j->it.current_serial) {
1554			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1555					 "%s: journal file corrupt: "
1556					 "expected serial %u, got %u",
1557					 j->filename,
1558					 j->it.current_serial, xhdr.serial0);
1559			FAIL(ISC_R_UNEXPECTED);
1560		}
1561		j->it.xsize = xhdr.size;
1562		j->it.xpos = 0;
1563	}
1564	/*
1565	 * Read an RR.
1566	 */
1567	result = journal_read_rrhdr(j, &rrhdr);
1568	/*
1569	 * Perform a sanity check on the journal RR size.
1570	 * The smallest possible RR has a 1-byte owner name
1571	 * and a 10-byte header.  The largest possible
1572	 * RR has 65535 bytes of data, a header, and a maximum-
1573	 * size owner name, well below 70 k total.
1574	 */
1575	if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1576		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1577				 "%s: journal corrupt: impossible RR size "
1578				 "(%d bytes)", j->filename, rrhdr.size);
1579		FAIL(ISC_R_UNEXPECTED);
1580	}
1581
1582	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1583	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1584	isc_buffer_add(&j->it.source, rrhdr.size);
1585
1586	/*
1587	 * The target buffer is made the same size
1588	 * as the source buffer, with the assumption that when
1589	 * no compression in present, the output of dns_*_fromwire()
1590	 * is no larger than the input.
1591	 */
1592	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1593
1594	/*
1595	 * Parse the owner name.  We don't know where it
1596	 * ends yet, so we make the entire "remaining"
1597	 * part of the buffer "active".
1598	 */
1599	isc_buffer_setactive(&j->it.source,
1600			     j->it.source.used - j->it.source.current);
1601	CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1602				&j->it.dctx, 0, &j->it.target));
1603
1604	/*
1605	 * Check that the RR header is there, and parse it.
1606	 */
1607	if (isc_buffer_remaininglength(&j->it.source) < 10)
1608		FAIL(DNS_R_FORMERR);
1609
1610	rdtype = isc_buffer_getuint16(&j->it.source);
1611	rdclass = isc_buffer_getuint16(&j->it.source);
1612	ttl = isc_buffer_getuint32(&j->it.source);
1613	rdlen = isc_buffer_getuint16(&j->it.source);
1614
1615	/*
1616	 * Parse the rdata.
1617	 */
1618	isc_buffer_setactive(&j->it.source, rdlen);
1619	dns_rdata_reset(&j->it.rdata);
1620	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1621				 rdtype, &j->it.source, &j->it.dctx,
1622				 0, &j->it.target));
1623	j->it.ttl = ttl;
1624
1625	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1626	if (rdtype == dns_rdatatype_soa) {
1627		/* XXX could do additional consistency checks here */
1628		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1629	}
1630
1631	result = ISC_R_SUCCESS;
1632
1633 failure:
1634	j->it.result = result;
1635	return (result);
1636}
1637
1638isc_result_t
1639dns_journal_next_rr(dns_journal_t *j) {
1640	j->it.result = read_one_rr(j);
1641	return (j->it.result);
1642}
1643
1644void
1645dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1646		   dns_rdata_t **rdata)
1647{
1648	REQUIRE(j->it.result == ISC_R_SUCCESS);
1649	*name = &j->it.name;
1650	*ttl = j->it.ttl;
1651	*rdata = &j->it.rdata;
1652}
1653
1654/**************************************************************************/
1655/*
1656 * Generating diffs from databases
1657 */
1658
1659/*
1660 * Construct a diff containing all the RRs at the current name of the
1661 * database iterator 'dbit' in database 'db', version 'ver'.
1662 * Set '*name' to the current name, and append the diff to 'diff'.
1663 * All new tuples will have the operation 'op'.
1664 *
1665 * Requires: 'name' must have buffer large enough to hold the name.
1666 * Typically, a dns_fixedname_t would be used.
1667 */
1668static isc_result_t
1669get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1670	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1671	      dns_diff_t *diff)
1672{
1673	isc_result_t result;
1674	dns_dbnode_t *node = NULL;
1675	dns_rdatasetiter_t *rdsiter = NULL;
1676	dns_difftuple_t *tuple = NULL;
1677
1678	result = dns_dbiterator_current(dbit, &node, name);
1679	if (result != ISC_R_SUCCESS)
1680		return (result);
1681
1682	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1683	if (result != ISC_R_SUCCESS)
1684		goto cleanup_node;
1685
1686	for (result = dns_rdatasetiter_first(rdsiter);
1687	     result == ISC_R_SUCCESS;
1688	     result = dns_rdatasetiter_next(rdsiter))
1689	{
1690		dns_rdataset_t rdataset;
1691
1692		dns_rdataset_init(&rdataset);
1693		dns_rdatasetiter_current(rdsiter, &rdataset);
1694
1695		for (result = dns_rdataset_first(&rdataset);
1696		     result == ISC_R_SUCCESS;
1697		     result = dns_rdataset_next(&rdataset))
1698		{
1699			dns_rdata_t rdata = DNS_RDATA_INIT;
1700			dns_rdataset_current(&rdataset, &rdata);
1701			result = dns_difftuple_create(diff->mctx, op, name,
1702						      rdataset.ttl, &rdata,
1703						      &tuple);
1704			if (result != ISC_R_SUCCESS) {
1705				dns_rdataset_disassociate(&rdataset);
1706				goto cleanup_iterator;
1707			}
1708			dns_diff_append(diff, &tuple);
1709		}
1710		dns_rdataset_disassociate(&rdataset);
1711		if (result != ISC_R_NOMORE)
1712			goto cleanup_iterator;
1713	}
1714	if (result != ISC_R_NOMORE)
1715		goto cleanup_iterator;
1716
1717	result = ISC_R_SUCCESS;
1718
1719 cleanup_iterator:
1720	dns_rdatasetiter_destroy(&rdsiter);
1721
1722 cleanup_node:
1723	dns_db_detachnode(db, &node);
1724
1725	return (result);
1726}
1727
1728/*
1729 * Comparison function for use by dns_diff_subtract when sorting
1730 * the diffs to be subtracted.  The sort keys are the rdata type
1731 * and the rdata itself.  The owner name is ignored, because
1732 * it is known to be the same for all tuples.
1733 */
1734static int
1735rdata_order(const void *av, const void *bv) {
1736	dns_difftuple_t const * const *ap = av;
1737	dns_difftuple_t const * const *bp = bv;
1738	dns_difftuple_t const *a = *ap;
1739	dns_difftuple_t const *b = *bp;
1740	int r;
1741	r = (b->rdata.type - a->rdata.type);
1742	if (r != 0)
1743		return (r);
1744	r = dns_rdata_compare(&a->rdata, &b->rdata);
1745	return (r);
1746}
1747
1748static isc_result_t
1749dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1750	isc_result_t result;
1751	dns_difftuple_t *p[2];
1752	int i, t;
1753	CHECK(dns_diff_sort(&diff[0], rdata_order));
1754	CHECK(dns_diff_sort(&diff[1], rdata_order));
1755
1756	for (;;) {
1757		p[0] = ISC_LIST_HEAD(diff[0].tuples);
1758		p[1] = ISC_LIST_HEAD(diff[1].tuples);
1759		if (p[0] == NULL && p[1] == NULL)
1760			break;
1761
1762		for (i = 0; i < 2; i++)
1763			if (p[!i] == NULL) {
1764				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1765				ISC_LIST_APPEND(r->tuples, p[i], link);
1766				goto next;
1767			}
1768		t = rdata_order(&p[0], &p[1]);
1769		if (t < 0) {
1770			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1771			ISC_LIST_APPEND(r->tuples, p[0], link);
1772			goto next;
1773		}
1774		if (t > 0) {
1775			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1776			ISC_LIST_APPEND(r->tuples, p[1], link);
1777			goto next;
1778		}
1779		INSIST(t == 0);
1780		/*
1781		 * Identical RRs in both databases; skip them both.
1782		 */
1783		for (i = 0; i < 2; i++) {
1784			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1785			dns_difftuple_free(&p[i]);
1786		}
1787	next: ;
1788	}
1789	result = ISC_R_SUCCESS;
1790 failure:
1791	return (result);
1792}
1793
1794/*
1795 * Compare the databases 'dba' and 'dbb' and generate a journal
1796 * entry containing the changes to make 'dba' from 'dbb' (note
1797 * the order).  This journal entry will consist of a single,
1798 * possibly very large transaction.
1799 */
1800
1801isc_result_t
1802dns_db_diff(isc_mem_t *mctx,
1803	    dns_db_t *dba, dns_dbversion_t *dbvera,
1804	    dns_db_t *dbb, dns_dbversion_t *dbverb,
1805	    const char *journal_filename)
1806{
1807	dns_db_t *db[2];
1808	dns_dbversion_t *ver[2];
1809	dns_dbiterator_t *dbit[2] = { NULL, NULL };
1810	isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1811	dns_fixedname_t fixname[2];
1812	isc_result_t result, itresult[2];
1813	dns_diff_t diff[2], resultdiff;
1814	int i, t;
1815	dns_journal_t *journal = NULL;
1816
1817	db[0] = dba, db[1] = dbb;
1818	ver[0] = dbvera, ver[1] = dbverb;
1819
1820	dns_diff_init(mctx, &diff[0]);
1821	dns_diff_init(mctx, &diff[1]);
1822	dns_diff_init(mctx, &resultdiff);
1823
1824	dns_fixedname_init(&fixname[0]);
1825	dns_fixedname_init(&fixname[1]);
1826
1827	result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1828	if (result != ISC_R_SUCCESS)
1829		return (result);
1830
1831	result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1832	if (result != ISC_R_SUCCESS)
1833		goto cleanup_journal;
1834	result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1835	if (result != ISC_R_SUCCESS)
1836		goto cleanup_interator0;
1837
1838	itresult[0] = dns_dbiterator_first(dbit[0]);
1839	itresult[1] = dns_dbiterator_first(dbit[1]);
1840
1841	for (;;) {
1842		for (i = 0; i < 2; i++) {
1843			if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1844				CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1845					    dns_fixedname_name(&fixname[i]),
1846					    i == 0 ?
1847					    DNS_DIFFOP_ADD :
1848					    DNS_DIFFOP_DEL,
1849					    &diff[i]));
1850				itresult[i] = dns_dbiterator_next(dbit[i]);
1851				have[i] = ISC_TRUE;
1852			}
1853		}
1854
1855		if (! have[0] && ! have[1]) {
1856			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1857			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1858			break;
1859		}
1860
1861		for (i = 0; i < 2; i++) {
1862			if (! have[!i]) {
1863				ISC_LIST_APPENDLIST(resultdiff.tuples,
1864						    diff[i].tuples, link);
1865				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1866				have[i] = ISC_FALSE;
1867				goto next;
1868			}
1869		}
1870
1871		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1872				     dns_fixedname_name(&fixname[1]));
1873		if (t < 0) {
1874			ISC_LIST_APPENDLIST(resultdiff.tuples,
1875					    diff[0].tuples, link);
1876			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1877			have[0] = ISC_FALSE;
1878			continue;
1879		}
1880		if (t > 0) {
1881			ISC_LIST_APPENDLIST(resultdiff.tuples,
1882					    diff[1].tuples, link);
1883			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1884			have[1] = ISC_FALSE;
1885			continue;
1886		}
1887		INSIST(t == 0);
1888		CHECK(dns_diff_subtract(diff, &resultdiff));
1889		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1890		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1891		have[0] = have[1] = ISC_FALSE;
1892	next: ;
1893	}
1894	if (itresult[0] != ISC_R_NOMORE)
1895		FAIL(itresult[0]);
1896	if (itresult[1] != ISC_R_NOMORE)
1897		FAIL(itresult[1]);
1898
1899	if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1900		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1901	} else {
1902		CHECK(dns_journal_write_transaction(journal, &resultdiff));
1903	}
1904	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1905	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1906
1907 failure:
1908	dns_diff_clear(&resultdiff);
1909	dns_dbiterator_destroy(&dbit[1]);
1910 cleanup_interator0:
1911	dns_dbiterator_destroy(&dbit[0]);
1912 cleanup_journal:
1913	dns_journal_destroy(&journal);
1914	return (result);
1915}
1916
1917isc_result_t
1918dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1919		    isc_uint32_t target_size)
1920{
1921	unsigned int i;
1922	journal_pos_t best_guess;
1923	journal_pos_t current_pos;
1924	dns_journal_t *j = NULL;
1925	journal_rawheader_t rawheader;
1926	unsigned int copy_length;
1927	unsigned int len;
1928	char *buf = NULL;
1929	unsigned int size = 0;
1930	isc_result_t result;
1931	unsigned int indexend;
1932
1933	CHECK(journal_open(mctx, filename, ISC_TRUE, ISC_FALSE, &j));
1934
1935	if (JOURNAL_EMPTY(&j->header)) {
1936		dns_journal_destroy(&j);
1937		return (ISC_R_SUCCESS);
1938	}
1939
1940	if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1941	    DNS_SERIAL_GT(serial, j->header.end.serial)) {
1942		dns_journal_destroy(&j);
1943		return (ISC_R_RANGE);
1944	}
1945
1946	/*
1947	 * Cope with very small target sizes.
1948	 */
1949	indexend = sizeof(journal_rawheader_t) +
1950		   j->header.index_size * sizeof(journal_rawpos_t);
1951	if (target_size < indexend * 2)
1952		target_size = target_size/2 + indexend;
1953
1954	/*
1955	 * See if there is any work to do.
1956	 */
1957	if ((isc_uint32_t) j->header.end.offset < target_size) {
1958		dns_journal_destroy(&j);
1959		return (ISC_R_SUCCESS);
1960	}
1961
1962	/*
1963	 * Remove overhead so space test below can succeed.
1964	 */
1965	if (target_size >= indexend)
1966		target_size -= indexend;
1967
1968	/*
1969	 * Find if we can create enough free space.
1970	 */
1971	best_guess = j->header.begin;
1972	for (i = 0; i < j->header.index_size; i++) {
1973		if (POS_VALID(j->index[i]) &&
1974		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1975		    ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
1976		     >= target_size / 2) &&
1977		    j->index[i].offset > best_guess.offset)
1978			best_guess = j->index[i];
1979	}
1980
1981	current_pos = best_guess;
1982	while (current_pos.serial != serial) {
1983		CHECK(journal_next(j, &current_pos));
1984		if (current_pos.serial == j->header.end.serial)
1985			break;
1986
1987		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
1988		   ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
1989		     >= (target_size / 2)) &&
1990		    current_pos.offset > best_guess.offset)
1991			best_guess = current_pos;
1992		else
1993			break;
1994	}
1995
1996	INSIST(best_guess.serial != j->header.end.serial);
1997	if (best_guess.serial != serial)
1998		CHECK(journal_next(j, &best_guess));
1999
2000	/*
2001	 * Enough space to proceed?
2002	 */
2003	if ((isc_uint32_t) (j->header.end.offset - best_guess.offset) >
2004	     (isc_uint32_t) (best_guess.offset - indexend)) {
2005		dns_journal_destroy(&j);
2006		return (ISC_R_NOSPACE);
2007	}
2008
2009	copy_length = j->header.end.offset - best_guess.offset;
2010
2011	/*
2012	 * Invalidate entire index, will be rebuilt at end.
2013	 */
2014	for (i = 0; i < j->header.index_size; i++) {
2015		if (POS_VALID(j->index[i]))
2016			POS_INVALIDATE(j->index[i]);
2017	}
2018
2019	/*
2020	 * Convert the index into on-disk format and write
2021	 * it to disk.
2022	 */
2023	CHECK(index_to_disk(j));
2024	CHECK(journal_fsync(j));
2025
2026	/*
2027	 * Update the journal header.
2028	 */
2029	if (copy_length == 0) {
2030		j->header.begin.serial = 0;
2031		j->header.end.serial = 0;
2032		j->header.begin.offset = 0;
2033		j->header.end.offset = 0;
2034	} else {
2035		j->header.begin = best_guess;
2036	}
2037	journal_header_encode(&j->header, &rawheader);
2038	CHECK(journal_seek(j, 0));
2039	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2040	CHECK(journal_fsync(j));
2041
2042	if (copy_length != 0) {
2043		/*
2044		 * Copy best_guess to end into space just freed.
2045		 */
2046		size = 64*1024;
2047		if (copy_length < size)
2048			size = copy_length;
2049		buf = isc_mem_get(mctx, size);
2050		if (buf == NULL) {
2051			result = ISC_R_NOMEMORY;
2052			goto failure;
2053		}
2054
2055		for (i = 0; i < copy_length; i += size) {
2056			len = (copy_length - i) > size ? size :
2057							 (copy_length - i);
2058			CHECK(journal_seek(j, best_guess.offset + i));
2059			CHECK(journal_read(j, buf, len));
2060			CHECK(journal_seek(j, indexend + i));
2061			CHECK(journal_write(j, buf, len));
2062		}
2063
2064		CHECK(journal_fsync(j));
2065
2066		/*
2067		 * Compute new header.
2068		 */
2069		j->header.begin.offset = indexend;
2070		j->header.end.offset = indexend + copy_length;
2071		/*
2072		 * Update the journal header.
2073		 */
2074		journal_header_encode(&j->header, &rawheader);
2075		CHECK(journal_seek(j, 0));
2076		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2077		CHECK(journal_fsync(j));
2078
2079		/*
2080		 * Build new index.
2081		 */
2082		current_pos = j->header.begin;
2083		while (current_pos.serial != j->header.end.serial) {
2084			index_add(j, &current_pos);
2085			CHECK(journal_next(j, &current_pos));
2086		}
2087
2088		/*
2089		 * Write index.
2090		 */
2091		CHECK(index_to_disk(j));
2092		CHECK(journal_fsync(j));
2093
2094		indexend = j->header.end.offset;
2095	}
2096	dns_journal_destroy(&j);
2097	(void)isc_file_truncate(filename, (isc_offset_t)indexend);
2098	result = ISC_R_SUCCESS;
2099
2100 failure:
2101	if (buf != NULL)
2102		isc_mem_put(mctx, buf, size);
2103	if (j != NULL)
2104		dns_journal_destroy(&j);
2105	return (result);
2106}
2107
2108static isc_result_t
2109index_to_disk(dns_journal_t *j) {
2110	isc_result_t result = ISC_R_SUCCESS;
2111
2112	if (j->header.index_size != 0) {
2113		unsigned int i;
2114		unsigned char *p;
2115		unsigned int rawbytes;
2116
2117		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2118
2119		p = j->rawindex;
2120		for (i = 0; i < j->header.index_size; i++) {
2121			encode_uint32(j->index[i].serial, p);
2122			p += 4;
2123			encode_uint32(j->index[i].offset, p);
2124			p += 4;
2125		}
2126		INSIST(p == j->rawindex + rawbytes);
2127
2128		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2129		CHECK(journal_write(j, j->rawindex, rawbytes));
2130	}
2131failure:
2132	return (result);
2133}
2134