1163837Spjd/*-
2163837Spjd * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3163837Spjd * All rights reserved.
4163837Spjd *
5163837Spjd * Redistribution and use in source and binary forms, with or without
6163837Spjd * modification, are permitted provided that the following conditions
7163837Spjd * are met:
8163837Spjd * 1. Redistributions of source code must retain the above copyright
9163837Spjd *    notice, this list of conditions and the following disclaimer.
10163837Spjd * 2. Redistributions in binary form must reproduce the above copyright
11163837Spjd *    notice, this list of conditions and the following disclaimer in the
12163837Spjd *    documentation and/or other materials provided with the distribution.
13163837Spjd *
14163837Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15163837Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16163837Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17163837Spjd * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18163837Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19163837Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20163837Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21163837Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22163837Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23163837Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24163837Spjd * SUCH DAMAGE.
25163837Spjd *
26163837Spjd * $FreeBSD$
27163837Spjd */
28163837Spjd
29163837Spjd#ifndef	_G_JOURNAL_H_
30163837Spjd#define	_G_JOURNAL_H_
31163837Spjd
32163837Spjd#include <sys/endian.h>
33163837Spjd#include <sys/md5.h>
34163837Spjd#ifdef _KERNEL
35163837Spjd#include <sys/bio.h>
36163837Spjd#endif
37163837Spjd
38163837Spjd#define	G_JOURNAL_CLASS_NAME	"JOURNAL"
39163837Spjd
40163837Spjd#define	G_JOURNAL_MAGIC		"GEOM::JOURNAL"
41163837Spjd/*
42163837Spjd * Version history:
43163837Spjd * 0 - Initial version number.
44163837Spjd */
45163837Spjd#define	G_JOURNAL_VERSION	0
46163837Spjd
47163837Spjd#ifdef _KERNEL
48163837Spjdextern int g_journal_debug;
49163837Spjd
50163837Spjd#define	GJ_DEBUG(lvl, ...)	do {					\
51163837Spjd	if (g_journal_debug >= (lvl)) {					\
52163837Spjd		printf("GEOM_JOURNAL");					\
53163837Spjd		if (g_journal_debug > 0)				\
54163837Spjd			printf("[%u]", lvl);				\
55163837Spjd		printf(": ");						\
56163837Spjd		printf(__VA_ARGS__);					\
57163837Spjd		printf("\n");						\
58163837Spjd	}								\
59163837Spjd} while (0)
60163837Spjd#define	GJ_LOGREQ(lvl, bp, ...)	do {					\
61163837Spjd	if (g_journal_debug >= (lvl)) {					\
62163837Spjd		printf("GEOM_JOURNAL");					\
63163837Spjd		if (g_journal_debug > 0)				\
64163837Spjd			printf("[%u]", lvl);				\
65163837Spjd		printf(": ");						\
66163837Spjd		printf(__VA_ARGS__);					\
67163837Spjd		printf(" ");						\
68163837Spjd		g_print_bio(bp);					\
69163837Spjd		printf("\n");						\
70163837Spjd	}								\
71163837Spjd} while (0)
72163837Spjd
73163837Spjd#define	JEMPTY(sc)	((sc)->sc_journal_offset -			\
74163837Spjd			 (sc)->sc_jprovider->sectorsize ==		\
75163837Spjd			 (sc)->sc_active.jj_offset &&			\
76163837Spjd			 (sc)->sc_current_count == 0)
77163837Spjd
78163837Spjd#define	GJ_BIO_REGULAR		0x00
79163837Spjd#define	GJ_BIO_READ		0x01
80163837Spjd#define	GJ_BIO_JOURNAL		0x02
81163837Spjd#define	GJ_BIO_COPY		0x03
82163837Spjd#define	GJ_BIO_MASK		0x0f
83163837Spjd
84163837Spjd#if 0
85163837Spjd#define	GJF_BIO_DONT_FREE	0x10
86163837Spjd#define	GJF_BIO_MASK		0xf0
87163837Spjd#endif
88163837Spjd
89163837Spjd#define	GJF_DEVICE_HARDCODED		0x0001
90163837Spjd#define	GJF_DEVICE_DESTROY		0x0010
91163837Spjd#define	GJF_DEVICE_SWITCH		0x0020
92163837Spjd#define	GJF_DEVICE_BEFORE_SWITCH	0x0040
93163837Spjd#define	GJF_DEVICE_CLEAN		0x0080
94163837Spjd#define	GJF_DEVICE_CHECKSUM		0x0100
95163837Spjd
96163837Spjd#define	GJ_HARD_LIMIT		64
97163837Spjd
98163837Spjd/*
99163837Spjd * We keep pointers to journaled data in bio structure and because we
100163837Spjd * need to store two off_t values (offset in data provider and offset in
101163837Spjd * journal), we have to borrow bio_completed field for this.
102163837Spjd */
103163837Spjd#define	bio_joffset	bio_completed
104163837Spjd/*
105163837Spjd * Use bio_caller1 field as a pointer in queue.
106163837Spjd */
107163837Spjd#define	bio_next	bio_caller1
108163837Spjd
109163837Spjd/*
110163837Spjd * There are two such structures maintained inside each journaled device.
111163837Spjd * One describes active part of the journal, were recent requests are stored.
112163837Spjd * The second describes the last consistent part of the journal with requests
113163837Spjd * that are copied to the destination provider.
114163837Spjd */
115163837Spjdstruct g_journal_journal {
116163837Spjd	struct bio	*jj_queue;	/* Cached journal entries. */
117163837Spjd	off_t		 jj_offset;	/* Journal's start offset. */
118163837Spjd};
119163837Spjd
120163837Spjdstruct g_journal_softc {
121163837Spjd	uint32_t	 sc_id;
122163837Spjd	uint8_t		 sc_type;
123163837Spjd	uint8_t		 sc_orig_type;
124163837Spjd	struct g_geom	*sc_geom;
125163837Spjd	u_int		 sc_flags;
126163837Spjd	struct mtx	 sc_mtx;
127163837Spjd	off_t		 sc_mediasize;
128163837Spjd	u_int		 sc_sectorsize;
129163837Spjd#define	GJ_FLUSH_DATA		0x01
130163837Spjd#define	GJ_FLUSH_JOURNAL	0x02
131163837Spjd	u_int		 sc_bio_flush;
132163837Spjd
133163837Spjd	uint32_t	 sc_journal_id;
134163837Spjd	uint32_t	 sc_journal_next_id;
135163837Spjd	int		 sc_journal_copying;
136163837Spjd	off_t		 sc_journal_offset;
137163837Spjd	off_t		 sc_journal_previous_id;
138163837Spjd
139163837Spjd	struct bio_queue_head sc_back_queue;
140163837Spjd	struct bio_queue_head sc_regular_queue;
141163837Spjd
142163837Spjd	struct bio_queue_head sc_delayed_queue;
143163837Spjd	int		 sc_delayed_count;
144163837Spjd
145163837Spjd	struct bio	*sc_current_queue;
146163837Spjd	int		 sc_current_count;
147163837Spjd
148163837Spjd	struct bio	*sc_flush_queue;
149163837Spjd	int		 sc_flush_count;
150163837Spjd	int		 sc_flush_in_progress;
151163837Spjd
152163837Spjd	struct bio	*sc_copy_queue;
153163837Spjd	int		 sc_copy_in_progress;
154163837Spjd
155163837Spjd	struct g_consumer *sc_dconsumer;
156163837Spjd	struct g_consumer *sc_jconsumer;
157163837Spjd
158163837Spjd	struct g_journal_journal sc_inactive;
159163837Spjd	struct g_journal_journal sc_active;
160163837Spjd
161163837Spjd	off_t		 sc_jstart;	/* Journal space start offset. */
162163837Spjd	off_t		 sc_jend;	/* Journal space end offset. */
163163837Spjd
164163837Spjd	struct callout	 sc_callout;
165163837Spjd	struct proc	*sc_worker;
166185693Strasz
167185693Strasz	struct root_hold_token *sc_rootmount;
168163837Spjd};
169163837Spjd#define	sc_dprovider	sc_dconsumer->provider
170163837Spjd#define	sc_jprovider	sc_jconsumer->provider
171163837Spjd#define	sc_name		sc_dprovider->name
172163837Spjd
173163837Spjd#define	GJQ_INSERT_HEAD(head, bp)	do {				\
174163837Spjd	(bp)->bio_next = (head);					\
175163837Spjd	(head) = (bp);							\
176163837Spjd} while (0)
177163837Spjd#define	GJQ_INSERT_AFTER(head, bp, pbp)	do {				\
178163837Spjd	if ((pbp) == NULL)						\
179163837Spjd		GJQ_INSERT_HEAD(head, bp);				\
180163837Spjd	else {								\
181163837Spjd		(bp)->bio_next = (pbp)->bio_next;			\
182163837Spjd		(pbp)->bio_next = (bp);					\
183163837Spjd	}								\
184163837Spjd} while (0)
185163837Spjd#define	GJQ_FIRST(head)	(head)
186163837Spjd#define	GJQ_REMOVE(head, bp)	do {					\
187163837Spjd	struct bio *_bp;						\
188163837Spjd									\
189163837Spjd	if ((head) == (bp)) {						\
190163837Spjd		(head) = (bp)->bio_next;				\
191163837Spjd		(bp)->bio_next = NULL;					\
192163837Spjd		break;							\
193163837Spjd	}								\
194163837Spjd	for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) {\
195163837Spjd		if (_bp->bio_next == (bp))				\
196163837Spjd			break;						\
197163837Spjd	}								\
198163837Spjd	KASSERT(_bp->bio_next != NULL, ("NULL bio_next"));		\
199163837Spjd	KASSERT(_bp->bio_next == (bp), ("bio_next != bp"));		\
200163837Spjd	_bp->bio_next = (bp)->bio_next;					\
201163837Spjd	(bp)->bio_next = NULL;						\
202163837Spjd} while (0)
203163837Spjd#define GJQ_FOREACH(head, bp)						\
204163837Spjd	for ((bp) = (head); (bp) != NULL; (bp) = (bp)->bio_next)
205163837Spjd
206163837Spjd#define	GJ_HEADER_MAGIC	"GJHDR"
207163837Spjd
208163837Spjdstruct g_journal_header {
209163837Spjd	char		jh_magic[sizeof(GJ_HEADER_MAGIC)];
210163837Spjd	uint32_t	jh_journal_id;
211163837Spjd	uint32_t	jh_journal_next_id;
212163837Spjd} __packed;
213163837Spjd
214163837Spjdstruct g_journal_entry {
215163837Spjd	uint64_t	je_joffset;
216163837Spjd	uint64_t	je_offset;
217163837Spjd	uint64_t	je_length;
218163837Spjd} __packed;
219163837Spjd
220163837Spjd#define	GJ_RECORD_HEADER_MAGIC		"GJRHDR"
221163837Spjd#define	GJ_RECORD_HEADER_NENTRIES	(20)
222163837Spjd#define	GJ_RECORD_MAX_SIZE(sc)	\
223163837Spjd	((sc)->sc_jprovider->sectorsize + GJ_RECORD_HEADER_NENTRIES * MAXPHYS)
224163837Spjd#define	GJ_VALIDATE_OFFSET(offset, sc)	do {				\
225163837Spjd	if ((offset) + GJ_RECORD_MAX_SIZE(sc) >= (sc)->sc_jend) {	\
226163837Spjd		(offset) = (sc)->sc_jstart;				\
227163837Spjd		GJ_DEBUG(2, "Starting from the begining (%s).",		\
228163837Spjd		    (sc)->sc_name);					\
229163837Spjd	}								\
230163837Spjd} while (0)
231163837Spjd
232163837Spjdstruct g_journal_record_header {
233163837Spjd	char		jrh_magic[sizeof(GJ_RECORD_HEADER_MAGIC)];
234163837Spjd	uint32_t	jrh_journal_id;
235163837Spjd	uint16_t	jrh_nentries;
236163837Spjd	u_char		jrh_sum[8];
237163837Spjd	struct g_journal_entry jrh_entries[GJ_RECORD_HEADER_NENTRIES];
238163837Spjd} __packed;
239163837Spjd
240163837Spjdtypedef int (g_journal_clean_t)(struct mount *mp);
241163837Spjdtypedef void (g_journal_dirty_t)(struct g_consumer *cp);
242163837Spjd
243163837Spjdstruct g_journal_desc {
244163837Spjd	const char		*jd_fstype;
245163837Spjd	g_journal_clean_t	*jd_clean;
246163837Spjd	g_journal_dirty_t	*jd_dirty;
247163837Spjd};
248163837Spjd
249163837Spjd/* Supported file systems. */
250163837Spjdextern const struct g_journal_desc g_journal_ufs;
251163837Spjd
252163837Spjd#define	GJ_TIMER_START(lvl, bt)	do {					\
253163837Spjd	if (g_journal_debug >= (lvl))					\
254163837Spjd		binuptime(bt);						\
255163837Spjd} while (0)
256163837Spjd#define	GJ_TIMER_STOP(lvl, bt, ...)	do {				\
257163837Spjd	if (g_journal_debug >= (lvl)) {					\
258163837Spjd		struct bintime _bt2;					\
259163837Spjd		struct timeval _tv;					\
260163837Spjd									\
261163837Spjd		binuptime(&_bt2);					\
262163837Spjd		bintime_sub(&_bt2, bt);					\
263163837Spjd		bintime2timeval(&_bt2, &_tv);				\
264163837Spjd		printf("GEOM_JOURNAL");					\
265163837Spjd		if (g_journal_debug > 0)				\
266163837Spjd			printf("[%u]", lvl);				\
267163837Spjd		printf(": ");						\
268163837Spjd		printf(__VA_ARGS__);					\
269163837Spjd		printf(": %jd.%06jds\n", (intmax_t)_tv.tv_sec,		\
270163837Spjd		    (intmax_t)_tv.tv_usec);				\
271163837Spjd	}								\
272163837Spjd} while (0)
273163837Spjd#endif	/* _KERNEL */
274163837Spjd
275163837Spjd#define	GJ_TYPE_DATA		0x01
276163837Spjd#define	GJ_TYPE_JOURNAL		0x02
277163837Spjd#define	GJ_TYPE_COMPLETE	(GJ_TYPE_DATA|GJ_TYPE_JOURNAL)
278163837Spjd
279163837Spjd#define	GJ_FLAG_CLEAN		0x01
280163837Spjd#define	GJ_FLAG_CHECKSUM	0x02
281163837Spjd
282163837Spjdstruct g_journal_metadata {
283163837Spjd	char		md_magic[16];	/* Magic value. */
284163837Spjd	uint32_t	md_version;	/* Version number. */
285163837Spjd	uint32_t	md_id;		/* Journal unique ID. */
286163837Spjd	uint8_t		md_type;	/* Provider type. */
287163837Spjd	uint64_t	md_jstart;	/* Journal space start offset. */
288163837Spjd	uint64_t	md_jend;	/* Journal space end offset. */
289163837Spjd	uint64_t	md_joffset;	/* Last known consistent journal offset. */
290163837Spjd	uint32_t	md_jid;		/* Last known consistent journal ID. */
291163837Spjd	uint64_t	md_flags;	/* Journal flags. */
292163837Spjd	char		md_provider[16]; /* Hardcoded provider. */
293163837Spjd	uint64_t	md_provsize;	/* Provider's size. */
294163837Spjd	u_char		md_hash[16];	/* MD5 hash. */
295163837Spjd};
296163837Spjdstatic __inline void
297163837Spjdjournal_metadata_encode(struct g_journal_metadata *md, u_char *data)
298163837Spjd{
299163837Spjd	MD5_CTX ctx;
300163837Spjd
301163837Spjd	bcopy(md->md_magic, data, 16);
302163837Spjd	le32enc(data + 16, md->md_version);
303163837Spjd	le32enc(data + 20, md->md_id);
304163837Spjd	*(data + 24) = md->md_type;
305163837Spjd	le64enc(data + 25, md->md_jstart);
306163837Spjd	le64enc(data + 33, md->md_jend);
307163837Spjd	le64enc(data + 41, md->md_joffset);
308163837Spjd	le32enc(data + 49, md->md_jid);
309163837Spjd	le64enc(data + 53, md->md_flags);
310163837Spjd	bcopy(md->md_provider, data + 61, 16);
311163837Spjd	le64enc(data + 77, md->md_provsize);
312163837Spjd	MD5Init(&ctx);
313163837Spjd	MD5Update(&ctx, data, 85);
314163837Spjd	MD5Final(md->md_hash, &ctx);
315163837Spjd	bcopy(md->md_hash, data + 85, 16);
316163837Spjd}
317163837Spjdstatic __inline int
318163837Spjdjournal_metadata_decode_v0(const u_char *data, struct g_journal_metadata *md)
319163837Spjd{
320163837Spjd	MD5_CTX ctx;
321163837Spjd
322163837Spjd	md->md_id = le32dec(data + 20);
323163837Spjd	md->md_type = *(data + 24);
324163837Spjd	md->md_jstart = le64dec(data + 25);
325163837Spjd	md->md_jend = le64dec(data + 33);
326163837Spjd	md->md_joffset = le64dec(data + 41);
327163837Spjd	md->md_jid = le32dec(data + 49);
328163837Spjd	md->md_flags = le64dec(data + 53);
329163837Spjd	bcopy(data + 61, md->md_provider, 16);
330163837Spjd	md->md_provsize = le64dec(data + 77);
331163837Spjd	MD5Init(&ctx);
332163837Spjd	MD5Update(&ctx, data, 85);
333163837Spjd	MD5Final(md->md_hash, &ctx);
334163837Spjd	if (bcmp(md->md_hash, data + 85, 16) != 0)
335163837Spjd		return (EINVAL);
336163837Spjd	return (0);
337163837Spjd}
338163837Spjdstatic __inline int
339163837Spjdjournal_metadata_decode(const u_char *data, struct g_journal_metadata *md)
340163837Spjd{
341163837Spjd	int error;
342163837Spjd
343163837Spjd	bcopy(data, md->md_magic, 16);
344163837Spjd	md->md_version = le32dec(data + 16);
345163837Spjd	switch (md->md_version) {
346163837Spjd	case 0:
347163837Spjd		error = journal_metadata_decode_v0(data, md);
348163837Spjd		break;
349163837Spjd	default:
350163837Spjd		error = EINVAL;
351163837Spjd		break;
352163837Spjd	}
353163837Spjd	return (error);
354163837Spjd}
355163837Spjd
356163837Spjdstatic __inline void
357163837Spjdjournal_metadata_dump(const struct g_journal_metadata *md)
358163837Spjd{
359163837Spjd	static const char hex[] = "0123456789abcdef";
360163837Spjd	char hash[16 * 2 + 1];
361163837Spjd	u_int i;
362163837Spjd
363163837Spjd	printf("     magic: %s\n", md->md_magic);
364163837Spjd	printf("   version: %u\n", (u_int)md->md_version);
365163837Spjd	printf("        id: %u\n", (u_int)md->md_id);
366163837Spjd	printf("      type: %u\n", (u_int)md->md_type);
367163837Spjd	printf("     start: %ju\n", (uintmax_t)md->md_jstart);
368163837Spjd	printf("       end: %ju\n", (uintmax_t)md->md_jend);
369163837Spjd	printf("   joffset: %ju\n", (uintmax_t)md->md_joffset);
370163837Spjd	printf("       jid: %u\n", (u_int)md->md_jid);
371163837Spjd	printf("     flags: %u\n", (u_int)md->md_flags);
372163837Spjd	printf("hcprovider: %s\n", md->md_provider);
373163837Spjd	printf("  provsize: %ju\n", (uintmax_t)md->md_provsize);
374163837Spjd	bzero(hash, sizeof(hash));
375163837Spjd	for (i = 0; i < 16; i++) {
376163837Spjd		hash[i * 2] = hex[md->md_hash[i] >> 4];
377163837Spjd		hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f];
378163837Spjd	}
379163837Spjd	printf("  MD5 hash: %s\n", hash);
380163837Spjd}
381163837Spjd#endif	/* !_G_JOURNAL_H_ */
382