secondary.c revision 209182
1216294Ssyrinx/*-
2216294Ssyrinx * Copyright (c) 2009-2010 The FreeBSD Foundation
3216294Ssyrinx * All rights reserved.
4216294Ssyrinx *
5216294Ssyrinx * This software was developed by Pawel Jakub Dawidek under sponsorship from
6216294Ssyrinx * the FreeBSD Foundation.
7216294Ssyrinx *
8216294Ssyrinx * Redistribution and use in source and binary forms, with or without
9216294Ssyrinx * modification, are permitted provided that the following conditions
10216294Ssyrinx * are met:
11216294Ssyrinx * 1. Redistributions of source code must retain the above copyright
12216294Ssyrinx *    notice, this list of conditions and the following disclaimer.
13216294Ssyrinx * 2. Redistributions in binary form must reproduce the above copyright
14216294Ssyrinx *    notice, this list of conditions and the following disclaimer in the
15216294Ssyrinx *    documentation and/or other materials provided with the distribution.
16216294Ssyrinx *
17216294Ssyrinx * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18216294Ssyrinx * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19216294Ssyrinx * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20216294Ssyrinx * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21216294Ssyrinx * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22216294Ssyrinx * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23216294Ssyrinx * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24216294Ssyrinx * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25216294Ssyrinx * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26216294Ssyrinx * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27216294Ssyrinx * SUCH DAMAGE.
28216294Ssyrinx */
29216294Ssyrinx
30216294Ssyrinx#include <sys/cdefs.h>
31216294Ssyrinx__FBSDID("$FreeBSD: head/sbin/hastd/secondary.c 209182 2010-06-14 21:41:22Z pjd $");
32216294Ssyrinx
33216294Ssyrinx#include <sys/param.h>
34216294Ssyrinx#include <sys/time.h>
35216294Ssyrinx#include <sys/bio.h>
36216294Ssyrinx#include <sys/disk.h>
37216294Ssyrinx#include <sys/stat.h>
38216294Ssyrinx
39216294Ssyrinx#include <assert.h>
40216294Ssyrinx#include <err.h>
41216294Ssyrinx#include <errno.h>
42216294Ssyrinx#include <fcntl.h>
43216294Ssyrinx#include <libgeom.h>
44216294Ssyrinx#include <pthread.h>
45216294Ssyrinx#include <stdint.h>
46216294Ssyrinx#include <stdio.h>
47216294Ssyrinx#include <string.h>
48216294Ssyrinx#include <sysexits.h>
49216294Ssyrinx#include <unistd.h>
50216294Ssyrinx
51216294Ssyrinx#include <activemap.h>
52216294Ssyrinx#include <nv.h>
53216294Ssyrinx#include <pjdlog.h>
54216294Ssyrinx
55216294Ssyrinx#include "control.h"
56216294Ssyrinx#include "hast.h"
57216294Ssyrinx#include "hast_proto.h"
58216294Ssyrinx#include "hastd.h"
59216294Ssyrinx#include "metadata.h"
60216294Ssyrinx#include "proto.h"
61216294Ssyrinx#include "subr.h"
62216294Ssyrinx#include "synch.h"
63216294Ssyrinx
64216294Ssyrinxstruct hio {
65216294Ssyrinx	uint64_t 	 hio_seq;
66216294Ssyrinx	int	 	 hio_error;
67216294Ssyrinx	struct nv	*hio_nv;
68216294Ssyrinx	void		*hio_data;
69216294Ssyrinx	uint8_t		 hio_cmd;
70216294Ssyrinx	uint64_t	 hio_offset;
71216294Ssyrinx	uint64_t	 hio_length;
72216294Ssyrinx	TAILQ_ENTRY(hio) hio_next;
73216294Ssyrinx};
74216294Ssyrinx
75216294Ssyrinx/*
76216294Ssyrinx * Free list holds unused structures. When free list is empty, we have to wait
77216294Ssyrinx * until some in-progress requests are freed.
78216294Ssyrinx */
79216294Ssyrinxstatic TAILQ_HEAD(, hio) hio_free_list;
80216294Ssyrinxstatic pthread_mutex_t hio_free_list_lock;
81216294Ssyrinxstatic pthread_cond_t hio_free_list_cond;
82216294Ssyrinx/*
83216294Ssyrinx * Disk thread (the one that do I/O requests) takes requests from this list.
84216294Ssyrinx */
85216294Ssyrinxstatic TAILQ_HEAD(, hio) hio_disk_list;
86216294Ssyrinxstatic pthread_mutex_t hio_disk_list_lock;
87216294Ssyrinxstatic pthread_cond_t hio_disk_list_cond;
88216294Ssyrinx/*
89216294Ssyrinx * There is one recv list for every component, although local components don't
90216294Ssyrinx * use recv lists as local requests are done synchronously.
91216294Ssyrinx */
92216294Ssyrinxstatic TAILQ_HEAD(, hio) hio_send_list;
93216294Ssyrinxstatic pthread_mutex_t hio_send_list_lock;
94216294Ssyrinxstatic pthread_cond_t hio_send_list_cond;
95216294Ssyrinx
96216294Ssyrinx/*
97216294Ssyrinx * Maximum number of outstanding I/O requests.
98216294Ssyrinx */
99216294Ssyrinx#define	HAST_HIO_MAX	256
100216294Ssyrinx
101216294Ssyrinxstatic void *recv_thread(void *arg);
102216294Ssyrinxstatic void *disk_thread(void *arg);
103216294Ssyrinxstatic void *send_thread(void *arg);
104216294Ssyrinx
105216294Ssyrinxstatic void
106216294Ssyrinxinit_environment(void)
107216294Ssyrinx{
108216294Ssyrinx	struct hio *hio;
109216294Ssyrinx	unsigned int ii;
110216294Ssyrinx
111216294Ssyrinx	/*
112216294Ssyrinx	 * Initialize lists, their locks and theirs condition variables.
113216294Ssyrinx	 */
114216294Ssyrinx	TAILQ_INIT(&hio_free_list);
115216294Ssyrinx	mtx_init(&hio_free_list_lock);
116216294Ssyrinx	cv_init(&hio_free_list_cond);
117216294Ssyrinx	TAILQ_INIT(&hio_disk_list);
118216294Ssyrinx	mtx_init(&hio_disk_list_lock);
119216294Ssyrinx	cv_init(&hio_disk_list_cond);
120216294Ssyrinx	TAILQ_INIT(&hio_send_list);
121216294Ssyrinx	mtx_init(&hio_send_list_lock);
122216294Ssyrinx	cv_init(&hio_send_list_cond);
123216294Ssyrinx
124216294Ssyrinx	/*
125216294Ssyrinx	 * Allocate requests pool and initialize requests.
126216294Ssyrinx	 */
127216294Ssyrinx	for (ii = 0; ii < HAST_HIO_MAX; ii++) {
128216294Ssyrinx		hio = malloc(sizeof(*hio));
129216294Ssyrinx		if (hio == NULL) {
130216294Ssyrinx			errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory "
131216294Ssyrinx			    "for hio request", sizeof(*hio));
132216294Ssyrinx		}
133216294Ssyrinx		hio->hio_error = 0;
134216294Ssyrinx		hio->hio_data = malloc(MAXPHYS);
135216294Ssyrinx		if (hio->hio_data == NULL) {
136216294Ssyrinx			errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory "
137216294Ssyrinx			    "for gctl_data", (size_t)MAXPHYS);
138216294Ssyrinx		}
139216294Ssyrinx		TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next);
140216294Ssyrinx	}
141216294Ssyrinx}
142216294Ssyrinx
143216294Ssyrinxstatic void
144216294Ssyrinxinit_local(struct hast_resource *res)
145216294Ssyrinx{
146216294Ssyrinx
147216294Ssyrinx	if (metadata_read(res, true) < 0)
148216294Ssyrinx		exit(EX_NOINPUT);
149216294Ssyrinx}
150216294Ssyrinx
151216294Ssyrinxstatic void
152216294Ssyrinxinit_remote(struct hast_resource *res, struct nv *nvin)
153216294Ssyrinx{
154216294Ssyrinx	uint64_t resuid;
155216294Ssyrinx	struct nv *nvout;
156216294Ssyrinx	unsigned char *map;
157216294Ssyrinx	size_t mapsize;
158216294Ssyrinx
159216294Ssyrinx	map = NULL;
160216294Ssyrinx	mapsize = 0;
161216294Ssyrinx	nvout = nv_alloc();
162216294Ssyrinx	nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize");
163216294Ssyrinx	nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize");
164216294Ssyrinx	resuid = nv_get_uint64(nvin, "resuid");
165216294Ssyrinx	res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt");
166216294Ssyrinx	res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt");
167216294Ssyrinx	nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt");
168216294Ssyrinx	nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt");
169216294Ssyrinx	mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
170216294Ssyrinx	    METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize);
171216294Ssyrinx	map = malloc(mapsize);
172216294Ssyrinx	if (map == NULL) {
173216294Ssyrinx		pjdlog_exitx(EX_TEMPFAIL,
174216294Ssyrinx		    "Unable to allocate memory (%zu bytes) for activemap.",
175216294Ssyrinx		    mapsize);
176216294Ssyrinx	}
177216294Ssyrinx	nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize");
178216294Ssyrinx	/*
179216294Ssyrinx	 * When we work as primary and secondary is missing we will increase
180216294Ssyrinx	 * localcnt in our metadata. When secondary is connected and synced
181216294Ssyrinx	 * we make localcnt be equal to remotecnt, which means nodes are more
182216294Ssyrinx	 * or less in sync.
183216294Ssyrinx	 * Split-brain condition is when both nodes are not able to communicate
184216294Ssyrinx	 * and are both configured as primary nodes. In turn, they can both
185216294Ssyrinx	 * make incompatible changes to the data and we have to detect that.
186216294Ssyrinx	 * Under split-brain condition we will increase our localcnt on first
187216294Ssyrinx	 * write and remote node will increase its localcnt on first write.
188216294Ssyrinx	 * When we connect we can see that primary's localcnt is greater than
189216294Ssyrinx	 * our remotecnt (primary was modified while we weren't watching) and
190216294Ssyrinx	 * our localcnt is greater than primary's remotecnt (we were modified
191216294Ssyrinx	 * while primary wasn't watching).
192216294Ssyrinx	 * There are many possible combinations which are all gathered below.
193216294Ssyrinx	 * Don't pay too much attention to exact numbers, the more important
194216294Ssyrinx	 * is to compare them. We compare secondary's local with primary's
195216294Ssyrinx	 * remote and secondary's remote with primary's local.
196216294Ssyrinx	 * Note that every case where primary's localcnt is smaller than
197216294Ssyrinx	 * secondary's remotecnt and where secondary's localcnt is smaller than
198216294Ssyrinx	 * primary's remotecnt should be impossible in practise. We will perform
199216294Ssyrinx	 * full synchronization then. Those cases are marked with an asterisk.
200216294Ssyrinx	 * Regular synchronization means that only extents marked as dirty are
201216294Ssyrinx	 * synchronized (regular synchronization).
202216294Ssyrinx	 *
203216294Ssyrinx	 * SECONDARY METADATA PRIMARY METADATA
204216294Ssyrinx	 * local=3 remote=3   local=2 remote=2*  ?! Full sync from secondary.
205216294Ssyrinx	 * local=3 remote=3   local=2 remote=3*  ?! Full sync from primary.
206216294Ssyrinx	 * local=3 remote=3   local=2 remote=4*  ?! Full sync from primary.
207216294Ssyrinx	 * local=3 remote=3   local=3 remote=2   Primary is out-of-date,
208216294Ssyrinx	 *                                       regular sync from secondary.
209216294Ssyrinx	 * local=3 remote=3   local=3 remote=3   Regular sync just in case.
210216294Ssyrinx	 * local=3 remote=3   local=3 remote=4*  ?! Full sync from primary.
211216294Ssyrinx	 * local=3 remote=3   local=4 remote=2   Split-brain condition.
212216294Ssyrinx	 * local=3 remote=3   local=4 remote=3   Secondary out-of-date,
213216294Ssyrinx	 *                                       regular sync from primary.
214216294Ssyrinx	 * local=3 remote=3   local=4 remote=4*  ?! Full sync from primary.
215216294Ssyrinx	 */
216216294Ssyrinx	if (res->hr_resuid == 0) {
217216294Ssyrinx		/*
218216294Ssyrinx		 * Provider is used for the first time. Initialize everything.
219216294Ssyrinx		 */
220216294Ssyrinx		assert(res->hr_secondary_localcnt == 0);
221216294Ssyrinx		res->hr_resuid = resuid;
222216294Ssyrinx		if (metadata_write(res) < 0)
223216294Ssyrinx			exit(EX_NOINPUT);
224216294Ssyrinx		memset(map, 0xff, mapsize);
225216294Ssyrinx		nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
226216294Ssyrinx	} else if (
227216294Ssyrinx	    /* Is primary is out-of-date? */
228216294Ssyrinx	    (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
229216294Ssyrinx	     res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
230216294Ssyrinx	    /* Node are more or less in sync? */
231216294Ssyrinx	    (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
232216294Ssyrinx	     res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
233216294Ssyrinx	    /* Is secondary is out-of-date? */
234216294Ssyrinx	    (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
235216294Ssyrinx	     res->hr_secondary_remotecnt < res->hr_primary_localcnt)) {
236216294Ssyrinx		/*
237216294Ssyrinx		 * Nodes are more or less in sync or one of the nodes is
238216294Ssyrinx		 * out-of-date.
239216294Ssyrinx		 * It doesn't matter at this point which one, we just have to
240216294Ssyrinx		 * send out local bitmap to the remote node.
241216294Ssyrinx		 */
242216294Ssyrinx		if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) !=
243216294Ssyrinx		    (ssize_t)mapsize) {
244216294Ssyrinx			pjdlog_exit(LOG_ERR, "Unable to read activemap");
245216294Ssyrinx		}
246216294Ssyrinx		if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
247216294Ssyrinx		     res->hr_secondary_remotecnt == res->hr_primary_localcnt) {
248216294Ssyrinx			/* Primary is out-of-date, sync from secondary. */
249216294Ssyrinx			nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
250216294Ssyrinx		} else {
251216294Ssyrinx			/*
252216294Ssyrinx			 * Secondary is out-of-date or counts match.
253216294Ssyrinx			 * Sync from primary.
254216294Ssyrinx			 */
255216294Ssyrinx			nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
256216294Ssyrinx		}
257216294Ssyrinx	} else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
258216294Ssyrinx	     res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
259216294Ssyrinx		/*
260216294Ssyrinx		 * Not good, we have split-brain condition.
261216294Ssyrinx		 */
262216294Ssyrinx		pjdlog_error("Split-brain detected, exiting.");
263216294Ssyrinx		nv_add_string(nvout, "Split-brain condition!", "errmsg");
264216294Ssyrinx		free(map);
265216294Ssyrinx		map = NULL;
266216294Ssyrinx		mapsize = 0;
267216294Ssyrinx	} else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
268216294Ssyrinx	    res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ {
269216294Ssyrinx		/*
270216294Ssyrinx		 * This should never happen in practise, but we will perform
271216294Ssyrinx		 * full synchronization.
272216294Ssyrinx		 */
273216294Ssyrinx		assert(res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
274216294Ssyrinx		    res->hr_primary_localcnt < res->hr_secondary_remotecnt);
275216294Ssyrinx		mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
276216294Ssyrinx		    METADATA_SIZE, res->hr_extentsize,
277216294Ssyrinx		    res->hr_local_sectorsize);
278216294Ssyrinx		memset(map, 0xff, mapsize);
279216294Ssyrinx		if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) {
280216294Ssyrinx			/* In this one of five cases sync from secondary. */
281216294Ssyrinx			nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
282216294Ssyrinx		} else {
283216294Ssyrinx			/* For the rest four cases sync from primary. */
284216294Ssyrinx			nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
285216294Ssyrinx		}
286216294Ssyrinx		pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).",
287216294Ssyrinx		    (uintmax_t)res->hr_primary_localcnt,
288216294Ssyrinx		    (uintmax_t)res->hr_primary_remotecnt,
289216294Ssyrinx		    (uintmax_t)res->hr_secondary_localcnt,
290216294Ssyrinx		    (uintmax_t)res->hr_secondary_remotecnt);
291216294Ssyrinx	}
292216294Ssyrinx	if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) {
293216294Ssyrinx		pjdlog_errno(LOG_WARNING, "Unable to send activemap to %s",
294216294Ssyrinx		    res->hr_remoteaddr);
295216294Ssyrinx		nv_free(nvout);
296216294Ssyrinx		exit(EX_TEMPFAIL);
297216294Ssyrinx	}
298216294Ssyrinx	nv_free(nvout);
299216294Ssyrinx	if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
300216294Ssyrinx	     res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
301216294Ssyrinx		/* Exit on split-brain. */
302216294Ssyrinx		exit(EX_CONFIG);
303216294Ssyrinx	}
304216294Ssyrinx}
305216294Ssyrinx
306216294Ssyrinxvoid
307216294Ssyrinxhastd_secondary(struct hast_resource *res, struct nv *nvin)
308216294Ssyrinx{
309216294Ssyrinx	pthread_t td;
310216294Ssyrinx	pid_t pid;
311216294Ssyrinx	int error;
312216294Ssyrinx
313216294Ssyrinx	/*
314216294Ssyrinx	 * Create communication channel between parent and child.
315216294Ssyrinx	 */
316216294Ssyrinx	if (proto_client("socketpair://", &res->hr_ctrl) < 0) {
317216294Ssyrinx		KEEP_ERRNO((void)pidfile_remove(pfh));
318216294Ssyrinx		pjdlog_exit(EX_OSERR,
319216294Ssyrinx		    "Unable to create control sockets between parent and child");
320216294Ssyrinx	}
321216294Ssyrinx
322216294Ssyrinx	pid = fork();
323216294Ssyrinx	if (pid < 0) {
324216294Ssyrinx		KEEP_ERRNO((void)pidfile_remove(pfh));
325216294Ssyrinx		pjdlog_exit(EX_OSERR, "Unable to fork");
326216294Ssyrinx	}
327216294Ssyrinx
328216294Ssyrinx	if (pid > 0) {
329216294Ssyrinx		/* This is parent. */
330216294Ssyrinx		proto_close(res->hr_remotein);
331216294Ssyrinx		res->hr_remotein = NULL;
332216294Ssyrinx		proto_close(res->hr_remoteout);
333216294Ssyrinx		res->hr_remoteout = NULL;
334216294Ssyrinx		res->hr_workerpid = pid;
335216294Ssyrinx		return;
336216294Ssyrinx	}
337216294Ssyrinx	(void)pidfile_close(pfh);
338216294Ssyrinx
339216294Ssyrinx	setproctitle("%s (secondary)", res->hr_name);
340216294Ssyrinx
341216294Ssyrinx	/* Error in setting timeout is not critical, but why should it fail? */
342216294Ssyrinx	if (proto_timeout(res->hr_remotein, 0) < 0)
343216294Ssyrinx		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
344216294Ssyrinx	if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0)
345216294Ssyrinx		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
346216294Ssyrinx
347216294Ssyrinx	init_local(res);
348216294Ssyrinx	init_remote(res, nvin);
349216294Ssyrinx	init_environment();
350216294Ssyrinx
351216294Ssyrinx	error = pthread_create(&td, NULL, recv_thread, res);
352216294Ssyrinx	assert(error == 0);
353216294Ssyrinx	error = pthread_create(&td, NULL, disk_thread, res);
354216294Ssyrinx	assert(error == 0);
355216294Ssyrinx	error = pthread_create(&td, NULL, send_thread, res);
356216294Ssyrinx	assert(error == 0);
357216294Ssyrinx	(void)ctrl_thread(res);
358216294Ssyrinx}
359216294Ssyrinx
360216294Ssyrinxstatic void
361216294Ssyrinxreqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...)
362216294Ssyrinx{
363216294Ssyrinx	char msg[1024];
364216294Ssyrinx	va_list ap;
365216294Ssyrinx	int len;
366216294Ssyrinx
367216294Ssyrinx	va_start(ap, fmt);
368216294Ssyrinx	len = vsnprintf(msg, sizeof(msg), fmt, ap);
369216294Ssyrinx	va_end(ap);
370216294Ssyrinx	if ((size_t)len < sizeof(msg)) {
371216294Ssyrinx		switch (hio->hio_cmd) {
372216294Ssyrinx		case HIO_READ:
373216294Ssyrinx			(void)snprintf(msg + len, sizeof(msg) - len,
374216294Ssyrinx			    "READ(%ju, %ju).", (uintmax_t)hio->hio_offset,
375216294Ssyrinx			    (uintmax_t)hio->hio_length);
376216294Ssyrinx			break;
377216294Ssyrinx		case HIO_DELETE:
378216294Ssyrinx			(void)snprintf(msg + len, sizeof(msg) - len,
379216294Ssyrinx			    "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset,
380216294Ssyrinx			    (uintmax_t)hio->hio_length);
381216294Ssyrinx			break;
382216294Ssyrinx		case HIO_FLUSH:
383216294Ssyrinx			(void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
384216294Ssyrinx			break;
385216294Ssyrinx		case HIO_WRITE:
386216294Ssyrinx			(void)snprintf(msg + len, sizeof(msg) - len,
387216294Ssyrinx			    "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset,
388216294Ssyrinx			    (uintmax_t)hio->hio_length);
389216294Ssyrinx			break;
390216294Ssyrinx		default:
391216294Ssyrinx			(void)snprintf(msg + len, sizeof(msg) - len,
392216294Ssyrinx			    "UNKNOWN(%u).", (unsigned int)hio->hio_cmd);
393216294Ssyrinx			break;
394216294Ssyrinx		}
395216294Ssyrinx	}
396216294Ssyrinx	pjdlog_common(loglevel, debuglevel, error, "%s", msg);
397216294Ssyrinx}
398216294Ssyrinx
399216294Ssyrinxstatic int
400216294Ssyrinxrequnpack(struct hast_resource *res, struct hio *hio)
401216294Ssyrinx{
402216294Ssyrinx
403216294Ssyrinx	hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd");
404216294Ssyrinx	if (hio->hio_cmd == 0) {
405216294Ssyrinx		pjdlog_error("Header contains no 'cmd' field.");
406216294Ssyrinx		hio->hio_error = EINVAL;
407216294Ssyrinx		goto end;
408216294Ssyrinx	}
409216294Ssyrinx	switch (hio->hio_cmd) {
410216294Ssyrinx	case HIO_READ:
411216294Ssyrinx	case HIO_WRITE:
412216294Ssyrinx	case HIO_DELETE:
413216294Ssyrinx		hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset");
414216294Ssyrinx		if (nv_error(hio->hio_nv) != 0) {
415216294Ssyrinx			pjdlog_error("Header is missing 'offset' field.");
416216294Ssyrinx			hio->hio_error = EINVAL;
417216294Ssyrinx			goto end;
418216294Ssyrinx		}
419216294Ssyrinx		hio->hio_length = nv_get_uint64(hio->hio_nv, "length");
420216294Ssyrinx		if (nv_error(hio->hio_nv) != 0) {
421216294Ssyrinx			pjdlog_error("Header is missing 'length' field.");
422216294Ssyrinx			hio->hio_error = EINVAL;
423216294Ssyrinx			goto end;
424216294Ssyrinx		}
425216294Ssyrinx		if (hio->hio_length == 0) {
426216294Ssyrinx			pjdlog_error("Data length is zero.");
427216294Ssyrinx			hio->hio_error = EINVAL;
428216294Ssyrinx			goto end;
429216294Ssyrinx		}
430216294Ssyrinx		if (hio->hio_length > MAXPHYS) {
431216294Ssyrinx			pjdlog_error("Data length is too large (%ju > %ju).",
432216294Ssyrinx			    (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS);
433216294Ssyrinx			hio->hio_error = EINVAL;
434216294Ssyrinx			goto end;
435216294Ssyrinx		}
436216294Ssyrinx		if ((hio->hio_offset % res->hr_local_sectorsize) != 0) {
437216294Ssyrinx			pjdlog_error("Offset %ju is not multiple of sector size.",
438216294Ssyrinx			    (uintmax_t)hio->hio_offset);
439216294Ssyrinx			hio->hio_error = EINVAL;
440216294Ssyrinx			goto end;
441216294Ssyrinx		}
442216294Ssyrinx		if ((hio->hio_length % res->hr_local_sectorsize) != 0) {
443216294Ssyrinx			pjdlog_error("Length %ju is not multiple of sector size.",
444216294Ssyrinx			    (uintmax_t)hio->hio_length);
445216294Ssyrinx			hio->hio_error = EINVAL;
446216294Ssyrinx			goto end;
447216294Ssyrinx		}
448216294Ssyrinx		if (hio->hio_offset + hio->hio_length >
449216294Ssyrinx		    (uint64_t)res->hr_datasize) {
450216294Ssyrinx			pjdlog_error("Data offset is too large (%ju > %ju).",
451216294Ssyrinx			    (uintmax_t)(hio->hio_offset + hio->hio_length),
452216294Ssyrinx			    (uintmax_t)res->hr_datasize);
453216294Ssyrinx			hio->hio_error = EINVAL;
454216294Ssyrinx			goto end;
455216294Ssyrinx		}
456216294Ssyrinx		break;
457216294Ssyrinx	default:
458216294Ssyrinx		pjdlog_error("Header contains invalid 'cmd' (%hhu).",
459216294Ssyrinx		    hio->hio_cmd);
460216294Ssyrinx		hio->hio_error = EINVAL;
461216294Ssyrinx		goto end;
462216294Ssyrinx	}
463216294Ssyrinx	hio->hio_error = 0;
464216294Ssyrinxend:
465216294Ssyrinx	return (hio->hio_error);
466216294Ssyrinx}
467216294Ssyrinx
468216294Ssyrinx/*
469216294Ssyrinx * Thread receives requests from the primary node.
470216294Ssyrinx */
471216294Ssyrinxstatic void *
472216294Ssyrinxrecv_thread(void *arg)
473216294Ssyrinx{
474216294Ssyrinx	struct hast_resource *res = arg;
475216294Ssyrinx	struct hio *hio;
476216294Ssyrinx	bool wakeup;
477216294Ssyrinx
478216294Ssyrinx	for (;;) {
479216294Ssyrinx		pjdlog_debug(2, "recv: Taking free request.");
480216294Ssyrinx		mtx_lock(&hio_free_list_lock);
481216294Ssyrinx		while ((hio = TAILQ_FIRST(&hio_free_list)) == NULL) {
482216294Ssyrinx			pjdlog_debug(2, "recv: No free requests, waiting.");
483216294Ssyrinx			cv_wait(&hio_free_list_cond, &hio_free_list_lock);
484216294Ssyrinx		}
485216294Ssyrinx		TAILQ_REMOVE(&hio_free_list, hio, hio_next);
486216294Ssyrinx		mtx_unlock(&hio_free_list_lock);
487216294Ssyrinx		pjdlog_debug(2, "recv: (%p) Got request.", hio);
488216294Ssyrinx		if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) {
489216294Ssyrinx			pjdlog_exit(EX_TEMPFAIL,
490216294Ssyrinx			    "Unable to receive request header");
491216294Ssyrinx		}
492216294Ssyrinx		if (requnpack(res, hio) != 0)
493216294Ssyrinx			goto send_queue;
494216294Ssyrinx		reqlog(LOG_DEBUG, 2, -1, hio,
495216294Ssyrinx		    "recv: (%p) Got request header: ", hio);
496216294Ssyrinx		if (hio->hio_cmd == HIO_WRITE) {
497216294Ssyrinx			if (hast_proto_recv_data(res, res->hr_remotein,
498216294Ssyrinx			    hio->hio_nv, hio->hio_data, MAXPHYS) < 0) {
499216294Ssyrinx				pjdlog_exit(EX_TEMPFAIL,
500216294Ssyrinx				    "Unable to receive reply data");
501216294Ssyrinx			}
502216294Ssyrinx		}
503216294Ssyrinx		pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.",
504216294Ssyrinx		    hio);
505216294Ssyrinx		mtx_lock(&hio_disk_list_lock);
506216294Ssyrinx		wakeup = TAILQ_EMPTY(&hio_disk_list);
507216294Ssyrinx		TAILQ_INSERT_TAIL(&hio_disk_list, hio, hio_next);
508216294Ssyrinx		mtx_unlock(&hio_disk_list_lock);
509216294Ssyrinx		if (wakeup)
510216294Ssyrinx			cv_signal(&hio_disk_list_cond);
511216294Ssyrinx		continue;
512216294Ssyrinxsend_queue:
513216294Ssyrinx		pjdlog_debug(2, "recv: (%p) Moving request to the send queue.",
514216294Ssyrinx		    hio);
515216294Ssyrinx		mtx_lock(&hio_send_list_lock);
516216294Ssyrinx		wakeup = TAILQ_EMPTY(&hio_send_list);
517216294Ssyrinx		TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next);
518216294Ssyrinx		mtx_unlock(&hio_send_list_lock);
519216294Ssyrinx		if (wakeup)
520216294Ssyrinx			cv_signal(&hio_send_list_cond);
521216294Ssyrinx	}
522216294Ssyrinx	/* NOTREACHED */
523216294Ssyrinx	return (NULL);
524216294Ssyrinx}
525216294Ssyrinx
526216294Ssyrinx/*
527216294Ssyrinx * Thread reads from or writes to local component and also handles DELETE and
528216294Ssyrinx * FLUSH requests.
529216294Ssyrinx */
530216294Ssyrinxstatic void *
531216294Ssyrinxdisk_thread(void *arg)
532216294Ssyrinx{
533216294Ssyrinx	struct hast_resource *res = arg;
534216294Ssyrinx	struct hio *hio;
535216294Ssyrinx	ssize_t ret;
536216294Ssyrinx	bool clear_activemap, wakeup;
537216294Ssyrinx
538216294Ssyrinx	clear_activemap = true;
539216294Ssyrinx
540216294Ssyrinx	for (;;) {
541216294Ssyrinx		pjdlog_debug(2, "disk: Taking request.");
542216294Ssyrinx		mtx_lock(&hio_disk_list_lock);
543216294Ssyrinx		while ((hio = TAILQ_FIRST(&hio_disk_list)) == NULL) {
544216294Ssyrinx			pjdlog_debug(2, "disk: No requests, waiting.");
545216294Ssyrinx			cv_wait(&hio_disk_list_cond, &hio_disk_list_lock);
546216294Ssyrinx		}
547216294Ssyrinx		TAILQ_REMOVE(&hio_disk_list, hio, hio_next);
548216294Ssyrinx		mtx_unlock(&hio_disk_list_lock);
549216294Ssyrinx		while (clear_activemap) {
550216294Ssyrinx			unsigned char *map;
551216294Ssyrinx			size_t mapsize;
552216294Ssyrinx
553216294Ssyrinx			/*
554216294Ssyrinx			 * When first request is received, it means that primary
555216294Ssyrinx			 * already received our activemap, merged it and stored
556216294Ssyrinx			 * locally. We can now safely clear our activemap.
557216294Ssyrinx			 */
558216294Ssyrinx			mapsize =
559216294Ssyrinx			    activemap_calc_ondisk_size(res->hr_local_mediasize -
560216294Ssyrinx			    METADATA_SIZE, res->hr_extentsize,
561216294Ssyrinx			    res->hr_local_sectorsize);
562216294Ssyrinx			map = calloc(1, mapsize);
563216294Ssyrinx			if (map == NULL) {
564216294Ssyrinx				pjdlog_warning("Unable to allocate memory to clear local activemap.");
565216294Ssyrinx				break;
566216294Ssyrinx			}
567216294Ssyrinx			if (pwrite(res->hr_localfd, map, mapsize,
568216294Ssyrinx			    METADATA_SIZE) != (ssize_t)mapsize) {
569216294Ssyrinx				pjdlog_errno(LOG_WARNING,
570216294Ssyrinx				    "Unable to store cleared activemap");
571216294Ssyrinx				free(map);
572216294Ssyrinx				break;
573216294Ssyrinx			}
574216294Ssyrinx			free(map);
575216294Ssyrinx			clear_activemap = false;
576216294Ssyrinx			pjdlog_debug(1, "Local activemap cleared.");
577216294Ssyrinx		}
578216294Ssyrinx		reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio);
579216294Ssyrinx		/* Handle the actual request. */
580216294Ssyrinx		switch (hio->hio_cmd) {
581216294Ssyrinx		case HIO_READ:
582216294Ssyrinx			ret = pread(res->hr_localfd, hio->hio_data,
583216294Ssyrinx			    hio->hio_length,
584216294Ssyrinx			    hio->hio_offset + res->hr_localoff);
585216294Ssyrinx			if (ret < 0)
586216294Ssyrinx				hio->hio_error = errno;
587216294Ssyrinx			else if (ret != (int64_t)hio->hio_length)
588216294Ssyrinx				hio->hio_error = EIO;
589216294Ssyrinx			else
590216294Ssyrinx				hio->hio_error = 0;
591216294Ssyrinx			break;
592216294Ssyrinx		case HIO_WRITE:
593216294Ssyrinx			ret = pwrite(res->hr_localfd, hio->hio_data,
594216294Ssyrinx			    hio->hio_length,
595216294Ssyrinx			    hio->hio_offset + res->hr_localoff);
596216294Ssyrinx			if (ret < 0)
597216294Ssyrinx				hio->hio_error = errno;
598216294Ssyrinx			else if (ret != (int64_t)hio->hio_length)
599216294Ssyrinx				hio->hio_error = EIO;
600216294Ssyrinx			else
601216294Ssyrinx				hio->hio_error = 0;
602216294Ssyrinx			break;
603216294Ssyrinx		case HIO_DELETE:
604216294Ssyrinx			ret = g_delete(res->hr_localfd,
605216294Ssyrinx			    hio->hio_offset + res->hr_localoff,
606216294Ssyrinx			    hio->hio_length);
607216294Ssyrinx			if (ret < 0)
608216294Ssyrinx				hio->hio_error = errno;
609216294Ssyrinx			else
610216294Ssyrinx				hio->hio_error = 0;
611216294Ssyrinx			break;
612216294Ssyrinx		case HIO_FLUSH:
613216294Ssyrinx			ret = g_flush(res->hr_localfd);
614216294Ssyrinx			if (ret < 0)
615				hio->hio_error = errno;
616			else
617				hio->hio_error = 0;
618			break;
619		}
620		if (hio->hio_error != 0) {
621			reqlog(LOG_ERR, 0, hio->hio_error, hio,
622			    "Request failed: ");
623		}
624		pjdlog_debug(2, "disk: (%p) Moving request to the send queue.",
625		    hio);
626		mtx_lock(&hio_send_list_lock);
627		wakeup = TAILQ_EMPTY(&hio_send_list);
628		TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next);
629		mtx_unlock(&hio_send_list_lock);
630		if (wakeup)
631			cv_signal(&hio_send_list_cond);
632	}
633	/* NOTREACHED */
634	return (NULL);
635}
636
637/*
638 * Thread sends requests back to primary node.
639 */
640static void *
641send_thread(void *arg)
642{
643	struct hast_resource *res = arg;
644	struct nv *nvout;
645	struct hio *hio;
646	void *data;
647	size_t length;
648	bool wakeup;
649
650	for (;;) {
651		pjdlog_debug(2, "send: Taking request.");
652		mtx_lock(&hio_send_list_lock);
653		while ((hio = TAILQ_FIRST(&hio_send_list)) == NULL) {
654			pjdlog_debug(2, "send: No requests, waiting.");
655			cv_wait(&hio_send_list_cond, &hio_send_list_lock);
656		}
657		TAILQ_REMOVE(&hio_send_list, hio, hio_next);
658		mtx_unlock(&hio_send_list_lock);
659		reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio);
660		nvout = nv_alloc();
661		/* Copy sequence number. */
662		nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq");
663		switch (hio->hio_cmd) {
664		case HIO_READ:
665			if (hio->hio_error == 0) {
666				data = hio->hio_data;
667				length = hio->hio_length;
668				break;
669			}
670			/*
671			 * We send no data in case of an error.
672			 */
673			/* FALLTHROUGH */
674		case HIO_DELETE:
675		case HIO_FLUSH:
676		case HIO_WRITE:
677			data = NULL;
678			length = 0;
679			break;
680		default:
681			abort();
682			break;
683		}
684		if (hio->hio_error != 0)
685			nv_add_int16(nvout, hio->hio_error, "error");
686		if (hast_proto_send(res, res->hr_remoteout, nvout, data,
687		    length) < 0) {
688			pjdlog_exit(EX_TEMPFAIL, "Unable to send reply.");
689		}
690		nv_free(nvout);
691		pjdlog_debug(2, "disk: (%p) Moving request to the free queue.",
692		    hio);
693		nv_free(hio->hio_nv);
694		hio->hio_error = 0;
695		mtx_lock(&hio_free_list_lock);
696		wakeup = TAILQ_EMPTY(&hio_free_list);
697		TAILQ_INSERT_TAIL(&hio_free_list, hio, hio_next);
698		mtx_unlock(&hio_free_list_lock);
699		if (wakeup)
700			cv_signal(&hio_free_list_cond);
701	}
702	/* NOTREACHED */
703	return (NULL);
704}
705