1204076Spjd/*-
2204076Spjd * Copyright (c) 2009-2010 The FreeBSD Foundation
3211877Spjd * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
4204076Spjd * All rights reserved.
5204076Spjd *
6204076Spjd * This software was developed by Pawel Jakub Dawidek under sponsorship from
7204076Spjd * the FreeBSD Foundation.
8204076Spjd *
9204076Spjd * Redistribution and use in source and binary forms, with or without
10204076Spjd * modification, are permitted provided that the following conditions
11204076Spjd * are met:
12204076Spjd * 1. Redistributions of source code must retain the above copyright
13204076Spjd *    notice, this list of conditions and the following disclaimer.
14204076Spjd * 2. Redistributions in binary form must reproduce the above copyright
15204076Spjd *    notice, this list of conditions and the following disclaimer in the
16204076Spjd *    documentation and/or other materials provided with the distribution.
17204076Spjd *
18204076Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
19204076Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20204076Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21204076Spjd * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
22204076Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23204076Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24204076Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25204076Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26204076Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27204076Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28204076Spjd * SUCH DAMAGE.
29204076Spjd */
30204076Spjd
31204076Spjd#include <sys/cdefs.h>
32204076Spjd__FBSDID("$FreeBSD$");
33204076Spjd
34204076Spjd#include <sys/param.h>
35204076Spjd#include <sys/time.h>
36204076Spjd#include <sys/bio.h>
37204076Spjd#include <sys/disk.h>
38204076Spjd#include <sys/stat.h>
39204076Spjd
40204076Spjd#include <err.h>
41204076Spjd#include <errno.h>
42204076Spjd#include <fcntl.h>
43204076Spjd#include <libgeom.h>
44204076Spjd#include <pthread.h>
45213009Spjd#include <signal.h>
46204076Spjd#include <stdint.h>
47204076Spjd#include <stdio.h>
48204076Spjd#include <string.h>
49204076Spjd#include <sysexits.h>
50204076Spjd#include <unistd.h>
51204076Spjd
52204076Spjd#include <activemap.h>
53204076Spjd#include <nv.h>
54204076Spjd#include <pjdlog.h>
55204076Spjd
56204076Spjd#include "control.h"
57212038Spjd#include "event.h"
58204076Spjd#include "hast.h"
59204076Spjd#include "hast_proto.h"
60204076Spjd#include "hastd.h"
61211977Spjd#include "hooks.h"
62204076Spjd#include "metadata.h"
63204076Spjd#include "proto.h"
64204076Spjd#include "subr.h"
65204076Spjd#include "synch.h"
66204076Spjd
67204076Spjdstruct hio {
68219864Spjd	uint64_t	 hio_seq;
69219864Spjd	int		 hio_error;
70204076Spjd	void		*hio_data;
71204076Spjd	uint8_t		 hio_cmd;
72204076Spjd	uint64_t	 hio_offset;
73204076Spjd	uint64_t	 hio_length;
74249236Strociny	bool		 hio_memsync;
75204076Spjd	TAILQ_ENTRY(hio) hio_next;
76204076Spjd};
77204076Spjd
78211984Spjdstatic struct hast_resource *gres;
79211984Spjd
80204076Spjd/*
81204076Spjd * Free list holds unused structures. When free list is empty, we have to wait
82204076Spjd * until some in-progress requests are freed.
83204076Spjd */
84204076Spjdstatic TAILQ_HEAD(, hio) hio_free_list;
85260007Strocinystatic size_t hio_free_list_size;
86204076Spjdstatic pthread_mutex_t hio_free_list_lock;
87204076Spjdstatic pthread_cond_t hio_free_list_cond;
88204076Spjd/*
89256027Strociny * Disk thread (the one that does I/O requests) takes requests from this list.
90204076Spjd */
91204076Spjdstatic TAILQ_HEAD(, hio) hio_disk_list;
92260007Strocinystatic size_t hio_disk_list_size;
93204076Spjdstatic pthread_mutex_t hio_disk_list_lock;
94204076Spjdstatic pthread_cond_t hio_disk_list_cond;
95204076Spjd/*
96256027Strociny * Thread that sends requests back to primary takes requests from this list.
97204076Spjd */
98204076Spjdstatic TAILQ_HEAD(, hio) hio_send_list;
99260007Strocinystatic size_t hio_send_list_size;
100204076Spjdstatic pthread_mutex_t hio_send_list_lock;
101204076Spjdstatic pthread_cond_t hio_send_list_cond;
102204076Spjd
103204076Spjd/*
104204076Spjd * Maximum number of outstanding I/O requests.
105204076Spjd */
106204076Spjd#define	HAST_HIO_MAX	256
107204076Spjd
108204076Spjdstatic void *recv_thread(void *arg);
109204076Spjdstatic void *disk_thread(void *arg);
110204076Spjdstatic void *send_thread(void *arg);
111204076Spjd
112211877Spjd#define	QUEUE_INSERT(name, hio)	do {					\
113211877Spjd	mtx_lock(&hio_##name##_list_lock);				\
114260007Strociny	if (TAILQ_EMPTY(&hio_##name##_list))				\
115260007Strociny		cv_broadcast(&hio_##name##_list_cond);			\
116211877Spjd	TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next);		\
117260007Strociny	hio_##name##_list_size++;					\
118211877Spjd	mtx_unlock(&hio_##name##_list_lock);				\
119211877Spjd} while (0)
120211877Spjd#define	QUEUE_TAKE(name, hio)	do {					\
121211877Spjd	mtx_lock(&hio_##name##_list_lock);				\
122211877Spjd	while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) {	\
123211877Spjd		cv_wait(&hio_##name##_list_cond,			\
124211877Spjd		    &hio_##name##_list_lock);				\
125211877Spjd	}								\
126260007Strociny	PJDLOG_ASSERT(hio_##name##_list_size != 0);			\
127260007Strociny	hio_##name##_list_size--;					\
128211877Spjd	TAILQ_REMOVE(&hio_##name##_list, (hio), hio_next);		\
129211877Spjd	mtx_unlock(&hio_##name##_list_lock);				\
130211877Spjd} while (0)
131211877Spjd
132204076Spjdstatic void
133260007Strocinyoutput_status_aux(struct nv *nvout)
134260007Strociny{
135260007Strociny
136260007Strociny	nv_add_uint64(nvout, (uint64_t)hio_free_list_size, "idle_queue_size");
137260007Strociny	nv_add_uint64(nvout, (uint64_t)hio_disk_list_size, "local_queue_size");
138260007Strociny	nv_add_uint64(nvout, (uint64_t)hio_send_list_size, "send_queue_size");
139260007Strociny}
140260007Strociny
141260007Strocinystatic void
142229509Strocinyhio_clear(struct hio *hio)
143229509Strociny{
144229509Strociny
145229509Strociny	hio->hio_seq = 0;
146229509Strociny	hio->hio_error = 0;
147229509Strociny	hio->hio_cmd = HIO_UNDEF;
148229509Strociny	hio->hio_offset = 0;
149229509Strociny	hio->hio_length = 0;
150249236Strociny	hio->hio_memsync = false;
151229509Strociny}
152229509Strociny
153229509Strocinystatic void
154249236Strocinyhio_copy(const struct hio *srchio, struct hio *dsthio)
155249236Strociny{
156249236Strociny
157249236Strociny	/*
158249236Strociny	 * We don't copy hio_error, hio_data and hio_next fields.
159249236Strociny	 */
160249236Strociny
161249236Strociny	dsthio->hio_seq = srchio->hio_seq;
162249236Strociny	dsthio->hio_cmd = srchio->hio_cmd;
163249236Strociny	dsthio->hio_offset = srchio->hio_offset;
164249236Strociny	dsthio->hio_length = srchio->hio_length;
165249236Strociny	dsthio->hio_memsync = srchio->hio_memsync;
166249236Strociny}
167249236Strociny
168249236Strocinystatic void
169204076Spjdinit_environment(void)
170204076Spjd{
171204076Spjd	struct hio *hio;
172204076Spjd	unsigned int ii;
173204076Spjd
174204076Spjd	/*
175204076Spjd	 * Initialize lists, their locks and theirs condition variables.
176204076Spjd	 */
177204076Spjd	TAILQ_INIT(&hio_free_list);
178204076Spjd	mtx_init(&hio_free_list_lock);
179204076Spjd	cv_init(&hio_free_list_cond);
180204076Spjd	TAILQ_INIT(&hio_disk_list);
181204076Spjd	mtx_init(&hio_disk_list_lock);
182204076Spjd	cv_init(&hio_disk_list_cond);
183204076Spjd	TAILQ_INIT(&hio_send_list);
184204076Spjd	mtx_init(&hio_send_list_lock);
185204076Spjd	cv_init(&hio_send_list_cond);
186204076Spjd
187204076Spjd	/*
188204076Spjd	 * Allocate requests pool and initialize requests.
189204076Spjd	 */
190204076Spjd	for (ii = 0; ii < HAST_HIO_MAX; ii++) {
191204076Spjd		hio = malloc(sizeof(*hio));
192204076Spjd		if (hio == NULL) {
193210879Spjd			pjdlog_exitx(EX_TEMPFAIL,
194210879Spjd			    "Unable to allocate memory (%zu bytes) for hio request.",
195210879Spjd			    sizeof(*hio));
196204076Spjd		}
197204076Spjd		hio->hio_data = malloc(MAXPHYS);
198204076Spjd		if (hio->hio_data == NULL) {
199210879Spjd			pjdlog_exitx(EX_TEMPFAIL,
200210879Spjd			    "Unable to allocate memory (%zu bytes) for gctl_data.",
201210879Spjd			    (size_t)MAXPHYS);
202204076Spjd		}
203229509Strociny		hio_clear(hio);
204204076Spjd		TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next);
205260007Strociny		hio_free_list_size++;
206204076Spjd	}
207204076Spjd}
208204076Spjd
209204076Spjdstatic void
210204076Spjdinit_local(struct hast_resource *res)
211204076Spjd{
212204076Spjd
213231017Strociny	if (metadata_read(res, true) == -1)
214204076Spjd		exit(EX_NOINPUT);
215204076Spjd}
216204076Spjd
217204076Spjdstatic void
218204076Spjdinit_remote(struct hast_resource *res, struct nv *nvin)
219204076Spjd{
220204076Spjd	uint64_t resuid;
221204076Spjd	struct nv *nvout;
222204076Spjd	unsigned char *map;
223204076Spjd	size_t mapsize;
224204076Spjd
225223181Strociny#ifdef notyet
226220271Spjd	/* Setup direction. */
227220271Spjd	if (proto_send(res->hr_remoteout, NULL, 0) == -1)
228220271Spjd		pjdlog_errno(LOG_WARNING, "Unable to set connection direction");
229223181Strociny#endif
230220271Spjd
231204076Spjd	nvout = nv_alloc();
232204076Spjd	nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize");
233204076Spjd	nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize");
234204076Spjd	resuid = nv_get_uint64(nvin, "resuid");
235204076Spjd	res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt");
236204076Spjd	res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt");
237204076Spjd	nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt");
238204076Spjd	nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt");
239204076Spjd	mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
240204076Spjd	    METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize);
241204076Spjd	map = malloc(mapsize);
242204076Spjd	if (map == NULL) {
243204076Spjd		pjdlog_exitx(EX_TEMPFAIL,
244204076Spjd		    "Unable to allocate memory (%zu bytes) for activemap.",
245204076Spjd		    mapsize);
246204076Spjd	}
247204076Spjd	/*
248204076Spjd	 * When we work as primary and secondary is missing we will increase
249204076Spjd	 * localcnt in our metadata. When secondary is connected and synced
250204076Spjd	 * we make localcnt be equal to remotecnt, which means nodes are more
251204076Spjd	 * or less in sync.
252204076Spjd	 * Split-brain condition is when both nodes are not able to communicate
253204076Spjd	 * and are both configured as primary nodes. In turn, they can both
254204076Spjd	 * make incompatible changes to the data and we have to detect that.
255204076Spjd	 * Under split-brain condition we will increase our localcnt on first
256204076Spjd	 * write and remote node will increase its localcnt on first write.
257204076Spjd	 * When we connect we can see that primary's localcnt is greater than
258204076Spjd	 * our remotecnt (primary was modified while we weren't watching) and
259204076Spjd	 * our localcnt is greater than primary's remotecnt (we were modified
260204076Spjd	 * while primary wasn't watching).
261204076Spjd	 * There are many possible combinations which are all gathered below.
262204076Spjd	 * Don't pay too much attention to exact numbers, the more important
263204076Spjd	 * is to compare them. We compare secondary's local with primary's
264204076Spjd	 * remote and secondary's remote with primary's local.
265204076Spjd	 * Note that every case where primary's localcnt is smaller than
266204076Spjd	 * secondary's remotecnt and where secondary's localcnt is smaller than
267204076Spjd	 * primary's remotecnt should be impossible in practise. We will perform
268204076Spjd	 * full synchronization then. Those cases are marked with an asterisk.
269204076Spjd	 * Regular synchronization means that only extents marked as dirty are
270204076Spjd	 * synchronized (regular synchronization).
271204076Spjd	 *
272204076Spjd	 * SECONDARY METADATA PRIMARY METADATA
273204076Spjd	 * local=3 remote=3   local=2 remote=2*  ?! Full sync from secondary.
274204076Spjd	 * local=3 remote=3   local=2 remote=3*  ?! Full sync from primary.
275204076Spjd	 * local=3 remote=3   local=2 remote=4*  ?! Full sync from primary.
276204076Spjd	 * local=3 remote=3   local=3 remote=2   Primary is out-of-date,
277204076Spjd	 *                                       regular sync from secondary.
278204076Spjd	 * local=3 remote=3   local=3 remote=3   Regular sync just in case.
279204076Spjd	 * local=3 remote=3   local=3 remote=4*  ?! Full sync from primary.
280204076Spjd	 * local=3 remote=3   local=4 remote=2   Split-brain condition.
281204076Spjd	 * local=3 remote=3   local=4 remote=3   Secondary out-of-date,
282204076Spjd	 *                                       regular sync from primary.
283204076Spjd	 * local=3 remote=3   local=4 remote=4*  ?! Full sync from primary.
284204076Spjd	 */
285204076Spjd	if (res->hr_resuid == 0) {
286204076Spjd		/*
287214284Spjd		 * Provider is used for the first time. If primary node done no
288214284Spjd		 * writes yet as well (we will find "virgin" argument) then
289214284Spjd		 * there is no need to synchronize anything. If primary node
290214284Spjd		 * done any writes already we have to synchronize everything.
291204076Spjd		 */
292218138Spjd		PJDLOG_ASSERT(res->hr_secondary_localcnt == 0);
293204076Spjd		res->hr_resuid = resuid;
294231017Strociny		if (metadata_write(res) == -1)
295204076Spjd			exit(EX_NOINPUT);
296214284Spjd		if (nv_exists(nvin, "virgin")) {
297214284Spjd			free(map);
298214284Spjd			map = NULL;
299214284Spjd			mapsize = 0;
300214284Spjd		} else {
301214284Spjd			memset(map, 0xff, mapsize);
302214284Spjd		}
303220865Spjd		nv_add_int8(nvout, 1, "virgin");
304204076Spjd		nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
305219830Spjd	} else if (res->hr_resuid != resuid) {
306219830Spjd		char errmsg[256];
307219830Spjd
308229509Strociny		free(map);
309219830Spjd		(void)snprintf(errmsg, sizeof(errmsg),
310219830Spjd		    "Resource unique ID mismatch (primary=%ju, secondary=%ju).",
311219830Spjd		    (uintmax_t)resuid, (uintmax_t)res->hr_resuid);
312219830Spjd		pjdlog_error("%s", errmsg);
313219830Spjd		nv_add_string(nvout, errmsg, "errmsg");
314231017Strociny		if (hast_proto_send(res, res->hr_remotein, nvout,
315231017Strociny		    NULL, 0) == -1) {
316231017Strociny			pjdlog_exit(EX_TEMPFAIL,
317231017Strociny			    "Unable to send response to %s",
318219830Spjd			    res->hr_remoteaddr);
319219830Spjd		}
320219831Spjd		nv_free(nvout);
321219830Spjd		exit(EX_CONFIG);
322204076Spjd	} else if (
323229509Strociny	    /* Is primary out-of-date? */
324204076Spjd	    (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
325204076Spjd	     res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
326229509Strociny	    /* Are the nodes more or less in sync? */
327204076Spjd	    (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
328204076Spjd	     res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
329229509Strociny	    /* Is secondary out-of-date? */
330204076Spjd	    (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
331204076Spjd	     res->hr_secondary_remotecnt < res->hr_primary_localcnt)) {
332204076Spjd		/*
333204076Spjd		 * Nodes are more or less in sync or one of the nodes is
334204076Spjd		 * out-of-date.
335204076Spjd		 * It doesn't matter at this point which one, we just have to
336204076Spjd		 * send out local bitmap to the remote node.
337204076Spjd		 */
338204076Spjd		if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) !=
339204076Spjd		    (ssize_t)mapsize) {
340204076Spjd			pjdlog_exit(LOG_ERR, "Unable to read activemap");
341204076Spjd		}
342204076Spjd		if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
343204076Spjd		     res->hr_secondary_remotecnt == res->hr_primary_localcnt) {
344204076Spjd			/* Primary is out-of-date, sync from secondary. */
345204076Spjd			nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
346204076Spjd		} else {
347204076Spjd			/*
348204076Spjd			 * Secondary is out-of-date or counts match.
349204076Spjd			 * Sync from primary.
350204076Spjd			 */
351204076Spjd			nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
352204076Spjd		}
353204076Spjd	} else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
354204076Spjd	     res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
355204076Spjd		/*
356204076Spjd		 * Not good, we have split-brain condition.
357204076Spjd		 */
358229509Strociny		free(map);
359204076Spjd		pjdlog_error("Split-brain detected, exiting.");
360204076Spjd		nv_add_string(nvout, "Split-brain condition!", "errmsg");
361231017Strociny		if (hast_proto_send(res, res->hr_remotein, nvout,
362231017Strociny		    NULL, 0) == -1) {
363231017Strociny			pjdlog_exit(EX_TEMPFAIL,
364231017Strociny			    "Unable to send response to %s",
365229509Strociny			    res->hr_remoteaddr);
366229509Strociny		}
367229509Strociny		nv_free(nvout);
368229509Strociny		/* Exit on split-brain. */
369229509Strociny		event_send(res, EVENT_SPLITBRAIN);
370229509Strociny		exit(EX_CONFIG);
371204076Spjd	} else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
372204076Spjd	    res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ {
373204076Spjd		/*
374204076Spjd		 * This should never happen in practise, but we will perform
375204076Spjd		 * full synchronization.
376204076Spjd		 */
377218138Spjd		PJDLOG_ASSERT(res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
378204076Spjd		    res->hr_primary_localcnt < res->hr_secondary_remotecnt);
379204076Spjd		mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
380204076Spjd		    METADATA_SIZE, res->hr_extentsize,
381204076Spjd		    res->hr_local_sectorsize);
382204076Spjd		memset(map, 0xff, mapsize);
383204076Spjd		if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) {
384204076Spjd			/* In this one of five cases sync from secondary. */
385204076Spjd			nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
386204076Spjd		} else {
387204076Spjd			/* For the rest four cases sync from primary. */
388204076Spjd			nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
389204076Spjd		}
390204076Spjd		pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).",
391204076Spjd		    (uintmax_t)res->hr_primary_localcnt,
392204076Spjd		    (uintmax_t)res->hr_primary_remotecnt,
393204076Spjd		    (uintmax_t)res->hr_secondary_localcnt,
394204076Spjd		    (uintmax_t)res->hr_secondary_remotecnt);
395204076Spjd	}
396220007Spjd	nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize");
397231017Strociny	if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) == -1) {
398214276Spjd		pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s",
399204076Spjd		    res->hr_remoteaddr);
400204076Spjd	}
401214275Spjd	if (map != NULL)
402214275Spjd		free(map);
403209182Spjd	nv_free(nvout);
404223181Strociny#ifdef notyet
405220271Spjd	/* Setup direction. */
406220271Spjd	if (proto_recv(res->hr_remotein, NULL, 0) == -1)
407220271Spjd		pjdlog_errno(LOG_WARNING, "Unable to set connection direction");
408223181Strociny#endif
409204076Spjd}
410204076Spjd
411204076Spjdvoid
412204076Spjdhastd_secondary(struct hast_resource *res, struct nv *nvin)
413204076Spjd{
414213009Spjd	sigset_t mask;
415204076Spjd	pthread_t td;
416204076Spjd	pid_t pid;
417219482Strociny	int error, mode, debuglevel;
418204076Spjd
419204076Spjd	/*
420204076Spjd	 * Create communication channel between parent and child.
421204076Spjd	 */
422231017Strociny	if (proto_client(NULL, "socketpair://", &res->hr_ctrl) == -1) {
423204076Spjd		KEEP_ERRNO((void)pidfile_remove(pfh));
424204076Spjd		pjdlog_exit(EX_OSERR,
425204076Spjd		    "Unable to create control sockets between parent and child");
426204076Spjd	}
427212038Spjd	/*
428212038Spjd	 * Create communication channel between child and parent.
429212038Spjd	 */
430231017Strociny	if (proto_client(NULL, "socketpair://", &res->hr_event) == -1) {
431212038Spjd		KEEP_ERRNO((void)pidfile_remove(pfh));
432212038Spjd		pjdlog_exit(EX_OSERR,
433212038Spjd		    "Unable to create event sockets between child and parent");
434212038Spjd	}
435204076Spjd
436204076Spjd	pid = fork();
437231017Strociny	if (pid == -1) {
438204076Spjd		KEEP_ERRNO((void)pidfile_remove(pfh));
439204076Spjd		pjdlog_exit(EX_OSERR, "Unable to fork");
440204076Spjd	}
441204076Spjd
442204076Spjd	if (pid > 0) {
443204076Spjd		/* This is parent. */
444204076Spjd		proto_close(res->hr_remotein);
445204076Spjd		res->hr_remotein = NULL;
446204076Spjd		proto_close(res->hr_remoteout);
447204076Spjd		res->hr_remoteout = NULL;
448212038Spjd		/* Declare that we are receiver. */
449212038Spjd		proto_recv(res->hr_event, NULL, 0);
450218043Spjd		/* Declare that we are sender. */
451218043Spjd		proto_send(res->hr_ctrl, NULL, 0);
452204076Spjd		res->hr_workerpid = pid;
453204076Spjd		return;
454204076Spjd	}
455211977Spjd
456211984Spjd	gres = res;
457260007Strociny	res->output_status_aux = output_status_aux;
458218043Spjd	mode = pjdlog_mode_get();
459219482Strociny	debuglevel = pjdlog_debug_get();
460211984Spjd
461218043Spjd	/* Declare that we are sender. */
462218043Spjd	proto_send(res->hr_event, NULL, 0);
463218043Spjd	/* Declare that we are receiver. */
464218043Spjd	proto_recv(res->hr_ctrl, NULL, 0);
465218043Spjd	descriptors_cleanup(res);
466204076Spjd
467218045Spjd	descriptors_assert(res, mode);
468218045Spjd
469218043Spjd	pjdlog_init(mode);
470219482Strociny	pjdlog_debug_set(debuglevel);
471218043Spjd	pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
472220005Spjd	setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role));
473204076Spjd
474213009Spjd	PJDLOG_VERIFY(sigemptyset(&mask) == 0);
475213009Spjd	PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
476210880Spjd
477207371Spjd	/* Error in setting timeout is not critical, but why should it fail? */
478231017Strociny	if (proto_timeout(res->hr_remotein, 2 * HAST_KEEPALIVE) == -1)
479207371Spjd		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
480231017Strociny	if (proto_timeout(res->hr_remoteout, res->hr_timeout) == -1)
481207371Spjd		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
482207371Spjd
483204076Spjd	init_local(res);
484213007Spjd	init_environment();
485213007Spjd
486221899Spjd	if (drop_privs(res) != 0)
487218049Spjd		exit(EX_CONFIG);
488218214Spjd	pjdlog_info("Privileges successfully dropped.");
489218049Spjd
490213007Spjd	/*
491213007Spjd	 * Create the control thread before sending any event to the parent,
492213007Spjd	 * as we can deadlock when parent sends control request to worker,
493213007Spjd	 * but worker has no control thread started yet, so parent waits.
494213007Spjd	 * In the meantime worker sends an event to the parent, but parent
495213007Spjd	 * is unable to handle the event, because it waits for control
496213007Spjd	 * request response.
497213007Spjd	 */
498213007Spjd	error = pthread_create(&td, NULL, ctrl_thread, res);
499218138Spjd	PJDLOG_ASSERT(error == 0);
500213007Spjd
501204076Spjd	init_remote(res, nvin);
502212038Spjd	event_send(res, EVENT_CONNECT);
503204076Spjd
504204076Spjd	error = pthread_create(&td, NULL, recv_thread, res);
505218138Spjd	PJDLOG_ASSERT(error == 0);
506204076Spjd	error = pthread_create(&td, NULL, disk_thread, res);
507218138Spjd	PJDLOG_ASSERT(error == 0);
508213007Spjd	(void)send_thread(res);
509204076Spjd}
510204076Spjd
511204076Spjdstatic void
512231017Strocinyreqlog(int loglevel, int debuglevel, int error, struct hio *hio,
513231017Strociny    const char *fmt, ...)
514204076Spjd{
515204076Spjd	char msg[1024];
516204076Spjd	va_list ap;
517204076Spjd	int len;
518204076Spjd
519204076Spjd	va_start(ap, fmt);
520204076Spjd	len = vsnprintf(msg, sizeof(msg), fmt, ap);
521204076Spjd	va_end(ap);
522204076Spjd	if ((size_t)len < sizeof(msg)) {
523204076Spjd		switch (hio->hio_cmd) {
524204076Spjd		case HIO_READ:
525204076Spjd			(void)snprintf(msg + len, sizeof(msg) - len,
526204076Spjd			    "READ(%ju, %ju).", (uintmax_t)hio->hio_offset,
527204076Spjd			    (uintmax_t)hio->hio_length);
528204076Spjd			break;
529204076Spjd		case HIO_DELETE:
530204076Spjd			(void)snprintf(msg + len, sizeof(msg) - len,
531204076Spjd			    "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset,
532204076Spjd			    (uintmax_t)hio->hio_length);
533204076Spjd			break;
534204076Spjd		case HIO_FLUSH:
535204076Spjd			(void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
536204076Spjd			break;
537204076Spjd		case HIO_WRITE:
538204076Spjd			(void)snprintf(msg + len, sizeof(msg) - len,
539204076Spjd			    "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset,
540204076Spjd			    (uintmax_t)hio->hio_length);
541204076Spjd			break;
542211882Spjd		case HIO_KEEPALIVE:
543211882Spjd			(void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE.");
544211882Spjd			break;
545204076Spjd		default:
546204076Spjd			(void)snprintf(msg + len, sizeof(msg) - len,
547204076Spjd			    "UNKNOWN(%u).", (unsigned int)hio->hio_cmd);
548204076Spjd			break;
549204076Spjd		}
550204076Spjd	}
551204076Spjd	pjdlog_common(loglevel, debuglevel, error, "%s", msg);
552204076Spjd}
553204076Spjd
554204076Spjdstatic int
555229509Strocinyrequnpack(struct hast_resource *res, struct hio *hio, struct nv *nv)
556204076Spjd{
557204076Spjd
558229509Strociny	hio->hio_cmd = nv_get_uint8(nv, "cmd");
559204076Spjd	if (hio->hio_cmd == 0) {
560204076Spjd		pjdlog_error("Header contains no 'cmd' field.");
561204076Spjd		hio->hio_error = EINVAL;
562204076Spjd		goto end;
563204076Spjd	}
564229509Strociny	if (hio->hio_cmd != HIO_KEEPALIVE) {
565229509Strociny		hio->hio_seq = nv_get_uint64(nv, "seq");
566229509Strociny		if (hio->hio_seq == 0) {
567229509Strociny			pjdlog_error("Header contains no 'seq' field.");
568229509Strociny			hio->hio_error = EINVAL;
569229509Strociny			goto end;
570229509Strociny		}
571229509Strociny	}
572204076Spjd	switch (hio->hio_cmd) {
573222164Spjd	case HIO_FLUSH:
574211882Spjd	case HIO_KEEPALIVE:
575211882Spjd		break;
576249236Strociny	case HIO_WRITE:
577249236Strociny		hio->hio_memsync = nv_exists(nv, "memsync");
578249236Strociny		/* FALLTHROUGH */
579204076Spjd	case HIO_READ:
580204076Spjd	case HIO_DELETE:
581229509Strociny		hio->hio_offset = nv_get_uint64(nv, "offset");
582229509Strociny		if (nv_error(nv) != 0) {
583204076Spjd			pjdlog_error("Header is missing 'offset' field.");
584204076Spjd			hio->hio_error = EINVAL;
585204076Spjd			goto end;
586204076Spjd		}
587229509Strociny		hio->hio_length = nv_get_uint64(nv, "length");
588229509Strociny		if (nv_error(nv) != 0) {
589204076Spjd			pjdlog_error("Header is missing 'length' field.");
590204076Spjd			hio->hio_error = EINVAL;
591204076Spjd			goto end;
592204076Spjd		}
593204076Spjd		if (hio->hio_length == 0) {
594204076Spjd			pjdlog_error("Data length is zero.");
595204076Spjd			hio->hio_error = EINVAL;
596204076Spjd			goto end;
597204076Spjd		}
598252517Strociny		if (hio->hio_cmd != HIO_DELETE && hio->hio_length > MAXPHYS) {
599204076Spjd			pjdlog_error("Data length is too large (%ju > %ju).",
600204076Spjd			    (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS);
601204076Spjd			hio->hio_error = EINVAL;
602204076Spjd			goto end;
603204076Spjd		}
604204076Spjd		if ((hio->hio_offset % res->hr_local_sectorsize) != 0) {
605204076Spjd			pjdlog_error("Offset %ju is not multiple of sector size.",
606204076Spjd			    (uintmax_t)hio->hio_offset);
607204076Spjd			hio->hio_error = EINVAL;
608204076Spjd			goto end;
609204076Spjd		}
610204076Spjd		if ((hio->hio_length % res->hr_local_sectorsize) != 0) {
611204076Spjd			pjdlog_error("Length %ju is not multiple of sector size.",
612204076Spjd			    (uintmax_t)hio->hio_length);
613204076Spjd			hio->hio_error = EINVAL;
614204076Spjd			goto end;
615204076Spjd		}
616204076Spjd		if (hio->hio_offset + hio->hio_length >
617204076Spjd		    (uint64_t)res->hr_datasize) {
618204076Spjd			pjdlog_error("Data offset is too large (%ju > %ju).",
619204076Spjd			    (uintmax_t)(hio->hio_offset + hio->hio_length),
620204076Spjd			    (uintmax_t)res->hr_datasize);
621204076Spjd			hio->hio_error = EINVAL;
622204076Spjd			goto end;
623204076Spjd		}
624204076Spjd		break;
625204076Spjd	default:
626204076Spjd		pjdlog_error("Header contains invalid 'cmd' (%hhu).",
627204076Spjd		    hio->hio_cmd);
628204076Spjd		hio->hio_error = EINVAL;
629204076Spjd		goto end;
630204076Spjd	}
631204076Spjd	hio->hio_error = 0;
632204076Spjdend:
633204076Spjd	return (hio->hio_error);
634204076Spjd}
635204076Spjd
636212899Spjdstatic __dead2 void
637211984Spjdsecondary_exit(int exitcode, const char *fmt, ...)
638211984Spjd{
639211984Spjd	va_list ap;
640211984Spjd
641218138Spjd	PJDLOG_ASSERT(exitcode != EX_OK);
642211984Spjd	va_start(ap, fmt);
643211984Spjd	pjdlogv_errno(LOG_ERR, fmt, ap);
644211984Spjd	va_end(ap);
645212038Spjd	event_send(gres, EVENT_DISCONNECT);
646211984Spjd	exit(exitcode);
647211984Spjd}
648211984Spjd
649204076Spjd/*
650204076Spjd * Thread receives requests from the primary node.
651204076Spjd */
652204076Spjdstatic void *
653204076Spjdrecv_thread(void *arg)
654204076Spjd{
655204076Spjd	struct hast_resource *res = arg;
656249236Strociny	struct hio *hio, *mshio;
657229509Strociny	struct nv *nv;
658204076Spjd
659204076Spjd	for (;;) {
660204076Spjd		pjdlog_debug(2, "recv: Taking free request.");
661211877Spjd		QUEUE_TAKE(free, hio);
662204076Spjd		pjdlog_debug(2, "recv: (%p) Got request.", hio);
663231017Strociny		if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) {
664211984Spjd			secondary_exit(EX_TEMPFAIL,
665204076Spjd			    "Unable to receive request header");
666204076Spjd		}
667229509Strociny		if (requnpack(res, hio, nv) != 0) {
668229509Strociny			nv_free(nv);
669211877Spjd			pjdlog_debug(2,
670211877Spjd			    "recv: (%p) Moving request to the send queue.",
671211877Spjd			    hio);
672211877Spjd			QUEUE_INSERT(send, hio);
673211877Spjd			continue;
674211877Spjd		}
675222228Spjd		switch (hio->hio_cmd) {
676222228Spjd		case HIO_READ:
677222228Spjd			res->hr_stat_read++;
678222228Spjd			break;
679222228Spjd		case HIO_WRITE:
680222228Spjd			res->hr_stat_write++;
681222228Spjd			break;
682222228Spjd		case HIO_DELETE:
683222228Spjd			res->hr_stat_delete++;
684222228Spjd			break;
685222228Spjd		case HIO_FLUSH:
686222228Spjd			res->hr_stat_flush++;
687222228Spjd			break;
688229509Strociny		case HIO_KEEPALIVE:
689229509Strociny			break;
690229509Strociny		default:
691229509Strociny			PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
692229509Strociny			    hio->hio_cmd);
693222228Spjd		}
694204076Spjd		reqlog(LOG_DEBUG, 2, -1, hio,
695204076Spjd		    "recv: (%p) Got request header: ", hio);
696211882Spjd		if (hio->hio_cmd == HIO_KEEPALIVE) {
697229509Strociny			nv_free(nv);
698211882Spjd			pjdlog_debug(2,
699211882Spjd			    "recv: (%p) Moving request to the free queue.",
700211882Spjd			    hio);
701229509Strociny			hio_clear(hio);
702211882Spjd			QUEUE_INSERT(free, hio);
703211882Spjd			continue;
704211882Spjd		} else if (hio->hio_cmd == HIO_WRITE) {
705229509Strociny			if (hast_proto_recv_data(res, res->hr_remotein, nv,
706231017Strociny			    hio->hio_data, MAXPHYS) == -1) {
707211984Spjd				secondary_exit(EX_TEMPFAIL,
708212051Spjd				    "Unable to receive request data");
709204076Spjd			}
710249236Strociny			if (hio->hio_memsync) {
711249236Strociny				/*
712249236Strociny				 * For memsync requests we expect two replies.
713249236Strociny				 * Clone the hio so we can handle both of them.
714249236Strociny				 */
715249236Strociny				pjdlog_debug(2, "recv: Taking free request.");
716249236Strociny				QUEUE_TAKE(free, mshio);
717249236Strociny				pjdlog_debug(2, "recv: (%p) Got request.",
718249236Strociny				    mshio);
719249236Strociny				hio_copy(hio, mshio);
720249236Strociny				mshio->hio_error = 0;
721249236Strociny				/*
722249236Strociny				 * We want to keep 'memsync' tag only on the
723249236Strociny				 * request going onto send queue (mshio).
724249236Strociny				 */
725249236Strociny				hio->hio_memsync = false;
726249236Strociny				pjdlog_debug(2,
727249236Strociny				    "recv: (%p) Moving memsync request to the send queue.",
728249236Strociny				    mshio);
729249236Strociny				QUEUE_INSERT(send, mshio);
730249236Strociny			}
731204076Spjd		}
732229509Strociny		nv_free(nv);
733204076Spjd		pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.",
734204076Spjd		    hio);
735211877Spjd		QUEUE_INSERT(disk, hio);
736204076Spjd	}
737204076Spjd	/* NOTREACHED */
738204076Spjd	return (NULL);
739204076Spjd}
740204076Spjd
741204076Spjd/*
742204076Spjd * Thread reads from or writes to local component and also handles DELETE and
743204076Spjd * FLUSH requests.
744204076Spjd */
745204076Spjdstatic void *
746204076Spjddisk_thread(void *arg)
747204076Spjd{
748204076Spjd	struct hast_resource *res = arg;
749204076Spjd	struct hio *hio;
750204076Spjd	ssize_t ret;
751229509Strociny	bool clear_activemap, logerror;
752204076Spjd
753204076Spjd	clear_activemap = true;
754204076Spjd
755204076Spjd	for (;;) {
756204076Spjd		pjdlog_debug(2, "disk: Taking request.");
757211877Spjd		QUEUE_TAKE(disk, hio);
758204076Spjd		while (clear_activemap) {
759204076Spjd			unsigned char *map;
760204076Spjd			size_t mapsize;
761204076Spjd
762204076Spjd			/*
763204076Spjd			 * When first request is received, it means that primary
764204076Spjd			 * already received our activemap, merged it and stored
765204076Spjd			 * locally. We can now safely clear our activemap.
766204076Spjd			 */
767204076Spjd			mapsize =
768204076Spjd			    activemap_calc_ondisk_size(res->hr_local_mediasize -
769204076Spjd			    METADATA_SIZE, res->hr_extentsize,
770204076Spjd			    res->hr_local_sectorsize);
771204076Spjd			map = calloc(1, mapsize);
772204076Spjd			if (map == NULL) {
773204076Spjd				pjdlog_warning("Unable to allocate memory to clear local activemap.");
774204076Spjd				break;
775204076Spjd			}
776204076Spjd			if (pwrite(res->hr_localfd, map, mapsize,
777204076Spjd			    METADATA_SIZE) != (ssize_t)mapsize) {
778204076Spjd				pjdlog_errno(LOG_WARNING,
779204076Spjd				    "Unable to store cleared activemap");
780204076Spjd				free(map);
781247866Strociny				res->hr_stat_activemap_write_error++;
782204076Spjd				break;
783204076Spjd			}
784204076Spjd			free(map);
785204076Spjd			clear_activemap = false;
786204076Spjd			pjdlog_debug(1, "Local activemap cleared.");
787229509Strociny			break;
788204076Spjd		}
789204076Spjd		reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio);
790229509Strociny		logerror = true;
791204076Spjd		/* Handle the actual request. */
792204076Spjd		switch (hio->hio_cmd) {
793204076Spjd		case HIO_READ:
794204076Spjd			ret = pread(res->hr_localfd, hio->hio_data,
795204076Spjd			    hio->hio_length,
796204076Spjd			    hio->hio_offset + res->hr_localoff);
797231017Strociny			if (ret == -1)
798204076Spjd				hio->hio_error = errno;
799204076Spjd			else if (ret != (int64_t)hio->hio_length)
800204076Spjd				hio->hio_error = EIO;
801204076Spjd			else
802204076Spjd				hio->hio_error = 0;
803204076Spjd			break;
804204076Spjd		case HIO_WRITE:
805204076Spjd			ret = pwrite(res->hr_localfd, hio->hio_data,
806204076Spjd			    hio->hio_length,
807204076Spjd			    hio->hio_offset + res->hr_localoff);
808231017Strociny			if (ret == -1)
809204076Spjd				hio->hio_error = errno;
810204076Spjd			else if (ret != (int64_t)hio->hio_length)
811204076Spjd				hio->hio_error = EIO;
812204076Spjd			else
813204076Spjd				hio->hio_error = 0;
814204076Spjd			break;
815204076Spjd		case HIO_DELETE:
816204076Spjd			ret = g_delete(res->hr_localfd,
817204076Spjd			    hio->hio_offset + res->hr_localoff,
818204076Spjd			    hio->hio_length);
819231017Strociny			if (ret == -1)
820204076Spjd				hio->hio_error = errno;
821204076Spjd			else
822204076Spjd				hio->hio_error = 0;
823204076Spjd			break;
824204076Spjd		case HIO_FLUSH:
825229509Strociny			if (!res->hr_localflush) {
826229509Strociny				ret = -1;
827229509Strociny				hio->hio_error = EOPNOTSUPP;
828229509Strociny				logerror = false;
829229509Strociny				break;
830229509Strociny			}
831204076Spjd			ret = g_flush(res->hr_localfd);
832231017Strociny			if (ret == -1) {
833229509Strociny				if (errno == EOPNOTSUPP)
834229509Strociny					res->hr_localflush = false;
835204076Spjd				hio->hio_error = errno;
836229509Strociny			} else {
837204076Spjd				hio->hio_error = 0;
838229509Strociny			}
839204076Spjd			break;
840229509Strociny		default:
841229509Strociny			PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
842229509Strociny			    hio->hio_cmd);
843204076Spjd		}
844229509Strociny		if (logerror && hio->hio_error != 0) {
845204076Spjd			reqlog(LOG_ERR, 0, hio->hio_error, hio,
846204076Spjd			    "Request failed: ");
847204076Spjd		}
848204076Spjd		pjdlog_debug(2, "disk: (%p) Moving request to the send queue.",
849204076Spjd		    hio);
850211877Spjd		QUEUE_INSERT(send, hio);
851204076Spjd	}
852204076Spjd	/* NOTREACHED */
853204076Spjd	return (NULL);
854204076Spjd}
855204076Spjd
856204076Spjd/*
857204076Spjd * Thread sends requests back to primary node.
858204076Spjd */
859204076Spjdstatic void *
860204076Spjdsend_thread(void *arg)
861204076Spjd{
862204076Spjd	struct hast_resource *res = arg;
863204076Spjd	struct nv *nvout;
864204076Spjd	struct hio *hio;
865204076Spjd	void *data;
866204076Spjd	size_t length;
867204076Spjd
868204076Spjd	for (;;) {
869204076Spjd		pjdlog_debug(2, "send: Taking request.");
870211877Spjd		QUEUE_TAKE(send, hio);
871204076Spjd		reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio);
872204076Spjd		nvout = nv_alloc();
873204076Spjd		/* Copy sequence number. */
874229509Strociny		nv_add_uint64(nvout, hio->hio_seq, "seq");
875249236Strociny		if (hio->hio_memsync) {
876249236Strociny			PJDLOG_ASSERT(hio->hio_cmd == HIO_WRITE);
877249236Strociny			nv_add_int8(nvout, 1, "received");
878249236Strociny		}
879204076Spjd		switch (hio->hio_cmd) {
880204076Spjd		case HIO_READ:
881204076Spjd			if (hio->hio_error == 0) {
882204076Spjd				data = hio->hio_data;
883204076Spjd				length = hio->hio_length;
884204076Spjd				break;
885204076Spjd			}
886204076Spjd			/*
887204076Spjd			 * We send no data in case of an error.
888204076Spjd			 */
889204076Spjd			/* FALLTHROUGH */
890204076Spjd		case HIO_DELETE:
891204076Spjd		case HIO_FLUSH:
892204076Spjd		case HIO_WRITE:
893204076Spjd			data = NULL;
894204076Spjd			length = 0;
895204076Spjd			break;
896204076Spjd		default:
897229509Strociny			PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
898229509Strociny			    hio->hio_cmd);
899204076Spjd		}
900247866Strociny		if (hio->hio_error != 0) {
901247866Strociny			switch (hio->hio_cmd) {
902247866Strociny			case HIO_READ:
903247866Strociny				res->hr_stat_read_error++;
904247866Strociny				break;
905247866Strociny			case HIO_WRITE:
906247866Strociny				res->hr_stat_write_error++;
907247866Strociny				break;
908247866Strociny			case HIO_DELETE:
909247866Strociny				res->hr_stat_delete_error++;
910247866Strociny				break;
911247866Strociny			case HIO_FLUSH:
912247866Strociny				res->hr_stat_flush_error++;
913247866Strociny				break;
914247866Strociny			}
915204076Spjd			nv_add_int16(nvout, hio->hio_error, "error");
916247866Strociny		}
917204076Spjd		if (hast_proto_send(res, res->hr_remoteout, nvout, data,
918231017Strociny		    length) == -1) {
919231017Strociny			secondary_exit(EX_TEMPFAIL, "Unable to send reply");
920204076Spjd		}
921204076Spjd		nv_free(nvout);
922209185Spjd		pjdlog_debug(2, "send: (%p) Moving request to the free queue.",
923204076Spjd		    hio);
924229509Strociny		hio_clear(hio);
925211877Spjd		QUEUE_INSERT(free, hio);
926204076Spjd	}
927204076Spjd	/* NOTREACHED */
928204076Spjd	return (NULL);
929204076Spjd}
930