secondary.c revision 209182
1216294Ssyrinx/*- 2216294Ssyrinx * Copyright (c) 2009-2010 The FreeBSD Foundation 3216294Ssyrinx * All rights reserved. 4216294Ssyrinx * 5216294Ssyrinx * This software was developed by Pawel Jakub Dawidek under sponsorship from 6216294Ssyrinx * the FreeBSD Foundation. 7216294Ssyrinx * 8216294Ssyrinx * Redistribution and use in source and binary forms, with or without 9216294Ssyrinx * modification, are permitted provided that the following conditions 10216294Ssyrinx * are met: 11216294Ssyrinx * 1. Redistributions of source code must retain the above copyright 12216294Ssyrinx * notice, this list of conditions and the following disclaimer. 13216294Ssyrinx * 2. Redistributions in binary form must reproduce the above copyright 14216294Ssyrinx * notice, this list of conditions and the following disclaimer in the 15216294Ssyrinx * documentation and/or other materials provided with the distribution. 16216294Ssyrinx * 17216294Ssyrinx * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 18216294Ssyrinx * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19216294Ssyrinx * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20216294Ssyrinx * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 21216294Ssyrinx * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22216294Ssyrinx * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23216294Ssyrinx * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24216294Ssyrinx * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25216294Ssyrinx * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26216294Ssyrinx * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27216294Ssyrinx * SUCH DAMAGE. 28216294Ssyrinx */ 29216294Ssyrinx 30216294Ssyrinx#include <sys/cdefs.h> 31216294Ssyrinx__FBSDID("$FreeBSD: head/sbin/hastd/secondary.c 209182 2010-06-14 21:41:22Z pjd $"); 32216294Ssyrinx 33216294Ssyrinx#include <sys/param.h> 34216294Ssyrinx#include <sys/time.h> 35216294Ssyrinx#include <sys/bio.h> 36216294Ssyrinx#include <sys/disk.h> 37216294Ssyrinx#include <sys/stat.h> 38216294Ssyrinx 39216294Ssyrinx#include <assert.h> 40216294Ssyrinx#include <err.h> 41216294Ssyrinx#include <errno.h> 42216294Ssyrinx#include <fcntl.h> 43216294Ssyrinx#include <libgeom.h> 44216294Ssyrinx#include <pthread.h> 45216294Ssyrinx#include <stdint.h> 46216294Ssyrinx#include <stdio.h> 47216294Ssyrinx#include <string.h> 48216294Ssyrinx#include <sysexits.h> 49216294Ssyrinx#include <unistd.h> 50216294Ssyrinx 51216294Ssyrinx#include <activemap.h> 52216294Ssyrinx#include <nv.h> 53216294Ssyrinx#include <pjdlog.h> 54216294Ssyrinx 55216294Ssyrinx#include "control.h" 56216294Ssyrinx#include "hast.h" 57216294Ssyrinx#include "hast_proto.h" 58216294Ssyrinx#include "hastd.h" 59216294Ssyrinx#include "metadata.h" 60216294Ssyrinx#include "proto.h" 61216294Ssyrinx#include "subr.h" 62216294Ssyrinx#include "synch.h" 63216294Ssyrinx 64216294Ssyrinxstruct hio { 65216294Ssyrinx uint64_t hio_seq; 66216294Ssyrinx int hio_error; 67216294Ssyrinx struct nv *hio_nv; 68216294Ssyrinx void *hio_data; 69216294Ssyrinx uint8_t hio_cmd; 70216294Ssyrinx uint64_t hio_offset; 71216294Ssyrinx uint64_t hio_length; 72216294Ssyrinx TAILQ_ENTRY(hio) hio_next; 73216294Ssyrinx}; 74216294Ssyrinx 75216294Ssyrinx/* 76216294Ssyrinx * Free list holds unused structures. When free list is empty, we have to wait 77216294Ssyrinx * until some in-progress requests are freed. 78216294Ssyrinx */ 79216294Ssyrinxstatic TAILQ_HEAD(, hio) hio_free_list; 80216294Ssyrinxstatic pthread_mutex_t hio_free_list_lock; 81216294Ssyrinxstatic pthread_cond_t hio_free_list_cond; 82216294Ssyrinx/* 83216294Ssyrinx * Disk thread (the one that do I/O requests) takes requests from this list. 84216294Ssyrinx */ 85216294Ssyrinxstatic TAILQ_HEAD(, hio) hio_disk_list; 86216294Ssyrinxstatic pthread_mutex_t hio_disk_list_lock; 87216294Ssyrinxstatic pthread_cond_t hio_disk_list_cond; 88216294Ssyrinx/* 89216294Ssyrinx * There is one recv list for every component, although local components don't 90216294Ssyrinx * use recv lists as local requests are done synchronously. 91216294Ssyrinx */ 92216294Ssyrinxstatic TAILQ_HEAD(, hio) hio_send_list; 93216294Ssyrinxstatic pthread_mutex_t hio_send_list_lock; 94216294Ssyrinxstatic pthread_cond_t hio_send_list_cond; 95216294Ssyrinx 96216294Ssyrinx/* 97216294Ssyrinx * Maximum number of outstanding I/O requests. 98216294Ssyrinx */ 99216294Ssyrinx#define HAST_HIO_MAX 256 100216294Ssyrinx 101216294Ssyrinxstatic void *recv_thread(void *arg); 102216294Ssyrinxstatic void *disk_thread(void *arg); 103216294Ssyrinxstatic void *send_thread(void *arg); 104216294Ssyrinx 105216294Ssyrinxstatic void 106216294Ssyrinxinit_environment(void) 107216294Ssyrinx{ 108216294Ssyrinx struct hio *hio; 109216294Ssyrinx unsigned int ii; 110216294Ssyrinx 111216294Ssyrinx /* 112216294Ssyrinx * Initialize lists, their locks and theirs condition variables. 113216294Ssyrinx */ 114216294Ssyrinx TAILQ_INIT(&hio_free_list); 115216294Ssyrinx mtx_init(&hio_free_list_lock); 116216294Ssyrinx cv_init(&hio_free_list_cond); 117216294Ssyrinx TAILQ_INIT(&hio_disk_list); 118216294Ssyrinx mtx_init(&hio_disk_list_lock); 119216294Ssyrinx cv_init(&hio_disk_list_cond); 120216294Ssyrinx TAILQ_INIT(&hio_send_list); 121216294Ssyrinx mtx_init(&hio_send_list_lock); 122216294Ssyrinx cv_init(&hio_send_list_cond); 123216294Ssyrinx 124216294Ssyrinx /* 125216294Ssyrinx * Allocate requests pool and initialize requests. 126216294Ssyrinx */ 127216294Ssyrinx for (ii = 0; ii < HAST_HIO_MAX; ii++) { 128216294Ssyrinx hio = malloc(sizeof(*hio)); 129216294Ssyrinx if (hio == NULL) { 130216294Ssyrinx errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory " 131216294Ssyrinx "for hio request", sizeof(*hio)); 132216294Ssyrinx } 133216294Ssyrinx hio->hio_error = 0; 134216294Ssyrinx hio->hio_data = malloc(MAXPHYS); 135216294Ssyrinx if (hio->hio_data == NULL) { 136216294Ssyrinx errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory " 137216294Ssyrinx "for gctl_data", (size_t)MAXPHYS); 138216294Ssyrinx } 139216294Ssyrinx TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next); 140216294Ssyrinx } 141216294Ssyrinx} 142216294Ssyrinx 143216294Ssyrinxstatic void 144216294Ssyrinxinit_local(struct hast_resource *res) 145216294Ssyrinx{ 146216294Ssyrinx 147216294Ssyrinx if (metadata_read(res, true) < 0) 148216294Ssyrinx exit(EX_NOINPUT); 149216294Ssyrinx} 150216294Ssyrinx 151216294Ssyrinxstatic void 152216294Ssyrinxinit_remote(struct hast_resource *res, struct nv *nvin) 153216294Ssyrinx{ 154216294Ssyrinx uint64_t resuid; 155216294Ssyrinx struct nv *nvout; 156216294Ssyrinx unsigned char *map; 157216294Ssyrinx size_t mapsize; 158216294Ssyrinx 159216294Ssyrinx map = NULL; 160216294Ssyrinx mapsize = 0; 161216294Ssyrinx nvout = nv_alloc(); 162216294Ssyrinx nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize"); 163216294Ssyrinx nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize"); 164216294Ssyrinx resuid = nv_get_uint64(nvin, "resuid"); 165216294Ssyrinx res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt"); 166216294Ssyrinx res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt"); 167216294Ssyrinx nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt"); 168216294Ssyrinx nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt"); 169216294Ssyrinx mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - 170216294Ssyrinx METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); 171216294Ssyrinx map = malloc(mapsize); 172216294Ssyrinx if (map == NULL) { 173216294Ssyrinx pjdlog_exitx(EX_TEMPFAIL, 174216294Ssyrinx "Unable to allocate memory (%zu bytes) for activemap.", 175216294Ssyrinx mapsize); 176216294Ssyrinx } 177216294Ssyrinx nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize"); 178216294Ssyrinx /* 179216294Ssyrinx * When we work as primary and secondary is missing we will increase 180216294Ssyrinx * localcnt in our metadata. When secondary is connected and synced 181216294Ssyrinx * we make localcnt be equal to remotecnt, which means nodes are more 182216294Ssyrinx * or less in sync. 183216294Ssyrinx * Split-brain condition is when both nodes are not able to communicate 184216294Ssyrinx * and are both configured as primary nodes. In turn, they can both 185216294Ssyrinx * make incompatible changes to the data and we have to detect that. 186216294Ssyrinx * Under split-brain condition we will increase our localcnt on first 187216294Ssyrinx * write and remote node will increase its localcnt on first write. 188216294Ssyrinx * When we connect we can see that primary's localcnt is greater than 189216294Ssyrinx * our remotecnt (primary was modified while we weren't watching) and 190216294Ssyrinx * our localcnt is greater than primary's remotecnt (we were modified 191216294Ssyrinx * while primary wasn't watching). 192216294Ssyrinx * There are many possible combinations which are all gathered below. 193216294Ssyrinx * Don't pay too much attention to exact numbers, the more important 194216294Ssyrinx * is to compare them. We compare secondary's local with primary's 195216294Ssyrinx * remote and secondary's remote with primary's local. 196216294Ssyrinx * Note that every case where primary's localcnt is smaller than 197216294Ssyrinx * secondary's remotecnt and where secondary's localcnt is smaller than 198216294Ssyrinx * primary's remotecnt should be impossible in practise. We will perform 199216294Ssyrinx * full synchronization then. Those cases are marked with an asterisk. 200216294Ssyrinx * Regular synchronization means that only extents marked as dirty are 201216294Ssyrinx * synchronized (regular synchronization). 202216294Ssyrinx * 203216294Ssyrinx * SECONDARY METADATA PRIMARY METADATA 204216294Ssyrinx * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary. 205216294Ssyrinx * local=3 remote=3 local=2 remote=3* ?! Full sync from primary. 206216294Ssyrinx * local=3 remote=3 local=2 remote=4* ?! Full sync from primary. 207216294Ssyrinx * local=3 remote=3 local=3 remote=2 Primary is out-of-date, 208216294Ssyrinx * regular sync from secondary. 209216294Ssyrinx * local=3 remote=3 local=3 remote=3 Regular sync just in case. 210216294Ssyrinx * local=3 remote=3 local=3 remote=4* ?! Full sync from primary. 211216294Ssyrinx * local=3 remote=3 local=4 remote=2 Split-brain condition. 212216294Ssyrinx * local=3 remote=3 local=4 remote=3 Secondary out-of-date, 213216294Ssyrinx * regular sync from primary. 214216294Ssyrinx * local=3 remote=3 local=4 remote=4* ?! Full sync from primary. 215216294Ssyrinx */ 216216294Ssyrinx if (res->hr_resuid == 0) { 217216294Ssyrinx /* 218216294Ssyrinx * Provider is used for the first time. Initialize everything. 219216294Ssyrinx */ 220216294Ssyrinx assert(res->hr_secondary_localcnt == 0); 221216294Ssyrinx res->hr_resuid = resuid; 222216294Ssyrinx if (metadata_write(res) < 0) 223216294Ssyrinx exit(EX_NOINPUT); 224216294Ssyrinx memset(map, 0xff, mapsize); 225216294Ssyrinx nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 226216294Ssyrinx } else if ( 227216294Ssyrinx /* Is primary is out-of-date? */ 228216294Ssyrinx (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 229216294Ssyrinx res->hr_secondary_remotecnt == res->hr_primary_localcnt) || 230216294Ssyrinx /* Node are more or less in sync? */ 231216294Ssyrinx (res->hr_secondary_localcnt == res->hr_primary_remotecnt && 232216294Ssyrinx res->hr_secondary_remotecnt == res->hr_primary_localcnt) || 233216294Ssyrinx /* Is secondary is out-of-date? */ 234216294Ssyrinx (res->hr_secondary_localcnt == res->hr_primary_remotecnt && 235216294Ssyrinx res->hr_secondary_remotecnt < res->hr_primary_localcnt)) { 236216294Ssyrinx /* 237216294Ssyrinx * Nodes are more or less in sync or one of the nodes is 238216294Ssyrinx * out-of-date. 239216294Ssyrinx * It doesn't matter at this point which one, we just have to 240216294Ssyrinx * send out local bitmap to the remote node. 241216294Ssyrinx */ 242216294Ssyrinx if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) != 243216294Ssyrinx (ssize_t)mapsize) { 244216294Ssyrinx pjdlog_exit(LOG_ERR, "Unable to read activemap"); 245216294Ssyrinx } 246216294Ssyrinx if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 247216294Ssyrinx res->hr_secondary_remotecnt == res->hr_primary_localcnt) { 248216294Ssyrinx /* Primary is out-of-date, sync from secondary. */ 249216294Ssyrinx nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); 250216294Ssyrinx } else { 251216294Ssyrinx /* 252216294Ssyrinx * Secondary is out-of-date or counts match. 253216294Ssyrinx * Sync from primary. 254216294Ssyrinx */ 255216294Ssyrinx nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 256216294Ssyrinx } 257216294Ssyrinx } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 258216294Ssyrinx res->hr_primary_localcnt > res->hr_secondary_remotecnt) { 259216294Ssyrinx /* 260216294Ssyrinx * Not good, we have split-brain condition. 261216294Ssyrinx */ 262216294Ssyrinx pjdlog_error("Split-brain detected, exiting."); 263216294Ssyrinx nv_add_string(nvout, "Split-brain condition!", "errmsg"); 264216294Ssyrinx free(map); 265216294Ssyrinx map = NULL; 266216294Ssyrinx mapsize = 0; 267216294Ssyrinx } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt || 268216294Ssyrinx res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ { 269216294Ssyrinx /* 270216294Ssyrinx * This should never happen in practise, but we will perform 271216294Ssyrinx * full synchronization. 272216294Ssyrinx */ 273216294Ssyrinx assert(res->hr_secondary_localcnt < res->hr_primary_remotecnt || 274216294Ssyrinx res->hr_primary_localcnt < res->hr_secondary_remotecnt); 275216294Ssyrinx mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - 276216294Ssyrinx METADATA_SIZE, res->hr_extentsize, 277216294Ssyrinx res->hr_local_sectorsize); 278216294Ssyrinx memset(map, 0xff, mapsize); 279216294Ssyrinx if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) { 280216294Ssyrinx /* In this one of five cases sync from secondary. */ 281216294Ssyrinx nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); 282216294Ssyrinx } else { 283216294Ssyrinx /* For the rest four cases sync from primary. */ 284216294Ssyrinx nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 285216294Ssyrinx } 286216294Ssyrinx pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).", 287216294Ssyrinx (uintmax_t)res->hr_primary_localcnt, 288216294Ssyrinx (uintmax_t)res->hr_primary_remotecnt, 289216294Ssyrinx (uintmax_t)res->hr_secondary_localcnt, 290216294Ssyrinx (uintmax_t)res->hr_secondary_remotecnt); 291216294Ssyrinx } 292216294Ssyrinx if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) { 293216294Ssyrinx pjdlog_errno(LOG_WARNING, "Unable to send activemap to %s", 294216294Ssyrinx res->hr_remoteaddr); 295216294Ssyrinx nv_free(nvout); 296216294Ssyrinx exit(EX_TEMPFAIL); 297216294Ssyrinx } 298216294Ssyrinx nv_free(nvout); 299216294Ssyrinx if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 300216294Ssyrinx res->hr_primary_localcnt > res->hr_secondary_remotecnt) { 301216294Ssyrinx /* Exit on split-brain. */ 302216294Ssyrinx exit(EX_CONFIG); 303216294Ssyrinx } 304216294Ssyrinx} 305216294Ssyrinx 306216294Ssyrinxvoid 307216294Ssyrinxhastd_secondary(struct hast_resource *res, struct nv *nvin) 308216294Ssyrinx{ 309216294Ssyrinx pthread_t td; 310216294Ssyrinx pid_t pid; 311216294Ssyrinx int error; 312216294Ssyrinx 313216294Ssyrinx /* 314216294Ssyrinx * Create communication channel between parent and child. 315216294Ssyrinx */ 316216294Ssyrinx if (proto_client("socketpair://", &res->hr_ctrl) < 0) { 317216294Ssyrinx KEEP_ERRNO((void)pidfile_remove(pfh)); 318216294Ssyrinx pjdlog_exit(EX_OSERR, 319216294Ssyrinx "Unable to create control sockets between parent and child"); 320216294Ssyrinx } 321216294Ssyrinx 322216294Ssyrinx pid = fork(); 323216294Ssyrinx if (pid < 0) { 324216294Ssyrinx KEEP_ERRNO((void)pidfile_remove(pfh)); 325216294Ssyrinx pjdlog_exit(EX_OSERR, "Unable to fork"); 326216294Ssyrinx } 327216294Ssyrinx 328216294Ssyrinx if (pid > 0) { 329216294Ssyrinx /* This is parent. */ 330216294Ssyrinx proto_close(res->hr_remotein); 331216294Ssyrinx res->hr_remotein = NULL; 332216294Ssyrinx proto_close(res->hr_remoteout); 333216294Ssyrinx res->hr_remoteout = NULL; 334216294Ssyrinx res->hr_workerpid = pid; 335216294Ssyrinx return; 336216294Ssyrinx } 337216294Ssyrinx (void)pidfile_close(pfh); 338216294Ssyrinx 339216294Ssyrinx setproctitle("%s (secondary)", res->hr_name); 340216294Ssyrinx 341216294Ssyrinx /* Error in setting timeout is not critical, but why should it fail? */ 342216294Ssyrinx if (proto_timeout(res->hr_remotein, 0) < 0) 343216294Ssyrinx pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 344216294Ssyrinx if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0) 345216294Ssyrinx pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 346216294Ssyrinx 347216294Ssyrinx init_local(res); 348216294Ssyrinx init_remote(res, nvin); 349216294Ssyrinx init_environment(); 350216294Ssyrinx 351216294Ssyrinx error = pthread_create(&td, NULL, recv_thread, res); 352216294Ssyrinx assert(error == 0); 353216294Ssyrinx error = pthread_create(&td, NULL, disk_thread, res); 354216294Ssyrinx assert(error == 0); 355216294Ssyrinx error = pthread_create(&td, NULL, send_thread, res); 356216294Ssyrinx assert(error == 0); 357216294Ssyrinx (void)ctrl_thread(res); 358216294Ssyrinx} 359216294Ssyrinx 360216294Ssyrinxstatic void 361216294Ssyrinxreqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...) 362216294Ssyrinx{ 363216294Ssyrinx char msg[1024]; 364216294Ssyrinx va_list ap; 365216294Ssyrinx int len; 366216294Ssyrinx 367216294Ssyrinx va_start(ap, fmt); 368216294Ssyrinx len = vsnprintf(msg, sizeof(msg), fmt, ap); 369216294Ssyrinx va_end(ap); 370216294Ssyrinx if ((size_t)len < sizeof(msg)) { 371216294Ssyrinx switch (hio->hio_cmd) { 372216294Ssyrinx case HIO_READ: 373216294Ssyrinx (void)snprintf(msg + len, sizeof(msg) - len, 374216294Ssyrinx "READ(%ju, %ju).", (uintmax_t)hio->hio_offset, 375216294Ssyrinx (uintmax_t)hio->hio_length); 376216294Ssyrinx break; 377216294Ssyrinx case HIO_DELETE: 378216294Ssyrinx (void)snprintf(msg + len, sizeof(msg) - len, 379216294Ssyrinx "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset, 380216294Ssyrinx (uintmax_t)hio->hio_length); 381216294Ssyrinx break; 382216294Ssyrinx case HIO_FLUSH: 383216294Ssyrinx (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH."); 384216294Ssyrinx break; 385216294Ssyrinx case HIO_WRITE: 386216294Ssyrinx (void)snprintf(msg + len, sizeof(msg) - len, 387216294Ssyrinx "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset, 388216294Ssyrinx (uintmax_t)hio->hio_length); 389216294Ssyrinx break; 390216294Ssyrinx default: 391216294Ssyrinx (void)snprintf(msg + len, sizeof(msg) - len, 392216294Ssyrinx "UNKNOWN(%u).", (unsigned int)hio->hio_cmd); 393216294Ssyrinx break; 394216294Ssyrinx } 395216294Ssyrinx } 396216294Ssyrinx pjdlog_common(loglevel, debuglevel, error, "%s", msg); 397216294Ssyrinx} 398216294Ssyrinx 399216294Ssyrinxstatic int 400216294Ssyrinxrequnpack(struct hast_resource *res, struct hio *hio) 401216294Ssyrinx{ 402216294Ssyrinx 403216294Ssyrinx hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd"); 404216294Ssyrinx if (hio->hio_cmd == 0) { 405216294Ssyrinx pjdlog_error("Header contains no 'cmd' field."); 406216294Ssyrinx hio->hio_error = EINVAL; 407216294Ssyrinx goto end; 408216294Ssyrinx } 409216294Ssyrinx switch (hio->hio_cmd) { 410216294Ssyrinx case HIO_READ: 411216294Ssyrinx case HIO_WRITE: 412216294Ssyrinx case HIO_DELETE: 413216294Ssyrinx hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset"); 414216294Ssyrinx if (nv_error(hio->hio_nv) != 0) { 415216294Ssyrinx pjdlog_error("Header is missing 'offset' field."); 416216294Ssyrinx hio->hio_error = EINVAL; 417216294Ssyrinx goto end; 418216294Ssyrinx } 419216294Ssyrinx hio->hio_length = nv_get_uint64(hio->hio_nv, "length"); 420216294Ssyrinx if (nv_error(hio->hio_nv) != 0) { 421216294Ssyrinx pjdlog_error("Header is missing 'length' field."); 422216294Ssyrinx hio->hio_error = EINVAL; 423216294Ssyrinx goto end; 424216294Ssyrinx } 425216294Ssyrinx if (hio->hio_length == 0) { 426216294Ssyrinx pjdlog_error("Data length is zero."); 427216294Ssyrinx hio->hio_error = EINVAL; 428216294Ssyrinx goto end; 429216294Ssyrinx } 430216294Ssyrinx if (hio->hio_length > MAXPHYS) { 431216294Ssyrinx pjdlog_error("Data length is too large (%ju > %ju).", 432216294Ssyrinx (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS); 433216294Ssyrinx hio->hio_error = EINVAL; 434216294Ssyrinx goto end; 435216294Ssyrinx } 436216294Ssyrinx if ((hio->hio_offset % res->hr_local_sectorsize) != 0) { 437216294Ssyrinx pjdlog_error("Offset %ju is not multiple of sector size.", 438216294Ssyrinx (uintmax_t)hio->hio_offset); 439216294Ssyrinx hio->hio_error = EINVAL; 440216294Ssyrinx goto end; 441216294Ssyrinx } 442216294Ssyrinx if ((hio->hio_length % res->hr_local_sectorsize) != 0) { 443216294Ssyrinx pjdlog_error("Length %ju is not multiple of sector size.", 444216294Ssyrinx (uintmax_t)hio->hio_length); 445216294Ssyrinx hio->hio_error = EINVAL; 446216294Ssyrinx goto end; 447216294Ssyrinx } 448216294Ssyrinx if (hio->hio_offset + hio->hio_length > 449216294Ssyrinx (uint64_t)res->hr_datasize) { 450216294Ssyrinx pjdlog_error("Data offset is too large (%ju > %ju).", 451216294Ssyrinx (uintmax_t)(hio->hio_offset + hio->hio_length), 452216294Ssyrinx (uintmax_t)res->hr_datasize); 453216294Ssyrinx hio->hio_error = EINVAL; 454216294Ssyrinx goto end; 455216294Ssyrinx } 456216294Ssyrinx break; 457216294Ssyrinx default: 458216294Ssyrinx pjdlog_error("Header contains invalid 'cmd' (%hhu).", 459216294Ssyrinx hio->hio_cmd); 460216294Ssyrinx hio->hio_error = EINVAL; 461216294Ssyrinx goto end; 462216294Ssyrinx } 463216294Ssyrinx hio->hio_error = 0; 464216294Ssyrinxend: 465216294Ssyrinx return (hio->hio_error); 466216294Ssyrinx} 467216294Ssyrinx 468216294Ssyrinx/* 469216294Ssyrinx * Thread receives requests from the primary node. 470216294Ssyrinx */ 471216294Ssyrinxstatic void * 472216294Ssyrinxrecv_thread(void *arg) 473216294Ssyrinx{ 474216294Ssyrinx struct hast_resource *res = arg; 475216294Ssyrinx struct hio *hio; 476216294Ssyrinx bool wakeup; 477216294Ssyrinx 478216294Ssyrinx for (;;) { 479216294Ssyrinx pjdlog_debug(2, "recv: Taking free request."); 480216294Ssyrinx mtx_lock(&hio_free_list_lock); 481216294Ssyrinx while ((hio = TAILQ_FIRST(&hio_free_list)) == NULL) { 482216294Ssyrinx pjdlog_debug(2, "recv: No free requests, waiting."); 483216294Ssyrinx cv_wait(&hio_free_list_cond, &hio_free_list_lock); 484216294Ssyrinx } 485216294Ssyrinx TAILQ_REMOVE(&hio_free_list, hio, hio_next); 486216294Ssyrinx mtx_unlock(&hio_free_list_lock); 487216294Ssyrinx pjdlog_debug(2, "recv: (%p) Got request.", hio); 488216294Ssyrinx if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) { 489216294Ssyrinx pjdlog_exit(EX_TEMPFAIL, 490216294Ssyrinx "Unable to receive request header"); 491216294Ssyrinx } 492216294Ssyrinx if (requnpack(res, hio) != 0) 493216294Ssyrinx goto send_queue; 494216294Ssyrinx reqlog(LOG_DEBUG, 2, -1, hio, 495216294Ssyrinx "recv: (%p) Got request header: ", hio); 496216294Ssyrinx if (hio->hio_cmd == HIO_WRITE) { 497216294Ssyrinx if (hast_proto_recv_data(res, res->hr_remotein, 498216294Ssyrinx hio->hio_nv, hio->hio_data, MAXPHYS) < 0) { 499216294Ssyrinx pjdlog_exit(EX_TEMPFAIL, 500216294Ssyrinx "Unable to receive reply data"); 501216294Ssyrinx } 502216294Ssyrinx } 503216294Ssyrinx pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.", 504216294Ssyrinx hio); 505216294Ssyrinx mtx_lock(&hio_disk_list_lock); 506216294Ssyrinx wakeup = TAILQ_EMPTY(&hio_disk_list); 507216294Ssyrinx TAILQ_INSERT_TAIL(&hio_disk_list, hio, hio_next); 508216294Ssyrinx mtx_unlock(&hio_disk_list_lock); 509216294Ssyrinx if (wakeup) 510216294Ssyrinx cv_signal(&hio_disk_list_cond); 511216294Ssyrinx continue; 512216294Ssyrinxsend_queue: 513216294Ssyrinx pjdlog_debug(2, "recv: (%p) Moving request to the send queue.", 514216294Ssyrinx hio); 515216294Ssyrinx mtx_lock(&hio_send_list_lock); 516216294Ssyrinx wakeup = TAILQ_EMPTY(&hio_send_list); 517216294Ssyrinx TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next); 518216294Ssyrinx mtx_unlock(&hio_send_list_lock); 519216294Ssyrinx if (wakeup) 520216294Ssyrinx cv_signal(&hio_send_list_cond); 521216294Ssyrinx } 522216294Ssyrinx /* NOTREACHED */ 523216294Ssyrinx return (NULL); 524216294Ssyrinx} 525216294Ssyrinx 526216294Ssyrinx/* 527216294Ssyrinx * Thread reads from or writes to local component and also handles DELETE and 528216294Ssyrinx * FLUSH requests. 529216294Ssyrinx */ 530216294Ssyrinxstatic void * 531216294Ssyrinxdisk_thread(void *arg) 532216294Ssyrinx{ 533216294Ssyrinx struct hast_resource *res = arg; 534216294Ssyrinx struct hio *hio; 535216294Ssyrinx ssize_t ret; 536216294Ssyrinx bool clear_activemap, wakeup; 537216294Ssyrinx 538216294Ssyrinx clear_activemap = true; 539216294Ssyrinx 540216294Ssyrinx for (;;) { 541216294Ssyrinx pjdlog_debug(2, "disk: Taking request."); 542216294Ssyrinx mtx_lock(&hio_disk_list_lock); 543216294Ssyrinx while ((hio = TAILQ_FIRST(&hio_disk_list)) == NULL) { 544216294Ssyrinx pjdlog_debug(2, "disk: No requests, waiting."); 545216294Ssyrinx cv_wait(&hio_disk_list_cond, &hio_disk_list_lock); 546216294Ssyrinx } 547216294Ssyrinx TAILQ_REMOVE(&hio_disk_list, hio, hio_next); 548216294Ssyrinx mtx_unlock(&hio_disk_list_lock); 549216294Ssyrinx while (clear_activemap) { 550216294Ssyrinx unsigned char *map; 551216294Ssyrinx size_t mapsize; 552216294Ssyrinx 553216294Ssyrinx /* 554216294Ssyrinx * When first request is received, it means that primary 555216294Ssyrinx * already received our activemap, merged it and stored 556216294Ssyrinx * locally. We can now safely clear our activemap. 557216294Ssyrinx */ 558216294Ssyrinx mapsize = 559216294Ssyrinx activemap_calc_ondisk_size(res->hr_local_mediasize - 560216294Ssyrinx METADATA_SIZE, res->hr_extentsize, 561216294Ssyrinx res->hr_local_sectorsize); 562216294Ssyrinx map = calloc(1, mapsize); 563216294Ssyrinx if (map == NULL) { 564216294Ssyrinx pjdlog_warning("Unable to allocate memory to clear local activemap."); 565216294Ssyrinx break; 566216294Ssyrinx } 567216294Ssyrinx if (pwrite(res->hr_localfd, map, mapsize, 568216294Ssyrinx METADATA_SIZE) != (ssize_t)mapsize) { 569216294Ssyrinx pjdlog_errno(LOG_WARNING, 570216294Ssyrinx "Unable to store cleared activemap"); 571216294Ssyrinx free(map); 572216294Ssyrinx break; 573216294Ssyrinx } 574216294Ssyrinx free(map); 575216294Ssyrinx clear_activemap = false; 576216294Ssyrinx pjdlog_debug(1, "Local activemap cleared."); 577216294Ssyrinx } 578216294Ssyrinx reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio); 579216294Ssyrinx /* Handle the actual request. */ 580216294Ssyrinx switch (hio->hio_cmd) { 581216294Ssyrinx case HIO_READ: 582216294Ssyrinx ret = pread(res->hr_localfd, hio->hio_data, 583216294Ssyrinx hio->hio_length, 584216294Ssyrinx hio->hio_offset + res->hr_localoff); 585216294Ssyrinx if (ret < 0) 586216294Ssyrinx hio->hio_error = errno; 587216294Ssyrinx else if (ret != (int64_t)hio->hio_length) 588216294Ssyrinx hio->hio_error = EIO; 589216294Ssyrinx else 590216294Ssyrinx hio->hio_error = 0; 591216294Ssyrinx break; 592216294Ssyrinx case HIO_WRITE: 593216294Ssyrinx ret = pwrite(res->hr_localfd, hio->hio_data, 594216294Ssyrinx hio->hio_length, 595216294Ssyrinx hio->hio_offset + res->hr_localoff); 596216294Ssyrinx if (ret < 0) 597216294Ssyrinx hio->hio_error = errno; 598216294Ssyrinx else if (ret != (int64_t)hio->hio_length) 599216294Ssyrinx hio->hio_error = EIO; 600216294Ssyrinx else 601216294Ssyrinx hio->hio_error = 0; 602216294Ssyrinx break; 603216294Ssyrinx case HIO_DELETE: 604216294Ssyrinx ret = g_delete(res->hr_localfd, 605216294Ssyrinx hio->hio_offset + res->hr_localoff, 606216294Ssyrinx hio->hio_length); 607216294Ssyrinx if (ret < 0) 608216294Ssyrinx hio->hio_error = errno; 609216294Ssyrinx else 610216294Ssyrinx hio->hio_error = 0; 611216294Ssyrinx break; 612216294Ssyrinx case HIO_FLUSH: 613216294Ssyrinx ret = g_flush(res->hr_localfd); 614216294Ssyrinx if (ret < 0) 615 hio->hio_error = errno; 616 else 617 hio->hio_error = 0; 618 break; 619 } 620 if (hio->hio_error != 0) { 621 reqlog(LOG_ERR, 0, hio->hio_error, hio, 622 "Request failed: "); 623 } 624 pjdlog_debug(2, "disk: (%p) Moving request to the send queue.", 625 hio); 626 mtx_lock(&hio_send_list_lock); 627 wakeup = TAILQ_EMPTY(&hio_send_list); 628 TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next); 629 mtx_unlock(&hio_send_list_lock); 630 if (wakeup) 631 cv_signal(&hio_send_list_cond); 632 } 633 /* NOTREACHED */ 634 return (NULL); 635} 636 637/* 638 * Thread sends requests back to primary node. 639 */ 640static void * 641send_thread(void *arg) 642{ 643 struct hast_resource *res = arg; 644 struct nv *nvout; 645 struct hio *hio; 646 void *data; 647 size_t length; 648 bool wakeup; 649 650 for (;;) { 651 pjdlog_debug(2, "send: Taking request."); 652 mtx_lock(&hio_send_list_lock); 653 while ((hio = TAILQ_FIRST(&hio_send_list)) == NULL) { 654 pjdlog_debug(2, "send: No requests, waiting."); 655 cv_wait(&hio_send_list_cond, &hio_send_list_lock); 656 } 657 TAILQ_REMOVE(&hio_send_list, hio, hio_next); 658 mtx_unlock(&hio_send_list_lock); 659 reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio); 660 nvout = nv_alloc(); 661 /* Copy sequence number. */ 662 nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq"); 663 switch (hio->hio_cmd) { 664 case HIO_READ: 665 if (hio->hio_error == 0) { 666 data = hio->hio_data; 667 length = hio->hio_length; 668 break; 669 } 670 /* 671 * We send no data in case of an error. 672 */ 673 /* FALLTHROUGH */ 674 case HIO_DELETE: 675 case HIO_FLUSH: 676 case HIO_WRITE: 677 data = NULL; 678 length = 0; 679 break; 680 default: 681 abort(); 682 break; 683 } 684 if (hio->hio_error != 0) 685 nv_add_int16(nvout, hio->hio_error, "error"); 686 if (hast_proto_send(res, res->hr_remoteout, nvout, data, 687 length) < 0) { 688 pjdlog_exit(EX_TEMPFAIL, "Unable to send reply."); 689 } 690 nv_free(nvout); 691 pjdlog_debug(2, "disk: (%p) Moving request to the free queue.", 692 hio); 693 nv_free(hio->hio_nv); 694 hio->hio_error = 0; 695 mtx_lock(&hio_free_list_lock); 696 wakeup = TAILQ_EMPTY(&hio_free_list); 697 TAILQ_INSERT_TAIL(&hio_free_list, hio, hio_next); 698 mtx_unlock(&hio_free_list_lock); 699 if (wakeup) 700 cv_signal(&hio_free_list_cond); 701 } 702 /* NOTREACHED */ 703 return (NULL); 704} 705