secondary.c revision 225831
1254885Sdumbbell/*- 2254885Sdumbbell * Copyright (c) 2009-2010 The FreeBSD Foundation 3254885Sdumbbell * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 4254885Sdumbbell * All rights reserved. 5254885Sdumbbell * 6254885Sdumbbell * This software was developed by Pawel Jakub Dawidek under sponsorship from 7254885Sdumbbell * the FreeBSD Foundation. 8254885Sdumbbell * 9254885Sdumbbell * Redistribution and use in source and binary forms, with or without 10254885Sdumbbell * modification, are permitted provided that the following conditions 11254885Sdumbbell * are met: 12254885Sdumbbell * 1. Redistributions of source code must retain the above copyright 13254885Sdumbbell * notice, this list of conditions and the following disclaimer. 14254885Sdumbbell * 2. Redistributions in binary form must reproduce the above copyright 15254885Sdumbbell * notice, this list of conditions and the following disclaimer in the 16254885Sdumbbell * documentation and/or other materials provided with the distribution. 17254885Sdumbbell * 18254885Sdumbbell * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 19254885Sdumbbell * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20254885Sdumbbell * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21254885Sdumbbell * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 22254885Sdumbbell * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23254885Sdumbbell * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24254885Sdumbbell * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25254885Sdumbbell * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26254885Sdumbbell * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27254885Sdumbbell * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28254885Sdumbbell * SUCH DAMAGE. 29254885Sdumbbell */ 30254885Sdumbbell 31254885Sdumbbell#include <sys/cdefs.h> 32254885Sdumbbell__FBSDID("$FreeBSD: head/sbin/hastd/secondary.c 225831 2011-09-28 13:13:43Z pjd $"); 33254885Sdumbbell 34254885Sdumbbell#include <sys/param.h> 35254885Sdumbbell#include <sys/time.h> 36254885Sdumbbell#include <sys/bio.h> 37254885Sdumbbell#include <sys/disk.h> 38254885Sdumbbell#include <sys/stat.h> 39254885Sdumbbell 40254885Sdumbbell#include <err.h> 41254885Sdumbbell#include <errno.h> 42254885Sdumbbell#include <fcntl.h> 43254885Sdumbbell#include <libgeom.h> 44254885Sdumbbell#include <pthread.h> 45254885Sdumbbell#include <signal.h> 46254885Sdumbbell#include <stdint.h> 47254885Sdumbbell#include <stdio.h> 48254885Sdumbbell#include <string.h> 49254885Sdumbbell#include <sysexits.h> 50254885Sdumbbell#include <unistd.h> 51254885Sdumbbell 52254885Sdumbbell#include <activemap.h> 53254885Sdumbbell#include <nv.h> 54254885Sdumbbell#include <pjdlog.h> 55254885Sdumbbell 56254885Sdumbbell#include "control.h" 57254885Sdumbbell#include "event.h" 58254885Sdumbbell#include "hast.h" 59254885Sdumbbell#include "hast_proto.h" 60254885Sdumbbell#include "hastd.h" 61254885Sdumbbell#include "hooks.h" 62254885Sdumbbell#include "metadata.h" 63254885Sdumbbell#include "proto.h" 64254885Sdumbbell#include "subr.h" 65254885Sdumbbell#include "synch.h" 66254885Sdumbbell 67254885Sdumbbellstruct hio { 68254885Sdumbbell uint64_t hio_seq; 69254885Sdumbbell int hio_error; 70254885Sdumbbell struct nv *hio_nv; 71254885Sdumbbell void *hio_data; 72254885Sdumbbell uint8_t hio_cmd; 73254885Sdumbbell uint64_t hio_offset; 74254885Sdumbbell uint64_t hio_length; 75254885Sdumbbell TAILQ_ENTRY(hio) hio_next; 76254885Sdumbbell}; 77254885Sdumbbell 78254885Sdumbbellstatic struct hast_resource *gres; 79254885Sdumbbell 80254885Sdumbbell/* 81254885Sdumbbell * Free list holds unused structures. When free list is empty, we have to wait 82254885Sdumbbell * until some in-progress requests are freed. 83254885Sdumbbell */ 84254885Sdumbbellstatic TAILQ_HEAD(, hio) hio_free_list; 85254885Sdumbbellstatic pthread_mutex_t hio_free_list_lock; 86254885Sdumbbellstatic pthread_cond_t hio_free_list_cond; 87254885Sdumbbell/* 88254885Sdumbbell * Disk thread (the one that do I/O requests) takes requests from this list. 89254885Sdumbbell */ 90254885Sdumbbellstatic TAILQ_HEAD(, hio) hio_disk_list; 91254885Sdumbbellstatic pthread_mutex_t hio_disk_list_lock; 92254885Sdumbbellstatic pthread_cond_t hio_disk_list_cond; 93254885Sdumbbell/* 94254885Sdumbbell * There is one recv list for every component, although local components don't 95254885Sdumbbell * use recv lists as local requests are done synchronously. 96254885Sdumbbell */ 97254885Sdumbbellstatic TAILQ_HEAD(, hio) hio_send_list; 98254885Sdumbbellstatic pthread_mutex_t hio_send_list_lock; 99254885Sdumbbellstatic pthread_cond_t hio_send_list_cond; 100254885Sdumbbell 101254885Sdumbbell/* 102254885Sdumbbell * Maximum number of outstanding I/O requests. 103254885Sdumbbell */ 104254885Sdumbbell#define HAST_HIO_MAX 256 105254885Sdumbbell 106254885Sdumbbellstatic void *recv_thread(void *arg); 107254885Sdumbbellstatic void *disk_thread(void *arg); 108254885Sdumbbellstatic void *send_thread(void *arg); 109254885Sdumbbell 110254885Sdumbbell#define QUEUE_INSERT(name, hio) do { \ 111254885Sdumbbell bool _wakeup; \ 112254885Sdumbbell \ 113254885Sdumbbell mtx_lock(&hio_##name##_list_lock); \ 114254885Sdumbbell _wakeup = TAILQ_EMPTY(&hio_##name##_list); \ 115254885Sdumbbell TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next); \ 116254885Sdumbbell mtx_unlock(&hio_##name##_list_lock); \ 117254885Sdumbbell if (_wakeup) \ 118254885Sdumbbell cv_signal(&hio_##name##_list_cond); \ 119254885Sdumbbell} while (0) 120254885Sdumbbell#define QUEUE_TAKE(name, hio) do { \ 121254885Sdumbbell mtx_lock(&hio_##name##_list_lock); \ 122254885Sdumbbell while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ 123254885Sdumbbell cv_wait(&hio_##name##_list_cond, \ 124254885Sdumbbell &hio_##name##_list_lock); \ 125254885Sdumbbell } \ 126254885Sdumbbell TAILQ_REMOVE(&hio_##name##_list, (hio), hio_next); \ 127254885Sdumbbell mtx_unlock(&hio_##name##_list_lock); \ 128254885Sdumbbell} while (0) 129254885Sdumbbell 130254885Sdumbbellstatic void 131254885Sdumbbellinit_environment(void) 132254885Sdumbbell{ 133254885Sdumbbell struct hio *hio; 134254885Sdumbbell unsigned int ii; 135254885Sdumbbell 136254885Sdumbbell /* 137254885Sdumbbell * Initialize lists, their locks and theirs condition variables. 138254885Sdumbbell */ 139254885Sdumbbell TAILQ_INIT(&hio_free_list); 140254885Sdumbbell mtx_init(&hio_free_list_lock); 141254885Sdumbbell cv_init(&hio_free_list_cond); 142254885Sdumbbell TAILQ_INIT(&hio_disk_list); 143254885Sdumbbell mtx_init(&hio_disk_list_lock); 144254885Sdumbbell cv_init(&hio_disk_list_cond); 145254885Sdumbbell TAILQ_INIT(&hio_send_list); 146254885Sdumbbell mtx_init(&hio_send_list_lock); 147254885Sdumbbell cv_init(&hio_send_list_cond); 148254885Sdumbbell 149254885Sdumbbell /* 150254885Sdumbbell * Allocate requests pool and initialize requests. 151254885Sdumbbell */ 152254885Sdumbbell for (ii = 0; ii < HAST_HIO_MAX; ii++) { 153254885Sdumbbell hio = malloc(sizeof(*hio)); 154254885Sdumbbell if (hio == NULL) { 155254885Sdumbbell pjdlog_exitx(EX_TEMPFAIL, 156254885Sdumbbell "Unable to allocate memory (%zu bytes) for hio request.", 157254885Sdumbbell sizeof(*hio)); 158254885Sdumbbell } 159254885Sdumbbell hio->hio_error = 0; 160254885Sdumbbell hio->hio_data = malloc(MAXPHYS); 161254885Sdumbbell if (hio->hio_data == NULL) { 162254885Sdumbbell pjdlog_exitx(EX_TEMPFAIL, 163254885Sdumbbell "Unable to allocate memory (%zu bytes) for gctl_data.", 164254885Sdumbbell (size_t)MAXPHYS); 165254885Sdumbbell } 166254885Sdumbbell TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next); 167254885Sdumbbell } 168254885Sdumbbell} 169254885Sdumbbell 170254885Sdumbbellstatic void 171254885Sdumbbellinit_local(struct hast_resource *res) 172254885Sdumbbell{ 173254885Sdumbbell 174254885Sdumbbell if (metadata_read(res, true) < 0) 175254885Sdumbbell exit(EX_NOINPUT); 176254885Sdumbbell} 177254885Sdumbbell 178254885Sdumbbellstatic void 179254885Sdumbbellinit_remote(struct hast_resource *res, struct nv *nvin) 180254885Sdumbbell{ 181254885Sdumbbell uint64_t resuid; 182254885Sdumbbell struct nv *nvout; 183254885Sdumbbell unsigned char *map; 184254885Sdumbbell size_t mapsize; 185254885Sdumbbell 186254885Sdumbbell#ifdef notyet 187254885Sdumbbell /* Setup direction. */ 188254885Sdumbbell if (proto_send(res->hr_remoteout, NULL, 0) == -1) 189254885Sdumbbell pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); 190254885Sdumbbell#endif 191254885Sdumbbell 192254885Sdumbbell map = NULL; 193254885Sdumbbell mapsize = 0; 194254885Sdumbbell nvout = nv_alloc(); 195254885Sdumbbell nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize"); 196254885Sdumbbell nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize"); 197254885Sdumbbell resuid = nv_get_uint64(nvin, "resuid"); 198254885Sdumbbell res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt"); 199254885Sdumbbell res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt"); 200254885Sdumbbell nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt"); 201254885Sdumbbell nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt"); 202254885Sdumbbell mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - 203254885Sdumbbell METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); 204254885Sdumbbell map = malloc(mapsize); 205254885Sdumbbell if (map == NULL) { 206254885Sdumbbell pjdlog_exitx(EX_TEMPFAIL, 207254885Sdumbbell "Unable to allocate memory (%zu bytes) for activemap.", 208254885Sdumbbell mapsize); 209254885Sdumbbell } 210254885Sdumbbell /* 211254885Sdumbbell * When we work as primary and secondary is missing we will increase 212254885Sdumbbell * localcnt in our metadata. When secondary is connected and synced 213254885Sdumbbell * we make localcnt be equal to remotecnt, which means nodes are more 214254885Sdumbbell * or less in sync. 215254885Sdumbbell * Split-brain condition is when both nodes are not able to communicate 216254885Sdumbbell * and are both configured as primary nodes. In turn, they can both 217254885Sdumbbell * make incompatible changes to the data and we have to detect that. 218254885Sdumbbell * Under split-brain condition we will increase our localcnt on first 219254885Sdumbbell * write and remote node will increase its localcnt on first write. 220254885Sdumbbell * When we connect we can see that primary's localcnt is greater than 221254885Sdumbbell * our remotecnt (primary was modified while we weren't watching) and 222254885Sdumbbell * our localcnt is greater than primary's remotecnt (we were modified 223254885Sdumbbell * while primary wasn't watching). 224254885Sdumbbell * There are many possible combinations which are all gathered below. 225254885Sdumbbell * Don't pay too much attention to exact numbers, the more important 226254885Sdumbbell * is to compare them. We compare secondary's local with primary's 227254885Sdumbbell * remote and secondary's remote with primary's local. 228254885Sdumbbell * Note that every case where primary's localcnt is smaller than 229254885Sdumbbell * secondary's remotecnt and where secondary's localcnt is smaller than 230254885Sdumbbell * primary's remotecnt should be impossible in practise. We will perform 231254885Sdumbbell * full synchronization then. Those cases are marked with an asterisk. 232254885Sdumbbell * Regular synchronization means that only extents marked as dirty are 233254885Sdumbbell * synchronized (regular synchronization). 234254885Sdumbbell * 235254885Sdumbbell * SECONDARY METADATA PRIMARY METADATA 236254885Sdumbbell * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary. 237254885Sdumbbell * local=3 remote=3 local=2 remote=3* ?! Full sync from primary. 238254885Sdumbbell * local=3 remote=3 local=2 remote=4* ?! Full sync from primary. 239254885Sdumbbell * local=3 remote=3 local=3 remote=2 Primary is out-of-date, 240254885Sdumbbell * regular sync from secondary. 241254885Sdumbbell * local=3 remote=3 local=3 remote=3 Regular sync just in case. 242254885Sdumbbell * local=3 remote=3 local=3 remote=4* ?! Full sync from primary. 243254885Sdumbbell * local=3 remote=3 local=4 remote=2 Split-brain condition. 244254885Sdumbbell * local=3 remote=3 local=4 remote=3 Secondary out-of-date, 245254885Sdumbbell * regular sync from primary. 246254885Sdumbbell * local=3 remote=3 local=4 remote=4* ?! Full sync from primary. 247254885Sdumbbell */ 248254885Sdumbbell if (res->hr_resuid == 0) { 249254885Sdumbbell /* 250254885Sdumbbell * Provider is used for the first time. If primary node done no 251254885Sdumbbell * writes yet as well (we will find "virgin" argument) then 252254885Sdumbbell * there is no need to synchronize anything. If primary node 253254885Sdumbbell * done any writes already we have to synchronize everything. 254254885Sdumbbell */ 255254885Sdumbbell PJDLOG_ASSERT(res->hr_secondary_localcnt == 0); 256254885Sdumbbell res->hr_resuid = resuid; 257254885Sdumbbell if (metadata_write(res) < 0) 258254885Sdumbbell exit(EX_NOINPUT); 259254885Sdumbbell if (nv_exists(nvin, "virgin")) { 260254885Sdumbbell free(map); 261254885Sdumbbell map = NULL; 262254885Sdumbbell mapsize = 0; 263254885Sdumbbell } else { 264254885Sdumbbell memset(map, 0xff, mapsize); 265254885Sdumbbell } 266254885Sdumbbell nv_add_int8(nvout, 1, "virgin"); 267254885Sdumbbell nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 268254885Sdumbbell } else if (res->hr_resuid != resuid) { 269254885Sdumbbell char errmsg[256]; 270254885Sdumbbell 271254885Sdumbbell (void)snprintf(errmsg, sizeof(errmsg), 272254885Sdumbbell "Resource unique ID mismatch (primary=%ju, secondary=%ju).", 273254885Sdumbbell (uintmax_t)resuid, (uintmax_t)res->hr_resuid); 274254885Sdumbbell pjdlog_error("%s", errmsg); 275254885Sdumbbell nv_add_string(nvout, errmsg, "errmsg"); 276254885Sdumbbell if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) { 277254885Sdumbbell pjdlog_exit(EX_TEMPFAIL, "Unable to send response to %s", 278254885Sdumbbell res->hr_remoteaddr); 279254885Sdumbbell } 280254885Sdumbbell nv_free(nvout); 281254885Sdumbbell exit(EX_CONFIG); 282254885Sdumbbell } else if ( 283254885Sdumbbell /* Is primary is out-of-date? */ 284254885Sdumbbell (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 285254885Sdumbbell res->hr_secondary_remotecnt == res->hr_primary_localcnt) || 286254885Sdumbbell /* Nodes are more or less in sync? */ 287254885Sdumbbell (res->hr_secondary_localcnt == res->hr_primary_remotecnt && 288254885Sdumbbell res->hr_secondary_remotecnt == res->hr_primary_localcnt) || 289254885Sdumbbell /* Is secondary is out-of-date? */ 290254885Sdumbbell (res->hr_secondary_localcnt == res->hr_primary_remotecnt && 291254885Sdumbbell res->hr_secondary_remotecnt < res->hr_primary_localcnt)) { 292254885Sdumbbell /* 293254885Sdumbbell * Nodes are more or less in sync or one of the nodes is 294254885Sdumbbell * out-of-date. 295254885Sdumbbell * It doesn't matter at this point which one, we just have to 296254885Sdumbbell * send out local bitmap to the remote node. 297254885Sdumbbell */ 298254885Sdumbbell if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) != 299254885Sdumbbell (ssize_t)mapsize) { 300254885Sdumbbell pjdlog_exit(LOG_ERR, "Unable to read activemap"); 301254885Sdumbbell } 302254885Sdumbbell if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 303254885Sdumbbell res->hr_secondary_remotecnt == res->hr_primary_localcnt) { 304254885Sdumbbell /* Primary is out-of-date, sync from secondary. */ 305254885Sdumbbell nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); 306254885Sdumbbell } else { 307254885Sdumbbell /* 308254885Sdumbbell * Secondary is out-of-date or counts match. 309254885Sdumbbell * Sync from primary. 310254885Sdumbbell */ 311254885Sdumbbell nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 312254885Sdumbbell } 313254885Sdumbbell } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 314254885Sdumbbell res->hr_primary_localcnt > res->hr_secondary_remotecnt) { 315254885Sdumbbell /* 316254885Sdumbbell * Not good, we have split-brain condition. 317254885Sdumbbell */ 318254885Sdumbbell pjdlog_error("Split-brain detected, exiting."); 319254885Sdumbbell nv_add_string(nvout, "Split-brain condition!", "errmsg"); 320254885Sdumbbell free(map); 321254885Sdumbbell map = NULL; 322254885Sdumbbell mapsize = 0; 323254885Sdumbbell } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt || 324254885Sdumbbell res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ { 325254885Sdumbbell /* 326254885Sdumbbell * This should never happen in practise, but we will perform 327254885Sdumbbell * full synchronization. 328254885Sdumbbell */ 329254885Sdumbbell PJDLOG_ASSERT(res->hr_secondary_localcnt < res->hr_primary_remotecnt || 330254885Sdumbbell res->hr_primary_localcnt < res->hr_secondary_remotecnt); 331254885Sdumbbell mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - 332254885Sdumbbell METADATA_SIZE, res->hr_extentsize, 333254885Sdumbbell res->hr_local_sectorsize); 334254885Sdumbbell memset(map, 0xff, mapsize); 335254885Sdumbbell if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) { 336254885Sdumbbell /* In this one of five cases sync from secondary. */ 337254885Sdumbbell nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); 338254885Sdumbbell } else { 339254885Sdumbbell /* For the rest four cases sync from primary. */ 340254885Sdumbbell nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 341254885Sdumbbell } 342254885Sdumbbell pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).", 343254885Sdumbbell (uintmax_t)res->hr_primary_localcnt, 344254885Sdumbbell (uintmax_t)res->hr_primary_remotecnt, 345254885Sdumbbell (uintmax_t)res->hr_secondary_localcnt, 346254885Sdumbbell (uintmax_t)res->hr_secondary_remotecnt); 347254885Sdumbbell } 348254885Sdumbbell nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize"); 349254885Sdumbbell if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) { 350254885Sdumbbell pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s", 351254885Sdumbbell res->hr_remoteaddr); 352254885Sdumbbell } 353254885Sdumbbell if (map != NULL) 354254885Sdumbbell free(map); 355254885Sdumbbell nv_free(nvout); 356254885Sdumbbell#ifdef notyet 357254885Sdumbbell /* Setup direction. */ 358254885Sdumbbell if (proto_recv(res->hr_remotein, NULL, 0) == -1) 359254885Sdumbbell pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); 360254885Sdumbbell#endif 361254885Sdumbbell if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 362254885Sdumbbell res->hr_primary_localcnt > res->hr_secondary_remotecnt) { 363254885Sdumbbell /* Exit on split-brain. */ 364254885Sdumbbell event_send(res, EVENT_SPLITBRAIN); 365254885Sdumbbell exit(EX_CONFIG); 366254885Sdumbbell } 367254885Sdumbbell} 368254885Sdumbbell 369254885Sdumbbellvoid 370254885Sdumbbellhastd_secondary(struct hast_resource *res, struct nv *nvin) 371254885Sdumbbell{ 372254885Sdumbbell sigset_t mask; 373254885Sdumbbell pthread_t td; 374254885Sdumbbell pid_t pid; 375254885Sdumbbell int error, mode, debuglevel; 376254885Sdumbbell 377254885Sdumbbell /* 378254885Sdumbbell * Create communication channel between parent and child. 379254885Sdumbbell */ 380254885Sdumbbell if (proto_client(NULL, "socketpair://", &res->hr_ctrl) < 0) { 381254885Sdumbbell KEEP_ERRNO((void)pidfile_remove(pfh)); 382254885Sdumbbell pjdlog_exit(EX_OSERR, 383254885Sdumbbell "Unable to create control sockets between parent and child"); 384254885Sdumbbell } 385254885Sdumbbell /* 386254885Sdumbbell * Create communication channel between child and parent. 387254885Sdumbbell */ 388254885Sdumbbell if (proto_client(NULL, "socketpair://", &res->hr_event) < 0) { 389254885Sdumbbell KEEP_ERRNO((void)pidfile_remove(pfh)); 390254885Sdumbbell pjdlog_exit(EX_OSERR, 391254885Sdumbbell "Unable to create event sockets between child and parent"); 392254885Sdumbbell } 393254885Sdumbbell 394254885Sdumbbell pid = fork(); 395254885Sdumbbell if (pid < 0) { 396254885Sdumbbell KEEP_ERRNO((void)pidfile_remove(pfh)); 397254885Sdumbbell pjdlog_exit(EX_OSERR, "Unable to fork"); 398254885Sdumbbell } 399254885Sdumbbell 400254885Sdumbbell if (pid > 0) { 401254885Sdumbbell /* This is parent. */ 402254885Sdumbbell proto_close(res->hr_remotein); 403254885Sdumbbell res->hr_remotein = NULL; 404254885Sdumbbell proto_close(res->hr_remoteout); 405254885Sdumbbell res->hr_remoteout = NULL; 406254885Sdumbbell /* Declare that we are receiver. */ 407254885Sdumbbell proto_recv(res->hr_event, NULL, 0); 408254885Sdumbbell /* Declare that we are sender. */ 409254885Sdumbbell proto_send(res->hr_ctrl, NULL, 0); 410254885Sdumbbell res->hr_workerpid = pid; 411254885Sdumbbell return; 412254885Sdumbbell } 413254885Sdumbbell 414254885Sdumbbell gres = res; 415254885Sdumbbell mode = pjdlog_mode_get(); 416254885Sdumbbell debuglevel = pjdlog_debug_get(); 417254885Sdumbbell 418254885Sdumbbell /* Declare that we are sender. */ 419254885Sdumbbell proto_send(res->hr_event, NULL, 0); 420254885Sdumbbell /* Declare that we are receiver. */ 421254885Sdumbbell proto_recv(res->hr_ctrl, NULL, 0); 422254885Sdumbbell descriptors_cleanup(res); 423254885Sdumbbell 424254885Sdumbbell descriptors_assert(res, mode); 425254885Sdumbbell 426254885Sdumbbell pjdlog_init(mode); 427254885Sdumbbell pjdlog_debug_set(debuglevel); 428254885Sdumbbell pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); 429254885Sdumbbell setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role)); 430254885Sdumbbell 431254885Sdumbbell PJDLOG_VERIFY(sigemptyset(&mask) == 0); 432254885Sdumbbell PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); 433254885Sdumbbell 434254885Sdumbbell /* Error in setting timeout is not critical, but why should it fail? */ 435254885Sdumbbell if (proto_timeout(res->hr_remotein, 2 * HAST_KEEPALIVE) < 0) 436254885Sdumbbell pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 437254885Sdumbbell if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0) 438254885Sdumbbell pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 439254885Sdumbbell 440254885Sdumbbell init_local(res); 441254885Sdumbbell init_environment(); 442254885Sdumbbell 443254885Sdumbbell if (drop_privs(res) != 0) 444254885Sdumbbell exit(EX_CONFIG); 445254885Sdumbbell pjdlog_info("Privileges successfully dropped."); 446254885Sdumbbell 447254885Sdumbbell /* 448254885Sdumbbell * Create the control thread before sending any event to the parent, 449254885Sdumbbell * as we can deadlock when parent sends control request to worker, 450254885Sdumbbell * but worker has no control thread started yet, so parent waits. 451254885Sdumbbell * In the meantime worker sends an event to the parent, but parent 452254885Sdumbbell * is unable to handle the event, because it waits for control 453254885Sdumbbell * request response. 454254885Sdumbbell */ 455254885Sdumbbell error = pthread_create(&td, NULL, ctrl_thread, res); 456254885Sdumbbell PJDLOG_ASSERT(error == 0); 457254885Sdumbbell 458254885Sdumbbell init_remote(res, nvin); 459254885Sdumbbell event_send(res, EVENT_CONNECT); 460254885Sdumbbell 461254885Sdumbbell error = pthread_create(&td, NULL, recv_thread, res); 462254885Sdumbbell PJDLOG_ASSERT(error == 0); 463254885Sdumbbell error = pthread_create(&td, NULL, disk_thread, res); 464254885Sdumbbell PJDLOG_ASSERT(error == 0); 465254885Sdumbbell (void)send_thread(res); 466254885Sdumbbell} 467254885Sdumbbell 468254885Sdumbbellstatic void 469254885Sdumbbellreqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...) 470254885Sdumbbell{ 471254885Sdumbbell char msg[1024]; 472254885Sdumbbell va_list ap; 473254885Sdumbbell int len; 474254885Sdumbbell 475254885Sdumbbell va_start(ap, fmt); 476254885Sdumbbell len = vsnprintf(msg, sizeof(msg), fmt, ap); 477254885Sdumbbell va_end(ap); 478254885Sdumbbell if ((size_t)len < sizeof(msg)) { 479254885Sdumbbell switch (hio->hio_cmd) { 480254885Sdumbbell case HIO_READ: 481254885Sdumbbell (void)snprintf(msg + len, sizeof(msg) - len, 482254885Sdumbbell "READ(%ju, %ju).", (uintmax_t)hio->hio_offset, 483254885Sdumbbell (uintmax_t)hio->hio_length); 484254885Sdumbbell break; 485254885Sdumbbell case HIO_DELETE: 486254885Sdumbbell (void)snprintf(msg + len, sizeof(msg) - len, 487254885Sdumbbell "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset, 488254885Sdumbbell (uintmax_t)hio->hio_length); 489254885Sdumbbell break; 490254885Sdumbbell case HIO_FLUSH: 491254885Sdumbbell (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH."); 492254885Sdumbbell break; 493254885Sdumbbell case HIO_WRITE: 494254885Sdumbbell (void)snprintf(msg + len, sizeof(msg) - len, 495254885Sdumbbell "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset, 496254885Sdumbbell (uintmax_t)hio->hio_length); 497254885Sdumbbell break; 498254885Sdumbbell case HIO_KEEPALIVE: 499254885Sdumbbell (void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE."); 500254885Sdumbbell break; 501254885Sdumbbell default: 502254885Sdumbbell (void)snprintf(msg + len, sizeof(msg) - len, 503254885Sdumbbell "UNKNOWN(%u).", (unsigned int)hio->hio_cmd); 504254885Sdumbbell break; 505254885Sdumbbell } 506254885Sdumbbell } 507254885Sdumbbell pjdlog_common(loglevel, debuglevel, error, "%s", msg); 508254885Sdumbbell} 509254885Sdumbbell 510254885Sdumbbellstatic int 511254885Sdumbbellrequnpack(struct hast_resource *res, struct hio *hio) 512254885Sdumbbell{ 513254885Sdumbbell 514254885Sdumbbell hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd"); 515254885Sdumbbell if (hio->hio_cmd == 0) { 516254885Sdumbbell pjdlog_error("Header contains no 'cmd' field."); 517254885Sdumbbell hio->hio_error = EINVAL; 518254885Sdumbbell goto end; 519254885Sdumbbell } 520254885Sdumbbell switch (hio->hio_cmd) { 521254885Sdumbbell case HIO_FLUSH: 522254885Sdumbbell case HIO_KEEPALIVE: 523254885Sdumbbell break; 524254885Sdumbbell case HIO_READ: 525254885Sdumbbell case HIO_WRITE: 526254885Sdumbbell case HIO_DELETE: 527254885Sdumbbell hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset"); 528254885Sdumbbell if (nv_error(hio->hio_nv) != 0) { 529254885Sdumbbell pjdlog_error("Header is missing 'offset' field."); 530254885Sdumbbell hio->hio_error = EINVAL; 531254885Sdumbbell goto end; 532254885Sdumbbell } 533254885Sdumbbell hio->hio_length = nv_get_uint64(hio->hio_nv, "length"); 534254885Sdumbbell if (nv_error(hio->hio_nv) != 0) { 535254885Sdumbbell pjdlog_error("Header is missing 'length' field."); 536254885Sdumbbell hio->hio_error = EINVAL; 537254885Sdumbbell goto end; 538254885Sdumbbell } 539254885Sdumbbell if (hio->hio_length == 0) { 540254885Sdumbbell pjdlog_error("Data length is zero."); 541254885Sdumbbell hio->hio_error = EINVAL; 542254885Sdumbbell goto end; 543254885Sdumbbell } 544254885Sdumbbell if (hio->hio_length > MAXPHYS) { 545254885Sdumbbell pjdlog_error("Data length is too large (%ju > %ju).", 546254885Sdumbbell (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS); 547254885Sdumbbell hio->hio_error = EINVAL; 548254885Sdumbbell goto end; 549254885Sdumbbell } 550254885Sdumbbell if ((hio->hio_offset % res->hr_local_sectorsize) != 0) { 551254885Sdumbbell pjdlog_error("Offset %ju is not multiple of sector size.", 552254885Sdumbbell (uintmax_t)hio->hio_offset); 553254885Sdumbbell hio->hio_error = EINVAL; 554254885Sdumbbell goto end; 555254885Sdumbbell } 556254885Sdumbbell if ((hio->hio_length % res->hr_local_sectorsize) != 0) { 557254885Sdumbbell pjdlog_error("Length %ju is not multiple of sector size.", 558254885Sdumbbell (uintmax_t)hio->hio_length); 559254885Sdumbbell hio->hio_error = EINVAL; 560254885Sdumbbell goto end; 561254885Sdumbbell } 562254885Sdumbbell if (hio->hio_offset + hio->hio_length > 563254885Sdumbbell (uint64_t)res->hr_datasize) { 564254885Sdumbbell pjdlog_error("Data offset is too large (%ju > %ju).", 565254885Sdumbbell (uintmax_t)(hio->hio_offset + hio->hio_length), 566254885Sdumbbell (uintmax_t)res->hr_datasize); 567254885Sdumbbell hio->hio_error = EINVAL; 568254885Sdumbbell goto end; 569254885Sdumbbell } 570254885Sdumbbell break; 571254885Sdumbbell default: 572254885Sdumbbell pjdlog_error("Header contains invalid 'cmd' (%hhu).", 573254885Sdumbbell hio->hio_cmd); 574254885Sdumbbell hio->hio_error = EINVAL; 575254885Sdumbbell goto end; 576254885Sdumbbell } 577254885Sdumbbell hio->hio_error = 0; 578254885Sdumbbellend: 579254885Sdumbbell return (hio->hio_error); 580254885Sdumbbell} 581254885Sdumbbell 582254885Sdumbbellstatic __dead2 void 583254885Sdumbbellsecondary_exit(int exitcode, const char *fmt, ...) 584254885Sdumbbell{ 585254885Sdumbbell va_list ap; 586254885Sdumbbell 587254885Sdumbbell PJDLOG_ASSERT(exitcode != EX_OK); 588254885Sdumbbell va_start(ap, fmt); 589254885Sdumbbell pjdlogv_errno(LOG_ERR, fmt, ap); 590254885Sdumbbell va_end(ap); 591254885Sdumbbell event_send(gres, EVENT_DISCONNECT); 592254885Sdumbbell exit(exitcode); 593254885Sdumbbell} 594254885Sdumbbell 595254885Sdumbbell/* 596254885Sdumbbell * Thread receives requests from the primary node. 597254885Sdumbbell */ 598254885Sdumbbellstatic void * 599254885Sdumbbellrecv_thread(void *arg) 600254885Sdumbbell{ 601254885Sdumbbell struct hast_resource *res = arg; 602254885Sdumbbell struct hio *hio; 603254885Sdumbbell 604254885Sdumbbell for (;;) { 605254885Sdumbbell pjdlog_debug(2, "recv: Taking free request."); 606254885Sdumbbell QUEUE_TAKE(free, hio); 607254885Sdumbbell pjdlog_debug(2, "recv: (%p) Got request.", hio); 608254885Sdumbbell if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) { 609254885Sdumbbell secondary_exit(EX_TEMPFAIL, 610254885Sdumbbell "Unable to receive request header"); 611254885Sdumbbell } 612254885Sdumbbell if (requnpack(res, hio) != 0) { 613254885Sdumbbell pjdlog_debug(2, 614254885Sdumbbell "recv: (%p) Moving request to the send queue.", 615254885Sdumbbell hio); 616254885Sdumbbell QUEUE_INSERT(send, hio); 617254885Sdumbbell continue; 618254885Sdumbbell } 619254885Sdumbbell switch (hio->hio_cmd) { 620254885Sdumbbell case HIO_READ: 621254885Sdumbbell res->hr_stat_read++; 622254885Sdumbbell break; 623254885Sdumbbell case HIO_WRITE: 624254885Sdumbbell res->hr_stat_write++; 625254885Sdumbbell break; 626254885Sdumbbell case HIO_DELETE: 627254885Sdumbbell res->hr_stat_delete++; 628254885Sdumbbell break; 629254885Sdumbbell case HIO_FLUSH: 630254885Sdumbbell res->hr_stat_flush++; 631254885Sdumbbell break; 632254885Sdumbbell } 633254885Sdumbbell reqlog(LOG_DEBUG, 2, -1, hio, 634254885Sdumbbell "recv: (%p) Got request header: ", hio); 635254885Sdumbbell if (hio->hio_cmd == HIO_KEEPALIVE) { 636254885Sdumbbell pjdlog_debug(2, 637254885Sdumbbell "recv: (%p) Moving request to the free queue.", 638254885Sdumbbell hio); 639254885Sdumbbell nv_free(hio->hio_nv); 640254885Sdumbbell QUEUE_INSERT(free, hio); 641254885Sdumbbell continue; 642254885Sdumbbell } else if (hio->hio_cmd == HIO_WRITE) { 643254885Sdumbbell if (hast_proto_recv_data(res, res->hr_remotein, 644254885Sdumbbell hio->hio_nv, hio->hio_data, MAXPHYS) < 0) { 645254885Sdumbbell secondary_exit(EX_TEMPFAIL, 646254885Sdumbbell "Unable to receive request data"); 647254885Sdumbbell } 648254885Sdumbbell } 649254885Sdumbbell pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.", 650254885Sdumbbell hio); 651254885Sdumbbell QUEUE_INSERT(disk, hio); 652254885Sdumbbell } 653254885Sdumbbell /* NOTREACHED */ 654254885Sdumbbell return (NULL); 655254885Sdumbbell} 656254885Sdumbbell 657254885Sdumbbell/* 658254885Sdumbbell * Thread reads from or writes to local component and also handles DELETE and 659254885Sdumbbell * FLUSH requests. 660254885Sdumbbell */ 661254885Sdumbbellstatic void * 662254885Sdumbbelldisk_thread(void *arg) 663254885Sdumbbell{ 664254885Sdumbbell struct hast_resource *res = arg; 665254885Sdumbbell struct hio *hio; 666254885Sdumbbell ssize_t ret; 667254885Sdumbbell bool clear_activemap; 668254885Sdumbbell 669254885Sdumbbell clear_activemap = true; 670254885Sdumbbell 671254885Sdumbbell for (;;) { 672254885Sdumbbell pjdlog_debug(2, "disk: Taking request."); 673254885Sdumbbell QUEUE_TAKE(disk, hio); 674254885Sdumbbell while (clear_activemap) { 675254885Sdumbbell unsigned char *map; 676254885Sdumbbell size_t mapsize; 677254885Sdumbbell 678254885Sdumbbell /* 679254885Sdumbbell * When first request is received, it means that primary 680254885Sdumbbell * already received our activemap, merged it and stored 681254885Sdumbbell * locally. We can now safely clear our activemap. 682254885Sdumbbell */ 683254885Sdumbbell mapsize = 684254885Sdumbbell activemap_calc_ondisk_size(res->hr_local_mediasize - 685254885Sdumbbell METADATA_SIZE, res->hr_extentsize, 686254885Sdumbbell res->hr_local_sectorsize); 687254885Sdumbbell map = calloc(1, mapsize); 688254885Sdumbbell if (map == NULL) { 689254885Sdumbbell pjdlog_warning("Unable to allocate memory to clear local activemap."); 690254885Sdumbbell break; 691254885Sdumbbell } 692254885Sdumbbell if (pwrite(res->hr_localfd, map, mapsize, 693254885Sdumbbell METADATA_SIZE) != (ssize_t)mapsize) { 694254885Sdumbbell pjdlog_errno(LOG_WARNING, 695254885Sdumbbell "Unable to store cleared activemap"); 696254885Sdumbbell free(map); 697254885Sdumbbell break; 698254885Sdumbbell } 699254885Sdumbbell free(map); 700254885Sdumbbell clear_activemap = false; 701254885Sdumbbell pjdlog_debug(1, "Local activemap cleared."); 702254885Sdumbbell break; 703254885Sdumbbell } 704254885Sdumbbell reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio); 705254885Sdumbbell /* Handle the actual request. */ 706254885Sdumbbell switch (hio->hio_cmd) { 707254885Sdumbbell case HIO_READ: 708254885Sdumbbell ret = pread(res->hr_localfd, hio->hio_data, 709254885Sdumbbell hio->hio_length, 710254885Sdumbbell hio->hio_offset + res->hr_localoff); 711254885Sdumbbell if (ret < 0) 712254885Sdumbbell hio->hio_error = errno; 713254885Sdumbbell else if (ret != (int64_t)hio->hio_length) 714254885Sdumbbell hio->hio_error = EIO; 715254885Sdumbbell else 716254885Sdumbbell hio->hio_error = 0; 717254885Sdumbbell break; 718254885Sdumbbell case HIO_WRITE: 719254885Sdumbbell ret = pwrite(res->hr_localfd, hio->hio_data, 720254885Sdumbbell hio->hio_length, 721254885Sdumbbell hio->hio_offset + res->hr_localoff); 722254885Sdumbbell if (ret < 0) 723254885Sdumbbell hio->hio_error = errno; 724254885Sdumbbell else if (ret != (int64_t)hio->hio_length) 725254885Sdumbbell hio->hio_error = EIO; 726254885Sdumbbell else 727254885Sdumbbell hio->hio_error = 0; 728254885Sdumbbell break; 729254885Sdumbbell case HIO_DELETE: 730254885Sdumbbell ret = g_delete(res->hr_localfd, 731254885Sdumbbell hio->hio_offset + res->hr_localoff, 732254885Sdumbbell hio->hio_length); 733254885Sdumbbell if (ret < 0) 734254885Sdumbbell hio->hio_error = errno; 735254885Sdumbbell else 736254885Sdumbbell hio->hio_error = 0; 737254885Sdumbbell break; 738254885Sdumbbell case HIO_FLUSH: 739254885Sdumbbell ret = g_flush(res->hr_localfd); 740254885Sdumbbell if (ret < 0) 741254885Sdumbbell hio->hio_error = errno; 742254885Sdumbbell else 743254885Sdumbbell hio->hio_error = 0; 744254885Sdumbbell break; 745254885Sdumbbell } 746254885Sdumbbell if (hio->hio_error != 0) { 747254885Sdumbbell reqlog(LOG_ERR, 0, hio->hio_error, hio, 748254885Sdumbbell "Request failed: "); 749254885Sdumbbell } 750254885Sdumbbell pjdlog_debug(2, "disk: (%p) Moving request to the send queue.", 751254885Sdumbbell hio); 752254885Sdumbbell QUEUE_INSERT(send, hio); 753254885Sdumbbell } 754254885Sdumbbell /* NOTREACHED */ 755254885Sdumbbell return (NULL); 756254885Sdumbbell} 757254885Sdumbbell 758254885Sdumbbell/* 759254885Sdumbbell * Thread sends requests back to primary node. 760254885Sdumbbell */ 761254885Sdumbbellstatic void * 762254885Sdumbbellsend_thread(void *arg) 763254885Sdumbbell{ 764254885Sdumbbell struct hast_resource *res = arg; 765254885Sdumbbell struct nv *nvout; 766254885Sdumbbell struct hio *hio; 767254885Sdumbbell void *data; 768254885Sdumbbell size_t length; 769254885Sdumbbell 770254885Sdumbbell for (;;) { 771254885Sdumbbell pjdlog_debug(2, "send: Taking request."); 772254885Sdumbbell QUEUE_TAKE(send, hio); 773254885Sdumbbell reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio); 774254885Sdumbbell nvout = nv_alloc(); 775254885Sdumbbell /* Copy sequence number. */ 776254885Sdumbbell nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq"); 777254885Sdumbbell switch (hio->hio_cmd) { 778254885Sdumbbell case HIO_READ: 779254885Sdumbbell if (hio->hio_error == 0) { 780254885Sdumbbell data = hio->hio_data; 781254885Sdumbbell length = hio->hio_length; 782254885Sdumbbell break; 783254885Sdumbbell } 784254885Sdumbbell /* 785254885Sdumbbell * We send no data in case of an error. 786254885Sdumbbell */ 787254885Sdumbbell /* FALLTHROUGH */ 788254885Sdumbbell case HIO_DELETE: 789254885Sdumbbell case HIO_FLUSH: 790254885Sdumbbell case HIO_WRITE: 791254885Sdumbbell data = NULL; 792254885Sdumbbell length = 0; 793254885Sdumbbell break; 794254885Sdumbbell default: 795254885Sdumbbell PJDLOG_ABORT("Unexpected command (cmd=%hhu).", 796254885Sdumbbell hio->hio_cmd); 797254885Sdumbbell } 798254885Sdumbbell if (hio->hio_error != 0) 799254885Sdumbbell nv_add_int16(nvout, hio->hio_error, "error"); 800254885Sdumbbell if (hast_proto_send(res, res->hr_remoteout, nvout, data, 801254885Sdumbbell length) < 0) { 802254885Sdumbbell secondary_exit(EX_TEMPFAIL, "Unable to send reply."); 803254885Sdumbbell } 804254885Sdumbbell nv_free(nvout); 805254885Sdumbbell pjdlog_debug(2, "send: (%p) Moving request to the free queue.", 806254885Sdumbbell hio); 807254885Sdumbbell nv_free(hio->hio_nv); 808254885Sdumbbell hio->hio_error = 0; 809254885Sdumbbell QUEUE_INSERT(free, hio); 810254885Sdumbbell } 811254885Sdumbbell /* NOTREACHED */ 812254885Sdumbbell return (NULL); 813254885Sdumbbell} 814254885Sdumbbell