secondary.c revision 223181
1222656Sed/*- 2222656Sed * Copyright (c) 2009-2010 The FreeBSD Foundation 3214152Sed * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 4214152Sed * All rights reserved. 5214152Sed * 6214152Sed * This software was developed by Pawel Jakub Dawidek under sponsorship from 7214152Sed * the FreeBSD Foundation. 8214152Sed * 9214152Sed * Redistribution and use in source and binary forms, with or without 10214152Sed * modification, are permitted provided that the following conditions 11214152Sed * are met: 12214152Sed * 1. Redistributions of source code must retain the above copyright 13214152Sed * notice, this list of conditions and the following disclaimer. 14214152Sed * 2. Redistributions in binary form must reproduce the above copyright 15214152Sed * notice, this list of conditions and the following disclaimer in the 16214152Sed * documentation and/or other materials provided with the distribution. 17214152Sed * 18214152Sed * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 19214152Sed * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20214152Sed * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21214152Sed * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 22214152Sed * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23214152Sed * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24214152Sed * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25214152Sed * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26214152Sed * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27214152Sed * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28214152Sed * SUCH DAMAGE. 29214152Sed */ 30214152Sed 31214152Sed#include <sys/cdefs.h> 32214152Sed__FBSDID("$FreeBSD: head/sbin/hastd/secondary.c 223181 2011-06-17 07:07:26Z trociny $"); 33214152Sed 34214152Sed#include <sys/param.h> 35214152Sed#include <sys/time.h> 36214152Sed#include <sys/bio.h> 37214152Sed#include <sys/disk.h> 38214152Sed#include <sys/stat.h> 39214152Sed 40214152Sed#include <err.h> 41214152Sed#include <errno.h> 42214152Sed#include <fcntl.h> 43214152Sed#include <libgeom.h> 44214152Sed#include <pthread.h> 45214152Sed#include <signal.h> 46214152Sed#include <stdint.h> 47214152Sed#include <stdio.h> 48214152Sed#include <string.h> 49214152Sed#include <sysexits.h> 50214152Sed#include <unistd.h> 51214152Sed 52214152Sed#include <activemap.h> 53214152Sed#include <nv.h> 54214152Sed#include <pjdlog.h> 55214152Sed 56214152Sed#include "control.h" 57214152Sed#include "event.h" 58214152Sed#include "hast.h" 59214152Sed#include "hast_proto.h" 60214152Sed#include "hastd.h" 61214152Sed#include "hooks.h" 62214152Sed#include "metadata.h" 63#include "proto.h" 64#include "subr.h" 65#include "synch.h" 66 67struct hio { 68 uint64_t hio_seq; 69 int hio_error; 70 struct nv *hio_nv; 71 void *hio_data; 72 uint8_t hio_cmd; 73 uint64_t hio_offset; 74 uint64_t hio_length; 75 TAILQ_ENTRY(hio) hio_next; 76}; 77 78static struct hast_resource *gres; 79 80/* 81 * Free list holds unused structures. When free list is empty, we have to wait 82 * until some in-progress requests are freed. 83 */ 84static TAILQ_HEAD(, hio) hio_free_list; 85static pthread_mutex_t hio_free_list_lock; 86static pthread_cond_t hio_free_list_cond; 87/* 88 * Disk thread (the one that do I/O requests) takes requests from this list. 89 */ 90static TAILQ_HEAD(, hio) hio_disk_list; 91static pthread_mutex_t hio_disk_list_lock; 92static pthread_cond_t hio_disk_list_cond; 93/* 94 * There is one recv list for every component, although local components don't 95 * use recv lists as local requests are done synchronously. 96 */ 97static TAILQ_HEAD(, hio) hio_send_list; 98static pthread_mutex_t hio_send_list_lock; 99static pthread_cond_t hio_send_list_cond; 100 101/* 102 * Maximum number of outstanding I/O requests. 103 */ 104#define HAST_HIO_MAX 256 105 106static void *recv_thread(void *arg); 107static void *disk_thread(void *arg); 108static void *send_thread(void *arg); 109 110#define QUEUE_INSERT(name, hio) do { \ 111 bool _wakeup; \ 112 \ 113 mtx_lock(&hio_##name##_list_lock); \ 114 _wakeup = TAILQ_EMPTY(&hio_##name##_list); \ 115 TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next); \ 116 mtx_unlock(&hio_##name##_list_lock); \ 117 if (_wakeup) \ 118 cv_signal(&hio_##name##_list_cond); \ 119} while (0) 120#define QUEUE_TAKE(name, hio) do { \ 121 mtx_lock(&hio_##name##_list_lock); \ 122 while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ 123 cv_wait(&hio_##name##_list_cond, \ 124 &hio_##name##_list_lock); \ 125 } \ 126 TAILQ_REMOVE(&hio_##name##_list, (hio), hio_next); \ 127 mtx_unlock(&hio_##name##_list_lock); \ 128} while (0) 129 130static void 131init_environment(void) 132{ 133 struct hio *hio; 134 unsigned int ii; 135 136 /* 137 * Initialize lists, their locks and theirs condition variables. 138 */ 139 TAILQ_INIT(&hio_free_list); 140 mtx_init(&hio_free_list_lock); 141 cv_init(&hio_free_list_cond); 142 TAILQ_INIT(&hio_disk_list); 143 mtx_init(&hio_disk_list_lock); 144 cv_init(&hio_disk_list_cond); 145 TAILQ_INIT(&hio_send_list); 146 mtx_init(&hio_send_list_lock); 147 cv_init(&hio_send_list_cond); 148 149 /* 150 * Allocate requests pool and initialize requests. 151 */ 152 for (ii = 0; ii < HAST_HIO_MAX; ii++) { 153 hio = malloc(sizeof(*hio)); 154 if (hio == NULL) { 155 pjdlog_exitx(EX_TEMPFAIL, 156 "Unable to allocate memory (%zu bytes) for hio request.", 157 sizeof(*hio)); 158 } 159 hio->hio_error = 0; 160 hio->hio_data = malloc(MAXPHYS); 161 if (hio->hio_data == NULL) { 162 pjdlog_exitx(EX_TEMPFAIL, 163 "Unable to allocate memory (%zu bytes) for gctl_data.", 164 (size_t)MAXPHYS); 165 } 166 TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next); 167 } 168} 169 170static void 171init_local(struct hast_resource *res) 172{ 173 174 if (metadata_read(res, true) < 0) 175 exit(EX_NOINPUT); 176} 177 178static void 179init_remote(struct hast_resource *res, struct nv *nvin) 180{ 181 uint64_t resuid; 182 struct nv *nvout; 183 unsigned char *map; 184 size_t mapsize; 185 186#ifdef notyet 187 /* Setup direction. */ 188 if (proto_send(res->hr_remoteout, NULL, 0) == -1) 189 pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); 190#endif 191 192 map = NULL; 193 mapsize = 0; 194 nvout = nv_alloc(); 195 nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize"); 196 nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize"); 197 resuid = nv_get_uint64(nvin, "resuid"); 198 res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt"); 199 res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt"); 200 nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt"); 201 nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt"); 202 mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - 203 METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); 204 map = malloc(mapsize); 205 if (map == NULL) { 206 pjdlog_exitx(EX_TEMPFAIL, 207 "Unable to allocate memory (%zu bytes) for activemap.", 208 mapsize); 209 } 210 /* 211 * When we work as primary and secondary is missing we will increase 212 * localcnt in our metadata. When secondary is connected and synced 213 * we make localcnt be equal to remotecnt, which means nodes are more 214 * or less in sync. 215 * Split-brain condition is when both nodes are not able to communicate 216 * and are both configured as primary nodes. In turn, they can both 217 * make incompatible changes to the data and we have to detect that. 218 * Under split-brain condition we will increase our localcnt on first 219 * write and remote node will increase its localcnt on first write. 220 * When we connect we can see that primary's localcnt is greater than 221 * our remotecnt (primary was modified while we weren't watching) and 222 * our localcnt is greater than primary's remotecnt (we were modified 223 * while primary wasn't watching). 224 * There are many possible combinations which are all gathered below. 225 * Don't pay too much attention to exact numbers, the more important 226 * is to compare them. We compare secondary's local with primary's 227 * remote and secondary's remote with primary's local. 228 * Note that every case where primary's localcnt is smaller than 229 * secondary's remotecnt and where secondary's localcnt is smaller than 230 * primary's remotecnt should be impossible in practise. We will perform 231 * full synchronization then. Those cases are marked with an asterisk. 232 * Regular synchronization means that only extents marked as dirty are 233 * synchronized (regular synchronization). 234 * 235 * SECONDARY METADATA PRIMARY METADATA 236 * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary. 237 * local=3 remote=3 local=2 remote=3* ?! Full sync from primary. 238 * local=3 remote=3 local=2 remote=4* ?! Full sync from primary. 239 * local=3 remote=3 local=3 remote=2 Primary is out-of-date, 240 * regular sync from secondary. 241 * local=3 remote=3 local=3 remote=3 Regular sync just in case. 242 * local=3 remote=3 local=3 remote=4* ?! Full sync from primary. 243 * local=3 remote=3 local=4 remote=2 Split-brain condition. 244 * local=3 remote=3 local=4 remote=3 Secondary out-of-date, 245 * regular sync from primary. 246 * local=3 remote=3 local=4 remote=4* ?! Full sync from primary. 247 */ 248 if (res->hr_resuid == 0) { 249 /* 250 * Provider is used for the first time. If primary node done no 251 * writes yet as well (we will find "virgin" argument) then 252 * there is no need to synchronize anything. If primary node 253 * done any writes already we have to synchronize everything. 254 */ 255 PJDLOG_ASSERT(res->hr_secondary_localcnt == 0); 256 res->hr_resuid = resuid; 257 if (metadata_write(res) < 0) 258 exit(EX_NOINPUT); 259 if (nv_exists(nvin, "virgin")) { 260 free(map); 261 map = NULL; 262 mapsize = 0; 263 } else { 264 memset(map, 0xff, mapsize); 265 } 266 nv_add_int8(nvout, 1, "virgin"); 267 nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 268 } else if (res->hr_resuid != resuid) { 269 char errmsg[256]; 270 271 (void)snprintf(errmsg, sizeof(errmsg), 272 "Resource unique ID mismatch (primary=%ju, secondary=%ju).", 273 (uintmax_t)resuid, (uintmax_t)res->hr_resuid); 274 pjdlog_error("%s", errmsg); 275 nv_add_string(nvout, errmsg, "errmsg"); 276 if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) { 277 pjdlog_exit(EX_TEMPFAIL, "Unable to send response to %s", 278 res->hr_remoteaddr); 279 } 280 nv_free(nvout); 281 exit(EX_CONFIG); 282 } else if ( 283 /* Is primary is out-of-date? */ 284 (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 285 res->hr_secondary_remotecnt == res->hr_primary_localcnt) || 286 /* Nodes are more or less in sync? */ 287 (res->hr_secondary_localcnt == res->hr_primary_remotecnt && 288 res->hr_secondary_remotecnt == res->hr_primary_localcnt) || 289 /* Is secondary is out-of-date? */ 290 (res->hr_secondary_localcnt == res->hr_primary_remotecnt && 291 res->hr_secondary_remotecnt < res->hr_primary_localcnt)) { 292 /* 293 * Nodes are more or less in sync or one of the nodes is 294 * out-of-date. 295 * It doesn't matter at this point which one, we just have to 296 * send out local bitmap to the remote node. 297 */ 298 if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) != 299 (ssize_t)mapsize) { 300 pjdlog_exit(LOG_ERR, "Unable to read activemap"); 301 } 302 if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 303 res->hr_secondary_remotecnt == res->hr_primary_localcnt) { 304 /* Primary is out-of-date, sync from secondary. */ 305 nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); 306 } else { 307 /* 308 * Secondary is out-of-date or counts match. 309 * Sync from primary. 310 */ 311 nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 312 } 313 } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 314 res->hr_primary_localcnt > res->hr_secondary_remotecnt) { 315 /* 316 * Not good, we have split-brain condition. 317 */ 318 pjdlog_error("Split-brain detected, exiting."); 319 nv_add_string(nvout, "Split-brain condition!", "errmsg"); 320 free(map); 321 map = NULL; 322 mapsize = 0; 323 } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt || 324 res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ { 325 /* 326 * This should never happen in practise, but we will perform 327 * full synchronization. 328 */ 329 PJDLOG_ASSERT(res->hr_secondary_localcnt < res->hr_primary_remotecnt || 330 res->hr_primary_localcnt < res->hr_secondary_remotecnt); 331 mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - 332 METADATA_SIZE, res->hr_extentsize, 333 res->hr_local_sectorsize); 334 memset(map, 0xff, mapsize); 335 if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) { 336 /* In this one of five cases sync from secondary. */ 337 nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); 338 } else { 339 /* For the rest four cases sync from primary. */ 340 nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); 341 } 342 pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).", 343 (uintmax_t)res->hr_primary_localcnt, 344 (uintmax_t)res->hr_primary_remotecnt, 345 (uintmax_t)res->hr_secondary_localcnt, 346 (uintmax_t)res->hr_secondary_remotecnt); 347 } 348 nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize"); 349 if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) { 350 pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s", 351 res->hr_remoteaddr); 352 } 353 if (map != NULL) 354 free(map); 355 nv_free(nvout); 356#ifdef notyet 357 /* Setup direction. */ 358 if (proto_recv(res->hr_remotein, NULL, 0) == -1) 359 pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); 360#endif 361 if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && 362 res->hr_primary_localcnt > res->hr_secondary_remotecnt) { 363 /* Exit on split-brain. */ 364 event_send(res, EVENT_SPLITBRAIN); 365 exit(EX_CONFIG); 366 } 367} 368 369void 370hastd_secondary(struct hast_resource *res, struct nv *nvin) 371{ 372 sigset_t mask; 373 pthread_t td; 374 pid_t pid; 375 int error, mode, debuglevel; 376 377 /* 378 * Create communication channel between parent and child. 379 */ 380 if (proto_client(NULL, "socketpair://", &res->hr_ctrl) < 0) { 381 KEEP_ERRNO((void)pidfile_remove(pfh)); 382 pjdlog_exit(EX_OSERR, 383 "Unable to create control sockets between parent and child"); 384 } 385 /* 386 * Create communication channel between child and parent. 387 */ 388 if (proto_client(NULL, "socketpair://", &res->hr_event) < 0) { 389 KEEP_ERRNO((void)pidfile_remove(pfh)); 390 pjdlog_exit(EX_OSERR, 391 "Unable to create event sockets between child and parent"); 392 } 393 394 pid = fork(); 395 if (pid < 0) { 396 KEEP_ERRNO((void)pidfile_remove(pfh)); 397 pjdlog_exit(EX_OSERR, "Unable to fork"); 398 } 399 400 if (pid > 0) { 401 /* This is parent. */ 402 proto_close(res->hr_remotein); 403 res->hr_remotein = NULL; 404 proto_close(res->hr_remoteout); 405 res->hr_remoteout = NULL; 406 /* Declare that we are receiver. */ 407 proto_recv(res->hr_event, NULL, 0); 408 /* Declare that we are sender. */ 409 proto_send(res->hr_ctrl, NULL, 0); 410 res->hr_workerpid = pid; 411 return; 412 } 413 414 gres = res; 415 mode = pjdlog_mode_get(); 416 debuglevel = pjdlog_debug_get(); 417 418 /* Declare that we are sender. */ 419 proto_send(res->hr_event, NULL, 0); 420 /* Declare that we are receiver. */ 421 proto_recv(res->hr_ctrl, NULL, 0); 422 descriptors_cleanup(res); 423 424 descriptors_assert(res, mode); 425 426 pjdlog_init(mode); 427 pjdlog_debug_set(debuglevel); 428 pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); 429 setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role)); 430 431 PJDLOG_VERIFY(sigemptyset(&mask) == 0); 432 PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); 433 434 /* Error in setting timeout is not critical, but why should it fail? */ 435 if (proto_timeout(res->hr_remotein, 2 * HAST_KEEPALIVE) < 0) 436 pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 437 if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0) 438 pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 439 440 init_local(res); 441 init_environment(); 442 443 if (drop_privs(res) != 0) 444 exit(EX_CONFIG); 445 pjdlog_info("Privileges successfully dropped."); 446 447 /* 448 * Create the control thread before sending any event to the parent, 449 * as we can deadlock when parent sends control request to worker, 450 * but worker has no control thread started yet, so parent waits. 451 * In the meantime worker sends an event to the parent, but parent 452 * is unable to handle the event, because it waits for control 453 * request response. 454 */ 455 error = pthread_create(&td, NULL, ctrl_thread, res); 456 PJDLOG_ASSERT(error == 0); 457 458 init_remote(res, nvin); 459 event_send(res, EVENT_CONNECT); 460 461 error = pthread_create(&td, NULL, recv_thread, res); 462 PJDLOG_ASSERT(error == 0); 463 error = pthread_create(&td, NULL, disk_thread, res); 464 PJDLOG_ASSERT(error == 0); 465 (void)send_thread(res); 466} 467 468static void 469reqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...) 470{ 471 char msg[1024]; 472 va_list ap; 473 int len; 474 475 va_start(ap, fmt); 476 len = vsnprintf(msg, sizeof(msg), fmt, ap); 477 va_end(ap); 478 if ((size_t)len < sizeof(msg)) { 479 switch (hio->hio_cmd) { 480 case HIO_READ: 481 (void)snprintf(msg + len, sizeof(msg) - len, 482 "READ(%ju, %ju).", (uintmax_t)hio->hio_offset, 483 (uintmax_t)hio->hio_length); 484 break; 485 case HIO_DELETE: 486 (void)snprintf(msg + len, sizeof(msg) - len, 487 "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset, 488 (uintmax_t)hio->hio_length); 489 break; 490 case HIO_FLUSH: 491 (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH."); 492 break; 493 case HIO_WRITE: 494 (void)snprintf(msg + len, sizeof(msg) - len, 495 "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset, 496 (uintmax_t)hio->hio_length); 497 break; 498 case HIO_KEEPALIVE: 499 (void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE."); 500 break; 501 default: 502 (void)snprintf(msg + len, sizeof(msg) - len, 503 "UNKNOWN(%u).", (unsigned int)hio->hio_cmd); 504 break; 505 } 506 } 507 pjdlog_common(loglevel, debuglevel, error, "%s", msg); 508} 509 510static int 511requnpack(struct hast_resource *res, struct hio *hio) 512{ 513 514 hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd"); 515 if (hio->hio_cmd == 0) { 516 pjdlog_error("Header contains no 'cmd' field."); 517 hio->hio_error = EINVAL; 518 goto end; 519 } 520 switch (hio->hio_cmd) { 521 case HIO_FLUSH: 522 case HIO_KEEPALIVE: 523 break; 524 case HIO_READ: 525 case HIO_WRITE: 526 case HIO_DELETE: 527 hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset"); 528 if (nv_error(hio->hio_nv) != 0) { 529 pjdlog_error("Header is missing 'offset' field."); 530 hio->hio_error = EINVAL; 531 goto end; 532 } 533 hio->hio_length = nv_get_uint64(hio->hio_nv, "length"); 534 if (nv_error(hio->hio_nv) != 0) { 535 pjdlog_error("Header is missing 'length' field."); 536 hio->hio_error = EINVAL; 537 goto end; 538 } 539 if (hio->hio_length == 0) { 540 pjdlog_error("Data length is zero."); 541 hio->hio_error = EINVAL; 542 goto end; 543 } 544 if (hio->hio_length > MAXPHYS) { 545 pjdlog_error("Data length is too large (%ju > %ju).", 546 (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS); 547 hio->hio_error = EINVAL; 548 goto end; 549 } 550 if ((hio->hio_offset % res->hr_local_sectorsize) != 0) { 551 pjdlog_error("Offset %ju is not multiple of sector size.", 552 (uintmax_t)hio->hio_offset); 553 hio->hio_error = EINVAL; 554 goto end; 555 } 556 if ((hio->hio_length % res->hr_local_sectorsize) != 0) { 557 pjdlog_error("Length %ju is not multiple of sector size.", 558 (uintmax_t)hio->hio_length); 559 hio->hio_error = EINVAL; 560 goto end; 561 } 562 if (hio->hio_offset + hio->hio_length > 563 (uint64_t)res->hr_datasize) { 564 pjdlog_error("Data offset is too large (%ju > %ju).", 565 (uintmax_t)(hio->hio_offset + hio->hio_length), 566 (uintmax_t)res->hr_datasize); 567 hio->hio_error = EINVAL; 568 goto end; 569 } 570 break; 571 default: 572 pjdlog_error("Header contains invalid 'cmd' (%hhu).", 573 hio->hio_cmd); 574 hio->hio_error = EINVAL; 575 goto end; 576 } 577 hio->hio_error = 0; 578end: 579 return (hio->hio_error); 580} 581 582static __dead2 void 583secondary_exit(int exitcode, const char *fmt, ...) 584{ 585 va_list ap; 586 587 PJDLOG_ASSERT(exitcode != EX_OK); 588 va_start(ap, fmt); 589 pjdlogv_errno(LOG_ERR, fmt, ap); 590 va_end(ap); 591 event_send(gres, EVENT_DISCONNECT); 592 exit(exitcode); 593} 594 595/* 596 * Thread receives requests from the primary node. 597 */ 598static void * 599recv_thread(void *arg) 600{ 601 struct hast_resource *res = arg; 602 struct hio *hio; 603 604 for (;;) { 605 pjdlog_debug(2, "recv: Taking free request."); 606 QUEUE_TAKE(free, hio); 607 pjdlog_debug(2, "recv: (%p) Got request.", hio); 608 if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) { 609 secondary_exit(EX_TEMPFAIL, 610 "Unable to receive request header"); 611 } 612 if (requnpack(res, hio) != 0) { 613 pjdlog_debug(2, 614 "recv: (%p) Moving request to the send queue.", 615 hio); 616 QUEUE_INSERT(send, hio); 617 continue; 618 } 619 switch (hio->hio_cmd) { 620 case HIO_READ: 621 res->hr_stat_read++; 622 break; 623 case HIO_WRITE: 624 res->hr_stat_write++; 625 break; 626 case HIO_DELETE: 627 res->hr_stat_delete++; 628 break; 629 case HIO_FLUSH: 630 res->hr_stat_flush++; 631 break; 632 } 633 reqlog(LOG_DEBUG, 2, -1, hio, 634 "recv: (%p) Got request header: ", hio); 635 if (hio->hio_cmd == HIO_KEEPALIVE) { 636 pjdlog_debug(2, 637 "recv: (%p) Moving request to the free queue.", 638 hio); 639 nv_free(hio->hio_nv); 640 QUEUE_INSERT(free, hio); 641 continue; 642 } else if (hio->hio_cmd == HIO_WRITE) { 643 if (hast_proto_recv_data(res, res->hr_remotein, 644 hio->hio_nv, hio->hio_data, MAXPHYS) < 0) { 645 secondary_exit(EX_TEMPFAIL, 646 "Unable to receive request data"); 647 } 648 } 649 pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.", 650 hio); 651 QUEUE_INSERT(disk, hio); 652 } 653 /* NOTREACHED */ 654 return (NULL); 655} 656 657/* 658 * Thread reads from or writes to local component and also handles DELETE and 659 * FLUSH requests. 660 */ 661static void * 662disk_thread(void *arg) 663{ 664 struct hast_resource *res = arg; 665 struct hio *hio; 666 ssize_t ret; 667 bool clear_activemap; 668 669 clear_activemap = true; 670 671 for (;;) { 672 pjdlog_debug(2, "disk: Taking request."); 673 QUEUE_TAKE(disk, hio); 674 while (clear_activemap) { 675 unsigned char *map; 676 size_t mapsize; 677 678 /* 679 * When first request is received, it means that primary 680 * already received our activemap, merged it and stored 681 * locally. We can now safely clear our activemap. 682 */ 683 mapsize = 684 activemap_calc_ondisk_size(res->hr_local_mediasize - 685 METADATA_SIZE, res->hr_extentsize, 686 res->hr_local_sectorsize); 687 map = calloc(1, mapsize); 688 if (map == NULL) { 689 pjdlog_warning("Unable to allocate memory to clear local activemap."); 690 break; 691 } 692 if (pwrite(res->hr_localfd, map, mapsize, 693 METADATA_SIZE) != (ssize_t)mapsize) { 694 pjdlog_errno(LOG_WARNING, 695 "Unable to store cleared activemap"); 696 free(map); 697 break; 698 } 699 free(map); 700 clear_activemap = false; 701 pjdlog_debug(1, "Local activemap cleared."); 702 } 703 reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio); 704 /* Handle the actual request. */ 705 switch (hio->hio_cmd) { 706 case HIO_READ: 707 ret = pread(res->hr_localfd, hio->hio_data, 708 hio->hio_length, 709 hio->hio_offset + res->hr_localoff); 710 if (ret < 0) 711 hio->hio_error = errno; 712 else if (ret != (int64_t)hio->hio_length) 713 hio->hio_error = EIO; 714 else 715 hio->hio_error = 0; 716 break; 717 case HIO_WRITE: 718 ret = pwrite(res->hr_localfd, hio->hio_data, 719 hio->hio_length, 720 hio->hio_offset + res->hr_localoff); 721 if (ret < 0) 722 hio->hio_error = errno; 723 else if (ret != (int64_t)hio->hio_length) 724 hio->hio_error = EIO; 725 else 726 hio->hio_error = 0; 727 break; 728 case HIO_DELETE: 729 ret = g_delete(res->hr_localfd, 730 hio->hio_offset + res->hr_localoff, 731 hio->hio_length); 732 if (ret < 0) 733 hio->hio_error = errno; 734 else 735 hio->hio_error = 0; 736 break; 737 case HIO_FLUSH: 738 ret = g_flush(res->hr_localfd); 739 if (ret < 0) 740 hio->hio_error = errno; 741 else 742 hio->hio_error = 0; 743 break; 744 } 745 if (hio->hio_error != 0) { 746 reqlog(LOG_ERR, 0, hio->hio_error, hio, 747 "Request failed: "); 748 } 749 pjdlog_debug(2, "disk: (%p) Moving request to the send queue.", 750 hio); 751 QUEUE_INSERT(send, hio); 752 } 753 /* NOTREACHED */ 754 return (NULL); 755} 756 757/* 758 * Thread sends requests back to primary node. 759 */ 760static void * 761send_thread(void *arg) 762{ 763 struct hast_resource *res = arg; 764 struct nv *nvout; 765 struct hio *hio; 766 void *data; 767 size_t length; 768 769 for (;;) { 770 pjdlog_debug(2, "send: Taking request."); 771 QUEUE_TAKE(send, hio); 772 reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio); 773 nvout = nv_alloc(); 774 /* Copy sequence number. */ 775 nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq"); 776 switch (hio->hio_cmd) { 777 case HIO_READ: 778 if (hio->hio_error == 0) { 779 data = hio->hio_data; 780 length = hio->hio_length; 781 break; 782 } 783 /* 784 * We send no data in case of an error. 785 */ 786 /* FALLTHROUGH */ 787 case HIO_DELETE: 788 case HIO_FLUSH: 789 case HIO_WRITE: 790 data = NULL; 791 length = 0; 792 break; 793 default: 794 abort(); 795 break; 796 } 797 if (hio->hio_error != 0) 798 nv_add_int16(nvout, hio->hio_error, "error"); 799 if (hast_proto_send(res, res->hr_remoteout, nvout, data, 800 length) < 0) { 801 secondary_exit(EX_TEMPFAIL, "Unable to send reply."); 802 } 803 nv_free(nvout); 804 pjdlog_debug(2, "send: (%p) Moving request to the free queue.", 805 hio); 806 nv_free(hio->hio_nv); 807 hio->hio_error = 0; 808 QUEUE_INSERT(free, hio); 809 } 810 /* NOTREACHED */ 811 return (NULL); 812} 813