hastd.c revision 210886
1204076Spjd/*- 2204076Spjd * Copyright (c) 2009-2010 The FreeBSD Foundation 3210886Spjd * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 4204076Spjd * All rights reserved. 5204076Spjd * 6204076Spjd * This software was developed by Pawel Jakub Dawidek under sponsorship from 7204076Spjd * the FreeBSD Foundation. 8204076Spjd * 9204076Spjd * Redistribution and use in source and binary forms, with or without 10204076Spjd * modification, are permitted provided that the following conditions 11204076Spjd * are met: 12204076Spjd * 1. Redistributions of source code must retain the above copyright 13204076Spjd * notice, this list of conditions and the following disclaimer. 14204076Spjd * 2. Redistributions in binary form must reproduce the above copyright 15204076Spjd * notice, this list of conditions and the following disclaimer in the 16204076Spjd * documentation and/or other materials provided with the distribution. 17204076Spjd * 18204076Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 19204076Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20204076Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21204076Spjd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 22204076Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23204076Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24204076Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25204076Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26204076Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27204076Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28204076Spjd * SUCH DAMAGE. 29204076Spjd */ 30204076Spjd 31204076Spjd#include <sys/cdefs.h> 32204076Spjd__FBSDID("$FreeBSD: head/sbin/hastd/hastd.c 210886 2010-08-05 19:16:31Z pjd $"); 33204076Spjd 34204076Spjd#include <sys/param.h> 35204076Spjd#include <sys/linker.h> 36204076Spjd#include <sys/module.h> 37204076Spjd#include <sys/wait.h> 38204076Spjd 39204076Spjd#include <assert.h> 40204076Spjd#include <err.h> 41204076Spjd#include <errno.h> 42204076Spjd#include <libutil.h> 43204076Spjd#include <signal.h> 44204076Spjd#include <stdbool.h> 45204076Spjd#include <stdio.h> 46204076Spjd#include <stdlib.h> 47204076Spjd#include <string.h> 48204076Spjd#include <sysexits.h> 49204076Spjd#include <unistd.h> 50204076Spjd 51204076Spjd#include <activemap.h> 52204076Spjd#include <pjdlog.h> 53204076Spjd 54204076Spjd#include "control.h" 55204076Spjd#include "hast.h" 56204076Spjd#include "hast_proto.h" 57204076Spjd#include "hastd.h" 58204076Spjd#include "subr.h" 59204076Spjd 60204076Spjd/* Path to configuration file. */ 61210886Spjdconst char *cfgpath = HAST_CONFIG; 62204076Spjd/* Hastd configuration. */ 63204076Spjdstatic struct hastd_config *cfg; 64204076Spjd/* Was SIGCHLD signal received? */ 65204076Spjdstatic bool sigchld_received = false; 66204076Spjd/* Was SIGHUP signal received? */ 67210886Spjdbool sighup_received = false; 68204076Spjd/* Was SIGINT or SIGTERM signal received? */ 69204076Spjdbool sigexit_received = false; 70204076Spjd/* PID file handle. */ 71204076Spjdstruct pidfh *pfh; 72204076Spjd 73204076Spjdstatic void 74204076Spjdusage(void) 75204076Spjd{ 76204076Spjd 77204076Spjd errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]"); 78204076Spjd} 79204076Spjd 80204076Spjdstatic void 81204076Spjdsighandler(int sig) 82204076Spjd{ 83204076Spjd 84204076Spjd switch (sig) { 85204076Spjd case SIGCHLD: 86204076Spjd sigchld_received = true; 87204076Spjd break; 88204076Spjd case SIGHUP: 89204076Spjd sighup_received = true; 90204076Spjd break; 91204076Spjd default: 92204076Spjd assert(!"invalid condition"); 93204076Spjd } 94204076Spjd} 95204076Spjd 96204076Spjdstatic void 97204076Spjdg_gate_load(void) 98204076Spjd{ 99204076Spjd 100204076Spjd if (modfind("g_gate") == -1) { 101204076Spjd /* Not present in kernel, try loading it. */ 102204076Spjd if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) { 103204076Spjd if (errno != EEXIST) { 104204076Spjd pjdlog_exit(EX_OSERR, 105204076Spjd "Unable to load geom_gate module"); 106204076Spjd } 107204076Spjd } 108204076Spjd } 109204076Spjd} 110204076Spjd 111204076Spjdstatic void 112207372Spjdchild_exit_log(unsigned int pid, int status) 113207372Spjd{ 114207372Spjd 115207372Spjd if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { 116207372Spjd pjdlog_debug(1, "Worker process exited gracefully (pid=%u).", 117207372Spjd pid); 118207372Spjd } else if (WIFSIGNALED(status)) { 119207372Spjd pjdlog_error("Worker process killed (pid=%u, signal=%d).", 120207372Spjd pid, WTERMSIG(status)); 121207372Spjd } else { 122207372Spjd pjdlog_error("Worker process exited ungracefully (pid=%u, exitcode=%d).", 123207372Spjd pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1); 124207372Spjd } 125207372Spjd} 126207372Spjd 127207372Spjdstatic void 128204076Spjdchild_exit(void) 129204076Spjd{ 130204076Spjd struct hast_resource *res; 131204076Spjd int status; 132204076Spjd pid_t pid; 133204076Spjd 134204076Spjd while ((pid = wait3(&status, WNOHANG, NULL)) > 0) { 135204076Spjd /* Find resource related to the process that just exited. */ 136204076Spjd TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { 137204076Spjd if (pid == res->hr_workerpid) 138204076Spjd break; 139204076Spjd } 140204076Spjd if (res == NULL) { 141204076Spjd /* 142204076Spjd * This can happen when new connection arrives and we 143204076Spjd * cancel child responsible for the old one. 144204076Spjd */ 145204076Spjd continue; 146204076Spjd } 147204076Spjd pjdlog_prefix_set("[%s] (%s) ", res->hr_name, 148204076Spjd role2str(res->hr_role)); 149207372Spjd child_exit_log(pid, status); 150206696Spjd proto_close(res->hr_ctrl); 151204076Spjd res->hr_workerpid = 0; 152204076Spjd if (res->hr_role == HAST_ROLE_PRIMARY) { 153207372Spjd /* 154207372Spjd * Restart child process if it was killed by signal 155207372Spjd * or exited because of temporary problem. 156207372Spjd */ 157207372Spjd if (WIFSIGNALED(status) || 158207372Spjd (WIFEXITED(status) && 159207372Spjd WEXITSTATUS(status) == EX_TEMPFAIL)) { 160207348Spjd sleep(1); 161207348Spjd pjdlog_info("Restarting worker process."); 162207348Spjd hastd_primary(res); 163207348Spjd } else { 164207348Spjd res->hr_role = HAST_ROLE_INIT; 165207348Spjd pjdlog_info("Changing resource role back to %s.", 166207348Spjd role2str(res->hr_role)); 167207348Spjd } 168204076Spjd } 169204076Spjd pjdlog_prefix_set("%s", ""); 170204076Spjd } 171204076Spjd} 172204076Spjd 173210886Spjdstatic bool 174210886Spjdresource_needs_restart(const struct hast_resource *res0, 175210886Spjd const struct hast_resource *res1) 176210886Spjd{ 177210886Spjd 178210886Spjd assert(strcmp(res0->hr_name, res1->hr_name) == 0); 179210886Spjd 180210886Spjd if (strcmp(res0->hr_provname, res1->hr_provname) != 0) 181210886Spjd return (true); 182210886Spjd if (strcmp(res0->hr_localpath, res1->hr_localpath) != 0) 183210886Spjd return (true); 184210886Spjd if (res0->hr_role == HAST_ROLE_INIT || 185210886Spjd res0->hr_role == HAST_ROLE_SECONDARY) { 186210886Spjd if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0) 187210886Spjd return (true); 188210886Spjd if (res0->hr_replication != res1->hr_replication) 189210886Spjd return (true); 190210886Spjd if (res0->hr_timeout != res1->hr_timeout) 191210886Spjd return (true); 192210886Spjd } 193210886Spjd return (false); 194210886Spjd} 195210886Spjd 196210886Spjdstatic bool 197210886Spjdresource_needs_reload(const struct hast_resource *res0, 198210886Spjd const struct hast_resource *res1) 199210886Spjd{ 200210886Spjd 201210886Spjd assert(strcmp(res0->hr_name, res1->hr_name) == 0); 202210886Spjd assert(strcmp(res0->hr_provname, res1->hr_provname) == 0); 203210886Spjd assert(strcmp(res0->hr_localpath, res1->hr_localpath) == 0); 204210886Spjd 205210886Spjd if (res0->hr_role != HAST_ROLE_PRIMARY) 206210886Spjd return (false); 207210886Spjd 208210886Spjd if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0) 209210886Spjd return (true); 210210886Spjd if (res0->hr_replication != res1->hr_replication) 211210886Spjd return (true); 212210886Spjd if (res0->hr_timeout != res1->hr_timeout) 213210886Spjd return (true); 214210886Spjd return (false); 215210886Spjd} 216210886Spjd 217204076Spjdstatic void 218204076Spjdhastd_reload(void) 219204076Spjd{ 220210886Spjd struct hastd_config *newcfg; 221210886Spjd struct hast_resource *nres, *cres, *tres; 222210886Spjd uint8_t role; 223204076Spjd 224210886Spjd pjdlog_info("Reloading configuration..."); 225210886Spjd 226210886Spjd newcfg = yy_config_parse(cfgpath, false); 227210886Spjd if (newcfg == NULL) 228210886Spjd goto failed; 229210886Spjd 230210886Spjd /* 231210886Spjd * Check if control address has changed. 232210886Spjd */ 233210886Spjd if (strcmp(cfg->hc_controladdr, newcfg->hc_controladdr) != 0) { 234210886Spjd if (proto_server(newcfg->hc_controladdr, 235210886Spjd &newcfg->hc_controlconn) < 0) { 236210886Spjd pjdlog_errno(LOG_ERR, 237210886Spjd "Unable to listen on control address %s", 238210886Spjd newcfg->hc_controladdr); 239210886Spjd goto failed; 240210886Spjd } 241210886Spjd } 242210886Spjd /* 243210886Spjd * Check if listen address has changed. 244210886Spjd */ 245210886Spjd if (strcmp(cfg->hc_listenaddr, newcfg->hc_listenaddr) != 0) { 246210886Spjd if (proto_server(newcfg->hc_listenaddr, 247210886Spjd &newcfg->hc_listenconn) < 0) { 248210886Spjd pjdlog_errno(LOG_ERR, "Unable to listen on address %s", 249210886Spjd newcfg->hc_listenaddr); 250210886Spjd goto failed; 251210886Spjd } 252210886Spjd } 253210886Spjd /* 254210886Spjd * Only when both control and listen sockets are successfully 255210886Spjd * initialized switch them to new configuration. 256210886Spjd */ 257210886Spjd if (newcfg->hc_controlconn != NULL) { 258210886Spjd pjdlog_info("Control socket changed from %s to %s.", 259210886Spjd cfg->hc_controladdr, newcfg->hc_controladdr); 260210886Spjd proto_close(cfg->hc_controlconn); 261210886Spjd cfg->hc_controlconn = newcfg->hc_controlconn; 262210886Spjd newcfg->hc_controlconn = NULL; 263210886Spjd strlcpy(cfg->hc_controladdr, newcfg->hc_controladdr, 264210886Spjd sizeof(cfg->hc_controladdr)); 265210886Spjd } 266210886Spjd if (newcfg->hc_listenconn != NULL) { 267210886Spjd pjdlog_info("Listen socket changed from %s to %s.", 268210886Spjd cfg->hc_listenaddr, newcfg->hc_listenaddr); 269210886Spjd proto_close(cfg->hc_listenconn); 270210886Spjd cfg->hc_listenconn = newcfg->hc_listenconn; 271210886Spjd newcfg->hc_listenconn = NULL; 272210886Spjd strlcpy(cfg->hc_listenaddr, newcfg->hc_listenaddr, 273210886Spjd sizeof(cfg->hc_listenaddr)); 274210886Spjd } 275210886Spjd 276210886Spjd /* 277210886Spjd * Stop and remove resources that were removed from the configuration. 278210886Spjd */ 279210886Spjd TAILQ_FOREACH_SAFE(cres, &cfg->hc_resources, hr_next, tres) { 280210886Spjd TAILQ_FOREACH(nres, &newcfg->hc_resources, hr_next) { 281210886Spjd if (strcmp(cres->hr_name, nres->hr_name) == 0) 282210886Spjd break; 283210886Spjd } 284210886Spjd if (nres == NULL) { 285210886Spjd control_set_role(cres, HAST_ROLE_INIT); 286210886Spjd TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next); 287210886Spjd pjdlog_info("Resource %s removed.", cres->hr_name); 288210886Spjd free(cres); 289210886Spjd } 290210886Spjd } 291210886Spjd /* 292210886Spjd * Move new resources to the current configuration. 293210886Spjd */ 294210886Spjd TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) { 295210886Spjd TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) { 296210886Spjd if (strcmp(cres->hr_name, nres->hr_name) == 0) 297210886Spjd break; 298210886Spjd } 299210886Spjd if (cres == NULL) { 300210886Spjd TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next); 301210886Spjd TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next); 302210886Spjd pjdlog_info("Resource %s added.", nres->hr_name); 303210886Spjd } 304210886Spjd } 305210886Spjd /* 306210886Spjd * Deal with modified resources. 307210886Spjd * Depending on what has changed exactly we might want to perform 308210886Spjd * different actions. 309210886Spjd * 310210886Spjd * We do full resource restart in the following situations: 311210886Spjd * Resource role is INIT or SECONDARY. 312210886Spjd * Resource role is PRIMARY and path to local component or provider 313210886Spjd * name has changed. 314210886Spjd * In case of PRIMARY, the worker process will be killed and restarted, 315210886Spjd * which also means removing /dev/hast/<name> provider and 316210886Spjd * recreating it. 317210886Spjd * 318210886Spjd * We do just reload (send SIGHUP to worker process) if we act as 319210886Spjd * PRIMARY, but only remote address, replication mode and timeout 320210886Spjd * has changed. For those, there is no need to restart worker process. 321210886Spjd * If PRIMARY receives SIGHUP, it will reconnect if remote address or 322210886Spjd * replication mode has changed or simply set new timeout if only 323210886Spjd * timeout has changed. 324210886Spjd */ 325210886Spjd TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) { 326210886Spjd TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) { 327210886Spjd if (strcmp(cres->hr_name, nres->hr_name) == 0) 328210886Spjd break; 329210886Spjd } 330210886Spjd assert(cres != NULL); 331210886Spjd if (resource_needs_restart(cres, nres)) { 332210886Spjd pjdlog_info("Resource %s configuration was modified, restarting it.", 333210886Spjd cres->hr_name); 334210886Spjd role = cres->hr_role; 335210886Spjd control_set_role(cres, HAST_ROLE_INIT); 336210886Spjd TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next); 337210886Spjd free(cres); 338210886Spjd TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next); 339210886Spjd TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next); 340210886Spjd control_set_role(nres, role); 341210886Spjd } else if (resource_needs_reload(cres, nres)) { 342210886Spjd pjdlog_info("Resource %s configuration was modified, reloading it.", 343210886Spjd cres->hr_name); 344210886Spjd strlcpy(cres->hr_remoteaddr, nres->hr_remoteaddr, 345210886Spjd sizeof(cres->hr_remoteaddr)); 346210886Spjd cres->hr_replication = nres->hr_replication; 347210886Spjd cres->hr_timeout = nres->hr_timeout; 348210886Spjd if (cres->hr_workerpid != 0) { 349210886Spjd if (kill(cres->hr_workerpid, SIGHUP) < 0) { 350210886Spjd pjdlog_errno(LOG_WARNING, 351210886Spjd "Unable to send SIGHUP to worker process %u", 352210886Spjd (unsigned int)cres->hr_workerpid); 353210886Spjd } 354210886Spjd } 355210886Spjd } 356210886Spjd } 357210886Spjd 358210886Spjd yy_config_free(newcfg); 359210886Spjd pjdlog_info("Configuration reloaded successfully."); 360210886Spjd return; 361210886Spjdfailed: 362210886Spjd if (newcfg != NULL) { 363210886Spjd if (newcfg->hc_controlconn != NULL) 364210886Spjd proto_close(newcfg->hc_controlconn); 365210886Spjd if (newcfg->hc_listenconn != NULL) 366210886Spjd proto_close(newcfg->hc_listenconn); 367210886Spjd yy_config_free(newcfg); 368210886Spjd } 369210886Spjd pjdlog_warning("Configuration not reloaded."); 370204076Spjd} 371204076Spjd 372204076Spjdstatic void 373204076Spjdlisten_accept(void) 374204076Spjd{ 375204076Spjd struct hast_resource *res; 376204076Spjd struct proto_conn *conn; 377204076Spjd struct nv *nvin, *nvout, *nverr; 378204076Spjd const char *resname; 379204076Spjd const unsigned char *token; 380204076Spjd char laddr[256], raddr[256]; 381204076Spjd size_t size; 382204076Spjd pid_t pid; 383204076Spjd int status; 384204076Spjd 385204076Spjd proto_local_address(cfg->hc_listenconn, laddr, sizeof(laddr)); 386204076Spjd pjdlog_debug(1, "Accepting connection to %s.", laddr); 387204076Spjd 388204076Spjd if (proto_accept(cfg->hc_listenconn, &conn) < 0) { 389204076Spjd pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr); 390204076Spjd return; 391204076Spjd } 392204076Spjd 393204076Spjd proto_local_address(conn, laddr, sizeof(laddr)); 394204076Spjd proto_remote_address(conn, raddr, sizeof(raddr)); 395209185Spjd pjdlog_info("Connection from %s to %s.", raddr, laddr); 396204076Spjd 397207371Spjd /* Error in setting timeout is not critical, but why should it fail? */ 398207371Spjd if (proto_timeout(conn, HAST_TIMEOUT) < 0) 399207371Spjd pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 400207371Spjd 401204076Spjd nvin = nvout = nverr = NULL; 402204076Spjd 403204076Spjd /* 404204076Spjd * Before receiving any data see if remote host have access to any 405204076Spjd * resource. 406204076Spjd */ 407204076Spjd TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { 408204076Spjd if (proto_address_match(conn, res->hr_remoteaddr)) 409204076Spjd break; 410204076Spjd } 411204076Spjd if (res == NULL) { 412204076Spjd pjdlog_error("Client %s isn't known.", raddr); 413204076Spjd goto close; 414204076Spjd } 415204076Spjd /* Ok, remote host can access at least one resource. */ 416204076Spjd 417204076Spjd if (hast_proto_recv_hdr(conn, &nvin) < 0) { 418204076Spjd pjdlog_errno(LOG_ERR, "Unable to receive header from %s", 419204076Spjd raddr); 420204076Spjd goto close; 421204076Spjd } 422204076Spjd 423204076Spjd resname = nv_get_string(nvin, "resource"); 424204076Spjd if (resname == NULL) { 425204076Spjd pjdlog_error("No 'resource' field in the header received from %s.", 426204076Spjd raddr); 427204076Spjd goto close; 428204076Spjd } 429204076Spjd pjdlog_debug(2, "%s: resource=%s", raddr, resname); 430204076Spjd token = nv_get_uint8_array(nvin, &size, "token"); 431204076Spjd /* 432204076Spjd * NULL token means that this is first conection. 433204076Spjd */ 434204076Spjd if (token != NULL && size != sizeof(res->hr_token)) { 435204076Spjd pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).", 436204076Spjd raddr, sizeof(res->hr_token), size); 437204076Spjd goto close; 438204076Spjd } 439204076Spjd 440204076Spjd /* 441204076Spjd * From now on we want to send errors to the remote node. 442204076Spjd */ 443204076Spjd nverr = nv_alloc(); 444204076Spjd 445204076Spjd /* Find resource related to this connection. */ 446204076Spjd TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { 447204076Spjd if (strcmp(resname, res->hr_name) == 0) 448204076Spjd break; 449204076Spjd } 450204076Spjd /* Have we found the resource? */ 451204076Spjd if (res == NULL) { 452204076Spjd pjdlog_error("No resource '%s' as requested by %s.", 453204076Spjd resname, raddr); 454204076Spjd nv_add_stringf(nverr, "errmsg", "Resource not configured."); 455204076Spjd goto fail; 456204076Spjd } 457204076Spjd 458204076Spjd /* Now that we know resource name setup log prefix. */ 459204076Spjd pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); 460204076Spjd 461204076Spjd /* Does the remote host have access to this resource? */ 462204076Spjd if (!proto_address_match(conn, res->hr_remoteaddr)) { 463204076Spjd pjdlog_error("Client %s has no access to the resource.", raddr); 464204076Spjd nv_add_stringf(nverr, "errmsg", "No access to the resource."); 465204076Spjd goto fail; 466204076Spjd } 467204076Spjd /* Is the resource marked as secondary? */ 468204076Spjd if (res->hr_role != HAST_ROLE_SECONDARY) { 469204076Spjd pjdlog_error("We act as %s for the resource and not as %s as requested by %s.", 470204076Spjd role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY), 471204076Spjd raddr); 472204076Spjd nv_add_stringf(nverr, "errmsg", 473204076Spjd "Remote node acts as %s for the resource and not as %s.", 474204076Spjd role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY)); 475204076Spjd goto fail; 476204076Spjd } 477204076Spjd /* Does token (if exists) match? */ 478204076Spjd if (token != NULL && memcmp(token, res->hr_token, 479204076Spjd sizeof(res->hr_token)) != 0) { 480204076Spjd pjdlog_error("Token received from %s doesn't match.", raddr); 481209185Spjd nv_add_stringf(nverr, "errmsg", "Token doesn't match."); 482204076Spjd goto fail; 483204076Spjd } 484204076Spjd /* 485204076Spjd * If there is no token, but we have half-open connection 486204076Spjd * (only remotein) or full connection (worker process is running) 487204076Spjd * we have to cancel those and accept the new connection. 488204076Spjd */ 489204076Spjd if (token == NULL) { 490204076Spjd assert(res->hr_remoteout == NULL); 491204076Spjd pjdlog_debug(1, "Initial connection from %s.", raddr); 492204076Spjd if (res->hr_workerpid != 0) { 493204076Spjd assert(res->hr_remotein == NULL); 494204076Spjd pjdlog_debug(1, 495204076Spjd "Worker process exists (pid=%u), stopping it.", 496204076Spjd (unsigned int)res->hr_workerpid); 497204076Spjd /* Stop child process. */ 498204076Spjd if (kill(res->hr_workerpid, SIGINT) < 0) { 499204076Spjd pjdlog_errno(LOG_ERR, 500204076Spjd "Unable to stop worker process (pid=%u)", 501204076Spjd (unsigned int)res->hr_workerpid); 502204076Spjd /* 503204076Spjd * Other than logging the problem we 504204076Spjd * ignore it - nothing smart to do. 505204076Spjd */ 506204076Spjd } 507204076Spjd /* Wait for it to exit. */ 508204076Spjd else if ((pid = waitpid(res->hr_workerpid, 509204076Spjd &status, 0)) != res->hr_workerpid) { 510207372Spjd /* We can only log the problem. */ 511204076Spjd pjdlog_errno(LOG_ERR, 512204076Spjd "Waiting for worker process (pid=%u) failed", 513204076Spjd (unsigned int)res->hr_workerpid); 514204076Spjd } else { 515207372Spjd child_exit_log(res->hr_workerpid, status); 516204076Spjd } 517204076Spjd res->hr_workerpid = 0; 518204076Spjd } else if (res->hr_remotein != NULL) { 519204076Spjd char oaddr[256]; 520204076Spjd 521204076Spjd proto_remote_address(conn, oaddr, sizeof(oaddr)); 522204076Spjd pjdlog_debug(1, 523204076Spjd "Canceling half-open connection from %s on connection from %s.", 524204076Spjd oaddr, raddr); 525204076Spjd proto_close(res->hr_remotein); 526204076Spjd res->hr_remotein = NULL; 527204076Spjd } 528204076Spjd } 529204076Spjd 530204076Spjd /* 531204076Spjd * Checks and cleanups are done. 532204076Spjd */ 533204076Spjd 534204076Spjd if (token == NULL) { 535204076Spjd arc4random_buf(res->hr_token, sizeof(res->hr_token)); 536204076Spjd nvout = nv_alloc(); 537204076Spjd nv_add_uint8_array(nvout, res->hr_token, 538204076Spjd sizeof(res->hr_token), "token"); 539204076Spjd if (nv_error(nvout) != 0) { 540204076Spjd pjdlog_common(LOG_ERR, 0, nv_error(nvout), 541204076Spjd "Unable to prepare return header for %s", raddr); 542204076Spjd nv_add_stringf(nverr, "errmsg", 543204076Spjd "Remote node was unable to prepare return header: %s.", 544204076Spjd strerror(nv_error(nvout))); 545204076Spjd goto fail; 546204076Spjd } 547204076Spjd if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0) { 548204076Spjd int error = errno; 549204076Spjd 550204076Spjd pjdlog_errno(LOG_ERR, "Unable to send response to %s", 551204076Spjd raddr); 552204076Spjd nv_add_stringf(nverr, "errmsg", 553204076Spjd "Remote node was unable to send response: %s.", 554204076Spjd strerror(error)); 555204076Spjd goto fail; 556204076Spjd } 557204076Spjd res->hr_remotein = conn; 558204076Spjd pjdlog_debug(1, "Incoming connection from %s configured.", 559204076Spjd raddr); 560204076Spjd } else { 561204076Spjd res->hr_remoteout = conn; 562204076Spjd pjdlog_debug(1, "Outgoing connection to %s configured.", raddr); 563204076Spjd hastd_secondary(res, nvin); 564204076Spjd } 565204076Spjd nv_free(nvin); 566204076Spjd nv_free(nvout); 567204076Spjd nv_free(nverr); 568204076Spjd pjdlog_prefix_set("%s", ""); 569204076Spjd return; 570204076Spjdfail: 571204076Spjd if (nv_error(nverr) != 0) { 572204076Spjd pjdlog_common(LOG_ERR, 0, nv_error(nverr), 573204076Spjd "Unable to prepare error header for %s", raddr); 574204076Spjd goto close; 575204076Spjd } 576204076Spjd if (hast_proto_send(NULL, conn, nverr, NULL, 0) < 0) { 577204076Spjd pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr); 578204076Spjd goto close; 579204076Spjd } 580204076Spjdclose: 581204076Spjd if (nvin != NULL) 582204076Spjd nv_free(nvin); 583204076Spjd if (nvout != NULL) 584204076Spjd nv_free(nvout); 585204076Spjd if (nverr != NULL) 586204076Spjd nv_free(nverr); 587204076Spjd proto_close(conn); 588204076Spjd pjdlog_prefix_set("%s", ""); 589204076Spjd} 590204076Spjd 591204076Spjdstatic void 592204076Spjdmain_loop(void) 593204076Spjd{ 594204076Spjd fd_set rfds, wfds; 595209177Spjd int cfd, lfd, maxfd, ret; 596204076Spjd 597204076Spjd for (;;) { 598204076Spjd if (sigchld_received) { 599204076Spjd sigchld_received = false; 600204076Spjd child_exit(); 601204076Spjd } 602204076Spjd if (sighup_received) { 603204076Spjd sighup_received = false; 604204076Spjd hastd_reload(); 605204076Spjd } 606204076Spjd 607210886Spjd cfd = proto_descriptor(cfg->hc_controlconn); 608210886Spjd lfd = proto_descriptor(cfg->hc_listenconn); 609210886Spjd maxfd = cfd > lfd ? cfd : lfd; 610210886Spjd 611209177Spjd /* Setup descriptors for select(2). */ 612204076Spjd FD_ZERO(&rfds); 613209177Spjd FD_SET(cfd, &rfds); 614209177Spjd FD_SET(lfd, &rfds); 615204076Spjd FD_ZERO(&wfds); 616209177Spjd FD_SET(cfd, &wfds); 617209177Spjd FD_SET(lfd, &wfds); 618204076Spjd 619204076Spjd ret = select(maxfd + 1, &rfds, &wfds, NULL, NULL); 620204076Spjd if (ret == -1) { 621204076Spjd if (errno == EINTR) 622204076Spjd continue; 623204076Spjd KEEP_ERRNO((void)pidfile_remove(pfh)); 624204076Spjd pjdlog_exit(EX_OSERR, "select() failed"); 625204076Spjd } 626204076Spjd 627209177Spjd if (FD_ISSET(cfd, &rfds) || FD_ISSET(cfd, &wfds)) 628204076Spjd control_handle(cfg); 629209177Spjd if (FD_ISSET(lfd, &rfds) || FD_ISSET(lfd, &wfds)) 630204076Spjd listen_accept(); 631204076Spjd } 632204076Spjd} 633204076Spjd 634204076Spjdint 635204076Spjdmain(int argc, char *argv[]) 636204076Spjd{ 637204076Spjd const char *pidfile; 638204076Spjd pid_t otherpid; 639204076Spjd bool foreground; 640204076Spjd int debuglevel; 641204076Spjd 642204076Spjd g_gate_load(); 643204076Spjd 644204076Spjd foreground = false; 645204076Spjd debuglevel = 0; 646204076Spjd pidfile = HASTD_PIDFILE; 647204076Spjd 648204076Spjd for (;;) { 649204076Spjd int ch; 650204076Spjd 651204076Spjd ch = getopt(argc, argv, "c:dFhP:"); 652204076Spjd if (ch == -1) 653204076Spjd break; 654204076Spjd switch (ch) { 655204076Spjd case 'c': 656204076Spjd cfgpath = optarg; 657204076Spjd break; 658204076Spjd case 'd': 659204076Spjd debuglevel++; 660204076Spjd break; 661204076Spjd case 'F': 662204076Spjd foreground = true; 663204076Spjd break; 664204076Spjd case 'P': 665204076Spjd pidfile = optarg; 666204076Spjd break; 667204076Spjd case 'h': 668204076Spjd default: 669204076Spjd usage(); 670204076Spjd } 671204076Spjd } 672204076Spjd argc -= optind; 673204076Spjd argv += optind; 674204076Spjd 675204076Spjd pjdlog_debug_set(debuglevel); 676204076Spjd 677204076Spjd pfh = pidfile_open(pidfile, 0600, &otherpid); 678204076Spjd if (pfh == NULL) { 679204076Spjd if (errno == EEXIST) { 680204076Spjd pjdlog_exitx(EX_TEMPFAIL, 681204076Spjd "Another hastd is already running, pid: %jd.", 682204076Spjd (intmax_t)otherpid); 683204076Spjd } 684204076Spjd /* If we cannot create pidfile from other reasons, only warn. */ 685210879Spjd pjdlog_errno(LOG_WARNING, "Unable to open or create pidfile"); 686204076Spjd } 687204076Spjd 688210883Spjd cfg = yy_config_parse(cfgpath, true); 689204076Spjd assert(cfg != NULL); 690204076Spjd 691204076Spjd signal(SIGHUP, sighandler); 692204076Spjd signal(SIGCHLD, sighandler); 693204076Spjd 694204076Spjd /* Listen on control address. */ 695204076Spjd if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) < 0) { 696204076Spjd KEEP_ERRNO((void)pidfile_remove(pfh)); 697204076Spjd pjdlog_exit(EX_OSERR, "Unable to listen on control address %s", 698204076Spjd cfg->hc_controladdr); 699204076Spjd } 700204076Spjd /* Listen for remote connections. */ 701204076Spjd if (proto_server(cfg->hc_listenaddr, &cfg->hc_listenconn) < 0) { 702204076Spjd KEEP_ERRNO((void)pidfile_remove(pfh)); 703204076Spjd pjdlog_exit(EX_OSERR, "Unable to listen on address %s", 704204076Spjd cfg->hc_listenaddr); 705204076Spjd } 706204076Spjd 707204076Spjd if (!foreground) { 708204076Spjd if (daemon(0, 0) < 0) { 709204076Spjd KEEP_ERRNO((void)pidfile_remove(pfh)); 710204076Spjd pjdlog_exit(EX_OSERR, "Unable to daemonize"); 711204076Spjd } 712204076Spjd 713204076Spjd /* Start logging to syslog. */ 714204076Spjd pjdlog_mode_set(PJDLOG_MODE_SYSLOG); 715204076Spjd 716204076Spjd /* Write PID to a file. */ 717204076Spjd if (pidfile_write(pfh) < 0) { 718204076Spjd pjdlog_errno(LOG_WARNING, 719204076Spjd "Unable to write PID to a file"); 720204076Spjd } 721204076Spjd } 722204076Spjd 723204076Spjd main_loop(); 724204076Spjd 725204076Spjd exit(0); 726204076Spjd} 727