g_raid3.c revision 144144
1133808Spjd/*- 2141994Spjd * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org> 3133808Spjd * All rights reserved. 4133808Spjd * 5133808Spjd * Redistribution and use in source and binary forms, with or without 6133808Spjd * modification, are permitted provided that the following conditions 7133808Spjd * are met: 8133808Spjd * 1. Redistributions of source code must retain the above copyright 9133808Spjd * notice, this list of conditions and the following disclaimer. 10133808Spjd * 2. Redistributions in binary form must reproduce the above copyright 11133808Spjd * notice, this list of conditions and the following disclaimer in the 12133808Spjd * documentation and/or other materials provided with the distribution. 13133808Spjd * 14133808Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15133808Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16133808Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17133808Spjd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18133808Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19133808Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20133808Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21133808Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22133808Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23133808Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24133808Spjd * SUCH DAMAGE. 25133808Spjd */ 26133808Spjd 27133808Spjd#include <sys/cdefs.h> 28133808Spjd__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 144144 2005-03-26 17:24:19Z pjd $"); 29133808Spjd 30133808Spjd#include <sys/param.h> 31133808Spjd#include <sys/systm.h> 32133808Spjd#include <sys/kernel.h> 33133808Spjd#include <sys/module.h> 34133808Spjd#include <sys/limits.h> 35133808Spjd#include <sys/lock.h> 36133808Spjd#include <sys/mutex.h> 37133808Spjd#include <sys/bio.h> 38133808Spjd#include <sys/sysctl.h> 39133808Spjd#include <sys/malloc.h> 40137257Spjd#include <sys/eventhandler.h> 41133808Spjd#include <vm/uma.h> 42133808Spjd#include <geom/geom.h> 43133808Spjd#include <sys/proc.h> 44133808Spjd#include <sys/kthread.h> 45139451Sjhb#include <sys/sched.h> 46133808Spjd#include <geom/raid3/g_raid3.h> 47133808Spjd 48133808Spjd 49133808Spjdstatic MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data"); 50133808Spjd 51133808SpjdSYSCTL_DECL(_kern_geom); 52133808SpjdSYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); 53133825Spjdu_int g_raid3_debug = 0; 54134528SpjdTUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug); 55133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0, 56133808Spjd "Debug level"); 57135866Spjdstatic u_int g_raid3_timeout = 4; 58137258SpjdTUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout); 59133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout, 60133808Spjd 0, "Time to wait on all raid3 components"); 61137258Spjdstatic u_int g_raid3_idletime = 5; 62137258SpjdTUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime); 63137258SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW, 64137258Spjd &g_raid3_idletime, 0, "Mark components as clean when idling"); 65133808Spjdstatic u_int g_raid3_reqs_per_sync = 5; 66133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW, 67133808Spjd &g_raid3_reqs_per_sync, 0, 68133808Spjd "Number of regular I/O requests per synchronization request"); 69139940Spjdstatic u_int g_raid3_syncs_per_sec = 1000; 70133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW, 71133808Spjd &g_raid3_syncs_per_sec, 0, 72133808Spjd "Number of synchronizations requests per second"); 73133808Spjd 74133808Spjdstatic u_int g_raid3_n64k = 50; 75133808SpjdTUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k); 76133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0, 77133808Spjd "Maximum number of 64kB allocations"); 78133808Spjdstatic u_int g_raid3_n16k = 200; 79133808SpjdTUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k); 80133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0, 81133808Spjd "Maximum number of 16kB allocations"); 82133808Spjdstatic u_int g_raid3_n4k = 1200; 83133808SpjdTUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k); 84133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0, 85133808Spjd "Maximum number of 4kB allocations"); 86133808Spjd 87133808SpjdSYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, 88133808Spjd "GEOM_RAID3 statistics"); 89134168Spjdstatic u_int g_raid3_parity_mismatch = 0; 90134168SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, 91134168Spjd &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); 92133808Spjdstatic u_int g_raid3_64k_requested = 0; 93133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD, 94133808Spjd &g_raid3_64k_requested, 0, "Number of requested 64kB allocations"); 95133808Spjdstatic u_int g_raid3_64k_failed = 0; 96133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD, 97133808Spjd &g_raid3_64k_failed, 0, "Number of failed 64kB allocations"); 98133808Spjdstatic u_int g_raid3_16k_requested = 0; 99133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD, 100133808Spjd &g_raid3_16k_requested, 0, "Number of requested 16kB allocations"); 101133808Spjdstatic u_int g_raid3_16k_failed = 0; 102133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD, 103133808Spjd &g_raid3_16k_failed, 0, "Number of failed 16kB allocations"); 104133808Spjdstatic u_int g_raid3_4k_requested = 0; 105133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD, 106133808Spjd &g_raid3_4k_requested, 0, "Number of requested 4kB allocations"); 107133808Spjdstatic u_int g_raid3_4k_failed = 0; 108133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD, 109133808Spjd &g_raid3_4k_failed, 0, "Number of failed 4kB allocations"); 110133808Spjd 111133808Spjd#define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ 112133808Spjd G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ 113133808Spjd msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ 114133808Spjd G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ 115133808Spjd} while (0) 116133808Spjd 117137257Spjdstatic eventhandler_tag g_raid3_ehtag = NULL; 118133808Spjd 119133808Spjdstatic int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, 120133808Spjd struct g_geom *gp); 121133808Spjdstatic g_taste_t g_raid3_taste; 122137257Spjdstatic void g_raid3_init(struct g_class *mp); 123137257Spjdstatic void g_raid3_fini(struct g_class *mp); 124133808Spjd 125133808Spjdstruct g_class g_raid3_class = { 126133808Spjd .name = G_RAID3_CLASS_NAME, 127133808Spjd .version = G_VERSION, 128133808Spjd .ctlreq = g_raid3_config, 129133808Spjd .taste = g_raid3_taste, 130137257Spjd .destroy_geom = g_raid3_destroy_geom, 131137257Spjd .init = g_raid3_init, 132137257Spjd .fini = g_raid3_fini 133133808Spjd}; 134133808Spjd 135133808Spjd 136133808Spjdstatic void g_raid3_destroy_provider(struct g_raid3_softc *sc); 137139144Spjdstatic int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); 138139144Spjdstatic void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); 139133808Spjdstatic void g_raid3_dumpconf(struct sbuf *sb, const char *indent, 140133808Spjd struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 141133808Spjdstatic void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); 142133808Spjd 143133808Spjd 144133808Spjdstatic const char * 145133808Spjdg_raid3_disk_state2str(int state) 146133808Spjd{ 147133808Spjd 148133808Spjd switch (state) { 149133808Spjd case G_RAID3_DISK_STATE_NODISK: 150133808Spjd return ("NODISK"); 151133808Spjd case G_RAID3_DISK_STATE_NONE: 152133808Spjd return ("NONE"); 153133808Spjd case G_RAID3_DISK_STATE_NEW: 154133808Spjd return ("NEW"); 155133808Spjd case G_RAID3_DISK_STATE_ACTIVE: 156133808Spjd return ("ACTIVE"); 157133808Spjd case G_RAID3_DISK_STATE_STALE: 158133808Spjd return ("STALE"); 159133808Spjd case G_RAID3_DISK_STATE_SYNCHRONIZING: 160133808Spjd return ("SYNCHRONIZING"); 161133808Spjd case G_RAID3_DISK_STATE_DISCONNECTED: 162133808Spjd return ("DISCONNECTED"); 163133808Spjd default: 164133808Spjd return ("INVALID"); 165133808Spjd } 166133808Spjd} 167133808Spjd 168133808Spjdstatic const char * 169133808Spjdg_raid3_device_state2str(int state) 170133808Spjd{ 171133808Spjd 172133808Spjd switch (state) { 173133808Spjd case G_RAID3_DEVICE_STATE_STARTING: 174133808Spjd return ("STARTING"); 175133808Spjd case G_RAID3_DEVICE_STATE_DEGRADED: 176133808Spjd return ("DEGRADED"); 177133808Spjd case G_RAID3_DEVICE_STATE_COMPLETE: 178133808Spjd return ("COMPLETE"); 179133808Spjd default: 180133808Spjd return ("INVALID"); 181133808Spjd } 182133808Spjd} 183133808Spjd 184133808Spjdconst char * 185133808Spjdg_raid3_get_diskname(struct g_raid3_disk *disk) 186133808Spjd{ 187133808Spjd 188133808Spjd if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) 189133808Spjd return ("[unknown]"); 190133808Spjd return (disk->d_name); 191133808Spjd} 192133808Spjd 193133808Spjd#define g_raid3_xor(src1, src2, dst, size) \ 194133808Spjd _g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2), \ 195133808Spjd (uint64_t *)(dst), (size_t)size) 196133808Spjdstatic void 197133808Spjd_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size) 198133808Spjd{ 199133808Spjd 200133808Spjd KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); 201133808Spjd for (; size > 0; size -= 128) { 202133808Spjd *dst++ = (*src1++) ^ (*src2++); 203133808Spjd *dst++ = (*src1++) ^ (*src2++); 204133808Spjd *dst++ = (*src1++) ^ (*src2++); 205133808Spjd *dst++ = (*src1++) ^ (*src2++); 206133808Spjd *dst++ = (*src1++) ^ (*src2++); 207133808Spjd *dst++ = (*src1++) ^ (*src2++); 208133808Spjd *dst++ = (*src1++) ^ (*src2++); 209133808Spjd *dst++ = (*src1++) ^ (*src2++); 210133808Spjd *dst++ = (*src1++) ^ (*src2++); 211133808Spjd *dst++ = (*src1++) ^ (*src2++); 212133808Spjd *dst++ = (*src1++) ^ (*src2++); 213133808Spjd *dst++ = (*src1++) ^ (*src2++); 214133808Spjd *dst++ = (*src1++) ^ (*src2++); 215133808Spjd *dst++ = (*src1++) ^ (*src2++); 216133808Spjd *dst++ = (*src1++) ^ (*src2++); 217133808Spjd *dst++ = (*src1++) ^ (*src2++); 218133808Spjd } 219133808Spjd} 220133808Spjd 221134168Spjdstatic int 222134168Spjdg_raid3_is_zero(struct bio *bp) 223134168Spjd{ 224134168Spjd static const uint64_t zeros[] = { 225134168Spjd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 226134168Spjd }; 227134168Spjd u_char *addr; 228134168Spjd ssize_t size; 229134168Spjd 230134168Spjd size = bp->bio_length; 231134168Spjd addr = (u_char *)bp->bio_data; 232134168Spjd for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { 233134168Spjd if (bcmp(addr, zeros, sizeof(zeros)) != 0) 234134168Spjd return (0); 235134168Spjd } 236134168Spjd return (1); 237134168Spjd} 238134168Spjd 239133808Spjd/* 240133808Spjd * --- Events handling functions --- 241133808Spjd * Events in geom_raid3 are used to maintain disks and device status 242133808Spjd * from one thread to simplify locking. 243133808Spjd */ 244133808Spjdstatic void 245133808Spjdg_raid3_event_free(struct g_raid3_event *ep) 246133808Spjd{ 247133808Spjd 248133808Spjd free(ep, M_RAID3); 249133808Spjd} 250133808Spjd 251133808Spjdint 252133808Spjdg_raid3_event_send(void *arg, int state, int flags) 253133808Spjd{ 254133808Spjd struct g_raid3_softc *sc; 255133808Spjd struct g_raid3_disk *disk; 256133808Spjd struct g_raid3_event *ep; 257133808Spjd int error; 258133808Spjd 259133808Spjd ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); 260133808Spjd G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); 261133808Spjd if ((flags & G_RAID3_EVENT_DEVICE) != 0) { 262133808Spjd disk = NULL; 263133808Spjd sc = arg; 264133808Spjd } else { 265133808Spjd disk = arg; 266133808Spjd sc = disk->d_softc; 267133808Spjd } 268133808Spjd ep->e_disk = disk; 269133808Spjd ep->e_state = state; 270133808Spjd ep->e_flags = flags; 271133808Spjd ep->e_error = 0; 272133808Spjd mtx_lock(&sc->sc_events_mtx); 273133808Spjd TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); 274133808Spjd mtx_unlock(&sc->sc_events_mtx); 275133808Spjd G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 276133808Spjd mtx_lock(&sc->sc_queue_mtx); 277133808Spjd wakeup(sc); 278133808Spjd wakeup(&sc->sc_queue); 279133808Spjd mtx_unlock(&sc->sc_queue_mtx); 280133808Spjd if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) 281133808Spjd return (0); 282133808Spjd g_topology_assert(); 283133808Spjd G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); 284133808Spjd g_topology_unlock(); 285133808Spjd while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { 286133808Spjd mtx_lock(&sc->sc_events_mtx); 287133808Spjd MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", 288133808Spjd hz * 5); 289133808Spjd } 290133808Spjd /* Don't even try to use 'sc' here, because it could be already dead. */ 291133808Spjd g_topology_lock(); 292133808Spjd error = ep->e_error; 293133808Spjd g_raid3_event_free(ep); 294133808Spjd return (error); 295133808Spjd} 296133808Spjd 297133808Spjdstatic struct g_raid3_event * 298133808Spjdg_raid3_event_get(struct g_raid3_softc *sc) 299133808Spjd{ 300133808Spjd struct g_raid3_event *ep; 301133808Spjd 302133808Spjd mtx_lock(&sc->sc_events_mtx); 303133808Spjd ep = TAILQ_FIRST(&sc->sc_events); 304133808Spjd mtx_unlock(&sc->sc_events_mtx); 305133808Spjd return (ep); 306133808Spjd} 307133808Spjd 308133808Spjdstatic void 309139144Spjdg_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) 310139144Spjd{ 311139144Spjd 312139144Spjd mtx_lock(&sc->sc_events_mtx); 313139144Spjd TAILQ_REMOVE(&sc->sc_events, ep, e_next); 314139144Spjd mtx_unlock(&sc->sc_events_mtx); 315139144Spjd} 316139144Spjd 317139144Spjdstatic void 318133808Spjdg_raid3_event_cancel(struct g_raid3_disk *disk) 319133808Spjd{ 320133808Spjd struct g_raid3_softc *sc; 321133808Spjd struct g_raid3_event *ep, *tmpep; 322133808Spjd 323133808Spjd g_topology_assert(); 324133808Spjd 325133808Spjd sc = disk->d_softc; 326133808Spjd mtx_lock(&sc->sc_events_mtx); 327133808Spjd TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { 328133808Spjd if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) 329133808Spjd continue; 330133808Spjd if (ep->e_disk != disk) 331133808Spjd continue; 332133808Spjd TAILQ_REMOVE(&sc->sc_events, ep, e_next); 333133808Spjd if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) 334133808Spjd g_raid3_event_free(ep); 335133808Spjd else { 336133808Spjd ep->e_error = ECANCELED; 337133808Spjd wakeup(ep); 338133808Spjd } 339133808Spjd } 340133808Spjd mtx_unlock(&sc->sc_events_mtx); 341133808Spjd} 342133808Spjd 343133808Spjd/* 344133808Spjd * Return the number of disks in the given state. 345133808Spjd * If state is equal to -1, count all connected disks. 346133808Spjd */ 347133808Spjdu_int 348133808Spjdg_raid3_ndisks(struct g_raid3_softc *sc, int state) 349133808Spjd{ 350133808Spjd struct g_raid3_disk *disk; 351133839Sobrien u_int n, ndisks; 352133808Spjd 353133839Sobrien for (n = ndisks = 0; n < sc->sc_ndisks; n++) { 354133808Spjd disk = &sc->sc_disks[n]; 355133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 356133808Spjd continue; 357133808Spjd if (state == -1 || disk->d_state == state) 358133808Spjd ndisks++; 359133808Spjd } 360133808Spjd return (ndisks); 361133808Spjd} 362133808Spjd 363133808Spjdstatic u_int 364133808Spjdg_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) 365133808Spjd{ 366133808Spjd struct bio *bp; 367133808Spjd u_int nreqs = 0; 368133808Spjd 369133808Spjd mtx_lock(&sc->sc_queue_mtx); 370133808Spjd TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 371133808Spjd if (bp->bio_from == cp) 372133808Spjd nreqs++; 373133808Spjd } 374133808Spjd mtx_unlock(&sc->sc_queue_mtx); 375133808Spjd return (nreqs); 376133808Spjd} 377133808Spjd 378133808Spjdstatic int 379133808Spjdg_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) 380133808Spjd{ 381133808Spjd 382137256Spjd if (cp->index > 0) { 383133808Spjd G_RAID3_DEBUG(2, 384133808Spjd "I/O requests for %s exist, can't destroy it now.", 385133808Spjd cp->provider->name); 386133808Spjd return (1); 387133808Spjd } 388133808Spjd if (g_raid3_nrequests(sc, cp) > 0) { 389133808Spjd G_RAID3_DEBUG(2, 390133808Spjd "I/O requests for %s in queue, can't destroy it now.", 391133808Spjd cp->provider->name); 392133808Spjd return (1); 393133808Spjd } 394133808Spjd return (0); 395133808Spjd} 396133808Spjd 397133808Spjdstatic void 398139144Spjdg_raid3_destroy_consumer(void *arg, int flags __unused) 399139144Spjd{ 400139144Spjd struct g_consumer *cp; 401139144Spjd 402139144Spjd cp = arg; 403139144Spjd G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); 404139144Spjd g_detach(cp); 405139144Spjd g_destroy_consumer(cp); 406139144Spjd} 407139144Spjd 408139144Spjdstatic void 409133808Spjdg_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) 410133808Spjd{ 411139144Spjd struct g_provider *pp; 412139144Spjd int retaste_wait; 413133808Spjd 414133808Spjd g_topology_assert(); 415133808Spjd 416133808Spjd cp->private = NULL; 417133808Spjd if (g_raid3_is_busy(sc, cp)) 418133808Spjd return; 419133808Spjd G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); 420139144Spjd pp = cp->provider; 421139144Spjd retaste_wait = 0; 422139144Spjd if (cp->acw == 1) { 423139144Spjd if ((pp->geom->flags & G_GEOM_WITHER) == 0) 424139144Spjd retaste_wait = 1; 425139144Spjd } 426139144Spjd G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, 427139144Spjd -cp->acw, -cp->ace, 0); 428139144Spjd if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) 429139144Spjd g_access(cp, -cp->acr, -cp->acw, -cp->ace); 430139144Spjd if (retaste_wait) { 431139144Spjd /* 432139144Spjd * After retaste event was send (inside g_access()), we can send 433139144Spjd * event to detach and destroy consumer. 434139144Spjd * A class, which has consumer to the given provider connected 435139144Spjd * will not receive retaste event for the provider. 436139144Spjd * This is the way how I ignore retaste events when I close 437139144Spjd * consumers opened for write: I detach and destroy consumer 438139144Spjd * after retaste event is sent. 439139144Spjd */ 440139144Spjd g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); 441139144Spjd return; 442139144Spjd } 443139144Spjd G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); 444133808Spjd g_detach(cp); 445133808Spjd g_destroy_consumer(cp); 446133808Spjd} 447133808Spjd 448133808Spjdstatic int 449133808Spjdg_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) 450133808Spjd{ 451144144Spjd struct g_consumer *cp; 452133808Spjd int error; 453133808Spjd 454133808Spjd g_topology_assert(); 455133808Spjd KASSERT(disk->d_consumer == NULL, 456133808Spjd ("Disk already connected (device %s).", disk->d_softc->sc_name)); 457133808Spjd 458144144Spjd cp = g_new_consumer(disk->d_softc->sc_geom); 459144144Spjd error = g_attach(cp, pp); 460144144Spjd if (error != 0) { 461144144Spjd g_destroy_consumer(cp); 462133808Spjd return (error); 463144144Spjd } 464144144Spjd error = g_access(cp, 1, 1, 1); 465139144Spjd if (error != 0) { 466144144Spjd g_detach(cp); 467144144Spjd g_destroy_consumer(cp); 468139144Spjd G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", 469139144Spjd pp->name, error); 470139144Spjd return (error); 471139144Spjd } 472144144Spjd disk->d_consumer = cp; 473144144Spjd disk->d_consumer->private = disk; 474144144Spjd disk->d_consumer->index = 0; 475133808Spjd G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); 476133808Spjd return (0); 477133808Spjd} 478133808Spjd 479133808Spjdstatic void 480133808Spjdg_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) 481133808Spjd{ 482133808Spjd 483133808Spjd g_topology_assert(); 484133808Spjd 485133808Spjd if (cp == NULL) 486133808Spjd return; 487139144Spjd if (cp->provider != NULL) 488133808Spjd g_raid3_kill_consumer(sc, cp); 489139144Spjd else 490133808Spjd g_destroy_consumer(cp); 491133808Spjd} 492133808Spjd 493133808Spjd/* 494133808Spjd * Initialize disk. This means allocate memory, create consumer, attach it 495133808Spjd * to the provider and open access (r1w1e1) to it. 496133808Spjd */ 497133808Spjdstatic struct g_raid3_disk * 498133808Spjdg_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, 499133808Spjd struct g_raid3_metadata *md, int *errorp) 500133808Spjd{ 501133808Spjd struct g_raid3_disk *disk; 502133808Spjd int error; 503133808Spjd 504133808Spjd disk = &sc->sc_disks[md->md_no]; 505133808Spjd error = g_raid3_connect_disk(disk, pp); 506144144Spjd if (error != 0) { 507144144Spjd if (errorp != NULL) 508144144Spjd *errorp = error; 509144144Spjd return (NULL); 510144144Spjd } 511133808Spjd disk->d_state = G_RAID3_DISK_STATE_NONE; 512133808Spjd disk->d_flags = md->md_dflags; 513133808Spjd if (md->md_provider[0] != '\0') 514133808Spjd disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; 515133808Spjd disk->d_sync.ds_consumer = NULL; 516133808Spjd disk->d_sync.ds_offset = md->md_sync_offset; 517133808Spjd disk->d_sync.ds_offset_done = md->md_sync_offset; 518135863Spjd disk->d_sync.ds_resync = -1; 519139295Spjd disk->d_genid = md->md_genid; 520133808Spjd disk->d_sync.ds_syncid = md->md_syncid; 521133808Spjd if (errorp != NULL) 522133808Spjd *errorp = 0; 523133808Spjd return (disk); 524133808Spjd} 525133808Spjd 526133808Spjdstatic void 527133808Spjdg_raid3_destroy_disk(struct g_raid3_disk *disk) 528133808Spjd{ 529133808Spjd struct g_raid3_softc *sc; 530133808Spjd 531133808Spjd g_topology_assert(); 532133808Spjd 533133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 534133808Spjd return; 535133808Spjd g_raid3_event_cancel(disk); 536133808Spjd sc = disk->d_softc; 537133808Spjd switch (disk->d_state) { 538133808Spjd case G_RAID3_DISK_STATE_SYNCHRONIZING: 539133808Spjd if (sc->sc_syncdisk != NULL) 540133808Spjd g_raid3_sync_stop(sc, 1); 541133808Spjd /* FALLTHROUGH */ 542133808Spjd case G_RAID3_DISK_STATE_NEW: 543133808Spjd case G_RAID3_DISK_STATE_STALE: 544133808Spjd case G_RAID3_DISK_STATE_ACTIVE: 545133808Spjd g_raid3_disconnect_consumer(sc, disk->d_consumer); 546133808Spjd disk->d_consumer = NULL; 547133808Spjd break; 548133808Spjd default: 549133808Spjd KASSERT(0 == 1, ("Wrong disk state (%s, %s).", 550133808Spjd g_raid3_get_diskname(disk), 551133808Spjd g_raid3_disk_state2str(disk->d_state))); 552133808Spjd } 553133808Spjd disk->d_state = G_RAID3_DISK_STATE_NODISK; 554133808Spjd} 555133808Spjd 556133808Spjdstatic void 557133808Spjdg_raid3_destroy_device(struct g_raid3_softc *sc) 558133808Spjd{ 559133808Spjd struct g_raid3_event *ep; 560137257Spjd struct g_raid3_disk *disk; 561133808Spjd struct g_geom *gp; 562133808Spjd struct g_consumer *cp; 563133808Spjd u_int n; 564133808Spjd 565133808Spjd g_topology_assert(); 566133808Spjd 567133808Spjd gp = sc->sc_geom; 568133808Spjd if (sc->sc_provider != NULL) 569133808Spjd g_raid3_destroy_provider(sc); 570137257Spjd for (n = 0; n < sc->sc_ndisks; n++) { 571137257Spjd disk = &sc->sc_disks[n]; 572139144Spjd if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { 573139144Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 574139144Spjd g_raid3_update_metadata(disk); 575139144Spjd g_raid3_destroy_disk(disk); 576139144Spjd } 577137257Spjd } 578133808Spjd while ((ep = g_raid3_event_get(sc)) != NULL) { 579139144Spjd g_raid3_event_remove(sc, ep); 580133808Spjd if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) 581133808Spjd g_raid3_event_free(ep); 582133808Spjd else { 583133808Spjd ep->e_error = ECANCELED; 584133808Spjd ep->e_flags |= G_RAID3_EVENT_DONE; 585133808Spjd G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); 586133808Spjd mtx_lock(&sc->sc_events_mtx); 587133808Spjd wakeup(ep); 588133808Spjd mtx_unlock(&sc->sc_events_mtx); 589133808Spjd } 590133808Spjd } 591133808Spjd callout_drain(&sc->sc_callout); 592133808Spjd gp->softc = NULL; 593133808Spjd cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); 594133808Spjd if (cp != NULL) 595133808Spjd g_raid3_disconnect_consumer(sc, cp); 596133808Spjd sc->sc_sync.ds_geom->softc = NULL; 597133808Spjd g_wither_geom(sc->sc_sync.ds_geom, ENXIO); 598133808Spjd uma_zdestroy(sc->sc_zone_64k); 599133808Spjd uma_zdestroy(sc->sc_zone_16k); 600133808Spjd uma_zdestroy(sc->sc_zone_4k); 601133808Spjd mtx_destroy(&sc->sc_queue_mtx); 602133808Spjd mtx_destroy(&sc->sc_events_mtx); 603133808Spjd G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); 604133808Spjd g_wither_geom(gp, ENXIO); 605133808Spjd} 606133808Spjd 607133808Spjdstatic void 608133808Spjdg_raid3_orphan(struct g_consumer *cp) 609133808Spjd{ 610133808Spjd struct g_raid3_disk *disk; 611133808Spjd 612133808Spjd g_topology_assert(); 613133808Spjd 614133808Spjd disk = cp->private; 615133808Spjd if (disk == NULL) 616133808Spjd return; 617139671Spjd disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; 618133808Spjd g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, 619133808Spjd G_RAID3_EVENT_DONTWAIT); 620133808Spjd} 621133808Spjd 622133808Spjdstatic int 623133808Spjdg_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) 624133808Spjd{ 625133808Spjd struct g_raid3_softc *sc; 626133808Spjd struct g_consumer *cp; 627133808Spjd off_t offset, length; 628133808Spjd u_char *sector; 629139144Spjd int error = 0; 630133808Spjd 631133808Spjd g_topology_assert(); 632133808Spjd 633133808Spjd sc = disk->d_softc; 634133808Spjd cp = disk->d_consumer; 635133808Spjd KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); 636133808Spjd KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); 637139144Spjd KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, 638139144Spjd ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, 639139144Spjd cp->acw, cp->ace)); 640133808Spjd length = cp->provider->sectorsize; 641133808Spjd offset = cp->provider->mediasize - length; 642133808Spjd sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); 643139144Spjd if (md != NULL) 644139144Spjd raid3_metadata_encode(md, sector); 645139144Spjd g_topology_unlock(); 646139144Spjd error = g_write_data(cp, offset, sector, length); 647139144Spjd g_topology_lock(); 648133808Spjd free(sector, M_RAID3); 649133808Spjd if (error != 0) { 650139671Spjd disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID; 651133808Spjd g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, 652133808Spjd G_RAID3_EVENT_DONTWAIT); 653133808Spjd } 654133808Spjd return (error); 655133808Spjd} 656133808Spjd 657133808Spjdint 658133808Spjdg_raid3_clear_metadata(struct g_raid3_disk *disk) 659133808Spjd{ 660133808Spjd int error; 661133808Spjd 662133808Spjd g_topology_assert(); 663133808Spjd error = g_raid3_write_metadata(disk, NULL); 664133808Spjd if (error == 0) { 665133808Spjd G_RAID3_DEBUG(2, "Metadata on %s cleared.", 666133808Spjd g_raid3_get_diskname(disk)); 667133808Spjd } else { 668133808Spjd G_RAID3_DEBUG(0, 669133808Spjd "Cannot clear metadata on disk %s (error=%d).", 670133808Spjd g_raid3_get_diskname(disk), error); 671133808Spjd } 672133808Spjd return (error); 673133808Spjd} 674133808Spjd 675133808Spjdvoid 676133808Spjdg_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) 677133808Spjd{ 678133808Spjd struct g_raid3_softc *sc; 679142727Spjd struct g_provider *pp; 680133808Spjd 681133808Spjd sc = disk->d_softc; 682133808Spjd strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); 683133808Spjd md->md_version = G_RAID3_VERSION; 684133808Spjd strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); 685133808Spjd md->md_id = sc->sc_id; 686133808Spjd md->md_all = sc->sc_ndisks; 687139295Spjd md->md_genid = sc->sc_genid; 688133808Spjd md->md_mediasize = sc->sc_mediasize; 689133808Spjd md->md_sectorsize = sc->sc_sectorsize; 690133808Spjd md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); 691133808Spjd md->md_no = disk->d_no; 692133808Spjd md->md_syncid = disk->d_sync.ds_syncid; 693133808Spjd md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); 694133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) 695133808Spjd md->md_sync_offset = disk->d_sync.ds_offset_done; 696133808Spjd else 697133808Spjd md->md_sync_offset = 0; 698142727Spjd if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) 699142727Spjd pp = disk->d_consumer->provider; 700142727Spjd else 701142727Spjd pp = NULL; 702142727Spjd if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) 703142727Spjd strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); 704142727Spjd else 705133808Spjd bzero(md->md_provider, sizeof(md->md_provider)); 706142727Spjd if (pp != NULL) 707142727Spjd md->md_provsize = pp->mediasize; 708142727Spjd else 709142727Spjd md->md_provsize = 0; 710133808Spjd} 711133808Spjd 712133808Spjdvoid 713133808Spjdg_raid3_update_metadata(struct g_raid3_disk *disk) 714133808Spjd{ 715133808Spjd struct g_raid3_metadata md; 716133808Spjd int error; 717133808Spjd 718133808Spjd g_topology_assert(); 719133808Spjd g_raid3_fill_metadata(disk, &md); 720133808Spjd error = g_raid3_write_metadata(disk, &md); 721133808Spjd if (error == 0) { 722133808Spjd G_RAID3_DEBUG(2, "Metadata on %s updated.", 723133808Spjd g_raid3_get_diskname(disk)); 724133808Spjd } else { 725133808Spjd G_RAID3_DEBUG(0, 726133808Spjd "Cannot update metadata on disk %s (error=%d).", 727133808Spjd g_raid3_get_diskname(disk), error); 728133808Spjd } 729133808Spjd} 730133808Spjd 731133808Spjdstatic void 732139144Spjdg_raid3_bump_syncid(struct g_raid3_softc *sc) 733133808Spjd{ 734133808Spjd struct g_raid3_disk *disk; 735133808Spjd u_int n; 736133808Spjd 737133808Spjd g_topology_assert(); 738133808Spjd KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, 739133808Spjd ("%s called with no active disks (device=%s).", __func__, 740133808Spjd sc->sc_name)); 741133808Spjd 742133808Spjd sc->sc_syncid++; 743139295Spjd G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, 744139295Spjd sc->sc_syncid); 745133808Spjd for (n = 0; n < sc->sc_ndisks; n++) { 746133808Spjd disk = &sc->sc_disks[n]; 747133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 748133808Spjd disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 749133808Spjd disk->d_sync.ds_syncid = sc->sc_syncid; 750133808Spjd g_raid3_update_metadata(disk); 751133808Spjd } 752133808Spjd } 753133808Spjd} 754133808Spjd 755137258Spjdstatic void 756139295Spjdg_raid3_bump_genid(struct g_raid3_softc *sc) 757139295Spjd{ 758139295Spjd struct g_raid3_disk *disk; 759139295Spjd u_int n; 760139295Spjd 761139295Spjd g_topology_assert(); 762139295Spjd KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, 763139295Spjd ("%s called with no active disks (device=%s).", __func__, 764139295Spjd sc->sc_name)); 765139295Spjd 766139295Spjd sc->sc_genid++; 767139295Spjd G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, 768139295Spjd sc->sc_genid); 769139295Spjd for (n = 0; n < sc->sc_ndisks; n++) { 770139295Spjd disk = &sc->sc_disks[n]; 771139295Spjd if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 772139295Spjd disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 773139295Spjd disk->d_genid = sc->sc_genid; 774139295Spjd g_raid3_update_metadata(disk); 775139295Spjd } 776139295Spjd } 777139295Spjd} 778139295Spjd 779139295Spjdstatic void 780137258Spjdg_raid3_idle(struct g_raid3_softc *sc) 781137258Spjd{ 782137258Spjd struct g_raid3_disk *disk; 783137258Spjd u_int i; 784137258Spjd 785137258Spjd if (sc->sc_provider == NULL || sc->sc_provider->acw == 0) 786137258Spjd return; 787137258Spjd sc->sc_idle = 1; 788137258Spjd g_topology_lock(); 789137258Spjd for (i = 0; i < sc->sc_ndisks; i++) { 790137258Spjd disk = &sc->sc_disks[i]; 791137258Spjd if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 792137258Spjd continue; 793137258Spjd G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 794137258Spjd g_raid3_get_diskname(disk), sc->sc_name); 795137258Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 796137258Spjd g_raid3_update_metadata(disk); 797137258Spjd } 798137258Spjd g_topology_unlock(); 799137258Spjd} 800137258Spjd 801137258Spjdstatic void 802137258Spjdg_raid3_unidle(struct g_raid3_softc *sc) 803137258Spjd{ 804137258Spjd struct g_raid3_disk *disk; 805137258Spjd u_int i; 806137258Spjd 807137258Spjd sc->sc_idle = 0; 808137258Spjd g_topology_lock(); 809137258Spjd for (i = 0; i < sc->sc_ndisks; i++) { 810137258Spjd disk = &sc->sc_disks[i]; 811137258Spjd if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 812137258Spjd continue; 813137258Spjd G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 814137258Spjd g_raid3_get_diskname(disk), sc->sc_name); 815137258Spjd disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 816137258Spjd g_raid3_update_metadata(disk); 817137258Spjd } 818137258Spjd g_topology_unlock(); 819137258Spjd} 820137258Spjd 821137259Spjd/* 822137259Spjd * Return 1 if we should check if RAID3 device is idling. 823137259Spjd */ 824137259Spjdstatic int 825137259Spjdg_raid3_check_idle(struct g_raid3_softc *sc) 826137259Spjd{ 827137259Spjd struct g_raid3_disk *disk; 828137259Spjd u_int i; 829137259Spjd 830137259Spjd if (sc->sc_idle) 831137259Spjd return (0); 832137259Spjd if (sc->sc_provider != NULL && sc->sc_provider->acw == 0) 833137259Spjd return (0); 834137259Spjd /* 835137259Spjd * Check if there are no in-flight requests. 836137259Spjd */ 837137259Spjd for (i = 0; i < sc->sc_ndisks; i++) { 838137259Spjd disk = &sc->sc_disks[i]; 839137259Spjd if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 840137259Spjd continue; 841137259Spjd if (disk->d_consumer->index > 0) 842137259Spjd return (0); 843137259Spjd } 844137259Spjd return (1); 845137259Spjd} 846137259Spjd 847133808Spjd/* 848133808Spjd * Treat bio_driver1 field in parent bio as list head and field bio_caller1 849133808Spjd * in child bio as pointer to the next element on the list. 850133808Spjd */ 851133808Spjd#define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 852133808Spjd 853133808Spjd#define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 854133808Spjd 855133808Spjd#define G_RAID3_FOREACH_BIO(pbp, bp) \ 856133808Spjd for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ 857133808Spjd (bp) = G_RAID3_NEXT_BIO(bp)) 858133808Spjd 859133808Spjd#define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ 860133808Spjd for ((bp) = G_RAID3_HEAD_BIO(pbp); \ 861133808Spjd (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ 862133808Spjd (bp) = (tmpbp)) 863133808Spjd 864133808Spjdstatic void 865133808Spjdg_raid3_init_bio(struct bio *pbp) 866133808Spjd{ 867133808Spjd 868133808Spjd G_RAID3_HEAD_BIO(pbp) = NULL; 869133808Spjd} 870133808Spjd 871133808Spjdstatic void 872134168Spjdg_raid3_remove_bio(struct bio *cbp) 873134168Spjd{ 874134168Spjd struct bio *pbp, *bp; 875134168Spjd 876134168Spjd pbp = cbp->bio_parent; 877134168Spjd if (G_RAID3_HEAD_BIO(pbp) == cbp) 878134168Spjd G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); 879134168Spjd else { 880134168Spjd G_RAID3_FOREACH_BIO(pbp, bp) { 881134168Spjd if (G_RAID3_NEXT_BIO(bp) == cbp) { 882134168Spjd G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); 883134168Spjd break; 884134168Spjd } 885134168Spjd } 886134168Spjd } 887134168Spjd G_RAID3_NEXT_BIO(cbp) = NULL; 888134168Spjd} 889134168Spjd 890134168Spjdstatic void 891134168Spjdg_raid3_replace_bio(struct bio *sbp, struct bio *dbp) 892134168Spjd{ 893134168Spjd struct bio *pbp, *bp; 894134168Spjd 895134168Spjd g_raid3_remove_bio(sbp); 896134168Spjd pbp = dbp->bio_parent; 897134168Spjd G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); 898134168Spjd if (G_RAID3_HEAD_BIO(pbp) == dbp) 899134168Spjd G_RAID3_HEAD_BIO(pbp) = sbp; 900134168Spjd else { 901134168Spjd G_RAID3_FOREACH_BIO(pbp, bp) { 902134168Spjd if (G_RAID3_NEXT_BIO(bp) == dbp) { 903134168Spjd G_RAID3_NEXT_BIO(bp) = sbp; 904134168Spjd break; 905134168Spjd } 906134168Spjd } 907134168Spjd } 908134168Spjd G_RAID3_NEXT_BIO(dbp) = NULL; 909134168Spjd} 910134168Spjd 911134168Spjdstatic void 912133808Spjdg_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) 913133808Spjd{ 914133808Spjd struct bio *bp, *pbp; 915133808Spjd size_t size; 916133808Spjd 917133808Spjd pbp = cbp->bio_parent; 918133808Spjd pbp->bio_children--; 919133808Spjd KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); 920133808Spjd size = pbp->bio_length / (sc->sc_ndisks - 1); 921133808Spjd if (size > 16384) 922133808Spjd uma_zfree(sc->sc_zone_64k, cbp->bio_data); 923133808Spjd else if (size > 4096) 924133808Spjd uma_zfree(sc->sc_zone_16k, cbp->bio_data); 925133808Spjd else 926133808Spjd uma_zfree(sc->sc_zone_4k, cbp->bio_data); 927133808Spjd if (G_RAID3_HEAD_BIO(pbp) == cbp) { 928133808Spjd G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); 929133808Spjd G_RAID3_NEXT_BIO(cbp) = NULL; 930133808Spjd g_destroy_bio(cbp); 931133808Spjd } else { 932133808Spjd G_RAID3_FOREACH_BIO(pbp, bp) { 933133808Spjd if (G_RAID3_NEXT_BIO(bp) == cbp) 934133808Spjd break; 935133808Spjd } 936134168Spjd if (bp != NULL) { 937134168Spjd KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, 938134168Spjd ("NULL bp->bio_driver1")); 939134168Spjd G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); 940134168Spjd G_RAID3_NEXT_BIO(cbp) = NULL; 941134168Spjd } 942133808Spjd g_destroy_bio(cbp); 943133808Spjd } 944133808Spjd} 945133808Spjd 946133808Spjdstatic struct bio * 947133808Spjdg_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) 948133808Spjd{ 949133808Spjd struct bio *bp, *cbp; 950133808Spjd size_t size; 951133808Spjd 952133808Spjd cbp = g_clone_bio(pbp); 953133808Spjd if (cbp == NULL) 954133808Spjd return (NULL); 955133808Spjd size = pbp->bio_length / (sc->sc_ndisks - 1); 956133808Spjd if (size > 16384) { 957133808Spjd cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT); 958133808Spjd g_raid3_64k_requested++; 959133808Spjd } else if (size > 4096) { 960133808Spjd cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT); 961133808Spjd g_raid3_16k_requested++; 962133808Spjd } else { 963133808Spjd cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT); 964133808Spjd g_raid3_4k_requested++; 965133808Spjd } 966133808Spjd if (cbp->bio_data == NULL) { 967133808Spjd if (size > 16384) 968133808Spjd g_raid3_64k_failed++; 969133808Spjd if (size > 4096) 970133808Spjd g_raid3_16k_failed++; 971133808Spjd else 972133808Spjd g_raid3_4k_failed++; 973133808Spjd pbp->bio_children--; 974133808Spjd g_destroy_bio(cbp); 975133808Spjd return (NULL); 976133808Spjd } 977133808Spjd G_RAID3_NEXT_BIO(cbp) = NULL; 978133808Spjd if (G_RAID3_HEAD_BIO(pbp) == NULL) 979133808Spjd G_RAID3_HEAD_BIO(pbp) = cbp; 980133808Spjd else { 981133808Spjd G_RAID3_FOREACH_BIO(pbp, bp) { 982133808Spjd if (G_RAID3_NEXT_BIO(bp) == NULL) { 983133808Spjd G_RAID3_NEXT_BIO(bp) = cbp; 984133808Spjd break; 985133808Spjd } 986133808Spjd } 987133808Spjd } 988133808Spjd return (cbp); 989133808Spjd} 990133808Spjd 991133808Spjdstatic void 992133808Spjdg_raid3_scatter(struct bio *pbp) 993133808Spjd{ 994133808Spjd struct g_raid3_softc *sc; 995133808Spjd struct g_raid3_disk *disk; 996133808Spjd struct bio *bp, *cbp; 997133808Spjd off_t atom, cadd, padd, left; 998133808Spjd 999133808Spjd sc = pbp->bio_to->geom->softc; 1000133808Spjd bp = NULL; 1001133808Spjd if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { 1002133808Spjd /* 1003133808Spjd * Find bio for which we should calculate data. 1004133808Spjd */ 1005133808Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1006133808Spjd if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { 1007133808Spjd bp = cbp; 1008133808Spjd break; 1009133808Spjd } 1010133808Spjd } 1011133808Spjd KASSERT(bp != NULL, ("NULL parity bio.")); 1012133808Spjd } 1013133808Spjd atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1014133808Spjd cadd = padd = 0; 1015133808Spjd for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { 1016133808Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1017133808Spjd if (cbp == bp) 1018133808Spjd continue; 1019133808Spjd bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); 1020133808Spjd padd += atom; 1021133808Spjd } 1022133808Spjd cadd += atom; 1023133808Spjd } 1024133808Spjd if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { 1025133808Spjd struct bio *tmpbp; 1026133808Spjd 1027133808Spjd /* 1028133808Spjd * Calculate parity. 1029133808Spjd */ 1030133808Spjd bzero(bp->bio_data, bp->bio_length); 1031133808Spjd G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { 1032133808Spjd if (cbp == bp) 1033133808Spjd continue; 1034133808Spjd g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data, 1035133808Spjd bp->bio_length); 1036133808Spjd if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) 1037133808Spjd g_raid3_destroy_bio(sc, cbp); 1038133808Spjd } 1039133808Spjd } 1040133808Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1041133808Spjd struct g_consumer *cp; 1042133808Spjd 1043133808Spjd disk = cbp->bio_caller2; 1044133808Spjd cp = disk->d_consumer; 1045133808Spjd cbp->bio_to = cp->provider; 1046133808Spjd G_RAID3_LOGREQ(3, cbp, "Sending request."); 1047139144Spjd KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, 1048139144Spjd ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1049139144Spjd cp->acr, cp->acw, cp->ace)); 1050137256Spjd cp->index++; 1051133808Spjd g_io_request(cbp, cp); 1052133808Spjd } 1053133808Spjd} 1054133808Spjd 1055133808Spjdstatic void 1056133808Spjdg_raid3_gather(struct bio *pbp) 1057133808Spjd{ 1058133808Spjd struct g_raid3_softc *sc; 1059133808Spjd struct g_raid3_disk *disk; 1060134124Spjd struct bio *xbp, *fbp, *cbp; 1061133808Spjd off_t atom, cadd, padd, left; 1062133808Spjd 1063133808Spjd sc = pbp->bio_to->geom->softc; 1064134124Spjd /* 1065134124Spjd * Find bio for which we have to calculate data. 1066134124Spjd * While going through this path, check if all requests 1067134124Spjd * succeeded, if not, deny whole request. 1068134124Spjd * If we're in COMPLETE mode, we allow one request to fail, 1069134124Spjd * so if we find one, we're sending it to the parity consumer. 1070134124Spjd * If there are more failed requests, we deny whole request. 1071134124Spjd */ 1072134124Spjd xbp = fbp = NULL; 1073134124Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1074134124Spjd if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { 1075134124Spjd KASSERT(xbp == NULL, ("More than one parity bio.")); 1076134124Spjd xbp = cbp; 1077134124Spjd } 1078134124Spjd if (cbp->bio_error == 0) 1079134124Spjd continue; 1080133808Spjd /* 1081134124Spjd * Found failed request. 1082133808Spjd */ 1083134124Spjd G_RAID3_LOGREQ(0, cbp, "Request failed."); 1084134124Spjd disk = cbp->bio_caller2; 1085134124Spjd if (disk != NULL) { 1086133808Spjd /* 1087139295Spjd * Actually this is pointless to bump genid, 1088134124Spjd * because whole device is fucked up. 1089133808Spjd */ 1090139671Spjd sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1091134124Spjd g_raid3_event_send(disk, 1092134124Spjd G_RAID3_DISK_STATE_DISCONNECTED, 1093134124Spjd G_RAID3_EVENT_DONTWAIT); 1094134124Spjd } 1095134124Spjd if (fbp == NULL) { 1096134124Spjd if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { 1097133808Spjd /* 1098134124Spjd * We are already in degraded mode, so we can't 1099134124Spjd * accept any failures. 1100133808Spjd */ 1101134124Spjd if (pbp->bio_error == 0) 1102134124Spjd pbp->bio_error = fbp->bio_error; 1103134124Spjd } else { 1104134124Spjd fbp = cbp; 1105133808Spjd } 1106134124Spjd } else { 1107133808Spjd /* 1108134124Spjd * Next failed request, that's too many. 1109133808Spjd */ 1110134124Spjd if (pbp->bio_error == 0) 1111134124Spjd pbp->bio_error = fbp->bio_error; 1112134124Spjd } 1113134124Spjd } 1114134124Spjd if (pbp->bio_error != 0) 1115134124Spjd goto finish; 1116134168Spjd if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { 1117134168Spjd pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; 1118134168Spjd if (xbp != fbp) 1119134168Spjd g_raid3_replace_bio(xbp, fbp); 1120134168Spjd g_raid3_destroy_bio(sc, fbp); 1121134168Spjd } else if (fbp != NULL) { 1122134124Spjd struct g_consumer *cp; 1123134124Spjd 1124134124Spjd /* 1125134124Spjd * One request failed, so send the same request to 1126134124Spjd * the parity consumer. 1127134124Spjd */ 1128134124Spjd disk = pbp->bio_driver2; 1129134124Spjd if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { 1130134124Spjd pbp->bio_error = fbp->bio_error; 1131133808Spjd goto finish; 1132133808Spjd } 1133134124Spjd pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1134134124Spjd pbp->bio_inbed--; 1135134124Spjd fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); 1136134124Spjd if (disk->d_no == sc->sc_ndisks - 1) 1137134124Spjd fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1138134124Spjd fbp->bio_error = 0; 1139134124Spjd fbp->bio_completed = 0; 1140134124Spjd fbp->bio_children = 0; 1141134124Spjd fbp->bio_inbed = 0; 1142134124Spjd cp = disk->d_consumer; 1143134124Spjd fbp->bio_caller2 = disk; 1144134124Spjd fbp->bio_to = cp->provider; 1145134124Spjd G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); 1146139144Spjd KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, 1147134124Spjd ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1148134124Spjd cp->acr, cp->acw, cp->ace)); 1149137256Spjd cp->index++; 1150134124Spjd g_io_request(fbp, cp); 1151134124Spjd return; 1152134124Spjd } 1153134124Spjd if (xbp != NULL) { 1154133808Spjd /* 1155133808Spjd * Calculate parity. 1156133808Spjd */ 1157133808Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1158133808Spjd if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) 1159133808Spjd continue; 1160134124Spjd g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data, 1161134124Spjd xbp->bio_length); 1162133808Spjd } 1163134124Spjd xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; 1164134168Spjd if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { 1165134168Spjd if (!g_raid3_is_zero(xbp)) { 1166134168Spjd g_raid3_parity_mismatch++; 1167134168Spjd pbp->bio_error = EIO; 1168134168Spjd goto finish; 1169134168Spjd } 1170134168Spjd g_raid3_destroy_bio(sc, xbp); 1171134168Spjd } 1172133808Spjd } 1173133808Spjd atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1174133808Spjd cadd = padd = 0; 1175133808Spjd for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { 1176133808Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1177133808Spjd bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); 1178133808Spjd pbp->bio_completed += atom; 1179133808Spjd padd += atom; 1180133808Spjd } 1181133808Spjd cadd += atom; 1182133808Spjd } 1183133808Spjdfinish: 1184133808Spjd if (pbp->bio_error == 0) 1185133808Spjd G_RAID3_LOGREQ(3, pbp, "Request finished."); 1186134303Spjd else { 1187134303Spjd if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) 1188134303Spjd G_RAID3_LOGREQ(1, pbp, "Verification error."); 1189134303Spjd else 1190134303Spjd G_RAID3_LOGREQ(0, pbp, "Request failed."); 1191134303Spjd } 1192134168Spjd pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; 1193133808Spjd g_io_deliver(pbp, pbp->bio_error); 1194133808Spjd while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) 1195133808Spjd g_raid3_destroy_bio(sc, cbp); 1196133808Spjd} 1197133808Spjd 1198133808Spjdstatic void 1199133808Spjdg_raid3_done(struct bio *bp) 1200133808Spjd{ 1201133808Spjd struct g_raid3_softc *sc; 1202133808Spjd 1203133808Spjd sc = bp->bio_from->geom->softc; 1204133808Spjd bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; 1205133808Spjd G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); 1206133808Spjd mtx_lock(&sc->sc_queue_mtx); 1207133808Spjd bioq_insert_head(&sc->sc_queue, bp); 1208133808Spjd wakeup(sc); 1209133808Spjd wakeup(&sc->sc_queue); 1210133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1211133808Spjd} 1212133808Spjd 1213133808Spjdstatic void 1214133808Spjdg_raid3_regular_request(struct bio *cbp) 1215133808Spjd{ 1216133808Spjd struct g_raid3_softc *sc; 1217133808Spjd struct g_raid3_disk *disk; 1218133808Spjd struct bio *pbp; 1219133808Spjd 1220133808Spjd g_topology_assert_not(); 1221133808Spjd 1222137256Spjd cbp->bio_from->index--; 1223133808Spjd pbp = cbp->bio_parent; 1224133808Spjd sc = pbp->bio_to->geom->softc; 1225133808Spjd disk = cbp->bio_from->private; 1226133808Spjd if (disk == NULL) { 1227133808Spjd g_topology_lock(); 1228133808Spjd g_raid3_kill_consumer(sc, cbp->bio_from); 1229133808Spjd g_topology_unlock(); 1230133808Spjd } 1231133808Spjd 1232133808Spjd G_RAID3_LOGREQ(3, cbp, "Request finished."); 1233133808Spjd pbp->bio_inbed++; 1234133808Spjd KASSERT(pbp->bio_inbed <= pbp->bio_children, 1235133808Spjd ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, 1236133808Spjd pbp->bio_children)); 1237133808Spjd if (pbp->bio_inbed != pbp->bio_children) 1238133808Spjd return; 1239133808Spjd switch (pbp->bio_cmd) { 1240133808Spjd case BIO_READ: 1241133808Spjd g_raid3_gather(pbp); 1242133808Spjd break; 1243133808Spjd case BIO_WRITE: 1244133808Spjd case BIO_DELETE: 1245133808Spjd { 1246133808Spjd int error = 0; 1247133808Spjd 1248133808Spjd pbp->bio_completed = pbp->bio_length; 1249133808Spjd while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { 1250133808Spjd if (cbp->bio_error != 0) { 1251133808Spjd disk = cbp->bio_caller2; 1252133808Spjd if (disk != NULL) { 1253139671Spjd sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1254133808Spjd g_raid3_event_send(disk, 1255133808Spjd G_RAID3_DISK_STATE_DISCONNECTED, 1256133808Spjd G_RAID3_EVENT_DONTWAIT); 1257133808Spjd } 1258133808Spjd if (error == 0) 1259133808Spjd error = cbp->bio_error; 1260133808Spjd else if (pbp->bio_error == 0) { 1261133808Spjd /* 1262133808Spjd * Next failed request, that's too many. 1263133808Spjd */ 1264133808Spjd pbp->bio_error = error; 1265133808Spjd } 1266133808Spjd } 1267133808Spjd g_raid3_destroy_bio(sc, cbp); 1268133808Spjd } 1269133808Spjd if (pbp->bio_error == 0) 1270133808Spjd G_RAID3_LOGREQ(3, pbp, "Request finished."); 1271133808Spjd else 1272133808Spjd G_RAID3_LOGREQ(0, pbp, "Request failed."); 1273133808Spjd pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; 1274133808Spjd pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; 1275133808Spjd g_io_deliver(pbp, pbp->bio_error); 1276133808Spjd break; 1277133808Spjd } 1278133808Spjd } 1279133808Spjd} 1280133808Spjd 1281133808Spjdstatic void 1282133808Spjdg_raid3_sync_done(struct bio *bp) 1283133808Spjd{ 1284133808Spjd struct g_raid3_softc *sc; 1285133808Spjd 1286133808Spjd G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); 1287133808Spjd sc = bp->bio_from->geom->softc; 1288133808Spjd bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; 1289133808Spjd mtx_lock(&sc->sc_queue_mtx); 1290133808Spjd bioq_insert_head(&sc->sc_queue, bp); 1291133808Spjd wakeup(sc); 1292133808Spjd wakeup(&sc->sc_queue); 1293133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1294133808Spjd} 1295133808Spjd 1296133808Spjdstatic void 1297133808Spjdg_raid3_start(struct bio *bp) 1298133808Spjd{ 1299133808Spjd struct g_raid3_softc *sc; 1300133808Spjd 1301133808Spjd sc = bp->bio_to->geom->softc; 1302133808Spjd /* 1303133808Spjd * If sc == NULL or there are no valid disks, provider's error 1304133808Spjd * should be set and g_raid3_start() should not be called at all. 1305133808Spjd */ 1306133808Spjd KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 1307133808Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), 1308133808Spjd ("Provider's error should be set (error=%d)(device=%s).", 1309133808Spjd bp->bio_to->error, bp->bio_to->name)); 1310133808Spjd G_RAID3_LOGREQ(3, bp, "Request received."); 1311133808Spjd 1312133808Spjd switch (bp->bio_cmd) { 1313133808Spjd case BIO_READ: 1314133808Spjd case BIO_WRITE: 1315133808Spjd case BIO_DELETE: 1316133808Spjd break; 1317133808Spjd case BIO_GETATTR: 1318133808Spjd default: 1319133808Spjd g_io_deliver(bp, EOPNOTSUPP); 1320133808Spjd return; 1321133808Spjd } 1322133808Spjd mtx_lock(&sc->sc_queue_mtx); 1323133808Spjd bioq_insert_tail(&sc->sc_queue, bp); 1324133808Spjd G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 1325133808Spjd wakeup(sc); 1326133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1327133808Spjd} 1328133808Spjd 1329133808Spjd/* 1330133808Spjd * Send one synchronization request. 1331133808Spjd */ 1332133808Spjdstatic void 1333133808Spjdg_raid3_sync_one(struct g_raid3_softc *sc) 1334133808Spjd{ 1335133808Spjd struct g_raid3_disk *disk; 1336133808Spjd struct bio *bp; 1337133808Spjd 1338133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 1339133808Spjd ("Wrong device state (%s, %s).", sc->sc_name, 1340133808Spjd g_raid3_device_state2str(sc->sc_state))); 1341133808Spjd disk = sc->sc_syncdisk; 1342133808Spjd KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name)); 1343133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 1344133808Spjd ("Disk %s is not marked for synchronization.", 1345133808Spjd g_raid3_get_diskname(disk))); 1346133808Spjd 1347133808Spjd bp = g_new_bio(); 1348133808Spjd if (bp == NULL) 1349133808Spjd return; 1350133808Spjd bp->bio_parent = NULL; 1351133808Spjd bp->bio_cmd = BIO_READ; 1352133808Spjd bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); 1353135872Spjd bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); 1354133808Spjd bp->bio_cflags = 0; 1355133808Spjd bp->bio_done = g_raid3_sync_done; 1356133808Spjd bp->bio_data = disk->d_sync.ds_data; 1357133808Spjd if (bp->bio_data == NULL) { 1358133808Spjd g_destroy_bio(bp); 1359133808Spjd return; 1360133808Spjd } 1361133808Spjd bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC; 1362133808Spjd disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); 1363133808Spjd bp->bio_to = sc->sc_provider; 1364133808Spjd G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); 1365137256Spjd disk->d_sync.ds_consumer->index++; 1366133808Spjd g_io_request(bp, disk->d_sync.ds_consumer); 1367133808Spjd} 1368133808Spjd 1369133808Spjdstatic void 1370133808Spjdg_raid3_sync_request(struct bio *bp) 1371133808Spjd{ 1372133808Spjd struct g_raid3_softc *sc; 1373133808Spjd struct g_raid3_disk *disk; 1374133808Spjd 1375137256Spjd bp->bio_from->index--; 1376133808Spjd sc = bp->bio_from->geom->softc; 1377133808Spjd disk = bp->bio_from->private; 1378133808Spjd if (disk == NULL) { 1379133808Spjd g_topology_lock(); 1380133808Spjd g_raid3_kill_consumer(sc, bp->bio_from); 1381133808Spjd g_topology_unlock(); 1382133808Spjd g_destroy_bio(bp); 1383133808Spjd return; 1384133808Spjd } 1385133808Spjd 1386133808Spjd /* 1387133808Spjd * Synchronization request. 1388133808Spjd */ 1389133808Spjd switch (bp->bio_cmd) { 1390133808Spjd case BIO_READ: 1391133808Spjd { 1392133808Spjd struct g_consumer *cp; 1393133808Spjd u_char *dst, *src; 1394133808Spjd off_t left; 1395133808Spjd u_int atom; 1396133808Spjd 1397133808Spjd if (bp->bio_error != 0) { 1398133808Spjd G_RAID3_LOGREQ(0, bp, 1399133808Spjd "Synchronization request failed (error=%d).", 1400133808Spjd bp->bio_error); 1401133808Spjd g_destroy_bio(bp); 1402133808Spjd return; 1403133808Spjd } 1404133808Spjd G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); 1405133808Spjd atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1406133808Spjd dst = src = bp->bio_data; 1407133808Spjd if (disk->d_no == sc->sc_ndisks - 1) { 1408133808Spjd u_int n; 1409133808Spjd 1410133808Spjd /* Parity component. */ 1411133808Spjd for (left = bp->bio_length; left > 0; 1412133808Spjd left -= sc->sc_sectorsize) { 1413133808Spjd bcopy(src, dst, atom); 1414133808Spjd src += atom; 1415133808Spjd for (n = 1; n < sc->sc_ndisks - 1; n++) { 1416133808Spjd g_raid3_xor(src, dst, dst, atom); 1417133808Spjd src += atom; 1418133808Spjd } 1419133808Spjd dst += atom; 1420133808Spjd } 1421133808Spjd } else { 1422133808Spjd /* Regular component. */ 1423133808Spjd src += atom * disk->d_no; 1424133808Spjd for (left = bp->bio_length; left > 0; 1425133808Spjd left -= sc->sc_sectorsize) { 1426133808Spjd bcopy(src, dst, atom); 1427133808Spjd src += sc->sc_sectorsize; 1428133808Spjd dst += atom; 1429133808Spjd } 1430133808Spjd } 1431133808Spjd bp->bio_offset /= sc->sc_ndisks - 1; 1432133808Spjd bp->bio_length /= sc->sc_ndisks - 1; 1433133808Spjd bp->bio_cmd = BIO_WRITE; 1434133808Spjd bp->bio_cflags = 0; 1435133808Spjd bp->bio_children = bp->bio_inbed = 0; 1436133808Spjd cp = disk->d_consumer; 1437139144Spjd KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, 1438133808Spjd ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1439133808Spjd cp->acr, cp->acw, cp->ace)); 1440137256Spjd cp->index++; 1441133808Spjd g_io_request(bp, cp); 1442133808Spjd return; 1443133808Spjd } 1444133808Spjd case BIO_WRITE: 1445135863Spjd { 1446135863Spjd struct g_raid3_disk_sync *sync; 1447135863Spjd 1448133808Spjd if (bp->bio_error != 0) { 1449133808Spjd G_RAID3_LOGREQ(0, bp, 1450133808Spjd "Synchronization request failed (error=%d).", 1451133808Spjd bp->bio_error); 1452133808Spjd g_destroy_bio(bp); 1453139671Spjd sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1454133808Spjd g_raid3_event_send(disk, 1455133808Spjd G_RAID3_DISK_STATE_DISCONNECTED, 1456133808Spjd G_RAID3_EVENT_DONTWAIT); 1457133808Spjd return; 1458133808Spjd } 1459133808Spjd G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); 1460135863Spjd sync = &disk->d_sync; 1461135863Spjd sync->ds_offset_done = bp->bio_offset + bp->bio_length; 1462133808Spjd g_destroy_bio(bp); 1463135863Spjd if (sync->ds_resync != -1) 1464135863Spjd return; 1465135863Spjd if (sync->ds_offset_done == 1466134421Spjd sc->sc_mediasize / (sc->sc_ndisks - 1)) { 1467133808Spjd /* 1468133808Spjd * Disk up-to-date, activate it. 1469133808Spjd */ 1470133808Spjd g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, 1471133808Spjd G_RAID3_EVENT_DONTWAIT); 1472133808Spjd return; 1473135872Spjd } else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) { 1474133808Spjd /* 1475133808Spjd * Update offset_done on every 100 blocks. 1476133808Spjd * XXX: This should be configurable. 1477133808Spjd */ 1478133808Spjd g_topology_lock(); 1479133808Spjd g_raid3_update_metadata(disk); 1480133808Spjd g_topology_unlock(); 1481133808Spjd } 1482133808Spjd return; 1483135863Spjd } 1484133808Spjd default: 1485133808Spjd KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", 1486133808Spjd bp->bio_cmd, sc->sc_name)); 1487133808Spjd break; 1488133808Spjd } 1489133808Spjd} 1490133808Spjd 1491133808Spjdstatic int 1492133808Spjdg_raid3_register_request(struct bio *pbp) 1493133808Spjd{ 1494133808Spjd struct g_raid3_softc *sc; 1495133808Spjd struct g_raid3_disk *disk; 1496133808Spjd struct g_consumer *cp; 1497133808Spjd struct bio *cbp; 1498133808Spjd off_t offset, length; 1499133839Sobrien u_int n, ndisks; 1500134168Spjd int round_robin, verify; 1501133808Spjd 1502133839Sobrien ndisks = 0; 1503133808Spjd sc = pbp->bio_to->geom->softc; 1504133808Spjd if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && 1505133808Spjd sc->sc_syncdisk == NULL) { 1506133808Spjd g_io_deliver(pbp, EIO); 1507133808Spjd return (0); 1508133808Spjd } 1509133808Spjd g_raid3_init_bio(pbp); 1510133808Spjd length = pbp->bio_length / (sc->sc_ndisks - 1); 1511133808Spjd offset = pbp->bio_offset / (sc->sc_ndisks - 1); 1512134168Spjd round_robin = verify = 0; 1513133808Spjd switch (pbp->bio_cmd) { 1514133808Spjd case BIO_READ: 1515134168Spjd if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && 1516134168Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1517134168Spjd pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; 1518134168Spjd verify = 1; 1519134168Spjd ndisks = sc->sc_ndisks; 1520134168Spjd } else { 1521134168Spjd verify = 0; 1522134168Spjd ndisks = sc->sc_ndisks - 1; 1523134168Spjd } 1524134168Spjd if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && 1525134168Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1526134168Spjd round_robin = 1; 1527134168Spjd } else { 1528134168Spjd round_robin = 0; 1529134168Spjd } 1530134168Spjd KASSERT(!round_robin || !verify, 1531134168Spjd ("ROUND-ROBIN and VERIFY are mutually exclusive.")); 1532134124Spjd pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; 1533133808Spjd break; 1534133808Spjd case BIO_WRITE: 1535133808Spjd case BIO_DELETE: 1536135863Spjd { 1537135863Spjd struct g_raid3_disk_sync *sync; 1538135863Spjd 1539137258Spjd if (sc->sc_idle) 1540137258Spjd g_raid3_unidle(sc); 1541137258Spjd 1542133808Spjd ndisks = sc->sc_ndisks; 1543135863Spjd 1544135863Spjd if (sc->sc_syncdisk == NULL) 1545135863Spjd break; 1546135863Spjd sync = &sc->sc_syncdisk->d_sync; 1547135863Spjd if (offset >= sync->ds_offset) 1548135863Spjd break; 1549135863Spjd if (offset + length <= sync->ds_offset_done) 1550135863Spjd break; 1551135863Spjd if (offset >= sync->ds_resync && sync->ds_resync != -1) 1552135863Spjd break; 1553135872Spjd sync->ds_resync = offset - (offset % MAXPHYS); 1554133808Spjd break; 1555135863Spjd } 1556133808Spjd } 1557133808Spjd for (n = 0; n < ndisks; n++) { 1558133808Spjd disk = &sc->sc_disks[n]; 1559133808Spjd cbp = g_raid3_clone_bio(sc, pbp); 1560133808Spjd if (cbp == NULL) { 1561133808Spjd while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) 1562133808Spjd g_raid3_destroy_bio(sc, cbp); 1563133808Spjd return (ENOMEM); 1564133808Spjd } 1565133808Spjd cbp->bio_offset = offset; 1566133808Spjd cbp->bio_length = length; 1567133808Spjd cbp->bio_done = g_raid3_done; 1568133808Spjd switch (pbp->bio_cmd) { 1569133808Spjd case BIO_READ: 1570133808Spjd if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { 1571133808Spjd /* 1572133808Spjd * Replace invalid component with the parity 1573133808Spjd * component. 1574133808Spjd */ 1575133808Spjd disk = &sc->sc_disks[sc->sc_ndisks - 1]; 1576133808Spjd cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1577133808Spjd pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1578134124Spjd } else if (round_robin && 1579134124Spjd disk->d_no == sc->sc_round_robin) { 1580134124Spjd /* 1581134124Spjd * In round-robin mode skip one data component 1582134124Spjd * and use parity component when reading. 1583134124Spjd */ 1584134124Spjd pbp->bio_driver2 = disk; 1585134124Spjd disk = &sc->sc_disks[sc->sc_ndisks - 1]; 1586134124Spjd cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1587134124Spjd sc->sc_round_robin++; 1588134124Spjd round_robin = 0; 1589134168Spjd } else if (verify && disk->d_no == sc->sc_ndisks - 1) { 1590134168Spjd cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1591133808Spjd } 1592133808Spjd break; 1593133808Spjd case BIO_WRITE: 1594133808Spjd case BIO_DELETE: 1595133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 1596133808Spjd disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 1597133808Spjd if (n == ndisks - 1) { 1598133808Spjd /* 1599133808Spjd * Active parity component, mark it as such. 1600133808Spjd */ 1601133808Spjd cbp->bio_cflags |= 1602133808Spjd G_RAID3_BIO_CFLAG_PARITY; 1603133808Spjd } 1604133808Spjd } else { 1605133808Spjd pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1606133808Spjd if (n == ndisks - 1) { 1607133808Spjd /* 1608133808Spjd * Parity component is not connected, 1609133808Spjd * so destroy its request. 1610133808Spjd */ 1611133808Spjd pbp->bio_pflags |= 1612133808Spjd G_RAID3_BIO_PFLAG_NOPARITY; 1613133808Spjd g_raid3_destroy_bio(sc, cbp); 1614133808Spjd cbp = NULL; 1615133808Spjd } else { 1616133808Spjd cbp->bio_cflags |= 1617133808Spjd G_RAID3_BIO_CFLAG_NODISK; 1618133808Spjd disk = NULL; 1619133808Spjd } 1620133808Spjd } 1621133808Spjd break; 1622133808Spjd } 1623133808Spjd if (cbp != NULL) 1624133808Spjd cbp->bio_caller2 = disk; 1625133808Spjd } 1626133808Spjd switch (pbp->bio_cmd) { 1627133808Spjd case BIO_READ: 1628134124Spjd if (round_robin) { 1629134124Spjd /* 1630134124Spjd * If we are in round-robin mode and 'round_robin' is 1631134124Spjd * still 1, it means, that we skipped parity component 1632134124Spjd * for this read and must reset sc_round_robin field. 1633134124Spjd */ 1634134124Spjd sc->sc_round_robin = 0; 1635134124Spjd } 1636133808Spjd G_RAID3_FOREACH_BIO(pbp, cbp) { 1637133808Spjd disk = cbp->bio_caller2; 1638133808Spjd cp = disk->d_consumer; 1639133808Spjd cbp->bio_to = cp->provider; 1640133808Spjd G_RAID3_LOGREQ(3, cbp, "Sending request."); 1641139144Spjd KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, 1642133808Spjd ("Consumer %s not opened (r%dw%de%d).", 1643133808Spjd cp->provider->name, cp->acr, cp->acw, cp->ace)); 1644137256Spjd cp->index++; 1645133808Spjd g_io_request(cbp, cp); 1646133808Spjd } 1647133808Spjd break; 1648133808Spjd case BIO_WRITE: 1649133808Spjd case BIO_DELETE: 1650133808Spjd /* 1651133808Spjd * Bump syncid on first write. 1652133808Spjd */ 1653139671Spjd if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { 1654139295Spjd sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; 1655133808Spjd g_topology_lock(); 1656139144Spjd g_raid3_bump_syncid(sc); 1657133808Spjd g_topology_unlock(); 1658133808Spjd } 1659133808Spjd g_raid3_scatter(pbp); 1660133808Spjd break; 1661133808Spjd } 1662133808Spjd return (0); 1663133808Spjd} 1664133808Spjd 1665133808Spjdstatic int 1666133808Spjdg_raid3_can_destroy(struct g_raid3_softc *sc) 1667133808Spjd{ 1668133808Spjd struct g_geom *gp; 1669133808Spjd struct g_consumer *cp; 1670133808Spjd 1671133808Spjd g_topology_assert(); 1672133808Spjd gp = sc->sc_geom; 1673133808Spjd LIST_FOREACH(cp, &gp->consumer, consumer) { 1674133808Spjd if (g_raid3_is_busy(sc, cp)) 1675133808Spjd return (0); 1676133808Spjd } 1677133808Spjd gp = sc->sc_sync.ds_geom; 1678133808Spjd LIST_FOREACH(cp, &gp->consumer, consumer) { 1679133808Spjd if (g_raid3_is_busy(sc, cp)) 1680133808Spjd return (0); 1681133808Spjd } 1682133808Spjd G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", 1683133808Spjd sc->sc_name); 1684133808Spjd return (1); 1685133808Spjd} 1686133808Spjd 1687133808Spjdstatic int 1688133808Spjdg_raid3_try_destroy(struct g_raid3_softc *sc) 1689133808Spjd{ 1690133808Spjd 1691139295Spjd g_topology_lock(); 1692139295Spjd if (!g_raid3_can_destroy(sc)) { 1693139295Spjd g_topology_unlock(); 1694139295Spjd return (0); 1695139295Spjd } 1696133808Spjd if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { 1697133808Spjd g_topology_unlock(); 1698133808Spjd G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, 1699133808Spjd &sc->sc_worker); 1700133808Spjd wakeup(&sc->sc_worker); 1701133808Spjd sc->sc_worker = NULL; 1702133808Spjd } else { 1703133808Spjd g_raid3_destroy_device(sc); 1704133808Spjd g_topology_unlock(); 1705133808Spjd free(sc->sc_disks, M_RAID3); 1706133808Spjd free(sc, M_RAID3); 1707133808Spjd } 1708133808Spjd return (1); 1709133808Spjd} 1710133808Spjd 1711133808Spjd/* 1712133808Spjd * Worker thread. 1713133808Spjd */ 1714133808Spjdstatic void 1715133808Spjdg_raid3_worker(void *arg) 1716133808Spjd{ 1717133808Spjd struct g_raid3_softc *sc; 1718133808Spjd struct g_raid3_disk *disk; 1719135863Spjd struct g_raid3_disk_sync *sync; 1720133808Spjd struct g_raid3_event *ep; 1721133808Spjd struct bio *bp; 1722133808Spjd u_int nreqs; 1723133808Spjd 1724133808Spjd sc = arg; 1725139451Sjhb mtx_lock_spin(&sched_lock); 1726139451Sjhb sched_prio(curthread, PRIBIO); 1727139451Sjhb mtx_unlock_spin(&sched_lock); 1728133808Spjd 1729133808Spjd nreqs = 0; 1730133808Spjd for (;;) { 1731133808Spjd G_RAID3_DEBUG(5, "%s: Let's see...", __func__); 1732133808Spjd /* 1733133808Spjd * First take a look at events. 1734133808Spjd * This is important to handle events before any I/O requests. 1735133808Spjd */ 1736133808Spjd ep = g_raid3_event_get(sc); 1737139144Spjd if (ep != NULL && g_topology_try_lock()) { 1738139144Spjd g_raid3_event_remove(sc, ep); 1739133808Spjd if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { 1740133808Spjd /* Update only device status. */ 1741133808Spjd G_RAID3_DEBUG(3, 1742133808Spjd "Running event for device %s.", 1743133808Spjd sc->sc_name); 1744133808Spjd ep->e_error = 0; 1745139144Spjd g_raid3_update_device(sc, 1); 1746133808Spjd } else { 1747133808Spjd /* Update disk status. */ 1748133808Spjd G_RAID3_DEBUG(3, "Running event for disk %s.", 1749133808Spjd g_raid3_get_diskname(ep->e_disk)); 1750133808Spjd ep->e_error = g_raid3_update_disk(ep->e_disk, 1751139144Spjd ep->e_state); 1752133808Spjd if (ep->e_error == 0) 1753139144Spjd g_raid3_update_device(sc, 0); 1754133808Spjd } 1755133808Spjd g_topology_unlock(); 1756133808Spjd if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { 1757133808Spjd KASSERT(ep->e_error == 0, 1758133808Spjd ("Error cannot be handled.")); 1759133808Spjd g_raid3_event_free(ep); 1760133808Spjd } else { 1761133808Spjd ep->e_flags |= G_RAID3_EVENT_DONE; 1762133808Spjd G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, 1763133808Spjd ep); 1764133808Spjd mtx_lock(&sc->sc_events_mtx); 1765133808Spjd wakeup(ep); 1766133808Spjd mtx_unlock(&sc->sc_events_mtx); 1767133808Spjd } 1768133808Spjd if ((sc->sc_flags & 1769133808Spjd G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1770133808Spjd if (g_raid3_try_destroy(sc)) 1771133808Spjd kthread_exit(0); 1772133808Spjd } 1773133808Spjd G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); 1774133808Spjd continue; 1775133808Spjd } 1776133808Spjd /* 1777133808Spjd * Now I/O requests. 1778133808Spjd */ 1779133808Spjd /* Get first request from the queue. */ 1780133808Spjd mtx_lock(&sc->sc_queue_mtx); 1781133808Spjd bp = bioq_first(&sc->sc_queue); 1782133808Spjd if (bp == NULL) { 1783139144Spjd if (ep != NULL) { 1784139144Spjd /* 1785139144Spjd * No I/O requests and topology lock was 1786139144Spjd * already held? Try again. 1787139144Spjd */ 1788139144Spjd mtx_unlock(&sc->sc_queue_mtx); 1789139379Spjd tsleep(ep, PRIBIO, "r3:top1", hz / 5); 1790139144Spjd continue; 1791139144Spjd } 1792133808Spjd if ((sc->sc_flags & 1793133808Spjd G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1794133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1795133808Spjd if (g_raid3_try_destroy(sc)) 1796133808Spjd kthread_exit(0); 1797133808Spjd mtx_lock(&sc->sc_queue_mtx); 1798133808Spjd } 1799133808Spjd } 1800133808Spjd if (sc->sc_syncdisk != NULL && 1801133808Spjd (bp == NULL || nreqs > g_raid3_reqs_per_sync)) { 1802133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1803133808Spjd /* 1804133808Spjd * It is time for synchronization... 1805133808Spjd */ 1806133808Spjd nreqs = 0; 1807133808Spjd disk = sc->sc_syncdisk; 1808135863Spjd sync = &disk->d_sync; 1809135863Spjd if (sync->ds_offset < 1810134421Spjd sc->sc_mediasize / (sc->sc_ndisks - 1) && 1811135863Spjd sync->ds_offset == sync->ds_offset_done) { 1812135863Spjd if (sync->ds_resync != -1) { 1813135863Spjd sync->ds_offset = sync->ds_resync; 1814135863Spjd sync->ds_offset_done = sync->ds_resync; 1815135863Spjd sync->ds_resync = -1; 1816135863Spjd } 1817133808Spjd g_raid3_sync_one(sc); 1818133808Spjd } 1819133808Spjd G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__); 1820133808Spjd goto sleep; 1821133808Spjd } 1822133808Spjd if (bp == NULL) { 1823137259Spjd if (g_raid3_check_idle(sc)) { 1824137258Spjd u_int idletime; 1825137258Spjd 1826137258Spjd idletime = g_raid3_idletime; 1827137258Spjd if (idletime == 0) 1828137258Spjd idletime = 1; 1829137258Spjd idletime *= hz; 1830137258Spjd if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, 1831137259Spjd "r3:w1", idletime) == EWOULDBLOCK) { 1832137259Spjd G_RAID3_DEBUG(5, "%s: I'm here 3.", 1833137258Spjd __func__); 1834137258Spjd /* 1835137258Spjd * No I/O requests in 'idletime' 1836137258Spjd * seconds, so mark components as clean. 1837137258Spjd */ 1838137258Spjd g_raid3_idle(sc); 1839137258Spjd } 1840137259Spjd G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); 1841137259Spjd } else { 1842137259Spjd MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, 1843137259Spjd "r3:w2", 0); 1844137258Spjd G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__); 1845137258Spjd } 1846133808Spjd continue; 1847133808Spjd } 1848133808Spjd nreqs++; 1849133808Spjd bioq_remove(&sc->sc_queue, bp); 1850133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1851133808Spjd 1852133808Spjd if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { 1853133808Spjd g_raid3_regular_request(bp); 1854133808Spjd } else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { 1855133808Spjd u_int timeout, sps; 1856133808Spjd 1857133808Spjd g_raid3_sync_request(bp); 1858133808Spjdsleep: 1859133808Spjd sps = atomic_load_acq_int(&g_raid3_syncs_per_sec); 1860133808Spjd if (sps == 0) { 1861139144Spjd G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__); 1862133808Spjd continue; 1863133808Spjd } 1864139144Spjd if (ep != NULL) { 1865139144Spjd /* 1866139144Spjd * We have some pending events, don't sleep now. 1867139144Spjd */ 1868139144Spjd G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__); 1869139379Spjd tsleep(ep, PRIBIO, "r3:top2", hz / 5); 1870139144Spjd continue; 1871139144Spjd } 1872133808Spjd mtx_lock(&sc->sc_queue_mtx); 1873133808Spjd if (bioq_first(&sc->sc_queue) != NULL) { 1874133808Spjd mtx_unlock(&sc->sc_queue_mtx); 1875139144Spjd G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__); 1876133808Spjd continue; 1877133808Spjd } 1878133808Spjd timeout = hz / sps; 1879133808Spjd if (timeout == 0) 1880133808Spjd timeout = 1; 1881133808Spjd MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2", 1882133808Spjd timeout); 1883133808Spjd } else { 1884133808Spjd if (g_raid3_register_request(bp) != 0) { 1885133808Spjd mtx_lock(&sc->sc_queue_mtx); 1886133808Spjd bioq_insert_tail(&sc->sc_queue, bp); 1887133808Spjd MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, 1888133808Spjd PRIBIO | PDROP, "r3:lowmem", hz / 10); 1889133808Spjd } 1890133808Spjd } 1891139144Spjd G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); 1892133808Spjd } 1893133808Spjd} 1894133808Spjd 1895133808Spjd/* 1896133808Spjd * Open disk's consumer if needed. 1897133808Spjd */ 1898133808Spjdstatic void 1899133808Spjdg_raid3_update_access(struct g_raid3_disk *disk) 1900133808Spjd{ 1901133808Spjd struct g_provider *pp; 1902133808Spjd 1903133808Spjd g_topology_assert(); 1904133808Spjd 1905133808Spjd pp = disk->d_softc->sc_provider; 1906139144Spjd if (pp == NULL) 1907133808Spjd return; 1908139144Spjd if (pp->acw > 0) { 1909139144Spjd if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { 1910139144Spjd G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 1911139144Spjd g_raid3_get_diskname(disk), disk->d_softc->sc_name); 1912139144Spjd disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 1913139144Spjd } 1914139144Spjd } else if (pp->acw == 0) { 1915139144Spjd if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { 1916139144Spjd G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 1917139144Spjd g_raid3_get_diskname(disk), disk->d_softc->sc_name); 1918139144Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 1919139144Spjd } 1920133808Spjd } 1921133808Spjd} 1922133808Spjd 1923133808Spjdstatic void 1924133808Spjdg_raid3_sync_start(struct g_raid3_softc *sc) 1925133808Spjd{ 1926133808Spjd struct g_raid3_disk *disk; 1927133808Spjd int error; 1928133808Spjd u_int n; 1929133808Spjd 1930133808Spjd g_topology_assert(); 1931133808Spjd 1932133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 1933133808Spjd ("Device not in DEGRADED state (%s, %u).", sc->sc_name, 1934133808Spjd sc->sc_state)); 1935133808Spjd KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", 1936133808Spjd sc->sc_name, sc->sc_state)); 1937133808Spjd disk = NULL; 1938133808Spjd for (n = 0; n < sc->sc_ndisks; n++) { 1939133808Spjd if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) 1940133808Spjd continue; 1941133808Spjd disk = &sc->sc_disks[n]; 1942133808Spjd break; 1943133808Spjd } 1944133808Spjd if (disk == NULL) 1945133808Spjd return; 1946133808Spjd 1947133808Spjd G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, 1948133808Spjd g_raid3_get_diskname(disk)); 1949133808Spjd disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 1950133808Spjd KASSERT(disk->d_sync.ds_consumer == NULL, 1951133808Spjd ("Sync consumer already exists (device=%s, disk=%s).", 1952133808Spjd sc->sc_name, g_raid3_get_diskname(disk))); 1953133808Spjd disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom); 1954133808Spjd disk->d_sync.ds_consumer->private = disk; 1955137256Spjd disk->d_sync.ds_consumer->index = 0; 1956133808Spjd error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider); 1957133808Spjd KASSERT(error == 0, ("Cannot attach to %s (error=%d).", 1958133808Spjd disk->d_softc->sc_name, error)); 1959133808Spjd error = g_access(disk->d_sync.ds_consumer, 1, 0, 0); 1960133808Spjd KASSERT(error == 0, ("Cannot open %s (error=%d).", 1961133808Spjd disk->d_softc->sc_name, error)); 1962135872Spjd disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); 1963133808Spjd sc->sc_syncdisk = disk; 1964133808Spjd} 1965133808Spjd 1966133808Spjd/* 1967133808Spjd * Stop synchronization process. 1968133808Spjd * type: 0 - synchronization finished 1969133808Spjd * 1 - synchronization stopped 1970133808Spjd */ 1971133808Spjdstatic void 1972133808Spjdg_raid3_sync_stop(struct g_raid3_softc *sc, int type) 1973133808Spjd{ 1974133808Spjd struct g_raid3_disk *disk; 1975133808Spjd 1976133808Spjd g_topology_assert(); 1977133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 1978133808Spjd ("Device not in DEGRADED state (%s, %u).", sc->sc_name, 1979133808Spjd sc->sc_state)); 1980133808Spjd disk = sc->sc_syncdisk; 1981133808Spjd sc->sc_syncdisk = NULL; 1982133808Spjd KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); 1983133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 1984133808Spjd ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 1985133808Spjd g_raid3_disk_state2str(disk->d_state))); 1986133808Spjd if (disk->d_sync.ds_consumer == NULL) 1987133808Spjd return; 1988133808Spjd 1989133808Spjd if (type == 0) { 1990133808Spjd G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", 1991133808Spjd disk->d_softc->sc_name, g_raid3_get_diskname(disk)); 1992133808Spjd } else /* if (type == 1) */ { 1993133808Spjd G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", 1994133808Spjd disk->d_softc->sc_name, g_raid3_get_diskname(disk)); 1995133808Spjd } 1996139144Spjd g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer); 1997133808Spjd free(disk->d_sync.ds_data, M_RAID3); 1998133808Spjd disk->d_sync.ds_consumer = NULL; 1999133808Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2000133808Spjd} 2001133808Spjd 2002133808Spjdstatic void 2003133808Spjdg_raid3_launch_provider(struct g_raid3_softc *sc) 2004133808Spjd{ 2005133808Spjd struct g_provider *pp; 2006133808Spjd 2007133808Spjd g_topology_assert(); 2008133808Spjd 2009133808Spjd pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); 2010133808Spjd pp->mediasize = sc->sc_mediasize; 2011133808Spjd pp->sectorsize = sc->sc_sectorsize; 2012133808Spjd sc->sc_provider = pp; 2013133808Spjd g_error_provider(pp, 0); 2014133808Spjd G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name, 2015133808Spjd pp->name); 2016133808Spjd if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) 2017133808Spjd g_raid3_sync_start(sc); 2018133808Spjd} 2019133808Spjd 2020133808Spjdstatic void 2021133808Spjdg_raid3_destroy_provider(struct g_raid3_softc *sc) 2022133808Spjd{ 2023133808Spjd struct bio *bp; 2024133808Spjd 2025133808Spjd g_topology_assert(); 2026133808Spjd KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", 2027133808Spjd sc->sc_name)); 2028133808Spjd 2029133808Spjd g_error_provider(sc->sc_provider, ENXIO); 2030133808Spjd mtx_lock(&sc->sc_queue_mtx); 2031133808Spjd while ((bp = bioq_first(&sc->sc_queue)) != NULL) { 2032133808Spjd bioq_remove(&sc->sc_queue, bp); 2033133808Spjd g_io_deliver(bp, ENXIO); 2034133808Spjd } 2035133808Spjd mtx_unlock(&sc->sc_queue_mtx); 2036133808Spjd G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, 2037133808Spjd sc->sc_provider->name); 2038133808Spjd sc->sc_provider->flags |= G_PF_WITHER; 2039133808Spjd g_orphan_provider(sc->sc_provider, ENXIO); 2040133808Spjd sc->sc_provider = NULL; 2041133808Spjd if (sc->sc_syncdisk != NULL) 2042133808Spjd g_raid3_sync_stop(sc, 1); 2043133808Spjd} 2044133808Spjd 2045133808Spjdstatic void 2046133808Spjdg_raid3_go(void *arg) 2047133808Spjd{ 2048133808Spjd struct g_raid3_softc *sc; 2049133808Spjd 2050133808Spjd sc = arg; 2051133808Spjd G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); 2052133808Spjd g_raid3_event_send(sc, 0, 2053133808Spjd G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); 2054133808Spjd} 2055133808Spjd 2056133808Spjdstatic u_int 2057133808Spjdg_raid3_determine_state(struct g_raid3_disk *disk) 2058133808Spjd{ 2059133808Spjd struct g_raid3_softc *sc; 2060133808Spjd u_int state; 2061133808Spjd 2062133808Spjd sc = disk->d_softc; 2063133808Spjd if (sc->sc_syncid == disk->d_sync.ds_syncid) { 2064133808Spjd if ((disk->d_flags & 2065133808Spjd G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { 2066133808Spjd /* Disk does not need synchronization. */ 2067133808Spjd state = G_RAID3_DISK_STATE_ACTIVE; 2068133808Spjd } else { 2069133808Spjd if ((sc->sc_flags & 2070133808Spjd G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || 2071133808Spjd (disk->d_flags & 2072133808Spjd G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { 2073133808Spjd /* 2074133808Spjd * We can start synchronization from 2075133808Spjd * the stored offset. 2076133808Spjd */ 2077133808Spjd state = G_RAID3_DISK_STATE_SYNCHRONIZING; 2078133808Spjd } else { 2079133808Spjd state = G_RAID3_DISK_STATE_STALE; 2080133808Spjd } 2081133808Spjd } 2082133808Spjd } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { 2083133808Spjd /* 2084133808Spjd * Reset all synchronization data for this disk, 2085133808Spjd * because if it even was synchronized, it was 2086133808Spjd * synchronized to disks with different syncid. 2087133808Spjd */ 2088133808Spjd disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 2089133808Spjd disk->d_sync.ds_offset = 0; 2090133808Spjd disk->d_sync.ds_offset_done = 0; 2091133808Spjd disk->d_sync.ds_syncid = sc->sc_syncid; 2092133808Spjd if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || 2093133808Spjd (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { 2094133808Spjd state = G_RAID3_DISK_STATE_SYNCHRONIZING; 2095133808Spjd } else { 2096133808Spjd state = G_RAID3_DISK_STATE_STALE; 2097133808Spjd } 2098133808Spjd } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { 2099133808Spjd /* 2100133808Spjd * Not good, NOT GOOD! 2101133808Spjd * It means that device was started on stale disks 2102133808Spjd * and more fresh disk just arrive. 2103133808Spjd * If there were writes, device is fucked up, sorry. 2104133808Spjd * I think the best choice here is don't touch 2105133808Spjd * this disk and inform the user laudly. 2106133808Spjd */ 2107133808Spjd G_RAID3_DEBUG(0, "Device %s was started before the freshest " 2108133808Spjd "disk (%s) arrives!! It will not be connected to the " 2109133808Spjd "running device.", sc->sc_name, 2110133808Spjd g_raid3_get_diskname(disk)); 2111133808Spjd g_raid3_destroy_disk(disk); 2112133808Spjd state = G_RAID3_DISK_STATE_NONE; 2113133808Spjd /* Return immediately, because disk was destroyed. */ 2114133808Spjd return (state); 2115133808Spjd } 2116133808Spjd G_RAID3_DEBUG(3, "State for %s disk: %s.", 2117133808Spjd g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); 2118133808Spjd return (state); 2119133808Spjd} 2120133808Spjd 2121133808Spjd/* 2122133808Spjd * Update device state. 2123133808Spjd */ 2124133808Spjdstatic void 2125139144Spjdg_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) 2126133808Spjd{ 2127133808Spjd struct g_raid3_disk *disk; 2128133808Spjd u_int state; 2129133808Spjd 2130133808Spjd g_topology_assert(); 2131133808Spjd 2132133808Spjd switch (sc->sc_state) { 2133133808Spjd case G_RAID3_DEVICE_STATE_STARTING: 2134133808Spjd { 2135139295Spjd u_int n, ndirty, ndisks, genid, syncid; 2136133808Spjd 2137133808Spjd KASSERT(sc->sc_provider == NULL, 2138133808Spjd ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); 2139133808Spjd /* 2140133808Spjd * Are we ready? We are, if all disks are connected or 2141133808Spjd * one disk is missing and 'force' is true. 2142133808Spjd */ 2143133808Spjd if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { 2144133808Spjd if (!force) 2145133808Spjd callout_drain(&sc->sc_callout); 2146133808Spjd } else { 2147133808Spjd if (force) { 2148133808Spjd /* 2149133808Spjd * Timeout expired, so destroy device. 2150133808Spjd */ 2151133808Spjd sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2152133808Spjd } 2153133808Spjd return; 2154133808Spjd } 2155133808Spjd 2156133808Spjd /* 2157139295Spjd * Find the biggest genid. 2158139295Spjd */ 2159139295Spjd genid = 0; 2160139295Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2161139295Spjd disk = &sc->sc_disks[n]; 2162139295Spjd if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2163139295Spjd continue; 2164139295Spjd if (disk->d_genid > genid) 2165139295Spjd genid = disk->d_genid; 2166139295Spjd } 2167139295Spjd sc->sc_genid = genid; 2168139295Spjd /* 2169139295Spjd * Remove all disks without the biggest genid. 2170139295Spjd */ 2171139295Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2172139295Spjd disk = &sc->sc_disks[n]; 2173139295Spjd if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2174139295Spjd continue; 2175139295Spjd if (disk->d_genid < genid) { 2176139295Spjd G_RAID3_DEBUG(0, 2177139295Spjd "Component %s (device %s) broken, skipping.", 2178139295Spjd g_raid3_get_diskname(disk), sc->sc_name); 2179139295Spjd g_raid3_destroy_disk(disk); 2180139295Spjd } 2181139295Spjd } 2182139295Spjd 2183139295Spjd /* 2184133808Spjd * There must be at least 'sc->sc_ndisks - 1' components 2185133808Spjd * with the same syncid and without SYNCHRONIZING flag. 2186133808Spjd */ 2187133808Spjd 2188133808Spjd /* 2189133808Spjd * Find the biggest syncid, number of valid components and 2190133808Spjd * number of dirty components. 2191133808Spjd */ 2192133808Spjd ndirty = ndisks = syncid = 0; 2193133808Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2194133808Spjd disk = &sc->sc_disks[n]; 2195133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2196133808Spjd continue; 2197133808Spjd if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) 2198133808Spjd ndirty++; 2199133808Spjd if (disk->d_sync.ds_syncid > syncid) { 2200133808Spjd syncid = disk->d_sync.ds_syncid; 2201133808Spjd ndisks = 0; 2202133808Spjd } else if (disk->d_sync.ds_syncid < syncid) { 2203133808Spjd continue; 2204133808Spjd } 2205133808Spjd if ((disk->d_flags & 2206133808Spjd G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { 2207133808Spjd continue; 2208133808Spjd } 2209133808Spjd ndisks++; 2210133808Spjd } 2211133808Spjd /* 2212133808Spjd * Do we have enough valid components? 2213133808Spjd */ 2214133808Spjd if (ndisks + 1 < sc->sc_ndisks) { 2215133808Spjd G_RAID3_DEBUG(0, 2216133808Spjd "Device %s is broken, too few valid components.", 2217133808Spjd sc->sc_name); 2218133808Spjd sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2219133808Spjd return; 2220133808Spjd } 2221133808Spjd /* 2222133808Spjd * If there is one DIRTY component and all disks are present, 2223133808Spjd * mark it for synchronization. If there is more than one DIRTY 2224133808Spjd * component, mark parity component for synchronization. 2225133808Spjd */ 2226133808Spjd if (ndisks == sc->sc_ndisks && ndirty == 1) { 2227133808Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2228133808Spjd disk = &sc->sc_disks[n]; 2229133808Spjd if ((disk->d_flags & 2230133808Spjd G_RAID3_DISK_FLAG_DIRTY) == 0) { 2231133808Spjd continue; 2232133808Spjd } 2233133808Spjd disk->d_flags |= 2234133808Spjd G_RAID3_DISK_FLAG_SYNCHRONIZING; 2235133808Spjd } 2236133808Spjd } else if (ndisks == sc->sc_ndisks && ndirty > 1) { 2237133808Spjd disk = &sc->sc_disks[sc->sc_ndisks - 1]; 2238133808Spjd disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 2239133808Spjd } 2240133808Spjd 2241133808Spjd sc->sc_syncid = syncid; 2242133808Spjd if (force) { 2243133808Spjd /* Remember to bump syncid on first write. */ 2244139671Spjd sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; 2245133808Spjd } 2246133808Spjd if (ndisks == sc->sc_ndisks) 2247133808Spjd state = G_RAID3_DEVICE_STATE_COMPLETE; 2248133808Spjd else /* if (ndisks == sc->sc_ndisks - 1) */ 2249133808Spjd state = G_RAID3_DEVICE_STATE_DEGRADED; 2250133808Spjd G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", 2251133808Spjd sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2252133808Spjd g_raid3_device_state2str(state)); 2253133808Spjd sc->sc_state = state; 2254133808Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2255133808Spjd disk = &sc->sc_disks[n]; 2256133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2257133808Spjd continue; 2258133808Spjd state = g_raid3_determine_state(disk); 2259133808Spjd g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); 2260139295Spjd if (state == G_RAID3_DISK_STATE_STALE) 2261139671Spjd sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; 2262133808Spjd } 2263133808Spjd break; 2264133808Spjd } 2265133808Spjd case G_RAID3_DEVICE_STATE_DEGRADED: 2266133808Spjd /* 2267139671Spjd * Genid need to be bumped immediately, so do it here. 2268133808Spjd */ 2269139671Spjd if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { 2270139295Spjd sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; 2271139295Spjd g_raid3_bump_genid(sc); 2272139295Spjd } 2273139295Spjd 2274133808Spjd if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) 2275133808Spjd return; 2276133808Spjd if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < 2277133808Spjd sc->sc_ndisks - 1) { 2278133808Spjd if (sc->sc_provider != NULL) 2279133808Spjd g_raid3_destroy_provider(sc); 2280133808Spjd sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2281133808Spjd return; 2282133808Spjd } 2283133808Spjd if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == 2284133808Spjd sc->sc_ndisks) { 2285133808Spjd state = G_RAID3_DEVICE_STATE_COMPLETE; 2286133808Spjd G_RAID3_DEBUG(1, 2287133808Spjd "Device %s state changed from %s to %s.", 2288133808Spjd sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2289133808Spjd g_raid3_device_state2str(state)); 2290133808Spjd sc->sc_state = state; 2291133808Spjd } 2292133808Spjd if (sc->sc_provider == NULL) 2293133808Spjd g_raid3_launch_provider(sc); 2294133808Spjd break; 2295133808Spjd case G_RAID3_DEVICE_STATE_COMPLETE: 2296133808Spjd /* 2297139671Spjd * Genid need to be bumped immediately, so do it here. 2298133808Spjd */ 2299139671Spjd if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { 2300139295Spjd sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; 2301139295Spjd g_raid3_bump_genid(sc); 2302139295Spjd } 2303139295Spjd 2304133808Spjd if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) 2305133808Spjd return; 2306133808Spjd KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= 2307133808Spjd sc->sc_ndisks - 1, 2308133808Spjd ("Too few ACTIVE components in COMPLETE state (device %s).", 2309133808Spjd sc->sc_name)); 2310133808Spjd if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == 2311133808Spjd sc->sc_ndisks - 1) { 2312133808Spjd state = G_RAID3_DEVICE_STATE_DEGRADED; 2313133808Spjd G_RAID3_DEBUG(1, 2314133808Spjd "Device %s state changed from %s to %s.", 2315133808Spjd sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2316133808Spjd g_raid3_device_state2str(state)); 2317133808Spjd sc->sc_state = state; 2318133808Spjd } 2319133808Spjd if (sc->sc_provider == NULL) 2320133808Spjd g_raid3_launch_provider(sc); 2321133808Spjd break; 2322133808Spjd default: 2323133808Spjd KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, 2324133808Spjd g_raid3_device_state2str(sc->sc_state))); 2325133808Spjd break; 2326133808Spjd } 2327133808Spjd} 2328133808Spjd 2329133808Spjd/* 2330133808Spjd * Update disk state and device state if needed. 2331133808Spjd */ 2332133808Spjd#define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ 2333133808Spjd "Disk %s state changed from %s to %s (device %s).", \ 2334133808Spjd g_raid3_get_diskname(disk), \ 2335133808Spjd g_raid3_disk_state2str(disk->d_state), \ 2336133808Spjd g_raid3_disk_state2str(state), sc->sc_name) 2337133808Spjdstatic int 2338139144Spjdg_raid3_update_disk(struct g_raid3_disk *disk, u_int state) 2339133808Spjd{ 2340133808Spjd struct g_raid3_softc *sc; 2341133808Spjd 2342133808Spjd g_topology_assert(); 2343133808Spjd 2344133808Spjd sc = disk->d_softc; 2345133808Spjdagain: 2346133808Spjd G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", 2347133808Spjd g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), 2348133808Spjd g_raid3_disk_state2str(state)); 2349133808Spjd switch (state) { 2350133808Spjd case G_RAID3_DISK_STATE_NEW: 2351133808Spjd /* 2352133808Spjd * Possible scenarios: 2353133808Spjd * 1. New disk arrive. 2354133808Spjd */ 2355133808Spjd /* Previous state should be NONE. */ 2356133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, 2357133808Spjd ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2358133808Spjd g_raid3_disk_state2str(disk->d_state))); 2359133808Spjd DISK_STATE_CHANGED(); 2360133808Spjd 2361133808Spjd disk->d_state = state; 2362133808Spjd G_RAID3_DEBUG(0, "Device %s: provider %s detected.", 2363133808Spjd sc->sc_name, g_raid3_get_diskname(disk)); 2364133808Spjd if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) 2365133808Spjd break; 2366133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2367133808Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2368133808Spjd ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2369133808Spjd g_raid3_device_state2str(sc->sc_state), 2370133808Spjd g_raid3_get_diskname(disk), 2371133808Spjd g_raid3_disk_state2str(disk->d_state))); 2372133808Spjd state = g_raid3_determine_state(disk); 2373133808Spjd if (state != G_RAID3_DISK_STATE_NONE) 2374133808Spjd goto again; 2375133808Spjd break; 2376133808Spjd case G_RAID3_DISK_STATE_ACTIVE: 2377133808Spjd /* 2378133808Spjd * Possible scenarios: 2379133808Spjd * 1. New disk does not need synchronization. 2380133808Spjd * 2. Synchronization process finished successfully. 2381133808Spjd */ 2382133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2383133808Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2384133808Spjd ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2385133808Spjd g_raid3_device_state2str(sc->sc_state), 2386133808Spjd g_raid3_get_diskname(disk), 2387133808Spjd g_raid3_disk_state2str(disk->d_state))); 2388133808Spjd /* Previous state should be NEW or SYNCHRONIZING. */ 2389133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || 2390133808Spjd disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2391133808Spjd ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2392133808Spjd g_raid3_disk_state2str(disk->d_state))); 2393133808Spjd DISK_STATE_CHANGED(); 2394133808Spjd 2395133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_NEW) 2396133808Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2397133808Spjd else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 2398133808Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; 2399133808Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; 2400133808Spjd g_raid3_sync_stop(sc, 0); 2401133808Spjd } 2402133808Spjd disk->d_state = state; 2403133808Spjd disk->d_sync.ds_offset = 0; 2404133808Spjd disk->d_sync.ds_offset_done = 0; 2405133808Spjd g_raid3_update_access(disk); 2406133808Spjd g_raid3_update_metadata(disk); 2407133808Spjd G_RAID3_DEBUG(0, "Device %s: provider %s activated.", 2408133808Spjd sc->sc_name, g_raid3_get_diskname(disk)); 2409133808Spjd break; 2410133808Spjd case G_RAID3_DISK_STATE_STALE: 2411133808Spjd /* 2412133808Spjd * Possible scenarios: 2413133808Spjd * 1. Stale disk was connected. 2414133808Spjd */ 2415133808Spjd /* Previous state should be NEW. */ 2416133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2417133808Spjd ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2418133808Spjd g_raid3_disk_state2str(disk->d_state))); 2419133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2420133808Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2421133808Spjd ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2422133808Spjd g_raid3_device_state2str(sc->sc_state), 2423133808Spjd g_raid3_get_diskname(disk), 2424133808Spjd g_raid3_disk_state2str(disk->d_state))); 2425133808Spjd /* 2426133808Spjd * STALE state is only possible if device is marked 2427133808Spjd * NOAUTOSYNC. 2428133808Spjd */ 2429133808Spjd KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, 2430133808Spjd ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2431133808Spjd g_raid3_device_state2str(sc->sc_state), 2432133808Spjd g_raid3_get_diskname(disk), 2433133808Spjd g_raid3_disk_state2str(disk->d_state))); 2434133808Spjd DISK_STATE_CHANGED(); 2435133808Spjd 2436133808Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2437133808Spjd disk->d_state = state; 2438133808Spjd g_raid3_update_metadata(disk); 2439133808Spjd G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", 2440133808Spjd sc->sc_name, g_raid3_get_diskname(disk)); 2441133808Spjd break; 2442133808Spjd case G_RAID3_DISK_STATE_SYNCHRONIZING: 2443133808Spjd /* 2444133808Spjd * Possible scenarios: 2445133808Spjd * 1. Disk which needs synchronization was connected. 2446133808Spjd */ 2447133808Spjd /* Previous state should be NEW. */ 2448133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2449133808Spjd ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2450133808Spjd g_raid3_disk_state2str(disk->d_state))); 2451133808Spjd KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2452133808Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2453133808Spjd ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2454133808Spjd g_raid3_device_state2str(sc->sc_state), 2455133808Spjd g_raid3_get_diskname(disk), 2456133808Spjd g_raid3_disk_state2str(disk->d_state))); 2457133808Spjd DISK_STATE_CHANGED(); 2458133808Spjd 2459133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_NEW) 2460133808Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2461133808Spjd disk->d_state = state; 2462133808Spjd if (sc->sc_provider != NULL) { 2463133808Spjd g_raid3_sync_start(sc); 2464133808Spjd g_raid3_update_metadata(disk); 2465133808Spjd } 2466133808Spjd break; 2467133808Spjd case G_RAID3_DISK_STATE_DISCONNECTED: 2468133808Spjd /* 2469133808Spjd * Possible scenarios: 2470133808Spjd * 1. Device wasn't running yet, but disk disappear. 2471133808Spjd * 2. Disk was active and disapppear. 2472133808Spjd * 3. Disk disappear during synchronization process. 2473133808Spjd */ 2474133808Spjd if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2475133808Spjd sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 2476133808Spjd /* 2477133808Spjd * Previous state should be ACTIVE, STALE or 2478133808Spjd * SYNCHRONIZING. 2479133808Spjd */ 2480133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 2481133808Spjd disk->d_state == G_RAID3_DISK_STATE_STALE || 2482133808Spjd disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2483133808Spjd ("Wrong disk state (%s, %s).", 2484133808Spjd g_raid3_get_diskname(disk), 2485133808Spjd g_raid3_disk_state2str(disk->d_state))); 2486133808Spjd } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { 2487133808Spjd /* Previous state should be NEW. */ 2488133808Spjd KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2489133808Spjd ("Wrong disk state (%s, %s).", 2490133808Spjd g_raid3_get_diskname(disk), 2491133808Spjd g_raid3_disk_state2str(disk->d_state))); 2492133808Spjd /* 2493133808Spjd * Reset bumping syncid if disk disappeared in STARTING 2494133808Spjd * state. 2495133808Spjd */ 2496139671Spjd if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) 2497139295Spjd sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; 2498133808Spjd#ifdef INVARIANTS 2499133808Spjd } else { 2500133808Spjd KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", 2501133808Spjd sc->sc_name, 2502133808Spjd g_raid3_device_state2str(sc->sc_state), 2503133808Spjd g_raid3_get_diskname(disk), 2504133808Spjd g_raid3_disk_state2str(disk->d_state))); 2505133808Spjd#endif 2506133808Spjd } 2507133808Spjd DISK_STATE_CHANGED(); 2508133808Spjd G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", 2509133808Spjd sc->sc_name, g_raid3_get_diskname(disk)); 2510133808Spjd 2511133808Spjd g_raid3_destroy_disk(disk); 2512133808Spjd break; 2513133808Spjd default: 2514133808Spjd KASSERT(1 == 0, ("Unknown state (%u).", state)); 2515133808Spjd break; 2516133808Spjd } 2517133808Spjd return (0); 2518133808Spjd} 2519133808Spjd#undef DISK_STATE_CHANGED 2520133808Spjd 2521139671Spjdint 2522133808Spjdg_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) 2523133808Spjd{ 2524133808Spjd struct g_provider *pp; 2525133808Spjd u_char *buf; 2526133808Spjd int error; 2527133808Spjd 2528133808Spjd g_topology_assert(); 2529133808Spjd 2530133808Spjd error = g_access(cp, 1, 0, 0); 2531133808Spjd if (error != 0) 2532133808Spjd return (error); 2533133808Spjd pp = cp->provider; 2534133808Spjd g_topology_unlock(); 2535133808Spjd /* Metadata are stored on last sector. */ 2536133808Spjd buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, 2537133808Spjd &error); 2538133808Spjd g_topology_lock(); 2539139144Spjd g_access(cp, -1, 0, 0); 2540133808Spjd if (error != 0) { 2541139295Spjd G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", 2542139295Spjd cp->provider->name, error); 2543139144Spjd if (buf != NULL) 2544139144Spjd g_free(buf); 2545133808Spjd return (error); 2546133808Spjd } 2547133808Spjd 2548133808Spjd /* Decode metadata. */ 2549133808Spjd error = raid3_metadata_decode(buf, md); 2550133808Spjd g_free(buf); 2551133808Spjd if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) 2552133808Spjd return (EINVAL); 2553139295Spjd if (md->md_version > G_RAID3_VERSION) { 2554139295Spjd G_RAID3_DEBUG(0, 2555139295Spjd "Kernel module is too old to handle metadata from %s.", 2556139295Spjd cp->provider->name); 2557139295Spjd return (EINVAL); 2558139295Spjd } 2559133808Spjd if (error != 0) { 2560133808Spjd G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", 2561133808Spjd cp->provider->name); 2562133808Spjd return (error); 2563133808Spjd } 2564133808Spjd 2565133808Spjd return (0); 2566133808Spjd} 2567133808Spjd 2568133808Spjdstatic int 2569133808Spjdg_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, 2570133808Spjd struct g_raid3_metadata *md) 2571133808Spjd{ 2572133808Spjd 2573133808Spjd if (md->md_no >= sc->sc_ndisks) { 2574133808Spjd G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", 2575133808Spjd pp->name, md->md_no); 2576133808Spjd return (EINVAL); 2577133808Spjd } 2578133808Spjd if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { 2579133808Spjd G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", 2580133808Spjd pp->name, md->md_no); 2581133808Spjd return (EEXIST); 2582133808Spjd } 2583133808Spjd if (md->md_all != sc->sc_ndisks) { 2584133808Spjd G_RAID3_DEBUG(1, 2585133808Spjd "Invalid '%s' field on disk %s (device %s), skipping.", 2586133808Spjd "md_all", pp->name, sc->sc_name); 2587133808Spjd return (EINVAL); 2588133808Spjd } 2589133808Spjd if (md->md_mediasize != sc->sc_mediasize) { 2590133808Spjd G_RAID3_DEBUG(1, 2591133808Spjd "Invalid '%s' field on disk %s (device %s), skipping.", 2592133808Spjd "md_mediasize", pp->name, sc->sc_name); 2593133808Spjd return (EINVAL); 2594133808Spjd } 2595133808Spjd if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { 2596133808Spjd G_RAID3_DEBUG(1, 2597133808Spjd "Invalid '%s' field on disk %s (device %s), skipping.", 2598133808Spjd "md_mediasize", pp->name, sc->sc_name); 2599133808Spjd return (EINVAL); 2600133808Spjd } 2601133808Spjd if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { 2602133808Spjd G_RAID3_DEBUG(1, 2603133808Spjd "Invalid size of disk %s (device %s), skipping.", pp->name, 2604133808Spjd sc->sc_name); 2605133808Spjd return (EINVAL); 2606133808Spjd } 2607133808Spjd if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { 2608133808Spjd G_RAID3_DEBUG(1, 2609133808Spjd "Invalid '%s' field on disk %s (device %s), skipping.", 2610133808Spjd "md_sectorsize", pp->name, sc->sc_name); 2611133808Spjd return (EINVAL); 2612133808Spjd } 2613133808Spjd if (md->md_sectorsize != sc->sc_sectorsize) { 2614133808Spjd G_RAID3_DEBUG(1, 2615133808Spjd "Invalid '%s' field on disk %s (device %s), skipping.", 2616133808Spjd "md_sectorsize", pp->name, sc->sc_name); 2617133808Spjd return (EINVAL); 2618133808Spjd } 2619133808Spjd if ((sc->sc_sectorsize % pp->sectorsize) != 0) { 2620133808Spjd G_RAID3_DEBUG(1, 2621133808Spjd "Invalid sector size of disk %s (device %s), skipping.", 2622133808Spjd pp->name, sc->sc_name); 2623133808Spjd return (EINVAL); 2624133808Spjd } 2625133808Spjd if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { 2626133808Spjd G_RAID3_DEBUG(1, 2627133808Spjd "Invalid device flags on disk %s (device %s), skipping.", 2628133808Spjd pp->name, sc->sc_name); 2629133808Spjd return (EINVAL); 2630133808Spjd } 2631134168Spjd if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && 2632134168Spjd (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { 2633134168Spjd /* 2634134168Spjd * VERIFY and ROUND-ROBIN options are mutally exclusive. 2635134168Spjd */ 2636134168Spjd G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " 2637134168Spjd "disk %s (device %s), skipping.", pp->name, sc->sc_name); 2638134168Spjd return (EINVAL); 2639134168Spjd } 2640133808Spjd if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { 2641133808Spjd G_RAID3_DEBUG(1, 2642133808Spjd "Invalid disk flags on disk %s (device %s), skipping.", 2643133808Spjd pp->name, sc->sc_name); 2644133808Spjd return (EINVAL); 2645133808Spjd } 2646133808Spjd return (0); 2647133808Spjd} 2648133808Spjd 2649139671Spjdint 2650133808Spjdg_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, 2651133808Spjd struct g_raid3_metadata *md) 2652133808Spjd{ 2653133808Spjd struct g_raid3_disk *disk; 2654133808Spjd int error; 2655133808Spjd 2656133808Spjd g_topology_assert(); 2657133808Spjd G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); 2658133808Spjd 2659133808Spjd error = g_raid3_check_metadata(sc, pp, md); 2660133808Spjd if (error != 0) 2661133808Spjd return (error); 2662139295Spjd if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && 2663139295Spjd md->md_genid < sc->sc_genid) { 2664139295Spjd G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", 2665139295Spjd pp->name, sc->sc_name); 2666139295Spjd return (EINVAL); 2667139295Spjd } 2668133808Spjd disk = g_raid3_init_disk(sc, pp, md, &error); 2669133808Spjd if (disk == NULL) 2670133808Spjd return (error); 2671133808Spjd error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, 2672133808Spjd G_RAID3_EVENT_WAIT); 2673139295Spjd if (error != 0) 2674139295Spjd return (error); 2675139295Spjd if (md->md_version < G_RAID3_VERSION) { 2676139295Spjd G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", 2677139295Spjd pp->name, md->md_version, G_RAID3_VERSION); 2678139295Spjd g_raid3_update_metadata(disk); 2679139295Spjd } 2680139295Spjd return (0); 2681133808Spjd} 2682133808Spjd 2683133808Spjdstatic int 2684133808Spjdg_raid3_access(struct g_provider *pp, int acr, int acw, int ace) 2685133808Spjd{ 2686133808Spjd struct g_raid3_softc *sc; 2687133808Spjd struct g_raid3_disk *disk; 2688139144Spjd int dcr, dcw, dce; 2689133808Spjd u_int n; 2690133808Spjd 2691133808Spjd g_topology_assert(); 2692133808Spjd G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, 2693133808Spjd acw, ace); 2694133808Spjd 2695133808Spjd dcr = pp->acr + acr; 2696133808Spjd dcw = pp->acw + acw; 2697133808Spjd dce = pp->ace + ace; 2698133808Spjd 2699133808Spjd sc = pp->geom->softc; 2700133808Spjd if (sc == NULL || 2701137412Spjd g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 || 2702137412Spjd (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 2703133808Spjd if (acr <= 0 && acw <= 0 && ace <= 0) 2704133808Spjd return (0); 2705133808Spjd else 2706133808Spjd return (ENXIO); 2707133808Spjd } 2708133808Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2709133808Spjd disk = &sc->sc_disks[n]; 2710133808Spjd if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 2711133808Spjd continue; 2712139144Spjd /* 2713139144Spjd * Mark disk as dirty on open and unmark on close. 2714139144Spjd */ 2715139144Spjd if (pp->acw == 0 && dcw > 0) { 2716139144Spjd G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 2717139144Spjd g_raid3_get_diskname(disk), sc->sc_name); 2718139144Spjd disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 2719139144Spjd g_raid3_update_metadata(disk); 2720139144Spjd } else if (pp->acw > 0 && dcw == 0) { 2721139144Spjd G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 2722139144Spjd g_raid3_get_diskname(disk), sc->sc_name); 2723139144Spjd disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2724139144Spjd g_raid3_update_metadata(disk); 2725133808Spjd } 2726133808Spjd } 2727139144Spjd return (0); 2728133808Spjd} 2729133808Spjd 2730133808Spjdstatic struct g_geom * 2731133808Spjdg_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) 2732133808Spjd{ 2733133808Spjd struct g_raid3_softc *sc; 2734133808Spjd struct g_geom *gp; 2735133808Spjd int error, timeout; 2736133808Spjd u_int n; 2737133808Spjd 2738133808Spjd g_topology_assert(); 2739133808Spjd G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); 2740133808Spjd 2741133808Spjd /* One disk is minimum. */ 2742133808Spjd if (md->md_all < 1) 2743133808Spjd return (NULL); 2744133808Spjd /* 2745133808Spjd * Action geom. 2746133808Spjd */ 2747133808Spjd gp = g_new_geomf(mp, "%s", md->md_name); 2748133808Spjd sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); 2749133808Spjd sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, 2750133808Spjd M_WAITOK | M_ZERO); 2751133808Spjd gp->start = g_raid3_start; 2752133808Spjd gp->orphan = g_raid3_orphan; 2753133808Spjd gp->access = g_raid3_access; 2754133808Spjd gp->dumpconf = g_raid3_dumpconf; 2755133808Spjd 2756133808Spjd sc->sc_id = md->md_id; 2757133808Spjd sc->sc_mediasize = md->md_mediasize; 2758133808Spjd sc->sc_sectorsize = md->md_sectorsize; 2759133808Spjd sc->sc_ndisks = md->md_all; 2760134124Spjd sc->sc_round_robin = 0; 2761133808Spjd sc->sc_flags = md->md_mflags; 2762139295Spjd sc->sc_bump_id = 0; 2763137258Spjd sc->sc_idle = 0; 2764138374Spjd for (n = 0; n < sc->sc_ndisks; n++) { 2765138374Spjd sc->sc_disks[n].d_softc = sc; 2766138374Spjd sc->sc_disks[n].d_no = n; 2767133808Spjd sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; 2768138374Spjd } 2769133808Spjd bioq_init(&sc->sc_queue); 2770133808Spjd mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); 2771133808Spjd TAILQ_INIT(&sc->sc_events); 2772133808Spjd mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); 2773133808Spjd callout_init(&sc->sc_callout, CALLOUT_MPSAFE); 2774133808Spjd sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; 2775133808Spjd gp->softc = sc; 2776133808Spjd sc->sc_geom = gp; 2777133808Spjd sc->sc_provider = NULL; 2778133808Spjd /* 2779133808Spjd * Synchronization geom. 2780133808Spjd */ 2781133808Spjd gp = g_new_geomf(mp, "%s.sync", md->md_name); 2782133808Spjd gp->softc = sc; 2783133808Spjd gp->orphan = g_raid3_orphan; 2784133808Spjd sc->sc_sync.ds_geom = gp; 2785133808Spjd sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL, 2786133808Spjd UMA_ALIGN_PTR, 0); 2787133808Spjd uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k); 2788133808Spjd sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL, 2789133808Spjd UMA_ALIGN_PTR, 0); 2790133808Spjd uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k); 2791133808Spjd sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL, 2792133808Spjd UMA_ALIGN_PTR, 0); 2793133808Spjd uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k); 2794133808Spjd error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, 2795133808Spjd "g_raid3 %s", md->md_name); 2796133808Spjd if (error != 0) { 2797133808Spjd G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", 2798133808Spjd sc->sc_name); 2799133808Spjd uma_zdestroy(sc->sc_zone_64k); 2800133808Spjd uma_zdestroy(sc->sc_zone_16k); 2801133808Spjd uma_zdestroy(sc->sc_zone_4k); 2802133808Spjd g_destroy_geom(sc->sc_sync.ds_geom); 2803133808Spjd mtx_destroy(&sc->sc_events_mtx); 2804133808Spjd mtx_destroy(&sc->sc_queue_mtx); 2805133808Spjd g_destroy_geom(sc->sc_geom); 2806133808Spjd free(sc->sc_disks, M_RAID3); 2807133808Spjd free(sc, M_RAID3); 2808133808Spjd return (NULL); 2809133808Spjd } 2810133808Spjd 2811133808Spjd G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); 2812133808Spjd 2813133808Spjd /* 2814133808Spjd * Run timeout. 2815133808Spjd */ 2816133808Spjd timeout = atomic_load_acq_int(&g_raid3_timeout); 2817133808Spjd callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); 2818133808Spjd return (sc->sc_geom); 2819133808Spjd} 2820133808Spjd 2821133808Spjdint 2822133808Spjdg_raid3_destroy(struct g_raid3_softc *sc, boolean_t force) 2823133808Spjd{ 2824133808Spjd struct g_provider *pp; 2825133808Spjd 2826133808Spjd g_topology_assert(); 2827133808Spjd 2828133808Spjd if (sc == NULL) 2829133808Spjd return (ENXIO); 2830133808Spjd pp = sc->sc_provider; 2831133808Spjd if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 2832133808Spjd if (force) { 2833139146Spjd G_RAID3_DEBUG(1, "Device %s is still open, so it " 2834133808Spjd "can't be definitely removed.", pp->name); 2835133808Spjd } else { 2836133808Spjd G_RAID3_DEBUG(1, 2837133808Spjd "Device %s is still open (r%dw%de%d).", pp->name, 2838133808Spjd pp->acr, pp->acw, pp->ace); 2839133808Spjd return (EBUSY); 2840133808Spjd } 2841133808Spjd } 2842133808Spjd 2843133808Spjd sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2844133808Spjd sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; 2845133808Spjd g_topology_unlock(); 2846133808Spjd G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 2847133808Spjd mtx_lock(&sc->sc_queue_mtx); 2848133808Spjd wakeup(sc); 2849133808Spjd wakeup(&sc->sc_queue); 2850133808Spjd mtx_unlock(&sc->sc_queue_mtx); 2851133808Spjd G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); 2852133808Spjd while (sc->sc_worker != NULL) 2853133808Spjd tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); 2854133808Spjd G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); 2855133808Spjd g_topology_lock(); 2856133808Spjd g_raid3_destroy_device(sc); 2857133808Spjd free(sc->sc_disks, M_RAID3); 2858133808Spjd free(sc, M_RAID3); 2859133808Spjd return (0); 2860133808Spjd} 2861133808Spjd 2862133808Spjdstatic void 2863133808Spjdg_raid3_taste_orphan(struct g_consumer *cp) 2864133808Spjd{ 2865133808Spjd 2866133808Spjd KASSERT(1 == 0, ("%s called while tasting %s.", __func__, 2867133808Spjd cp->provider->name)); 2868133808Spjd} 2869133808Spjd 2870133808Spjdstatic struct g_geom * 2871133808Spjdg_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 2872133808Spjd{ 2873133808Spjd struct g_raid3_metadata md; 2874133808Spjd struct g_raid3_softc *sc; 2875133808Spjd struct g_consumer *cp; 2876133808Spjd struct g_geom *gp; 2877133808Spjd int error; 2878133808Spjd 2879133808Spjd g_topology_assert(); 2880133808Spjd g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); 2881133808Spjd G_RAID3_DEBUG(2, "Tasting %s.", pp->name); 2882133808Spjd 2883133808Spjd gp = g_new_geomf(mp, "raid3:taste"); 2884133808Spjd /* This orphan function should be never called. */ 2885133808Spjd gp->orphan = g_raid3_taste_orphan; 2886133808Spjd cp = g_new_consumer(gp); 2887133808Spjd g_attach(cp, pp); 2888133808Spjd error = g_raid3_read_metadata(cp, &md); 2889133808Spjd g_detach(cp); 2890133808Spjd g_destroy_consumer(cp); 2891133808Spjd g_destroy_geom(gp); 2892133808Spjd if (error != 0) 2893133808Spjd return (NULL); 2894133808Spjd gp = NULL; 2895133808Spjd 2896133808Spjd if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0) 2897133808Spjd return (NULL); 2898142727Spjd if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) 2899142727Spjd return (NULL); 2900133808Spjd if (g_raid3_debug >= 2) 2901133808Spjd raid3_metadata_dump(&md); 2902133808Spjd 2903133808Spjd /* 2904133808Spjd * Let's check if device already exists. 2905133808Spjd */ 2906134486Spjd sc = NULL; 2907133808Spjd LIST_FOREACH(gp, &mp->geom, geom) { 2908133808Spjd sc = gp->softc; 2909133808Spjd if (sc == NULL) 2910133808Spjd continue; 2911133808Spjd if (sc->sc_sync.ds_geom == gp) 2912133808Spjd continue; 2913133808Spjd if (strcmp(md.md_name, sc->sc_name) != 0) 2914133808Spjd continue; 2915133808Spjd if (md.md_id != sc->sc_id) { 2916133808Spjd G_RAID3_DEBUG(0, "Device %s already configured.", 2917133808Spjd sc->sc_name); 2918133808Spjd return (NULL); 2919133808Spjd } 2920133808Spjd break; 2921133808Spjd } 2922133808Spjd if (gp == NULL) { 2923133808Spjd gp = g_raid3_create(mp, &md); 2924133808Spjd if (gp == NULL) { 2925133808Spjd G_RAID3_DEBUG(0, "Cannot create device %s.", 2926133808Spjd md.md_name); 2927133808Spjd return (NULL); 2928133808Spjd } 2929133808Spjd sc = gp->softc; 2930133808Spjd } 2931133808Spjd G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); 2932133808Spjd error = g_raid3_add_disk(sc, pp, &md); 2933133808Spjd if (error != 0) { 2934133808Spjd G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", 2935133808Spjd pp->name, gp->name, error); 2936133808Spjd if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == 2937133808Spjd sc->sc_ndisks) { 2938133808Spjd g_raid3_destroy(sc, 1); 2939133808Spjd } 2940133808Spjd return (NULL); 2941133808Spjd } 2942133808Spjd return (gp); 2943133808Spjd} 2944133808Spjd 2945133808Spjdstatic int 2946133808Spjdg_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, 2947133808Spjd struct g_geom *gp) 2948133808Spjd{ 2949133808Spjd 2950133808Spjd return (g_raid3_destroy(gp->softc, 0)); 2951133808Spjd} 2952133808Spjd 2953133808Spjdstatic void 2954133808Spjdg_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 2955133808Spjd struct g_consumer *cp, struct g_provider *pp) 2956133808Spjd{ 2957133808Spjd struct g_raid3_softc *sc; 2958133808Spjd 2959133808Spjd g_topology_assert(); 2960133808Spjd 2961133808Spjd sc = gp->softc; 2962133808Spjd if (sc == NULL) 2963133808Spjd return; 2964133808Spjd /* Skip synchronization geom. */ 2965133808Spjd if (gp == sc->sc_sync.ds_geom) 2966133808Spjd return; 2967133808Spjd if (pp != NULL) { 2968133808Spjd /* Nothing here. */ 2969133808Spjd } else if (cp != NULL) { 2970133808Spjd struct g_raid3_disk *disk; 2971133808Spjd 2972133808Spjd disk = cp->private; 2973133808Spjd if (disk == NULL) 2974133808Spjd return; 2975133808Spjd sbuf_printf(sb, "%s<Type>", indent); 2976133808Spjd if (disk->d_no == sc->sc_ndisks - 1) 2977133808Spjd sbuf_printf(sb, "PARITY"); 2978133808Spjd else 2979133808Spjd sbuf_printf(sb, "DATA"); 2980133808Spjd sbuf_printf(sb, "</Type>\n"); 2981133808Spjd sbuf_printf(sb, "%s<Number>%u</Number>\n", indent, 2982133808Spjd (u_int)disk->d_no); 2983133808Spjd if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 2984133808Spjd sbuf_printf(sb, "%s<Synchronized>", indent); 2985133808Spjd if (disk->d_sync.ds_offset_done == 0) 2986133808Spjd sbuf_printf(sb, "0%%"); 2987133808Spjd else { 2988133808Spjd sbuf_printf(sb, "%u%%", 2989133808Spjd (u_int)((disk->d_sync.ds_offset_done * 100) / 2990134421Spjd (sc->sc_mediasize / (sc->sc_ndisks - 1)))); 2991133808Spjd } 2992133808Spjd sbuf_printf(sb, "</Synchronized>\n"); 2993133808Spjd } 2994133808Spjd sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, 2995133808Spjd disk->d_sync.ds_syncid); 2996139295Spjd sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid); 2997133808Spjd sbuf_printf(sb, "%s<Flags>", indent); 2998133808Spjd if (disk->d_flags == 0) 2999133808Spjd sbuf_printf(sb, "NONE"); 3000133808Spjd else { 3001133808Spjd int first = 1; 3002133808Spjd 3003133808Spjd#define ADD_FLAG(flag, name) do { \ 3004133808Spjd if ((disk->d_flags & (flag)) != 0) { \ 3005133808Spjd if (!first) \ 3006133808Spjd sbuf_printf(sb, ", "); \ 3007133808Spjd else \ 3008133808Spjd first = 0; \ 3009133808Spjd sbuf_printf(sb, name); \ 3010133808Spjd } \ 3011133808Spjd} while (0) 3012133808Spjd ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); 3013133808Spjd ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); 3014133808Spjd ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, 3015133808Spjd "SYNCHRONIZING"); 3016133808Spjd ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); 3017133808Spjd#undef ADD_FLAG 3018133808Spjd } 3019133808Spjd sbuf_printf(sb, "</Flags>\n"); 3020133808Spjd sbuf_printf(sb, "%s<State>%s</State>\n", indent, 3021133808Spjd g_raid3_disk_state2str(disk->d_state)); 3022133808Spjd } else { 3023133808Spjd sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id); 3024133808Spjd sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid); 3025139295Spjd sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid); 3026133808Spjd sbuf_printf(sb, "%s<Flags>", indent); 3027133808Spjd if (sc->sc_flags == 0) 3028133808Spjd sbuf_printf(sb, "NONE"); 3029133808Spjd else { 3030133808Spjd int first = 1; 3031133808Spjd 3032133808Spjd#define ADD_FLAG(flag, name) do { \ 3033133808Spjd if ((sc->sc_flags & (flag)) != 0) { \ 3034133808Spjd if (!first) \ 3035133808Spjd sbuf_printf(sb, ", "); \ 3036133808Spjd else \ 3037133808Spjd first = 0; \ 3038133808Spjd sbuf_printf(sb, name); \ 3039133808Spjd } \ 3040133808Spjd} while (0) 3041133808Spjd ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); 3042134124Spjd ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, 3043134124Spjd "ROUND-ROBIN"); 3044134168Spjd ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); 3045133808Spjd#undef ADD_FLAG 3046133808Spjd } 3047133808Spjd sbuf_printf(sb, "</Flags>\n"); 3048133808Spjd sbuf_printf(sb, "%s<Components>%u</Components>\n", indent, 3049133808Spjd sc->sc_ndisks); 3050133979Spjd sbuf_printf(sb, "%s<State>%s</State>\n", indent, 3051133979Spjd g_raid3_device_state2str(sc->sc_state)); 3052133808Spjd } 3053133808Spjd} 3054133808Spjd 3055137257Spjdstatic void 3056137257Spjdg_raid3_shutdown(void *arg, int howto) 3057137257Spjd{ 3058137257Spjd struct g_class *mp; 3059137257Spjd struct g_geom *gp, *gp2; 3060137257Spjd 3061137257Spjd mp = arg; 3062137421Spjd DROP_GIANT(); 3063137257Spjd g_topology_lock(); 3064137257Spjd LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 3065137257Spjd if (gp->softc == NULL) 3066137257Spjd continue; 3067137257Spjd g_raid3_destroy(gp->softc, 1); 3068137257Spjd } 3069137257Spjd g_topology_unlock(); 3070137421Spjd PICKUP_GIANT(); 3071137257Spjd#if 0 3072137257Spjd tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20); 3073137257Spjd#endif 3074137257Spjd} 3075137257Spjd 3076137257Spjdstatic void 3077137257Spjdg_raid3_init(struct g_class *mp) 3078137257Spjd{ 3079137257Spjd 3080137257Spjd g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync, 3081137257Spjd g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST); 3082137257Spjd if (g_raid3_ehtag == NULL) 3083137257Spjd G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); 3084137257Spjd} 3085137257Spjd 3086137257Spjdstatic void 3087137257Spjdg_raid3_fini(struct g_class *mp) 3088137257Spjd{ 3089137257Spjd 3090137257Spjd if (g_raid3_ehtag == NULL) 3091137257Spjd return; 3092137257Spjd EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag); 3093137257Spjd} 3094137257Spjd 3095133808SpjdDECLARE_GEOM_CLASS(g_raid3_class, g_raid3); 3096