g_raid3.c revision 223921
1218887Sdim/*- 2218887Sdim * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> 3218887Sdim * All rights reserved. 4218887Sdim * 5218887Sdim * Redistribution and use in source and binary forms, with or without 6218887Sdim * modification, are permitted provided that the following conditions 7218887Sdim * are met: 8218887Sdim * 1. Redistributions of source code must retain the above copyright 9218887Sdim * notice, this list of conditions and the following disclaimer. 10218887Sdim * 2. Redistributions in binary form must reproduce the above copyright 11218887Sdim * notice, this list of conditions and the following disclaimer in the 12218887Sdim * documentation and/or other materials provided with the distribution. 13218887Sdim * 14218887Sdim * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15218887Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16218887Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17249423Sdim * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18218887Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19234353Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20218887Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21249423Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22249423Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23234353Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24249423Sdim * SUCH DAMAGE. 25218887Sdim */ 26218887Sdim 27218887Sdim#include <sys/cdefs.h> 28218887Sdim__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 223921 2011-07-11 05:22:31Z ae $"); 29218887Sdim 30218887Sdim#include <sys/param.h> 31218887Sdim#include <sys/systm.h> 32218887Sdim#include <sys/kernel.h> 33218887Sdim#include <sys/module.h> 34218887Sdim#include <sys/limits.h> 35218887Sdim#include <sys/lock.h> 36218887Sdim#include <sys/mutex.h> 37218887Sdim#include <sys/bio.h> 38218887Sdim#include <sys/sbuf.h> 39218887Sdim#include <sys/sysctl.h> 40218887Sdim#include <sys/malloc.h> 41218887Sdim#include <sys/eventhandler.h> 42218887Sdim#include <vm/uma.h> 43218887Sdim#include <geom/geom.h> 44226633Sdim#include <sys/proc.h> 45218887Sdim#include <sys/kthread.h> 46218887Sdim#include <sys/sched.h> 47218887Sdim#include <geom/raid3/g_raid3.h> 48218887Sdim 49218887SdimFEATURE(geom_raid3, "GEOM RAID-3 functionality"); 50218887Sdim 51218887Sdimstatic MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); 52218887Sdim 53218887SdimSYSCTL_DECL(_kern_geom); 54218887SdimSYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); 55218887Sdimu_int g_raid3_debug = 0; 56218887SdimTUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug); 57218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0, 58218887Sdim "Debug level"); 59218887Sdimstatic u_int g_raid3_timeout = 4; 60218887SdimTUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout); 61218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout, 62226633Sdim 0, "Time to wait on all raid3 components"); 63218887Sdimstatic u_int g_raid3_idletime = 5; 64218887SdimTUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime); 65218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW, 66218887Sdim &g_raid3_idletime, 0, "Mark components as clean when idling"); 67218887Sdimstatic u_int g_raid3_disconnect_on_failure = 1; 68218887SdimTUNABLE_INT("kern.geom.raid3.disconnect_on_failure", 69218887Sdim &g_raid3_disconnect_on_failure); 70218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, 71218887Sdim &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); 72218887Sdimstatic u_int g_raid3_syncreqs = 2; 73218887SdimTUNABLE_INT("kern.geom.raid3.sync_requests", &g_raid3_syncreqs); 74218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, 75218887Sdim &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); 76218887Sdimstatic u_int g_raid3_use_malloc = 0; 77218887SdimTUNABLE_INT("kern.geom.raid3.use_malloc", &g_raid3_use_malloc); 78218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, 79218887Sdim &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); 80218887Sdim 81218887Sdimstatic u_int g_raid3_n64k = 50; 82218887SdimTUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k); 83226633SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0, 84218887Sdim "Maximum number of 64kB allocations"); 85218887Sdimstatic u_int g_raid3_n16k = 200; 86218887SdimTUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k); 87218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0, 88218887Sdim "Maximum number of 16kB allocations"); 89218887Sdimstatic u_int g_raid3_n4k = 1200; 90218887SdimTUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k); 91218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0, 92218887Sdim "Maximum number of 4kB allocations"); 93218887Sdim 94218887SdimSYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, 95218887Sdim "GEOM_RAID3 statistics"); 96218887Sdimstatic u_int g_raid3_parity_mismatch = 0; 97218887SdimSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, 98218887Sdim &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); 99218887Sdim 100218887Sdim#define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ 101218887Sdim G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ 102226633Sdim msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ 103218887Sdim G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ 104218887Sdim} while (0) 105218887Sdim 106218887Sdimstatic eventhandler_tag g_raid3_pre_sync = NULL; 107218887Sdim 108218887Sdimstatic int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, 109218887Sdim struct g_geom *gp); 110218887Sdimstatic g_taste_t g_raid3_taste; 111218887Sdimstatic void g_raid3_init(struct g_class *mp); 112218887Sdimstatic void g_raid3_fini(struct g_class *mp); 113218887Sdim 114218887Sdimstruct g_class g_raid3_class = { 115218887Sdim .name = G_RAID3_CLASS_NAME, 116218887Sdim .version = G_VERSION, 117218887Sdim .ctlreq = g_raid3_config, 118218887Sdim .taste = g_raid3_taste, 119218887Sdim .destroy_geom = g_raid3_destroy_geom, 120218887Sdim .init = g_raid3_init, 121226633Sdim .fini = g_raid3_fini 122218887Sdim}; 123218887Sdim 124218887Sdim 125218887Sdimstatic void g_raid3_destroy_provider(struct g_raid3_softc *sc); 126218887Sdimstatic int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); 127218887Sdimstatic void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); 128218887Sdimstatic void g_raid3_dumpconf(struct sbuf *sb, const char *indent, 129218887Sdim struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 130218887Sdimstatic void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); 131218887Sdimstatic int g_raid3_register_request(struct bio *pbp); 132218887Sdimstatic void g_raid3_sync_release(struct g_raid3_softc *sc); 133218887Sdim 134218887Sdim 135218887Sdimstatic const char * 136218887Sdimg_raid3_disk_state2str(int state) 137218887Sdim{ 138218887Sdim 139218887Sdim switch (state) { 140218887Sdim case G_RAID3_DISK_STATE_NODISK: 141218887Sdim return ("NODISK"); 142218887Sdim case G_RAID3_DISK_STATE_NONE: 143218887Sdim return ("NONE"); 144218887Sdim case G_RAID3_DISK_STATE_NEW: 145218887Sdim return ("NEW"); 146218887Sdim case G_RAID3_DISK_STATE_ACTIVE: 147218887Sdim return ("ACTIVE"); 148218887Sdim case G_RAID3_DISK_STATE_STALE: 149218887Sdim return ("STALE"); 150218887Sdim case G_RAID3_DISK_STATE_SYNCHRONIZING: 151218887Sdim return ("SYNCHRONIZING"); 152218887Sdim case G_RAID3_DISK_STATE_DISCONNECTED: 153218887Sdim return ("DISCONNECTED"); 154218887Sdim default: 155218887Sdim return ("INVALID"); 156218887Sdim } 157218887Sdim} 158218887Sdim 159218887Sdimstatic const char * 160218887Sdimg_raid3_device_state2str(int state) 161218887Sdim{ 162218887Sdim 163218887Sdim switch (state) { 164218887Sdim case G_RAID3_DEVICE_STATE_STARTING: 165218887Sdim return ("STARTING"); 166218887Sdim case G_RAID3_DEVICE_STATE_DEGRADED: 167218887Sdim return ("DEGRADED"); 168218887Sdim case G_RAID3_DEVICE_STATE_COMPLETE: 169218887Sdim return ("COMPLETE"); 170218887Sdim default: 171218887Sdim return ("INVALID"); 172218887Sdim } 173218887Sdim} 174218887Sdim 175218887Sdimconst char * 176218887Sdimg_raid3_get_diskname(struct g_raid3_disk *disk) 177218887Sdim{ 178218887Sdim 179218887Sdim if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) 180218887Sdim return ("[unknown]"); 181218887Sdim return (disk->d_name); 182218887Sdim} 183239462Sdim 184226633Sdimstatic void * 185218887Sdimg_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) 186218887Sdim{ 187218887Sdim void *ptr; 188218887Sdim enum g_raid3_zones zone; 189218887Sdim 190218887Sdim if (g_raid3_use_malloc || 191218887Sdim (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) 192218887Sdim ptr = malloc(size, M_RAID3, flags); 193218887Sdim else { 194218887Sdim ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, 195218887Sdim &sc->sc_zones[zone], flags); 196218887Sdim sc->sc_zones[zone].sz_requested++; 197218887Sdim if (ptr == NULL) 198249423Sdim sc->sc_zones[zone].sz_failed++; 199249423Sdim } 200249423Sdim return (ptr); 201249423Sdim} 202218887Sdim 203218887Sdimstatic void 204218887Sdimg_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) 205218887Sdim{ 206218887Sdim enum g_raid3_zones zone; 207218887Sdim 208218887Sdim if (g_raid3_use_malloc || 209218887Sdim (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) 210218887Sdim free(ptr, M_RAID3); 211218887Sdim else { 212218887Sdim uma_zfree_arg(sc->sc_zones[zone].sz_zone, 213218887Sdim ptr, &sc->sc_zones[zone]); 214218887Sdim } 215218887Sdim} 216218887Sdim 217218887Sdimstatic int 218218887Sdimg_raid3_uma_ctor(void *mem, int size, void *arg, int flags) 219218887Sdim{ 220218887Sdim struct g_raid3_zone *sz = arg; 221218887Sdim 222218887Sdim if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) 223218887Sdim return (ENOMEM); 224218887Sdim sz->sz_inuse++; 225218887Sdim return (0); 226218887Sdim} 227218887Sdim 228218887Sdimstatic void 229234353Sdimg_raid3_uma_dtor(void *mem, int size, void *arg) 230234353Sdim{ 231234353Sdim struct g_raid3_zone *sz = arg; 232234353Sdim 233234353Sdim sz->sz_inuse--; 234234353Sdim} 235234353Sdim 236234353Sdim#define g_raid3_xor(src, dst, size) \ 237234353Sdim _g_raid3_xor((uint64_t *)(src), \ 238234353Sdim (uint64_t *)(dst), (size_t)size) 239234353Sdimstatic void 240218887Sdim_g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) 241249423Sdim{ 242218887Sdim 243218887Sdim KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); 244218887Sdim for (; size > 0; size -= 128) { 245218887Sdim *dst++ ^= (*src++); 246218887Sdim *dst++ ^= (*src++); 247218887Sdim *dst++ ^= (*src++); 248218887Sdim *dst++ ^= (*src++); 249218887Sdim *dst++ ^= (*src++); 250218887Sdim *dst++ ^= (*src++); 251218887Sdim *dst++ ^= (*src++); 252218887Sdim *dst++ ^= (*src++); 253218887Sdim *dst++ ^= (*src++); 254218887Sdim *dst++ ^= (*src++); 255218887Sdim *dst++ ^= (*src++); 256218887Sdim *dst++ ^= (*src++); 257218887Sdim *dst++ ^= (*src++); 258218887Sdim *dst++ ^= (*src++); 259218887Sdim *dst++ ^= (*src++); 260218887Sdim *dst++ ^= (*src++); 261218887Sdim } 262218887Sdim} 263218887Sdim 264218887Sdimstatic int 265218887Sdimg_raid3_is_zero(struct bio *bp) 266218887Sdim{ 267218887Sdim static const uint64_t zeros[] = { 268218887Sdim 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 269218887Sdim }; 270234353Sdim u_char *addr; 271234353Sdim ssize_t size; 272234353Sdim 273234353Sdim size = bp->bio_length; 274234353Sdim addr = (u_char *)bp->bio_data; 275234353Sdim for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { 276234353Sdim if (bcmp(addr, zeros, sizeof(zeros)) != 0) 277234353Sdim return (0); 278218887Sdim } 279226633Sdim return (1); 280249423Sdim} 281218887Sdim 282218887Sdim/* 283218887Sdim * --- Events handling functions --- 284249423Sdim * Events in geom_raid3 are used to maintain disks and device status 285218887Sdim * from one thread to simplify locking. 286218887Sdim */ 287218887Sdimstatic void 288218887Sdimg_raid3_event_free(struct g_raid3_event *ep) 289218887Sdim{ 290218887Sdim 291218887Sdim free(ep, M_RAID3); 292218887Sdim} 293218887Sdim 294218887Sdimint 295218887Sdimg_raid3_event_send(void *arg, int state, int flags) 296226633Sdim{ 297218887Sdim struct g_raid3_softc *sc; 298218887Sdim struct g_raid3_disk *disk; 299218887Sdim struct g_raid3_event *ep; 300218887Sdim int error; 301218887Sdim 302218887Sdim ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); 303218887Sdim G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); 304218887Sdim if ((flags & G_RAID3_EVENT_DEVICE) != 0) { 305218887Sdim disk = NULL; 306218887Sdim sc = arg; 307218887Sdim } else { 308218887Sdim disk = arg; 309218887Sdim sc = disk->d_softc; 310218887Sdim } 311218887Sdim ep->e_disk = disk; 312218887Sdim ep->e_state = state; 313218887Sdim ep->e_flags = flags; 314218887Sdim ep->e_error = 0; 315234353Sdim mtx_lock(&sc->sc_events_mtx); 316234353Sdim TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); 317234353Sdim mtx_unlock(&sc->sc_events_mtx); 318234353Sdim G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 319234353Sdim mtx_lock(&sc->sc_queue_mtx); 320234353Sdim wakeup(sc); 321226633Sdim wakeup(&sc->sc_queue); 322218887Sdim mtx_unlock(&sc->sc_queue_mtx); 323218887Sdim if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) 324218887Sdim return (0); 325218887Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 326218887Sdim G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); 327218887Sdim sx_xunlock(&sc->sc_lock); 328218887Sdim while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { 329218887Sdim mtx_lock(&sc->sc_events_mtx); 330218887Sdim MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", 331218887Sdim hz * 5); 332218887Sdim } 333218887Sdim error = ep->e_error; 334218887Sdim g_raid3_event_free(ep); 335218887Sdim sx_xlock(&sc->sc_lock); 336218887Sdim return (error); 337218887Sdim} 338218887Sdim 339218887Sdimstatic struct g_raid3_event * 340218887Sdimg_raid3_event_get(struct g_raid3_softc *sc) 341218887Sdim{ 342218887Sdim struct g_raid3_event *ep; 343218887Sdim 344218887Sdim mtx_lock(&sc->sc_events_mtx); 345218887Sdim ep = TAILQ_FIRST(&sc->sc_events); 346218887Sdim mtx_unlock(&sc->sc_events_mtx); 347218887Sdim return (ep); 348218887Sdim} 349218887Sdim 350218887Sdimstatic void 351218887Sdimg_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) 352218887Sdim{ 353218887Sdim 354218887Sdim mtx_lock(&sc->sc_events_mtx); 355218887Sdim TAILQ_REMOVE(&sc->sc_events, ep, e_next); 356218887Sdim mtx_unlock(&sc->sc_events_mtx); 357218887Sdim} 358218887Sdim 359218887Sdimstatic void 360218887Sdimg_raid3_event_cancel(struct g_raid3_disk *disk) 361243830Sdim{ 362218887Sdim struct g_raid3_softc *sc; 363218887Sdim struct g_raid3_event *ep, *tmpep; 364218887Sdim 365218887Sdim sc = disk->d_softc; 366218887Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 367218887Sdim 368218887Sdim mtx_lock(&sc->sc_events_mtx); 369218887Sdim TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { 370218887Sdim if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) 371218887Sdim continue; 372218887Sdim if (ep->e_disk != disk) 373234353Sdim continue; 374218887Sdim TAILQ_REMOVE(&sc->sc_events, ep, e_next); 375218887Sdim if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) 376218887Sdim g_raid3_event_free(ep); 377218887Sdim else { 378218887Sdim ep->e_error = ECANCELED; 379218887Sdim wakeup(ep); 380218887Sdim } 381218887Sdim } 382218887Sdim mtx_unlock(&sc->sc_events_mtx); 383218887Sdim} 384218887Sdim 385218887Sdim/* 386218887Sdim * Return the number of disks in the given state. 387218887Sdim * If state is equal to -1, count all connected disks. 388218887Sdim */ 389218887Sdimu_int 390218887Sdimg_raid3_ndisks(struct g_raid3_softc *sc, int state) 391218887Sdim{ 392218887Sdim struct g_raid3_disk *disk; 393218887Sdim u_int n, ndisks; 394218887Sdim 395218887Sdim sx_assert(&sc->sc_lock, SX_LOCKED); 396218887Sdim 397218887Sdim for (n = ndisks = 0; n < sc->sc_ndisks; n++) { 398218887Sdim disk = &sc->sc_disks[n]; 399218887Sdim if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 400218887Sdim continue; 401218887Sdim if (state == -1 || disk->d_state == state) 402218887Sdim ndisks++; 403218887Sdim } 404218887Sdim return (ndisks); 405218887Sdim} 406218887Sdim 407218887Sdimstatic u_int 408218887Sdimg_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) 409249423Sdim{ 410249423Sdim struct bio *bp; 411249423Sdim u_int nreqs = 0; 412249423Sdim 413249423Sdim mtx_lock(&sc->sc_queue_mtx); 414249423Sdim TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 415218887Sdim if (bp->bio_from == cp) 416218887Sdim nreqs++; 417218887Sdim } 418249423Sdim mtx_unlock(&sc->sc_queue_mtx); 419218887Sdim return (nreqs); 420218887Sdim} 421218887Sdim 422234353Sdimstatic int 423234353Sdimg_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) 424234353Sdim{ 425234353Sdim 426234353Sdim if (cp->index > 0) { 427234353Sdim G_RAID3_DEBUG(2, 428234353Sdim "I/O requests for %s exist, can't destroy it now.", 429234353Sdim cp->provider->name); 430234353Sdim return (1); 431234353Sdim } 432234353Sdim if (g_raid3_nrequests(sc, cp) > 0) { 433234353Sdim G_RAID3_DEBUG(2, 434234353Sdim "I/O requests for %s in queue, can't destroy it now.", 435234353Sdim cp->provider->name); 436218887Sdim return (1); 437218887Sdim } 438218887Sdim return (0); 439218887Sdim} 440218887Sdim 441218887Sdimstatic void 442218887Sdimg_raid3_destroy_consumer(void *arg, int flags __unused) 443218887Sdim{ 444218887Sdim struct g_consumer *cp; 445218887Sdim 446218887Sdim g_topology_assert(); 447218887Sdim 448218887Sdim cp = arg; 449218887Sdim G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); 450226633Sdim g_detach(cp); 451218887Sdim g_destroy_consumer(cp); 452218887Sdim} 453218887Sdim 454226633Sdimstatic void 455243830Sdimg_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) 456218887Sdim{ 457218887Sdim struct g_provider *pp; 458226633Sdim int retaste_wait; 459218887Sdim 460218887Sdim g_topology_assert(); 461218887Sdim 462226633Sdim cp->private = NULL; 463243830Sdim if (g_raid3_is_busy(sc, cp)) 464218887Sdim return; 465218887Sdim G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); 466226633Sdim pp = cp->provider; 467218887Sdim retaste_wait = 0; 468218887Sdim if (cp->acw == 1) { 469218887Sdim if ((pp->geom->flags & G_GEOM_WITHER) == 0) 470226633Sdim retaste_wait = 1; 471218887Sdim } 472243830Sdim G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, 473218887Sdim -cp->acw, -cp->ace, 0); 474218887Sdim if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) 475226633Sdim g_access(cp, -cp->acr, -cp->acw, -cp->ace); 476226633Sdim if (retaste_wait) { 477243830Sdim /* 478218887Sdim * After retaste event was send (inside g_access()), we can send 479218887Sdim * event to detach and destroy consumer. 480226633Sdim * A class, which has consumer to the given provider connected 481249423Sdim * will not receive retaste event for the provider. 482218887Sdim * This is the way how I ignore retaste events when I close 483218887Sdim * consumers opened for write: I detach and destroy consumer 484226633Sdim * after retaste event is sent. 485218887Sdim */ 486218887Sdim g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); 487218887Sdim return; 488226633Sdim } 489218887Sdim G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); 490218887Sdim g_detach(cp); 491218887Sdim g_destroy_consumer(cp); 492218887Sdim} 493226633Sdim 494226633Sdimstatic int 495218887Sdimg_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) 496218887Sdim{ 497226633Sdim struct g_consumer *cp; 498226633Sdim int error; 499218887Sdim 500218887Sdim g_topology_assert_not(); 501226633Sdim KASSERT(disk->d_consumer == NULL, 502234353Sdim ("Disk already connected (device %s).", disk->d_softc->sc_name)); 503218887Sdim 504218887Sdim g_topology_lock(); 505234353Sdim cp = g_new_consumer(disk->d_softc->sc_geom); 506234353Sdim error = g_attach(cp, pp); 507234353Sdim if (error != 0) { 508234353Sdim g_destroy_consumer(cp); 509226633Sdim g_topology_unlock(); 510218887Sdim return (error); 511218887Sdim } 512218887Sdim error = g_access(cp, 1, 1, 1); 513226633Sdim g_topology_unlock(); 514226633Sdim if (error != 0) { 515218887Sdim g_detach(cp); 516218887Sdim g_destroy_consumer(cp); 517218887Sdim G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", 518218887Sdim pp->name, error); 519218887Sdim return (error); 520218887Sdim } 521226633Sdim disk->d_consumer = cp; 522218887Sdim disk->d_consumer->private = disk; 523218887Sdim disk->d_consumer->index = 0; 524218887Sdim G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); 525226633Sdim return (0); 526218887Sdim} 527218887Sdim 528218887Sdimstatic void 529234353Sdimg_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) 530234353Sdim{ 531234353Sdim 532234353Sdim g_topology_assert(); 533234353Sdim 534234353Sdim if (cp == NULL) 535234353Sdim return; 536234353Sdim if (cp->provider != NULL) 537234353Sdim g_raid3_kill_consumer(sc, cp); 538234353Sdim else 539234353Sdim g_destroy_consumer(cp); 540234353Sdim} 541239462Sdim 542239462Sdim/* 543239462Sdim * Initialize disk. This means allocate memory, create consumer, attach it 544239462Sdim * to the provider and open access (r1w1e1) to it. 545239462Sdim */ 546239462Sdimstatic struct g_raid3_disk * 547239462Sdimg_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, 548239462Sdim struct g_raid3_metadata *md, int *errorp) 549239462Sdim{ 550239462Sdim struct g_raid3_disk *disk; 551239462Sdim int error; 552239462Sdim 553239462Sdim disk = &sc->sc_disks[md->md_no]; 554239462Sdim error = g_raid3_connect_disk(disk, pp); 555239462Sdim if (error != 0) { 556239462Sdim if (errorp != NULL) 557239462Sdim *errorp = error; 558239462Sdim return (NULL); 559239462Sdim } 560239462Sdim disk->d_state = G_RAID3_DISK_STATE_NONE; 561239462Sdim disk->d_flags = md->md_dflags; 562234353Sdim if (md->md_provider[0] != '\0') 563234353Sdim disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; 564234353Sdim disk->d_sync.ds_consumer = NULL; 565239462Sdim disk->d_sync.ds_offset = md->md_sync_offset; 566239462Sdim disk->d_sync.ds_offset_done = md->md_sync_offset; 567239462Sdim disk->d_genid = md->md_genid; 568239462Sdim disk->d_sync.ds_syncid = md->md_syncid; 569239462Sdim if (errorp != NULL) 570234353Sdim *errorp = 0; 571234353Sdim return (disk); 572234353Sdim} 573249423Sdim 574249423Sdimstatic void 575249423Sdimg_raid3_destroy_disk(struct g_raid3_disk *disk) 576249423Sdim{ 577249423Sdim struct g_raid3_softc *sc; 578249423Sdim 579249423Sdim g_topology_assert_not(); 580249423Sdim sc = disk->d_softc; 581239462Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 582239462Sdim 583234353Sdim if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 584234353Sdim return; 585239462Sdim g_raid3_event_cancel(disk); 586239462Sdim switch (disk->d_state) { 587239462Sdim case G_RAID3_DISK_STATE_SYNCHRONIZING: 588239462Sdim if (sc->sc_syncdisk != NULL) 589239462Sdim g_raid3_sync_stop(sc, 1); 590218887Sdim /* FALLTHROUGH */ 591218887Sdim case G_RAID3_DISK_STATE_NEW: 592218887Sdim case G_RAID3_DISK_STATE_STALE: 593218887Sdim case G_RAID3_DISK_STATE_ACTIVE: 594218887Sdim g_topology_lock(); 595218887Sdim g_raid3_disconnect_consumer(sc, disk->d_consumer); 596218887Sdim g_topology_unlock(); 597218887Sdim disk->d_consumer = NULL; 598218887Sdim break; 599218887Sdim default: 600218887Sdim KASSERT(0 == 1, ("Wrong disk state (%s, %s).", 601218887Sdim g_raid3_get_diskname(disk), 602218887Sdim g_raid3_disk_state2str(disk->d_state))); 603218887Sdim } 604218887Sdim disk->d_state = G_RAID3_DISK_STATE_NODISK; 605218887Sdim} 606218887Sdim 607218887Sdimstatic void 608218887Sdimg_raid3_destroy_device(struct g_raid3_softc *sc) 609218887Sdim{ 610218887Sdim struct g_raid3_event *ep; 611218887Sdim struct g_raid3_disk *disk; 612218887Sdim struct g_geom *gp; 613218887Sdim struct g_consumer *cp; 614218887Sdim u_int n; 615218887Sdim 616218887Sdim g_topology_assert_not(); 617218887Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 618218887Sdim 619218887Sdim gp = sc->sc_geom; 620218887Sdim if (sc->sc_provider != NULL) 621218887Sdim g_raid3_destroy_provider(sc); 622218887Sdim for (n = 0; n < sc->sc_ndisks; n++) { 623218887Sdim disk = &sc->sc_disks[n]; 624218887Sdim if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { 625218887Sdim disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 626218887Sdim g_raid3_update_metadata(disk); 627218887Sdim g_raid3_destroy_disk(disk); 628218887Sdim } 629218887Sdim } 630218887Sdim while ((ep = g_raid3_event_get(sc)) != NULL) { 631218887Sdim g_raid3_event_remove(sc, ep); 632218887Sdim if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) 633218887Sdim g_raid3_event_free(ep); 634218887Sdim else { 635218887Sdim ep->e_error = ECANCELED; 636218887Sdim ep->e_flags |= G_RAID3_EVENT_DONE; 637218887Sdim G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); 638218887Sdim mtx_lock(&sc->sc_events_mtx); 639218887Sdim wakeup(ep); 640218887Sdim mtx_unlock(&sc->sc_events_mtx); 641234353Sdim } 642234353Sdim } 643234353Sdim callout_drain(&sc->sc_callout); 644234353Sdim cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); 645234353Sdim g_topology_lock(); 646234353Sdim if (cp != NULL) 647234353Sdim g_raid3_disconnect_consumer(sc, cp); 648234353Sdim g_wither_geom(sc->sc_sync.ds_geom, ENXIO); 649234353Sdim G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); 650234353Sdim g_wither_geom(gp, ENXIO); 651218887Sdim g_topology_unlock(); 652234353Sdim if (!g_raid3_use_malloc) { 653218887Sdim uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); 654218887Sdim uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); 655218887Sdim uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); 656218887Sdim } 657218887Sdim mtx_destroy(&sc->sc_queue_mtx); 658218887Sdim mtx_destroy(&sc->sc_events_mtx); 659218887Sdim sx_xunlock(&sc->sc_lock); 660218887Sdim sx_destroy(&sc->sc_lock); 661218887Sdim} 662218887Sdim 663218887Sdimstatic void 664218887Sdimg_raid3_orphan(struct g_consumer *cp) 665218887Sdim{ 666218887Sdim struct g_raid3_disk *disk; 667218887Sdim 668218887Sdim g_topology_assert(); 669218887Sdim 670218887Sdim disk = cp->private; 671218887Sdim if (disk == NULL) 672218887Sdim return; 673218887Sdim disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; 674218887Sdim g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, 675218887Sdim G_RAID3_EVENT_DONTWAIT); 676218887Sdim} 677218887Sdim 678218887Sdimstatic int 679218887Sdimg_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) 680218887Sdim{ 681234353Sdim struct g_raid3_softc *sc; 682234353Sdim struct g_consumer *cp; 683234353Sdim off_t offset, length; 684234353Sdim u_char *sector; 685234353Sdim int error = 0; 686239462Sdim 687239462Sdim g_topology_assert_not(); 688239462Sdim sc = disk->d_softc; 689239462Sdim sx_assert(&sc->sc_lock, SX_LOCKED); 690239462Sdim 691239462Sdim cp = disk->d_consumer; 692239462Sdim KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); 693239462Sdim KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); 694239462Sdim KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 695239462Sdim ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, 696239462Sdim cp->acw, cp->ace)); 697239462Sdim length = cp->provider->sectorsize; 698239462Sdim offset = cp->provider->mediasize - length; 699239462Sdim sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); 700239462Sdim if (md != NULL) 701239462Sdim raid3_metadata_encode(md, sector); 702239462Sdim error = g_write_data(cp, offset, sector, length); 703239462Sdim free(sector, M_RAID3); 704239462Sdim if (error != 0) { 705239462Sdim if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { 706239462Sdim G_RAID3_DEBUG(0, "Cannot write metadata on %s " 707239462Sdim "(device=%s, error=%d).", 708239462Sdim g_raid3_get_diskname(disk), sc->sc_name, error); 709239462Sdim disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; 710239462Sdim } else { 711239462Sdim G_RAID3_DEBUG(1, "Cannot write metadata on %s " 712239462Sdim "(device=%s, error=%d).", 713239462Sdim g_raid3_get_diskname(disk), sc->sc_name, error); 714239462Sdim } 715239462Sdim if (g_raid3_disconnect_on_failure && 716239462Sdim sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 717218887Sdim sc->sc_bump_id |= G_RAID3_BUMP_GENID; 718218887Sdim g_raid3_event_send(disk, 719218887Sdim G_RAID3_DISK_STATE_DISCONNECTED, 720218887Sdim G_RAID3_EVENT_DONTWAIT); 721234353Sdim } 722234353Sdim } 723234353Sdim return (error); 724234353Sdim} 725234353Sdim 726234353Sdimint 727234353Sdimg_raid3_clear_metadata(struct g_raid3_disk *disk) 728234353Sdim{ 729234353Sdim int error; 730234353Sdim 731234353Sdim g_topology_assert_not(); 732234353Sdim sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); 733234353Sdim 734234353Sdim error = g_raid3_write_metadata(disk, NULL); 735234353Sdim if (error == 0) { 736234353Sdim G_RAID3_DEBUG(2, "Metadata on %s cleared.", 737234353Sdim g_raid3_get_diskname(disk)); 738234353Sdim } else { 739234353Sdim G_RAID3_DEBUG(0, 740234353Sdim "Cannot clear metadata on disk %s (error=%d).", 741234353Sdim g_raid3_get_diskname(disk), error); 742234353Sdim } 743234353Sdim return (error); 744234353Sdim} 745234353Sdim 746218887Sdimvoid 747218887Sdimg_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) 748218887Sdim{ 749239462Sdim struct g_raid3_softc *sc; 750239462Sdim struct g_provider *pp; 751239462Sdim 752239462Sdim sc = disk->d_softc; 753239462Sdim strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); 754239462Sdim md->md_version = G_RAID3_VERSION; 755239462Sdim strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); 756218887Sdim md->md_id = sc->sc_id; 757218887Sdim md->md_all = sc->sc_ndisks; 758218887Sdim md->md_genid = sc->sc_genid; 759218887Sdim md->md_mediasize = sc->sc_mediasize; 760218887Sdim md->md_sectorsize = sc->sc_sectorsize; 761218887Sdim md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); 762218887Sdim md->md_no = disk->d_no; 763218887Sdim md->md_syncid = disk->d_sync.ds_syncid; 764218887Sdim md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); 765218887Sdim if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) 766218887Sdim md->md_sync_offset = 0; 767243830Sdim else { 768243830Sdim md->md_sync_offset = 769234353Sdim disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); 770243830Sdim } 771243830Sdim if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) 772218887Sdim pp = disk->d_consumer->provider; 773218887Sdim else 774218887Sdim pp = NULL; 775234353Sdim if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) 776234353Sdim strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); 777234353Sdim else 778218887Sdim bzero(md->md_provider, sizeof(md->md_provider)); 779218887Sdim if (pp != NULL) 780218887Sdim md->md_provsize = pp->mediasize; 781218887Sdim else 782218887Sdim md->md_provsize = 0; 783218887Sdim} 784218887Sdim 785218887Sdimvoid 786218887Sdimg_raid3_update_metadata(struct g_raid3_disk *disk) 787218887Sdim{ 788218887Sdim struct g_raid3_softc *sc; 789218887Sdim struct g_raid3_metadata md; 790218887Sdim int error; 791218887Sdim 792218887Sdim g_topology_assert_not(); 793218887Sdim sc = disk->d_softc; 794218887Sdim sx_assert(&sc->sc_lock, SX_LOCKED); 795218887Sdim 796218887Sdim g_raid3_fill_metadata(disk, &md); 797218887Sdim error = g_raid3_write_metadata(disk, &md); 798234353Sdim if (error == 0) { 799234353Sdim G_RAID3_DEBUG(2, "Metadata on %s updated.", 800234353Sdim g_raid3_get_diskname(disk)); 801234353Sdim } else { 802218887Sdim G_RAID3_DEBUG(0, 803218887Sdim "Cannot update metadata on disk %s (error=%d).", 804234353Sdim g_raid3_get_diskname(disk), error); 805234353Sdim } 806234353Sdim} 807234353Sdim 808234353Sdimstatic void 809234353Sdimg_raid3_bump_syncid(struct g_raid3_softc *sc) 810234353Sdim{ 811234353Sdim struct g_raid3_disk *disk; 812234353Sdim u_int n; 813234353Sdim 814234353Sdim g_topology_assert_not(); 815234353Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 816218887Sdim KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, 817218887Sdim ("%s called with no active disks (device=%s).", __func__, 818218887Sdim sc->sc_name)); 819218887Sdim 820218887Sdim sc->sc_syncid++; 821218887Sdim G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, 822226633Sdim sc->sc_syncid); 823218887Sdim for (n = 0; n < sc->sc_ndisks; n++) { 824218887Sdim disk = &sc->sc_disks[n]; 825218887Sdim if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 826218887Sdim disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 827218887Sdim disk->d_sync.ds_syncid = sc->sc_syncid; 828218887Sdim g_raid3_update_metadata(disk); 829218887Sdim } 830218887Sdim } 831218887Sdim} 832218887Sdim 833218887Sdimstatic void 834218887Sdimg_raid3_bump_genid(struct g_raid3_softc *sc) 835218887Sdim{ 836218887Sdim struct g_raid3_disk *disk; 837218887Sdim u_int n; 838218887Sdim 839218887Sdim g_topology_assert_not(); 840218887Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 841226633Sdim KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, 842218887Sdim ("%s called with no active disks (device=%s).", __func__, 843218887Sdim sc->sc_name)); 844218887Sdim 845218887Sdim sc->sc_genid++; 846218887Sdim G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, 847218887Sdim sc->sc_genid); 848226633Sdim for (n = 0; n < sc->sc_ndisks; n++) { 849218887Sdim disk = &sc->sc_disks[n]; 850218887Sdim if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 851218887Sdim disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 852218887Sdim disk->d_genid = sc->sc_genid; 853218887Sdim g_raid3_update_metadata(disk); 854218887Sdim } 855218887Sdim } 856218887Sdim} 857218887Sdim 858218887Sdimstatic int 859218887Sdimg_raid3_idle(struct g_raid3_softc *sc, int acw) 860218887Sdim{ 861218887Sdim struct g_raid3_disk *disk; 862243830Sdim u_int i; 863218887Sdim int timeout; 864218887Sdim 865218887Sdim g_topology_assert_not(); 866218887Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 867218887Sdim 868234353Sdim if (sc->sc_provider == NULL) 869218887Sdim return (0); 870218887Sdim if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) 871218887Sdim return (0); 872218887Sdim if (sc->sc_idle) 873218887Sdim return (0); 874218887Sdim if (sc->sc_writes > 0) 875218887Sdim return (0); 876218887Sdim if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { 877218887Sdim timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); 878239462Sdim if (timeout > 0) 879239462Sdim return (timeout); 880239462Sdim } 881239462Sdim sc->sc_idle = 1; 882218887Sdim for (i = 0; i < sc->sc_ndisks; i++) { 883226633Sdim disk = &sc->sc_disks[i]; 884218887Sdim if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 885218887Sdim continue; 886218887Sdim G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 887218887Sdim g_raid3_get_diskname(disk), sc->sc_name); 888218887Sdim disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 889226633Sdim g_raid3_update_metadata(disk); 890218887Sdim } 891218887Sdim return (0); 892218887Sdim} 893218887Sdim 894218887Sdimstatic void 895218887Sdimg_raid3_unidle(struct g_raid3_softc *sc) 896218887Sdim{ 897218887Sdim struct g_raid3_disk *disk; 898218887Sdim u_int i; 899218887Sdim 900218887Sdim g_topology_assert_not(); 901218887Sdim sx_assert(&sc->sc_lock, SX_XLOCKED); 902249423Sdim 903249423Sdim if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) 904249423Sdim return; 905249423Sdim sc->sc_idle = 0; 906249423Sdim sc->sc_last_write = time_uptime; 907249423Sdim for (i = 0; i < sc->sc_ndisks; i++) { 908249423Sdim disk = &sc->sc_disks[i]; 909249423Sdim if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 910249423Sdim continue; 911249423Sdim G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 912249423Sdim g_raid3_get_diskname(disk), sc->sc_name); 913249423Sdim disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 914249423Sdim g_raid3_update_metadata(disk); 915249423Sdim } 916249423Sdim} 917249423Sdim 918249423Sdim/* 919249423Sdim * Treat bio_driver1 field in parent bio as list head and field bio_caller1 920249423Sdim * in child bio as pointer to the next element on the list. 921249423Sdim */ 922249423Sdim#define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 923249423Sdim 924249423Sdim#define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 925249423Sdim 926218887Sdim#define G_RAID3_FOREACH_BIO(pbp, bp) \ 927249423Sdim for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ 928249423Sdim (bp) = G_RAID3_NEXT_BIO(bp)) 929249423Sdim 930249423Sdim#define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ 931249423Sdim for ((bp) = G_RAID3_HEAD_BIO(pbp); \ 932249423Sdim (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ 933239462Sdim (bp) = (tmpbp)) 934249423Sdim 935249423Sdimstatic void 936249423Sdimg_raid3_init_bio(struct bio *pbp) 937249423Sdim{ 938249423Sdim 939249423Sdim G_RAID3_HEAD_BIO(pbp) = NULL; 940239462Sdim} 941249423Sdim 942239462Sdimstatic void 943239462Sdimg_raid3_remove_bio(struct bio *cbp) 944239462Sdim{ 945249423Sdim struct bio *pbp, *bp; 946218887Sdim 947218887Sdim pbp = cbp->bio_parent; 948218887Sdim if (G_RAID3_HEAD_BIO(pbp) == cbp) 949218887Sdim G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); 950218887Sdim else { 951218887Sdim G_RAID3_FOREACH_BIO(pbp, bp) { 952218887Sdim if (G_RAID3_NEXT_BIO(bp) == cbp) { 953218887Sdim G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); 954218887Sdim break; 955218887Sdim } 956218887Sdim } 957218887Sdim } 958218887Sdim G_RAID3_NEXT_BIO(cbp) = NULL; 959226633Sdim} 960218887Sdim 961218887Sdimstatic void 962218887Sdimg_raid3_replace_bio(struct bio *sbp, struct bio *dbp) 963218887Sdim{ 964218887Sdim struct bio *pbp, *bp; 965218887Sdim 966218887Sdim g_raid3_remove_bio(sbp); 967218887Sdim pbp = dbp->bio_parent; 968218887Sdim G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); 969218887Sdim if (G_RAID3_HEAD_BIO(pbp) == dbp) 970218887Sdim G_RAID3_HEAD_BIO(pbp) = sbp; 971218887Sdim else { 972218887Sdim G_RAID3_FOREACH_BIO(pbp, bp) { 973218887Sdim if (G_RAID3_NEXT_BIO(bp) == dbp) { 974218887Sdim G_RAID3_NEXT_BIO(bp) = sbp; 975218887Sdim break; 976218887Sdim } 977218887Sdim } 978218887Sdim } 979218887Sdim G_RAID3_NEXT_BIO(dbp) = NULL; 980218887Sdim} 981218887Sdim 982218887Sdimstatic void 983218887Sdimg_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) 984218887Sdim{ 985218887Sdim struct bio *bp, *pbp; 986218887Sdim size_t size; 987218887Sdim 988218887Sdim pbp = cbp->bio_parent; 989218887Sdim pbp->bio_children--; 990218887Sdim KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); 991218887Sdim size = pbp->bio_length / (sc->sc_ndisks - 1); 992218887Sdim g_raid3_free(sc, cbp->bio_data, size); 993218887Sdim if (G_RAID3_HEAD_BIO(pbp) == cbp) { 994218887Sdim G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); 995218887Sdim G_RAID3_NEXT_BIO(cbp) = NULL; 996218887Sdim g_destroy_bio(cbp); 997218887Sdim } else { 998218887Sdim G_RAID3_FOREACH_BIO(pbp, bp) { 999218887Sdim if (G_RAID3_NEXT_BIO(bp) == cbp) 1000218887Sdim break; 1001218887Sdim } 1002218887Sdim if (bp != NULL) { 1003218887Sdim KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, 1004218887Sdim ("NULL bp->bio_driver1")); 1005218887Sdim G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); 1006218887Sdim G_RAID3_NEXT_BIO(cbp) = NULL; 1007218887Sdim } 1008218887Sdim g_destroy_bio(cbp); 1009218887Sdim } 1010218887Sdim} 1011218887Sdim 1012218887Sdimstatic struct bio * 1013218887Sdimg_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) 1014218887Sdim{ 1015218887Sdim struct bio *bp, *cbp; 1016243830Sdim size_t size; 1017243830Sdim int memflag; 1018243830Sdim 1019243830Sdim cbp = g_clone_bio(pbp); 1020218887Sdim if (cbp == NULL) 1021218887Sdim return (NULL); 1022218887Sdim size = pbp->bio_length / (sc->sc_ndisks - 1); 1023218887Sdim if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) 1024239462Sdim memflag = M_WAITOK; 1025218887Sdim else 1026218887Sdim memflag = M_NOWAIT; 1027239462Sdim cbp->bio_data = g_raid3_alloc(sc, size, memflag); 1028239462Sdim if (cbp->bio_data == NULL) { 1029239462Sdim pbp->bio_children--; 1030239462Sdim g_destroy_bio(cbp); 1031239462Sdim return (NULL); 1032239462Sdim } 1033239462Sdim G_RAID3_NEXT_BIO(cbp) = NULL; 1034218887Sdim if (G_RAID3_HEAD_BIO(pbp) == NULL) 1035239462Sdim G_RAID3_HEAD_BIO(pbp) = cbp; 1036239462Sdim else { 1037239462Sdim G_RAID3_FOREACH_BIO(pbp, bp) { 1038239462Sdim if (G_RAID3_NEXT_BIO(bp) == NULL) { 1039239462Sdim G_RAID3_NEXT_BIO(bp) = cbp; 1040239462Sdim break; 1041239462Sdim } 1042239462Sdim } 1043218887Sdim } 1044218887Sdim return (cbp); 1045218887Sdim} 1046218887Sdim 1047218887Sdimstatic void 1048218887Sdimg_raid3_scatter(struct bio *pbp) 1049218887Sdim{ 1050218887Sdim struct g_raid3_softc *sc; 1051218887Sdim struct g_raid3_disk *disk; 1052218887Sdim struct bio *bp, *cbp, *tmpbp; 1053218887Sdim off_t atom, cadd, padd, left; 1054218887Sdim int first; 1055218887Sdim 1056218887Sdim sc = pbp->bio_to->geom->softc; 1057218887Sdim bp = NULL; 1058218887Sdim if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { 1059218887Sdim /* 1060218887Sdim * Find bio for which we should calculate data. 1061218887Sdim */ 1062218887Sdim G_RAID3_FOREACH_BIO(pbp, cbp) { 1063218887Sdim if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { 1064218887Sdim bp = cbp; 1065218887Sdim break; 1066218887Sdim } 1067218887Sdim } 1068218887Sdim KASSERT(bp != NULL, ("NULL parity bio.")); 1069218887Sdim } 1070249423Sdim atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1071218887Sdim cadd = padd = 0; 1072218887Sdim for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { 1073218887Sdim G_RAID3_FOREACH_BIO(pbp, cbp) { 1074218887Sdim if (cbp == bp) 1075218887Sdim continue; 1076218887Sdim bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); 1077218887Sdim padd += atom; 1078218887Sdim } 1079218887Sdim cadd += atom; 1080218887Sdim } 1081218887Sdim if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { 1082218887Sdim /* 1083218887Sdim * Calculate parity. 1084218887Sdim */ 1085218887Sdim first = 1; 1086218887Sdim G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { 1087218887Sdim if (cbp == bp) 1088218887Sdim continue; 1089218887Sdim if (first) { 1090218887Sdim bcopy(cbp->bio_data, bp->bio_data, 1091218887Sdim bp->bio_length); 1092218887Sdim first = 0; 1093218887Sdim } else { 1094218887Sdim g_raid3_xor(cbp->bio_data, bp->bio_data, 1095218887Sdim bp->bio_length); 1096218887Sdim } 1097218887Sdim if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) 1098218887Sdim g_raid3_destroy_bio(sc, cbp); 1099249423Sdim } 1100249423Sdim } 1101249423Sdim G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { 1102249423Sdim struct g_consumer *cp; 1103249423Sdim 1104249423Sdim disk = cbp->bio_caller2; 1105249423Sdim cp = disk->d_consumer; 1106249423Sdim cbp->bio_to = cp->provider; 1107249423Sdim G_RAID3_LOGREQ(3, cbp, "Sending request."); 1108249423Sdim KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1109249423Sdim ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1110249423Sdim cp->acr, cp->acw, cp->ace)); 1111249423Sdim cp->index++; 1112249423Sdim sc->sc_writes++; 1113249423Sdim g_io_request(cbp, cp); 1114249423Sdim } 1115249423Sdim} 1116218887Sdim 1117218887Sdimstatic void 1118239462Sdimg_raid3_gather(struct bio *pbp) 1119218887Sdim{ 1120218887Sdim struct g_raid3_softc *sc; 1121218887Sdim struct g_raid3_disk *disk; 1122218887Sdim struct bio *xbp, *fbp, *cbp; 1123249423Sdim off_t atom, cadd, padd, left; 1124249423Sdim 1125249423Sdim sc = pbp->bio_to->geom->softc; 1126249423Sdim /* 1127249423Sdim * Find bio for which we have to calculate data. 1128249423Sdim * While going through this path, check if all requests 1129249423Sdim * succeeded, if not, deny whole request. 1130249423Sdim * If we're in COMPLETE mode, we allow one request to fail, 1131249423Sdim * so if we find one, we're sending it to the parity consumer. 1132249423Sdim * If there are more failed requests, we deny whole request. 1133249423Sdim */ 1134249423Sdim xbp = fbp = NULL; 1135239462Sdim G_RAID3_FOREACH_BIO(pbp, cbp) { 1136249423Sdim if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { 1137249423Sdim KASSERT(xbp == NULL, ("More than one parity bio.")); 1138249423Sdim xbp = cbp; 1139249423Sdim } 1140249423Sdim if (cbp->bio_error == 0) 1141249423Sdim continue; 1142249423Sdim /* 1143249423Sdim * Found failed request. 1144249423Sdim */ 1145218887Sdim if (fbp == NULL) { 1146218887Sdim if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { 1147218887Sdim /* 1148218887Sdim * We are already in degraded mode, so we can't 1149218887Sdim * accept any failures. 1150249423Sdim */ 1151218887Sdim if (pbp->bio_error == 0) 1152218887Sdim pbp->bio_error = cbp->bio_error; 1153249423Sdim } else { 1154218887Sdim fbp = cbp; 1155239462Sdim } 1156239462Sdim } else { 1157239462Sdim /* 1158239462Sdim * Next failed request, that's too many. 1159239462Sdim */ 1160239462Sdim if (pbp->bio_error == 0) 1161239462Sdim pbp->bio_error = fbp->bio_error; 1162239462Sdim } 1163239462Sdim disk = cbp->bio_caller2; 1164239462Sdim if (disk == NULL) 1165239462Sdim continue; 1166239462Sdim if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { 1167239462Sdim disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; 1168239462Sdim G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", 1169249423Sdim cbp->bio_error); 1170239462Sdim } else { 1171239462Sdim G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", 1172239462Sdim cbp->bio_error); 1173239462Sdim } 1174239462Sdim if (g_raid3_disconnect_on_failure && 1175239462Sdim sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1176243830Sdim sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1177249423Sdim g_raid3_event_send(disk, 1178239462Sdim G_RAID3_DISK_STATE_DISCONNECTED, 1179239462Sdim G_RAID3_EVENT_DONTWAIT); 1180239462Sdim } 1181239462Sdim } 1182239462Sdim if (pbp->bio_error != 0) 1183239462Sdim goto finish; 1184239462Sdim if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { 1185239462Sdim pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; 1186249423Sdim if (xbp != fbp) 1187249423Sdim g_raid3_replace_bio(xbp, fbp); 1188249423Sdim g_raid3_destroy_bio(sc, fbp); 1189249423Sdim } else if (fbp != NULL) { 1190249423Sdim struct g_consumer *cp; 1191249423Sdim 1192249423Sdim /* 1193249423Sdim * One request failed, so send the same request to 1194249423Sdim * the parity consumer. 1195249423Sdim */ 1196249423Sdim disk = pbp->bio_driver2; 1197249423Sdim if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { 1198249423Sdim pbp->bio_error = fbp->bio_error; 1199239462Sdim goto finish; 1200239462Sdim } 1201239462Sdim pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1202239462Sdim pbp->bio_inbed--; 1203239462Sdim fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); 1204249423Sdim if (disk->d_no == sc->sc_ndisks - 1) 1205239462Sdim fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1206249423Sdim fbp->bio_error = 0; 1207249423Sdim fbp->bio_completed = 0; 1208239462Sdim fbp->bio_children = 0; 1209249423Sdim fbp->bio_inbed = 0; 1210239462Sdim cp = disk->d_consumer; 1211239462Sdim fbp->bio_caller2 = disk; 1212239462Sdim fbp->bio_to = cp->provider; 1213239462Sdim G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); 1214239462Sdim KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1215218887Sdim ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1216218887Sdim cp->acr, cp->acw, cp->ace)); 1217239462Sdim cp->index++; 1218239462Sdim g_io_request(fbp, cp); 1219218887Sdim return; 1220239462Sdim } 1221239462Sdim if (xbp != NULL) { 1222239462Sdim /* 1223239462Sdim * Calculate parity. 1224239462Sdim */ 1225218887Sdim G_RAID3_FOREACH_BIO(pbp, cbp) { 1226218887Sdim if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) 1227249423Sdim continue; 1228249423Sdim g_raid3_xor(cbp->bio_data, xbp->bio_data, 1229239462Sdim xbp->bio_length); 1230239462Sdim } 1231239462Sdim xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; 1232239462Sdim if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { 1233239462Sdim if (!g_raid3_is_zero(xbp)) { 1234218887Sdim g_raid3_parity_mismatch++; 1235239462Sdim pbp->bio_error = EIO; 1236239462Sdim goto finish; 1237218887Sdim } 1238218887Sdim g_raid3_destroy_bio(sc, xbp); 1239239462Sdim } 1240218887Sdim } 1241218887Sdim atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1242218887Sdim cadd = padd = 0; 1243218887Sdim for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { 1244218887Sdim G_RAID3_FOREACH_BIO(pbp, cbp) { 1245239462Sdim bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); 1246239462Sdim pbp->bio_completed += atom; 1247218887Sdim padd += atom; 1248243830Sdim } 1249218887Sdim cadd += atom; 1250243830Sdim } 1251243830Sdimfinish: 1252243830Sdim if (pbp->bio_error == 0) 1253243830Sdim G_RAID3_LOGREQ(3, pbp, "Request finished."); 1254239462Sdim else { 1255239462Sdim if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) 1256239462Sdim G_RAID3_LOGREQ(1, pbp, "Verification error."); 1257239462Sdim else 1258239462Sdim G_RAID3_LOGREQ(0, pbp, "Request failed."); 1259239462Sdim } 1260239462Sdim pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; 1261239462Sdim while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) 1262218887Sdim g_raid3_destroy_bio(sc, cbp); 1263218887Sdim g_io_deliver(pbp, pbp->bio_error); 1264218887Sdim} 1265218887Sdim 1266218887Sdimstatic void 1267218887Sdimg_raid3_done(struct bio *bp) 1268218887Sdim{ 1269218887Sdim struct g_raid3_softc *sc; 1270218887Sdim 1271218887Sdim sc = bp->bio_from->geom->softc; 1272218887Sdim bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; 1273218887Sdim G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); 1274218887Sdim mtx_lock(&sc->sc_queue_mtx); 1275218887Sdim bioq_insert_head(&sc->sc_queue, bp); 1276218887Sdim mtx_unlock(&sc->sc_queue_mtx); 1277218887Sdim wakeup(sc); 1278239462Sdim wakeup(&sc->sc_queue); 1279239462Sdim} 1280218887Sdim 1281218887Sdimstatic void 1282218887Sdimg_raid3_regular_request(struct bio *cbp) 1283218887Sdim{ 1284218887Sdim struct g_raid3_softc *sc; 1285218887Sdim struct g_raid3_disk *disk; 1286218887Sdim struct bio *pbp; 1287249423Sdim 1288249423Sdim g_topology_assert_not(); 1289249423Sdim 1290249423Sdim pbp = cbp->bio_parent; 1291249423Sdim sc = pbp->bio_to->geom->softc; 1292249423Sdim cbp->bio_from->index--; 1293249423Sdim if (cbp->bio_cmd == BIO_WRITE) 1294249423Sdim sc->sc_writes--; 1295249423Sdim disk = cbp->bio_from->private; 1296249423Sdim if (disk == NULL) { 1297249423Sdim g_topology_lock(); 1298249423Sdim g_raid3_kill_consumer(sc, cbp->bio_from); 1299249423Sdim g_topology_unlock(); 1300249423Sdim } 1301249423Sdim 1302249423Sdim G_RAID3_LOGREQ(3, cbp, "Request finished."); 1303249423Sdim pbp->bio_inbed++; 1304249423Sdim KASSERT(pbp->bio_inbed <= pbp->bio_children, 1305249423Sdim ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, 1306249423Sdim pbp->bio_children)); 1307249423Sdim if (pbp->bio_inbed != pbp->bio_children) 1308249423Sdim return; 1309249423Sdim switch (pbp->bio_cmd) { 1310218887Sdim case BIO_READ: 1311218887Sdim g_raid3_gather(pbp); 1312218887Sdim break; 1313218887Sdim case BIO_WRITE: 1314234353Sdim case BIO_DELETE: 1315234353Sdim { 1316218887Sdim int error = 0; 1317218887Sdim 1318218887Sdim pbp->bio_completed = pbp->bio_length; 1319218887Sdim while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { 1320218887Sdim if (cbp->bio_error == 0) { 1321218887Sdim g_raid3_destroy_bio(sc, cbp); 1322218887Sdim continue; 1323218887Sdim } 1324218887Sdim 1325218887Sdim if (error == 0) 1326218887Sdim error = cbp->bio_error; 1327218887Sdim else if (pbp->bio_error == 0) { 1328218887Sdim /* 1329218887Sdim * Next failed request, that's too many. 1330239462Sdim */ 1331239462Sdim pbp->bio_error = error; 1332218887Sdim } 1333218887Sdim 1334218887Sdim disk = cbp->bio_caller2; 1335239462Sdim if (disk == NULL) { 1336249423Sdim g_raid3_destroy_bio(sc, cbp); 1337218887Sdim continue; 1338239462Sdim } 1339218887Sdim 1340239462Sdim if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { 1341218887Sdim disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; 1342218887Sdim G_RAID3_LOGREQ(0, cbp, 1343218887Sdim "Request failed (error=%d).", 1344239462Sdim cbp->bio_error); 1345218887Sdim } else { 1346218887Sdim G_RAID3_LOGREQ(1, cbp, 1347218887Sdim "Request failed (error=%d).", 1348218887Sdim cbp->bio_error); 1349218887Sdim } 1350218887Sdim if (g_raid3_disconnect_on_failure && 1351218887Sdim sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1352218887Sdim sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1353218887Sdim g_raid3_event_send(disk, 1354239462Sdim G_RAID3_DISK_STATE_DISCONNECTED, 1355239462Sdim G_RAID3_EVENT_DONTWAIT); 1356239462Sdim } 1357239462Sdim g_raid3_destroy_bio(sc, cbp); 1358239462Sdim } 1359239462Sdim if (pbp->bio_error == 0) 1360239462Sdim G_RAID3_LOGREQ(3, pbp, "Request finished."); 1361239462Sdim else 1362218887Sdim G_RAID3_LOGREQ(0, pbp, "Request failed."); 1363218887Sdim pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; 1364218887Sdim pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; 1365218887Sdim bioq_remove(&sc->sc_inflight, pbp); 1366218887Sdim /* Release delayed sync requests if possible. */ 1367218887Sdim g_raid3_sync_release(sc); 1368218887Sdim g_io_deliver(pbp, pbp->bio_error); 1369218887Sdim break; 1370218887Sdim } 1371239462Sdim } 1372239462Sdim} 1373239462Sdim 1374239462Sdimstatic void 1375239462Sdimg_raid3_sync_done(struct bio *bp) 1376239462Sdim{ 1377239462Sdim struct g_raid3_softc *sc; 1378239462Sdim 1379218887Sdim G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); 1380249423Sdim sc = bp->bio_from->geom->softc; 1381249423Sdim bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; 1382249423Sdim mtx_lock(&sc->sc_queue_mtx); 1383249423Sdim bioq_insert_head(&sc->sc_queue, bp); 1384249423Sdim mtx_unlock(&sc->sc_queue_mtx); 1385249423Sdim wakeup(sc); 1386249423Sdim wakeup(&sc->sc_queue); 1387249423Sdim} 1388249423Sdim 1389249423Sdimstatic void 1390g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) 1391{ 1392 struct bio_queue_head queue; 1393 struct g_raid3_disk *disk; 1394 struct g_consumer *cp; 1395 struct bio *cbp; 1396 u_int i; 1397 1398 bioq_init(&queue); 1399 for (i = 0; i < sc->sc_ndisks; i++) { 1400 disk = &sc->sc_disks[i]; 1401 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 1402 continue; 1403 cbp = g_clone_bio(bp); 1404 if (cbp == NULL) { 1405 for (cbp = bioq_first(&queue); cbp != NULL; 1406 cbp = bioq_first(&queue)) { 1407 bioq_remove(&queue, cbp); 1408 g_destroy_bio(cbp); 1409 } 1410 if (bp->bio_error == 0) 1411 bp->bio_error = ENOMEM; 1412 g_io_deliver(bp, bp->bio_error); 1413 return; 1414 } 1415 bioq_insert_tail(&queue, cbp); 1416 cbp->bio_done = g_std_done; 1417 cbp->bio_caller1 = disk; 1418 cbp->bio_to = disk->d_consumer->provider; 1419 } 1420 for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { 1421 bioq_remove(&queue, cbp); 1422 G_RAID3_LOGREQ(3, cbp, "Sending request."); 1423 disk = cbp->bio_caller1; 1424 cbp->bio_caller1 = NULL; 1425 cp = disk->d_consumer; 1426 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1427 ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1428 cp->acr, cp->acw, cp->ace)); 1429 g_io_request(cbp, disk->d_consumer); 1430 } 1431} 1432 1433static void 1434g_raid3_start(struct bio *bp) 1435{ 1436 struct g_raid3_softc *sc; 1437 1438 sc = bp->bio_to->geom->softc; 1439 /* 1440 * If sc == NULL or there are no valid disks, provider's error 1441 * should be set and g_raid3_start() should not be called at all. 1442 */ 1443 KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 1444 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), 1445 ("Provider's error should be set (error=%d)(device=%s).", 1446 bp->bio_to->error, bp->bio_to->name)); 1447 G_RAID3_LOGREQ(3, bp, "Request received."); 1448 1449 switch (bp->bio_cmd) { 1450 case BIO_READ: 1451 case BIO_WRITE: 1452 case BIO_DELETE: 1453 break; 1454 case BIO_FLUSH: 1455 g_raid3_flush(sc, bp); 1456 return; 1457 case BIO_GETATTR: 1458 default: 1459 g_io_deliver(bp, EOPNOTSUPP); 1460 return; 1461 } 1462 mtx_lock(&sc->sc_queue_mtx); 1463 bioq_insert_tail(&sc->sc_queue, bp); 1464 mtx_unlock(&sc->sc_queue_mtx); 1465 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 1466 wakeup(sc); 1467} 1468 1469/* 1470 * Return TRUE if the given request is colliding with a in-progress 1471 * synchronization request. 1472 */ 1473static int 1474g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) 1475{ 1476 struct g_raid3_disk *disk; 1477 struct bio *sbp; 1478 off_t rstart, rend, sstart, send; 1479 int i; 1480 1481 disk = sc->sc_syncdisk; 1482 if (disk == NULL) 1483 return (0); 1484 rstart = bp->bio_offset; 1485 rend = bp->bio_offset + bp->bio_length; 1486 for (i = 0; i < g_raid3_syncreqs; i++) { 1487 sbp = disk->d_sync.ds_bios[i]; 1488 if (sbp == NULL) 1489 continue; 1490 sstart = sbp->bio_offset; 1491 send = sbp->bio_length; 1492 if (sbp->bio_cmd == BIO_WRITE) { 1493 sstart *= sc->sc_ndisks - 1; 1494 send *= sc->sc_ndisks - 1; 1495 } 1496 send += sstart; 1497 if (rend > sstart && rstart < send) 1498 return (1); 1499 } 1500 return (0); 1501} 1502 1503/* 1504 * Return TRUE if the given sync request is colliding with a in-progress regular 1505 * request. 1506 */ 1507static int 1508g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) 1509{ 1510 off_t rstart, rend, sstart, send; 1511 struct bio *bp; 1512 1513 if (sc->sc_syncdisk == NULL) 1514 return (0); 1515 sstart = sbp->bio_offset; 1516 send = sstart + sbp->bio_length; 1517 TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { 1518 rstart = bp->bio_offset; 1519 rend = bp->bio_offset + bp->bio_length; 1520 if (rend > sstart && rstart < send) 1521 return (1); 1522 } 1523 return (0); 1524} 1525 1526/* 1527 * Puts request onto delayed queue. 1528 */ 1529static void 1530g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) 1531{ 1532 1533 G_RAID3_LOGREQ(2, bp, "Delaying request."); 1534 bioq_insert_head(&sc->sc_regular_delayed, bp); 1535} 1536 1537/* 1538 * Puts synchronization request onto delayed queue. 1539 */ 1540static void 1541g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) 1542{ 1543 1544 G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); 1545 bioq_insert_tail(&sc->sc_sync_delayed, bp); 1546} 1547 1548/* 1549 * Releases delayed regular requests which don't collide anymore with sync 1550 * requests. 1551 */ 1552static void 1553g_raid3_regular_release(struct g_raid3_softc *sc) 1554{ 1555 struct bio *bp, *bp2; 1556 1557 TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { 1558 if (g_raid3_sync_collision(sc, bp)) 1559 continue; 1560 bioq_remove(&sc->sc_regular_delayed, bp); 1561 G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); 1562 mtx_lock(&sc->sc_queue_mtx); 1563 bioq_insert_head(&sc->sc_queue, bp); 1564#if 0 1565 /* 1566 * wakeup() is not needed, because this function is called from 1567 * the worker thread. 1568 */ 1569 wakeup(&sc->sc_queue); 1570#endif 1571 mtx_unlock(&sc->sc_queue_mtx); 1572 } 1573} 1574 1575/* 1576 * Releases delayed sync requests which don't collide anymore with regular 1577 * requests. 1578 */ 1579static void 1580g_raid3_sync_release(struct g_raid3_softc *sc) 1581{ 1582 struct bio *bp, *bp2; 1583 1584 TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { 1585 if (g_raid3_regular_collision(sc, bp)) 1586 continue; 1587 bioq_remove(&sc->sc_sync_delayed, bp); 1588 G_RAID3_LOGREQ(2, bp, 1589 "Releasing delayed synchronization request."); 1590 g_io_request(bp, bp->bio_from); 1591 } 1592} 1593 1594/* 1595 * Handle synchronization requests. 1596 * Every synchronization request is two-steps process: first, READ request is 1597 * send to active provider and then WRITE request (with read data) to the provider 1598 * beeing synchronized. When WRITE is finished, new synchronization request is 1599 * send. 1600 */ 1601static void 1602g_raid3_sync_request(struct bio *bp) 1603{ 1604 struct g_raid3_softc *sc; 1605 struct g_raid3_disk *disk; 1606 1607 bp->bio_from->index--; 1608 sc = bp->bio_from->geom->softc; 1609 disk = bp->bio_from->private; 1610 if (disk == NULL) { 1611 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ 1612 g_topology_lock(); 1613 g_raid3_kill_consumer(sc, bp->bio_from); 1614 g_topology_unlock(); 1615 free(bp->bio_data, M_RAID3); 1616 g_destroy_bio(bp); 1617 sx_xlock(&sc->sc_lock); 1618 return; 1619 } 1620 1621 /* 1622 * Synchronization request. 1623 */ 1624 switch (bp->bio_cmd) { 1625 case BIO_READ: 1626 { 1627 struct g_consumer *cp; 1628 u_char *dst, *src; 1629 off_t left; 1630 u_int atom; 1631 1632 if (bp->bio_error != 0) { 1633 G_RAID3_LOGREQ(0, bp, 1634 "Synchronization request failed (error=%d).", 1635 bp->bio_error); 1636 g_destroy_bio(bp); 1637 return; 1638 } 1639 G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); 1640 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1641 dst = src = bp->bio_data; 1642 if (disk->d_no == sc->sc_ndisks - 1) { 1643 u_int n; 1644 1645 /* Parity component. */ 1646 for (left = bp->bio_length; left > 0; 1647 left -= sc->sc_sectorsize) { 1648 bcopy(src, dst, atom); 1649 src += atom; 1650 for (n = 1; n < sc->sc_ndisks - 1; n++) { 1651 g_raid3_xor(src, dst, atom); 1652 src += atom; 1653 } 1654 dst += atom; 1655 } 1656 } else { 1657 /* Regular component. */ 1658 src += atom * disk->d_no; 1659 for (left = bp->bio_length; left > 0; 1660 left -= sc->sc_sectorsize) { 1661 bcopy(src, dst, atom); 1662 src += sc->sc_sectorsize; 1663 dst += atom; 1664 } 1665 } 1666 bp->bio_driver1 = bp->bio_driver2 = NULL; 1667 bp->bio_pflags = 0; 1668 bp->bio_offset /= sc->sc_ndisks - 1; 1669 bp->bio_length /= sc->sc_ndisks - 1; 1670 bp->bio_cmd = BIO_WRITE; 1671 bp->bio_cflags = 0; 1672 bp->bio_children = bp->bio_inbed = 0; 1673 cp = disk->d_consumer; 1674 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1675 ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1676 cp->acr, cp->acw, cp->ace)); 1677 cp->index++; 1678 g_io_request(bp, cp); 1679 return; 1680 } 1681 case BIO_WRITE: 1682 { 1683 struct g_raid3_disk_sync *sync; 1684 off_t boffset, moffset; 1685 void *data; 1686 int i; 1687 1688 if (bp->bio_error != 0) { 1689 G_RAID3_LOGREQ(0, bp, 1690 "Synchronization request failed (error=%d).", 1691 bp->bio_error); 1692 g_destroy_bio(bp); 1693 sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1694 g_raid3_event_send(disk, 1695 G_RAID3_DISK_STATE_DISCONNECTED, 1696 G_RAID3_EVENT_DONTWAIT); 1697 return; 1698 } 1699 G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); 1700 sync = &disk->d_sync; 1701 if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || 1702 sync->ds_consumer == NULL || 1703 (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1704 /* Don't send more synchronization requests. */ 1705 sync->ds_inflight--; 1706 if (sync->ds_bios != NULL) { 1707 i = (int)(uintptr_t)bp->bio_caller1; 1708 sync->ds_bios[i] = NULL; 1709 } 1710 free(bp->bio_data, M_RAID3); 1711 g_destroy_bio(bp); 1712 if (sync->ds_inflight > 0) 1713 return; 1714 if (sync->ds_consumer == NULL || 1715 (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1716 return; 1717 } 1718 /* 1719 * Disk up-to-date, activate it. 1720 */ 1721 g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, 1722 G_RAID3_EVENT_DONTWAIT); 1723 return; 1724 } 1725 1726 /* Send next synchronization request. */ 1727 data = bp->bio_data; 1728 bzero(bp, sizeof(*bp)); 1729 bp->bio_cmd = BIO_READ; 1730 bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); 1731 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); 1732 sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); 1733 bp->bio_done = g_raid3_sync_done; 1734 bp->bio_data = data; 1735 bp->bio_from = sync->ds_consumer; 1736 bp->bio_to = sc->sc_provider; 1737 G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); 1738 sync->ds_consumer->index++; 1739 /* 1740 * Delay the request if it is colliding with a regular request. 1741 */ 1742 if (g_raid3_regular_collision(sc, bp)) 1743 g_raid3_sync_delay(sc, bp); 1744 else 1745 g_io_request(bp, sync->ds_consumer); 1746 1747 /* Release delayed requests if possible. */ 1748 g_raid3_regular_release(sc); 1749 1750 /* Find the smallest offset. */ 1751 moffset = sc->sc_mediasize; 1752 for (i = 0; i < g_raid3_syncreqs; i++) { 1753 bp = sync->ds_bios[i]; 1754 boffset = bp->bio_offset; 1755 if (bp->bio_cmd == BIO_WRITE) 1756 boffset *= sc->sc_ndisks - 1; 1757 if (boffset < moffset) 1758 moffset = boffset; 1759 } 1760 if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { 1761 /* Update offset_done on every 100 blocks. */ 1762 sync->ds_offset_done = moffset; 1763 g_raid3_update_metadata(disk); 1764 } 1765 return; 1766 } 1767 default: 1768 KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", 1769 bp->bio_cmd, sc->sc_name)); 1770 break; 1771 } 1772} 1773 1774static int 1775g_raid3_register_request(struct bio *pbp) 1776{ 1777 struct g_raid3_softc *sc; 1778 struct g_raid3_disk *disk; 1779 struct g_consumer *cp; 1780 struct bio *cbp, *tmpbp; 1781 off_t offset, length; 1782 u_int n, ndisks; 1783 int round_robin, verify; 1784 1785 ndisks = 0; 1786 sc = pbp->bio_to->geom->softc; 1787 if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && 1788 sc->sc_syncdisk == NULL) { 1789 g_io_deliver(pbp, EIO); 1790 return (0); 1791 } 1792 g_raid3_init_bio(pbp); 1793 length = pbp->bio_length / (sc->sc_ndisks - 1); 1794 offset = pbp->bio_offset / (sc->sc_ndisks - 1); 1795 round_robin = verify = 0; 1796 switch (pbp->bio_cmd) { 1797 case BIO_READ: 1798 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && 1799 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1800 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; 1801 verify = 1; 1802 ndisks = sc->sc_ndisks; 1803 } else { 1804 verify = 0; 1805 ndisks = sc->sc_ndisks - 1; 1806 } 1807 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && 1808 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1809 round_robin = 1; 1810 } else { 1811 round_robin = 0; 1812 } 1813 KASSERT(!round_robin || !verify, 1814 ("ROUND-ROBIN and VERIFY are mutually exclusive.")); 1815 pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; 1816 break; 1817 case BIO_WRITE: 1818 case BIO_DELETE: 1819 /* 1820 * Delay the request if it is colliding with a synchronization 1821 * request. 1822 */ 1823 if (g_raid3_sync_collision(sc, pbp)) { 1824 g_raid3_regular_delay(sc, pbp); 1825 return (0); 1826 } 1827 1828 if (sc->sc_idle) 1829 g_raid3_unidle(sc); 1830 else 1831 sc->sc_last_write = time_uptime; 1832 1833 ndisks = sc->sc_ndisks; 1834 break; 1835 } 1836 for (n = 0; n < ndisks; n++) { 1837 disk = &sc->sc_disks[n]; 1838 cbp = g_raid3_clone_bio(sc, pbp); 1839 if (cbp == NULL) { 1840 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) 1841 g_raid3_destroy_bio(sc, cbp); 1842 /* 1843 * To prevent deadlock, we must run back up 1844 * with the ENOMEM for failed requests of any 1845 * of our consumers. Our own sync requests 1846 * can stick around, as they are finite. 1847 */ 1848 if ((pbp->bio_cflags & 1849 G_RAID3_BIO_CFLAG_REGULAR) != 0) { 1850 g_io_deliver(pbp, ENOMEM); 1851 return (0); 1852 } 1853 return (ENOMEM); 1854 } 1855 cbp->bio_offset = offset; 1856 cbp->bio_length = length; 1857 cbp->bio_done = g_raid3_done; 1858 switch (pbp->bio_cmd) { 1859 case BIO_READ: 1860 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { 1861 /* 1862 * Replace invalid component with the parity 1863 * component. 1864 */ 1865 disk = &sc->sc_disks[sc->sc_ndisks - 1]; 1866 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1867 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1868 } else if (round_robin && 1869 disk->d_no == sc->sc_round_robin) { 1870 /* 1871 * In round-robin mode skip one data component 1872 * and use parity component when reading. 1873 */ 1874 pbp->bio_driver2 = disk; 1875 disk = &sc->sc_disks[sc->sc_ndisks - 1]; 1876 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1877 sc->sc_round_robin++; 1878 round_robin = 0; 1879 } else if (verify && disk->d_no == sc->sc_ndisks - 1) { 1880 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1881 } 1882 break; 1883 case BIO_WRITE: 1884 case BIO_DELETE: 1885 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 1886 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 1887 if (n == ndisks - 1) { 1888 /* 1889 * Active parity component, mark it as such. 1890 */ 1891 cbp->bio_cflags |= 1892 G_RAID3_BIO_CFLAG_PARITY; 1893 } 1894 } else { 1895 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1896 if (n == ndisks - 1) { 1897 /* 1898 * Parity component is not connected, 1899 * so destroy its request. 1900 */ 1901 pbp->bio_pflags |= 1902 G_RAID3_BIO_PFLAG_NOPARITY; 1903 g_raid3_destroy_bio(sc, cbp); 1904 cbp = NULL; 1905 } else { 1906 cbp->bio_cflags |= 1907 G_RAID3_BIO_CFLAG_NODISK; 1908 disk = NULL; 1909 } 1910 } 1911 break; 1912 } 1913 if (cbp != NULL) 1914 cbp->bio_caller2 = disk; 1915 } 1916 switch (pbp->bio_cmd) { 1917 case BIO_READ: 1918 if (round_robin) { 1919 /* 1920 * If we are in round-robin mode and 'round_robin' is 1921 * still 1, it means, that we skipped parity component 1922 * for this read and must reset sc_round_robin field. 1923 */ 1924 sc->sc_round_robin = 0; 1925 } 1926 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { 1927 disk = cbp->bio_caller2; 1928 cp = disk->d_consumer; 1929 cbp->bio_to = cp->provider; 1930 G_RAID3_LOGREQ(3, cbp, "Sending request."); 1931 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1932 ("Consumer %s not opened (r%dw%de%d).", 1933 cp->provider->name, cp->acr, cp->acw, cp->ace)); 1934 cp->index++; 1935 g_io_request(cbp, cp); 1936 } 1937 break; 1938 case BIO_WRITE: 1939 case BIO_DELETE: 1940 /* 1941 * Put request onto inflight queue, so we can check if new 1942 * synchronization requests don't collide with it. 1943 */ 1944 bioq_insert_tail(&sc->sc_inflight, pbp); 1945 1946 /* 1947 * Bump syncid on first write. 1948 */ 1949 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { 1950 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; 1951 g_raid3_bump_syncid(sc); 1952 } 1953 g_raid3_scatter(pbp); 1954 break; 1955 } 1956 return (0); 1957} 1958 1959static int 1960g_raid3_can_destroy(struct g_raid3_softc *sc) 1961{ 1962 struct g_geom *gp; 1963 struct g_consumer *cp; 1964 1965 g_topology_assert(); 1966 gp = sc->sc_geom; 1967 if (gp->softc == NULL) 1968 return (1); 1969 LIST_FOREACH(cp, &gp->consumer, consumer) { 1970 if (g_raid3_is_busy(sc, cp)) 1971 return (0); 1972 } 1973 gp = sc->sc_sync.ds_geom; 1974 LIST_FOREACH(cp, &gp->consumer, consumer) { 1975 if (g_raid3_is_busy(sc, cp)) 1976 return (0); 1977 } 1978 G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", 1979 sc->sc_name); 1980 return (1); 1981} 1982 1983static int 1984g_raid3_try_destroy(struct g_raid3_softc *sc) 1985{ 1986 1987 g_topology_assert_not(); 1988 sx_assert(&sc->sc_lock, SX_XLOCKED); 1989 1990 if (sc->sc_rootmount != NULL) { 1991 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, 1992 sc->sc_rootmount); 1993 root_mount_rel(sc->sc_rootmount); 1994 sc->sc_rootmount = NULL; 1995 } 1996 1997 g_topology_lock(); 1998 if (!g_raid3_can_destroy(sc)) { 1999 g_topology_unlock(); 2000 return (0); 2001 } 2002 sc->sc_geom->softc = NULL; 2003 sc->sc_sync.ds_geom->softc = NULL; 2004 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { 2005 g_topology_unlock(); 2006 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, 2007 &sc->sc_worker); 2008 /* Unlock sc_lock here, as it can be destroyed after wakeup. */ 2009 sx_xunlock(&sc->sc_lock); 2010 wakeup(&sc->sc_worker); 2011 sc->sc_worker = NULL; 2012 } else { 2013 g_topology_unlock(); 2014 g_raid3_destroy_device(sc); 2015 free(sc->sc_disks, M_RAID3); 2016 free(sc, M_RAID3); 2017 } 2018 return (1); 2019} 2020 2021/* 2022 * Worker thread. 2023 */ 2024static void 2025g_raid3_worker(void *arg) 2026{ 2027 struct g_raid3_softc *sc; 2028 struct g_raid3_event *ep; 2029 struct bio *bp; 2030 int timeout; 2031 2032 sc = arg; 2033 thread_lock(curthread); 2034 sched_prio(curthread, PRIBIO); 2035 thread_unlock(curthread); 2036 2037 sx_xlock(&sc->sc_lock); 2038 for (;;) { 2039 G_RAID3_DEBUG(5, "%s: Let's see...", __func__); 2040 /* 2041 * First take a look at events. 2042 * This is important to handle events before any I/O requests. 2043 */ 2044 ep = g_raid3_event_get(sc); 2045 if (ep != NULL) { 2046 g_raid3_event_remove(sc, ep); 2047 if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { 2048 /* Update only device status. */ 2049 G_RAID3_DEBUG(3, 2050 "Running event for device %s.", 2051 sc->sc_name); 2052 ep->e_error = 0; 2053 g_raid3_update_device(sc, 1); 2054 } else { 2055 /* Update disk status. */ 2056 G_RAID3_DEBUG(3, "Running event for disk %s.", 2057 g_raid3_get_diskname(ep->e_disk)); 2058 ep->e_error = g_raid3_update_disk(ep->e_disk, 2059 ep->e_state); 2060 if (ep->e_error == 0) 2061 g_raid3_update_device(sc, 0); 2062 } 2063 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { 2064 KASSERT(ep->e_error == 0, 2065 ("Error cannot be handled.")); 2066 g_raid3_event_free(ep); 2067 } else { 2068 ep->e_flags |= G_RAID3_EVENT_DONE; 2069 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, 2070 ep); 2071 mtx_lock(&sc->sc_events_mtx); 2072 wakeup(ep); 2073 mtx_unlock(&sc->sc_events_mtx); 2074 } 2075 if ((sc->sc_flags & 2076 G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 2077 if (g_raid3_try_destroy(sc)) { 2078 curthread->td_pflags &= ~TDP_GEOM; 2079 G_RAID3_DEBUG(1, "Thread exiting."); 2080 kproc_exit(0); 2081 } 2082 } 2083 G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); 2084 continue; 2085 } 2086 /* 2087 * Check if we can mark array as CLEAN and if we can't take 2088 * how much seconds should we wait. 2089 */ 2090 timeout = g_raid3_idle(sc, -1); 2091 /* 2092 * Now I/O requests. 2093 */ 2094 /* Get first request from the queue. */ 2095 mtx_lock(&sc->sc_queue_mtx); 2096 bp = bioq_first(&sc->sc_queue); 2097 if (bp == NULL) { 2098 if ((sc->sc_flags & 2099 G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 2100 mtx_unlock(&sc->sc_queue_mtx); 2101 if (g_raid3_try_destroy(sc)) { 2102 curthread->td_pflags &= ~TDP_GEOM; 2103 G_RAID3_DEBUG(1, "Thread exiting."); 2104 kproc_exit(0); 2105 } 2106 mtx_lock(&sc->sc_queue_mtx); 2107 } 2108 sx_xunlock(&sc->sc_lock); 2109 /* 2110 * XXX: We can miss an event here, because an event 2111 * can be added without sx-device-lock and without 2112 * mtx-queue-lock. Maybe I should just stop using 2113 * dedicated mutex for events synchronization and 2114 * stick with the queue lock? 2115 * The event will hang here until next I/O request 2116 * or next event is received. 2117 */ 2118 MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", 2119 timeout * hz); 2120 sx_xlock(&sc->sc_lock); 2121 G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); 2122 continue; 2123 } 2124process: 2125 bioq_remove(&sc->sc_queue, bp); 2126 mtx_unlock(&sc->sc_queue_mtx); 2127 2128 if (bp->bio_from->geom == sc->sc_sync.ds_geom && 2129 (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { 2130 g_raid3_sync_request(bp); /* READ */ 2131 } else if (bp->bio_to != sc->sc_provider) { 2132 if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) 2133 g_raid3_regular_request(bp); 2134 else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) 2135 g_raid3_sync_request(bp); /* WRITE */ 2136 else { 2137 KASSERT(0, 2138 ("Invalid request cflags=0x%hhx to=%s.", 2139 bp->bio_cflags, bp->bio_to->name)); 2140 } 2141 } else if (g_raid3_register_request(bp) != 0) { 2142 mtx_lock(&sc->sc_queue_mtx); 2143 bioq_insert_head(&sc->sc_queue, bp); 2144 /* 2145 * We are short in memory, let see if there are finished 2146 * request we can free. 2147 */ 2148 TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 2149 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) 2150 goto process; 2151 } 2152 /* 2153 * No finished regular request, so at least keep 2154 * synchronization running. 2155 */ 2156 TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 2157 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) 2158 goto process; 2159 } 2160 sx_xunlock(&sc->sc_lock); 2161 MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, 2162 "r3:lowmem", hz / 10); 2163 sx_xlock(&sc->sc_lock); 2164 } 2165 G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); 2166 } 2167} 2168 2169static void 2170g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) 2171{ 2172 2173 sx_assert(&sc->sc_lock, SX_LOCKED); 2174 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) 2175 return; 2176 if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { 2177 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 2178 g_raid3_get_diskname(disk), sc->sc_name); 2179 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 2180 } else if (sc->sc_idle && 2181 (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { 2182 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 2183 g_raid3_get_diskname(disk), sc->sc_name); 2184 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2185 } 2186} 2187 2188static void 2189g_raid3_sync_start(struct g_raid3_softc *sc) 2190{ 2191 struct g_raid3_disk *disk; 2192 struct g_consumer *cp; 2193 struct bio *bp; 2194 int error; 2195 u_int n; 2196 2197 g_topology_assert_not(); 2198 sx_assert(&sc->sc_lock, SX_XLOCKED); 2199 2200 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 2201 ("Device not in DEGRADED state (%s, %u).", sc->sc_name, 2202 sc->sc_state)); 2203 KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", 2204 sc->sc_name, sc->sc_state)); 2205 disk = NULL; 2206 for (n = 0; n < sc->sc_ndisks; n++) { 2207 if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) 2208 continue; 2209 disk = &sc->sc_disks[n]; 2210 break; 2211 } 2212 if (disk == NULL) 2213 return; 2214 2215 sx_xunlock(&sc->sc_lock); 2216 g_topology_lock(); 2217 cp = g_new_consumer(sc->sc_sync.ds_geom); 2218 error = g_attach(cp, sc->sc_provider); 2219 KASSERT(error == 0, 2220 ("Cannot attach to %s (error=%d).", sc->sc_name, error)); 2221 error = g_access(cp, 1, 0, 0); 2222 KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); 2223 g_topology_unlock(); 2224 sx_xlock(&sc->sc_lock); 2225 2226 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, 2227 g_raid3_get_diskname(disk)); 2228 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) 2229 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 2230 KASSERT(disk->d_sync.ds_consumer == NULL, 2231 ("Sync consumer already exists (device=%s, disk=%s).", 2232 sc->sc_name, g_raid3_get_diskname(disk))); 2233 2234 disk->d_sync.ds_consumer = cp; 2235 disk->d_sync.ds_consumer->private = disk; 2236 disk->d_sync.ds_consumer->index = 0; 2237 sc->sc_syncdisk = disk; 2238 2239 /* 2240 * Allocate memory for synchronization bios and initialize them. 2241 */ 2242 disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, 2243 M_RAID3, M_WAITOK); 2244 for (n = 0; n < g_raid3_syncreqs; n++) { 2245 bp = g_alloc_bio(); 2246 disk->d_sync.ds_bios[n] = bp; 2247 bp->bio_parent = NULL; 2248 bp->bio_cmd = BIO_READ; 2249 bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); 2250 bp->bio_cflags = 0; 2251 bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); 2252 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); 2253 disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); 2254 bp->bio_done = g_raid3_sync_done; 2255 bp->bio_from = disk->d_sync.ds_consumer; 2256 bp->bio_to = sc->sc_provider; 2257 bp->bio_caller1 = (void *)(uintptr_t)n; 2258 } 2259 2260 /* Set the number of in-flight synchronization requests. */ 2261 disk->d_sync.ds_inflight = g_raid3_syncreqs; 2262 2263 /* 2264 * Fire off first synchronization requests. 2265 */ 2266 for (n = 0; n < g_raid3_syncreqs; n++) { 2267 bp = disk->d_sync.ds_bios[n]; 2268 G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); 2269 disk->d_sync.ds_consumer->index++; 2270 /* 2271 * Delay the request if it is colliding with a regular request. 2272 */ 2273 if (g_raid3_regular_collision(sc, bp)) 2274 g_raid3_sync_delay(sc, bp); 2275 else 2276 g_io_request(bp, disk->d_sync.ds_consumer); 2277 } 2278} 2279 2280/* 2281 * Stop synchronization process. 2282 * type: 0 - synchronization finished 2283 * 1 - synchronization stopped 2284 */ 2285static void 2286g_raid3_sync_stop(struct g_raid3_softc *sc, int type) 2287{ 2288 struct g_raid3_disk *disk; 2289 struct g_consumer *cp; 2290 2291 g_topology_assert_not(); 2292 sx_assert(&sc->sc_lock, SX_LOCKED); 2293 2294 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 2295 ("Device not in DEGRADED state (%s, %u).", sc->sc_name, 2296 sc->sc_state)); 2297 disk = sc->sc_syncdisk; 2298 sc->sc_syncdisk = NULL; 2299 KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); 2300 KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2301 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2302 g_raid3_disk_state2str(disk->d_state))); 2303 if (disk->d_sync.ds_consumer == NULL) 2304 return; 2305 2306 if (type == 0) { 2307 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", 2308 sc->sc_name, g_raid3_get_diskname(disk)); 2309 } else /* if (type == 1) */ { 2310 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", 2311 sc->sc_name, g_raid3_get_diskname(disk)); 2312 } 2313 free(disk->d_sync.ds_bios, M_RAID3); 2314 disk->d_sync.ds_bios = NULL; 2315 cp = disk->d_sync.ds_consumer; 2316 disk->d_sync.ds_consumer = NULL; 2317 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2318 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ 2319 g_topology_lock(); 2320 g_raid3_kill_consumer(sc, cp); 2321 g_topology_unlock(); 2322 sx_xlock(&sc->sc_lock); 2323} 2324 2325static void 2326g_raid3_launch_provider(struct g_raid3_softc *sc) 2327{ 2328 struct g_provider *pp; 2329 struct g_raid3_disk *disk; 2330 int n; 2331 2332 sx_assert(&sc->sc_lock, SX_LOCKED); 2333 2334 g_topology_lock(); 2335 pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); 2336 pp->mediasize = sc->sc_mediasize; 2337 pp->sectorsize = sc->sc_sectorsize; 2338 pp->stripesize = 0; 2339 pp->stripeoffset = 0; 2340 for (n = 0; n < sc->sc_ndisks; n++) { 2341 disk = &sc->sc_disks[n]; 2342 if (disk->d_consumer && disk->d_consumer->provider && 2343 disk->d_consumer->provider->stripesize > pp->stripesize) { 2344 pp->stripesize = disk->d_consumer->provider->stripesize; 2345 pp->stripeoffset = disk->d_consumer->provider->stripeoffset; 2346 } 2347 } 2348 pp->stripesize *= sc->sc_ndisks - 1; 2349 pp->stripeoffset *= sc->sc_ndisks - 1; 2350 sc->sc_provider = pp; 2351 g_error_provider(pp, 0); 2352 g_topology_unlock(); 2353 G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, 2354 g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); 2355 2356 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) 2357 g_raid3_sync_start(sc); 2358} 2359 2360static void 2361g_raid3_destroy_provider(struct g_raid3_softc *sc) 2362{ 2363 struct bio *bp; 2364 2365 g_topology_assert_not(); 2366 KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", 2367 sc->sc_name)); 2368 2369 g_topology_lock(); 2370 g_error_provider(sc->sc_provider, ENXIO); 2371 mtx_lock(&sc->sc_queue_mtx); 2372 while ((bp = bioq_first(&sc->sc_queue)) != NULL) { 2373 bioq_remove(&sc->sc_queue, bp); 2374 g_io_deliver(bp, ENXIO); 2375 } 2376 mtx_unlock(&sc->sc_queue_mtx); 2377 G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, 2378 sc->sc_provider->name); 2379 sc->sc_provider->flags |= G_PF_WITHER; 2380 g_orphan_provider(sc->sc_provider, ENXIO); 2381 g_topology_unlock(); 2382 sc->sc_provider = NULL; 2383 if (sc->sc_syncdisk != NULL) 2384 g_raid3_sync_stop(sc, 1); 2385} 2386 2387static void 2388g_raid3_go(void *arg) 2389{ 2390 struct g_raid3_softc *sc; 2391 2392 sc = arg; 2393 G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); 2394 g_raid3_event_send(sc, 0, 2395 G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); 2396} 2397 2398static u_int 2399g_raid3_determine_state(struct g_raid3_disk *disk) 2400{ 2401 struct g_raid3_softc *sc; 2402 u_int state; 2403 2404 sc = disk->d_softc; 2405 if (sc->sc_syncid == disk->d_sync.ds_syncid) { 2406 if ((disk->d_flags & 2407 G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { 2408 /* Disk does not need synchronization. */ 2409 state = G_RAID3_DISK_STATE_ACTIVE; 2410 } else { 2411 if ((sc->sc_flags & 2412 G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || 2413 (disk->d_flags & 2414 G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { 2415 /* 2416 * We can start synchronization from 2417 * the stored offset. 2418 */ 2419 state = G_RAID3_DISK_STATE_SYNCHRONIZING; 2420 } else { 2421 state = G_RAID3_DISK_STATE_STALE; 2422 } 2423 } 2424 } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { 2425 /* 2426 * Reset all synchronization data for this disk, 2427 * because if it even was synchronized, it was 2428 * synchronized to disks with different syncid. 2429 */ 2430 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 2431 disk->d_sync.ds_offset = 0; 2432 disk->d_sync.ds_offset_done = 0; 2433 disk->d_sync.ds_syncid = sc->sc_syncid; 2434 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || 2435 (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { 2436 state = G_RAID3_DISK_STATE_SYNCHRONIZING; 2437 } else { 2438 state = G_RAID3_DISK_STATE_STALE; 2439 } 2440 } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { 2441 /* 2442 * Not good, NOT GOOD! 2443 * It means that device was started on stale disks 2444 * and more fresh disk just arrive. 2445 * If there were writes, device is broken, sorry. 2446 * I think the best choice here is don't touch 2447 * this disk and inform the user loudly. 2448 */ 2449 G_RAID3_DEBUG(0, "Device %s was started before the freshest " 2450 "disk (%s) arrives!! It will not be connected to the " 2451 "running device.", sc->sc_name, 2452 g_raid3_get_diskname(disk)); 2453 g_raid3_destroy_disk(disk); 2454 state = G_RAID3_DISK_STATE_NONE; 2455 /* Return immediately, because disk was destroyed. */ 2456 return (state); 2457 } 2458 G_RAID3_DEBUG(3, "State for %s disk: %s.", 2459 g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); 2460 return (state); 2461} 2462 2463/* 2464 * Update device state. 2465 */ 2466static void 2467g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) 2468{ 2469 struct g_raid3_disk *disk; 2470 u_int state; 2471 2472 sx_assert(&sc->sc_lock, SX_XLOCKED); 2473 2474 switch (sc->sc_state) { 2475 case G_RAID3_DEVICE_STATE_STARTING: 2476 { 2477 u_int n, ndirty, ndisks, genid, syncid; 2478 2479 KASSERT(sc->sc_provider == NULL, 2480 ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); 2481 /* 2482 * Are we ready? We are, if all disks are connected or 2483 * one disk is missing and 'force' is true. 2484 */ 2485 if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { 2486 if (!force) 2487 callout_drain(&sc->sc_callout); 2488 } else { 2489 if (force) { 2490 /* 2491 * Timeout expired, so destroy device. 2492 */ 2493 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2494 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", 2495 __LINE__, sc->sc_rootmount); 2496 root_mount_rel(sc->sc_rootmount); 2497 sc->sc_rootmount = NULL; 2498 } 2499 return; 2500 } 2501 2502 /* 2503 * Find the biggest genid. 2504 */ 2505 genid = 0; 2506 for (n = 0; n < sc->sc_ndisks; n++) { 2507 disk = &sc->sc_disks[n]; 2508 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2509 continue; 2510 if (disk->d_genid > genid) 2511 genid = disk->d_genid; 2512 } 2513 sc->sc_genid = genid; 2514 /* 2515 * Remove all disks without the biggest genid. 2516 */ 2517 for (n = 0; n < sc->sc_ndisks; n++) { 2518 disk = &sc->sc_disks[n]; 2519 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2520 continue; 2521 if (disk->d_genid < genid) { 2522 G_RAID3_DEBUG(0, 2523 "Component %s (device %s) broken, skipping.", 2524 g_raid3_get_diskname(disk), sc->sc_name); 2525 g_raid3_destroy_disk(disk); 2526 } 2527 } 2528 2529 /* 2530 * There must be at least 'sc->sc_ndisks - 1' components 2531 * with the same syncid and without SYNCHRONIZING flag. 2532 */ 2533 2534 /* 2535 * Find the biggest syncid, number of valid components and 2536 * number of dirty components. 2537 */ 2538 ndirty = ndisks = syncid = 0; 2539 for (n = 0; n < sc->sc_ndisks; n++) { 2540 disk = &sc->sc_disks[n]; 2541 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2542 continue; 2543 if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) 2544 ndirty++; 2545 if (disk->d_sync.ds_syncid > syncid) { 2546 syncid = disk->d_sync.ds_syncid; 2547 ndisks = 0; 2548 } else if (disk->d_sync.ds_syncid < syncid) { 2549 continue; 2550 } 2551 if ((disk->d_flags & 2552 G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { 2553 continue; 2554 } 2555 ndisks++; 2556 } 2557 /* 2558 * Do we have enough valid components? 2559 */ 2560 if (ndisks + 1 < sc->sc_ndisks) { 2561 G_RAID3_DEBUG(0, 2562 "Device %s is broken, too few valid components.", 2563 sc->sc_name); 2564 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2565 return; 2566 } 2567 /* 2568 * If there is one DIRTY component and all disks are present, 2569 * mark it for synchronization. If there is more than one DIRTY 2570 * component, mark parity component for synchronization. 2571 */ 2572 if (ndisks == sc->sc_ndisks && ndirty == 1) { 2573 for (n = 0; n < sc->sc_ndisks; n++) { 2574 disk = &sc->sc_disks[n]; 2575 if ((disk->d_flags & 2576 G_RAID3_DISK_FLAG_DIRTY) == 0) { 2577 continue; 2578 } 2579 disk->d_flags |= 2580 G_RAID3_DISK_FLAG_SYNCHRONIZING; 2581 } 2582 } else if (ndisks == sc->sc_ndisks && ndirty > 1) { 2583 disk = &sc->sc_disks[sc->sc_ndisks - 1]; 2584 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 2585 } 2586 2587 sc->sc_syncid = syncid; 2588 if (force) { 2589 /* Remember to bump syncid on first write. */ 2590 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; 2591 } 2592 if (ndisks == sc->sc_ndisks) 2593 state = G_RAID3_DEVICE_STATE_COMPLETE; 2594 else /* if (ndisks == sc->sc_ndisks - 1) */ 2595 state = G_RAID3_DEVICE_STATE_DEGRADED; 2596 G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", 2597 sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2598 g_raid3_device_state2str(state)); 2599 sc->sc_state = state; 2600 for (n = 0; n < sc->sc_ndisks; n++) { 2601 disk = &sc->sc_disks[n]; 2602 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2603 continue; 2604 state = g_raid3_determine_state(disk); 2605 g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); 2606 if (state == G_RAID3_DISK_STATE_STALE) 2607 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; 2608 } 2609 break; 2610 } 2611 case G_RAID3_DEVICE_STATE_DEGRADED: 2612 /* 2613 * Genid need to be bumped immediately, so do it here. 2614 */ 2615 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { 2616 sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; 2617 g_raid3_bump_genid(sc); 2618 } 2619 2620 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) 2621 return; 2622 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < 2623 sc->sc_ndisks - 1) { 2624 if (sc->sc_provider != NULL) 2625 g_raid3_destroy_provider(sc); 2626 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2627 return; 2628 } 2629 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == 2630 sc->sc_ndisks) { 2631 state = G_RAID3_DEVICE_STATE_COMPLETE; 2632 G_RAID3_DEBUG(1, 2633 "Device %s state changed from %s to %s.", 2634 sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2635 g_raid3_device_state2str(state)); 2636 sc->sc_state = state; 2637 } 2638 if (sc->sc_provider == NULL) 2639 g_raid3_launch_provider(sc); 2640 if (sc->sc_rootmount != NULL) { 2641 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, 2642 sc->sc_rootmount); 2643 root_mount_rel(sc->sc_rootmount); 2644 sc->sc_rootmount = NULL; 2645 } 2646 break; 2647 case G_RAID3_DEVICE_STATE_COMPLETE: 2648 /* 2649 * Genid need to be bumped immediately, so do it here. 2650 */ 2651 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { 2652 sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; 2653 g_raid3_bump_genid(sc); 2654 } 2655 2656 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) 2657 return; 2658 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= 2659 sc->sc_ndisks - 1, 2660 ("Too few ACTIVE components in COMPLETE state (device %s).", 2661 sc->sc_name)); 2662 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == 2663 sc->sc_ndisks - 1) { 2664 state = G_RAID3_DEVICE_STATE_DEGRADED; 2665 G_RAID3_DEBUG(1, 2666 "Device %s state changed from %s to %s.", 2667 sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2668 g_raid3_device_state2str(state)); 2669 sc->sc_state = state; 2670 } 2671 if (sc->sc_provider == NULL) 2672 g_raid3_launch_provider(sc); 2673 if (sc->sc_rootmount != NULL) { 2674 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, 2675 sc->sc_rootmount); 2676 root_mount_rel(sc->sc_rootmount); 2677 sc->sc_rootmount = NULL; 2678 } 2679 break; 2680 default: 2681 KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, 2682 g_raid3_device_state2str(sc->sc_state))); 2683 break; 2684 } 2685} 2686 2687/* 2688 * Update disk state and device state if needed. 2689 */ 2690#define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ 2691 "Disk %s state changed from %s to %s (device %s).", \ 2692 g_raid3_get_diskname(disk), \ 2693 g_raid3_disk_state2str(disk->d_state), \ 2694 g_raid3_disk_state2str(state), sc->sc_name) 2695static int 2696g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) 2697{ 2698 struct g_raid3_softc *sc; 2699 2700 sc = disk->d_softc; 2701 sx_assert(&sc->sc_lock, SX_XLOCKED); 2702 2703again: 2704 G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", 2705 g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), 2706 g_raid3_disk_state2str(state)); 2707 switch (state) { 2708 case G_RAID3_DISK_STATE_NEW: 2709 /* 2710 * Possible scenarios: 2711 * 1. New disk arrive. 2712 */ 2713 /* Previous state should be NONE. */ 2714 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, 2715 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2716 g_raid3_disk_state2str(disk->d_state))); 2717 DISK_STATE_CHANGED(); 2718 2719 disk->d_state = state; 2720 G_RAID3_DEBUG(1, "Device %s: provider %s detected.", 2721 sc->sc_name, g_raid3_get_diskname(disk)); 2722 if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) 2723 break; 2724 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2725 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2726 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2727 g_raid3_device_state2str(sc->sc_state), 2728 g_raid3_get_diskname(disk), 2729 g_raid3_disk_state2str(disk->d_state))); 2730 state = g_raid3_determine_state(disk); 2731 if (state != G_RAID3_DISK_STATE_NONE) 2732 goto again; 2733 break; 2734 case G_RAID3_DISK_STATE_ACTIVE: 2735 /* 2736 * Possible scenarios: 2737 * 1. New disk does not need synchronization. 2738 * 2. Synchronization process finished successfully. 2739 */ 2740 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2741 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2742 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2743 g_raid3_device_state2str(sc->sc_state), 2744 g_raid3_get_diskname(disk), 2745 g_raid3_disk_state2str(disk->d_state))); 2746 /* Previous state should be NEW or SYNCHRONIZING. */ 2747 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || 2748 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2749 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2750 g_raid3_disk_state2str(disk->d_state))); 2751 DISK_STATE_CHANGED(); 2752 2753 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 2754 disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; 2755 disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; 2756 g_raid3_sync_stop(sc, 0); 2757 } 2758 disk->d_state = state; 2759 disk->d_sync.ds_offset = 0; 2760 disk->d_sync.ds_offset_done = 0; 2761 g_raid3_update_idle(sc, disk); 2762 g_raid3_update_metadata(disk); 2763 G_RAID3_DEBUG(1, "Device %s: provider %s activated.", 2764 sc->sc_name, g_raid3_get_diskname(disk)); 2765 break; 2766 case G_RAID3_DISK_STATE_STALE: 2767 /* 2768 * Possible scenarios: 2769 * 1. Stale disk was connected. 2770 */ 2771 /* Previous state should be NEW. */ 2772 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2773 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2774 g_raid3_disk_state2str(disk->d_state))); 2775 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2776 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2777 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2778 g_raid3_device_state2str(sc->sc_state), 2779 g_raid3_get_diskname(disk), 2780 g_raid3_disk_state2str(disk->d_state))); 2781 /* 2782 * STALE state is only possible if device is marked 2783 * NOAUTOSYNC. 2784 */ 2785 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, 2786 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2787 g_raid3_device_state2str(sc->sc_state), 2788 g_raid3_get_diskname(disk), 2789 g_raid3_disk_state2str(disk->d_state))); 2790 DISK_STATE_CHANGED(); 2791 2792 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2793 disk->d_state = state; 2794 g_raid3_update_metadata(disk); 2795 G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", 2796 sc->sc_name, g_raid3_get_diskname(disk)); 2797 break; 2798 case G_RAID3_DISK_STATE_SYNCHRONIZING: 2799 /* 2800 * Possible scenarios: 2801 * 1. Disk which needs synchronization was connected. 2802 */ 2803 /* Previous state should be NEW. */ 2804 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2805 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2806 g_raid3_disk_state2str(disk->d_state))); 2807 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2808 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2809 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2810 g_raid3_device_state2str(sc->sc_state), 2811 g_raid3_get_diskname(disk), 2812 g_raid3_disk_state2str(disk->d_state))); 2813 DISK_STATE_CHANGED(); 2814 2815 if (disk->d_state == G_RAID3_DISK_STATE_NEW) 2816 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2817 disk->d_state = state; 2818 if (sc->sc_provider != NULL) { 2819 g_raid3_sync_start(sc); 2820 g_raid3_update_metadata(disk); 2821 } 2822 break; 2823 case G_RAID3_DISK_STATE_DISCONNECTED: 2824 /* 2825 * Possible scenarios: 2826 * 1. Device wasn't running yet, but disk disappear. 2827 * 2. Disk was active and disapppear. 2828 * 3. Disk disappear during synchronization process. 2829 */ 2830 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2831 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 2832 /* 2833 * Previous state should be ACTIVE, STALE or 2834 * SYNCHRONIZING. 2835 */ 2836 KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 2837 disk->d_state == G_RAID3_DISK_STATE_STALE || 2838 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2839 ("Wrong disk state (%s, %s).", 2840 g_raid3_get_diskname(disk), 2841 g_raid3_disk_state2str(disk->d_state))); 2842 } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { 2843 /* Previous state should be NEW. */ 2844 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2845 ("Wrong disk state (%s, %s).", 2846 g_raid3_get_diskname(disk), 2847 g_raid3_disk_state2str(disk->d_state))); 2848 /* 2849 * Reset bumping syncid if disk disappeared in STARTING 2850 * state. 2851 */ 2852 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) 2853 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; 2854#ifdef INVARIANTS 2855 } else { 2856 KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", 2857 sc->sc_name, 2858 g_raid3_device_state2str(sc->sc_state), 2859 g_raid3_get_diskname(disk), 2860 g_raid3_disk_state2str(disk->d_state))); 2861#endif 2862 } 2863 DISK_STATE_CHANGED(); 2864 G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", 2865 sc->sc_name, g_raid3_get_diskname(disk)); 2866 2867 g_raid3_destroy_disk(disk); 2868 break; 2869 default: 2870 KASSERT(1 == 0, ("Unknown state (%u).", state)); 2871 break; 2872 } 2873 return (0); 2874} 2875#undef DISK_STATE_CHANGED 2876 2877int 2878g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) 2879{ 2880 struct g_provider *pp; 2881 u_char *buf; 2882 int error; 2883 2884 g_topology_assert(); 2885 2886 error = g_access(cp, 1, 0, 0); 2887 if (error != 0) 2888 return (error); 2889 pp = cp->provider; 2890 g_topology_unlock(); 2891 /* Metadata are stored on last sector. */ 2892 buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, 2893 &error); 2894 g_topology_lock(); 2895 g_access(cp, -1, 0, 0); 2896 if (buf == NULL) { 2897 G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", 2898 cp->provider->name, error); 2899 return (error); 2900 } 2901 2902 /* Decode metadata. */ 2903 error = raid3_metadata_decode(buf, md); 2904 g_free(buf); 2905 if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) 2906 return (EINVAL); 2907 if (md->md_version > G_RAID3_VERSION) { 2908 G_RAID3_DEBUG(0, 2909 "Kernel module is too old to handle metadata from %s.", 2910 cp->provider->name); 2911 return (EINVAL); 2912 } 2913 if (error != 0) { 2914 G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", 2915 cp->provider->name); 2916 return (error); 2917 } 2918 if (md->md_sectorsize > MAXPHYS) { 2919 G_RAID3_DEBUG(0, "The blocksize is too big."); 2920 return (EINVAL); 2921 } 2922 2923 return (0); 2924} 2925 2926static int 2927g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, 2928 struct g_raid3_metadata *md) 2929{ 2930 2931 if (md->md_no >= sc->sc_ndisks) { 2932 G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", 2933 pp->name, md->md_no); 2934 return (EINVAL); 2935 } 2936 if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { 2937 G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", 2938 pp->name, md->md_no); 2939 return (EEXIST); 2940 } 2941 if (md->md_all != sc->sc_ndisks) { 2942 G_RAID3_DEBUG(1, 2943 "Invalid '%s' field on disk %s (device %s), skipping.", 2944 "md_all", pp->name, sc->sc_name); 2945 return (EINVAL); 2946 } 2947 if ((md->md_mediasize % md->md_sectorsize) != 0) { 2948 G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " 2949 "0) on disk %s (device %s), skipping.", pp->name, 2950 sc->sc_name); 2951 return (EINVAL); 2952 } 2953 if (md->md_mediasize != sc->sc_mediasize) { 2954 G_RAID3_DEBUG(1, 2955 "Invalid '%s' field on disk %s (device %s), skipping.", 2956 "md_mediasize", pp->name, sc->sc_name); 2957 return (EINVAL); 2958 } 2959 if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { 2960 G_RAID3_DEBUG(1, 2961 "Invalid '%s' field on disk %s (device %s), skipping.", 2962 "md_mediasize", pp->name, sc->sc_name); 2963 return (EINVAL); 2964 } 2965 if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { 2966 G_RAID3_DEBUG(1, 2967 "Invalid size of disk %s (device %s), skipping.", pp->name, 2968 sc->sc_name); 2969 return (EINVAL); 2970 } 2971 if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { 2972 G_RAID3_DEBUG(1, 2973 "Invalid '%s' field on disk %s (device %s), skipping.", 2974 "md_sectorsize", pp->name, sc->sc_name); 2975 return (EINVAL); 2976 } 2977 if (md->md_sectorsize != sc->sc_sectorsize) { 2978 G_RAID3_DEBUG(1, 2979 "Invalid '%s' field on disk %s (device %s), skipping.", 2980 "md_sectorsize", pp->name, sc->sc_name); 2981 return (EINVAL); 2982 } 2983 if ((sc->sc_sectorsize % pp->sectorsize) != 0) { 2984 G_RAID3_DEBUG(1, 2985 "Invalid sector size of disk %s (device %s), skipping.", 2986 pp->name, sc->sc_name); 2987 return (EINVAL); 2988 } 2989 if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { 2990 G_RAID3_DEBUG(1, 2991 "Invalid device flags on disk %s (device %s), skipping.", 2992 pp->name, sc->sc_name); 2993 return (EINVAL); 2994 } 2995 if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && 2996 (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { 2997 /* 2998 * VERIFY and ROUND-ROBIN options are mutally exclusive. 2999 */ 3000 G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " 3001 "disk %s (device %s), skipping.", pp->name, sc->sc_name); 3002 return (EINVAL); 3003 } 3004 if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { 3005 G_RAID3_DEBUG(1, 3006 "Invalid disk flags on disk %s (device %s), skipping.", 3007 pp->name, sc->sc_name); 3008 return (EINVAL); 3009 } 3010 return (0); 3011} 3012 3013int 3014g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, 3015 struct g_raid3_metadata *md) 3016{ 3017 struct g_raid3_disk *disk; 3018 int error; 3019 3020 g_topology_assert_not(); 3021 G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); 3022 3023 error = g_raid3_check_metadata(sc, pp, md); 3024 if (error != 0) 3025 return (error); 3026 if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && 3027 md->md_genid < sc->sc_genid) { 3028 G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", 3029 pp->name, sc->sc_name); 3030 return (EINVAL); 3031 } 3032 disk = g_raid3_init_disk(sc, pp, md, &error); 3033 if (disk == NULL) 3034 return (error); 3035 error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, 3036 G_RAID3_EVENT_WAIT); 3037 if (error != 0) 3038 return (error); 3039 if (md->md_version < G_RAID3_VERSION) { 3040 G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", 3041 pp->name, md->md_version, G_RAID3_VERSION); 3042 g_raid3_update_metadata(disk); 3043 } 3044 return (0); 3045} 3046 3047static void 3048g_raid3_destroy_delayed(void *arg, int flag) 3049{ 3050 struct g_raid3_softc *sc; 3051 int error; 3052 3053 if (flag == EV_CANCEL) { 3054 G_RAID3_DEBUG(1, "Destroying canceled."); 3055 return; 3056 } 3057 sc = arg; 3058 g_topology_unlock(); 3059 sx_xlock(&sc->sc_lock); 3060 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, 3061 ("DESTROY flag set on %s.", sc->sc_name)); 3062 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, 3063 ("DESTROYING flag not set on %s.", sc->sc_name)); 3064 G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); 3065 error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); 3066 if (error != 0) { 3067 G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); 3068 sx_xunlock(&sc->sc_lock); 3069 } 3070 g_topology_lock(); 3071} 3072 3073static int 3074g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) 3075{ 3076 struct g_raid3_softc *sc; 3077 int dcr, dcw, dce, error = 0; 3078 3079 g_topology_assert(); 3080 G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, 3081 acw, ace); 3082 3083 sc = pp->geom->softc; 3084 if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) 3085 return (0); 3086 KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); 3087 3088 dcr = pp->acr + acr; 3089 dcw = pp->acw + acw; 3090 dce = pp->ace + ace; 3091 3092 g_topology_unlock(); 3093 sx_xlock(&sc->sc_lock); 3094 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || 3095 g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { 3096 if (acr > 0 || acw > 0 || ace > 0) 3097 error = ENXIO; 3098 goto end; 3099 } 3100 if (dcw == 0 && !sc->sc_idle) 3101 g_raid3_idle(sc, dcw); 3102 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { 3103 if (acr > 0 || acw > 0 || ace > 0) { 3104 error = ENXIO; 3105 goto end; 3106 } 3107 if (dcr == 0 && dcw == 0 && dce == 0) { 3108 g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, 3109 sc, NULL); 3110 } 3111 } 3112end: 3113 sx_xunlock(&sc->sc_lock); 3114 g_topology_lock(); 3115 return (error); 3116} 3117 3118static struct g_geom * 3119g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) 3120{ 3121 struct g_raid3_softc *sc; 3122 struct g_geom *gp; 3123 int error, timeout; 3124 u_int n; 3125 3126 g_topology_assert(); 3127 G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); 3128 3129 /* One disk is minimum. */ 3130 if (md->md_all < 1) 3131 return (NULL); 3132 /* 3133 * Action geom. 3134 */ 3135 gp = g_new_geomf(mp, "%s", md->md_name); 3136 sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); 3137 sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, 3138 M_WAITOK | M_ZERO); 3139 gp->start = g_raid3_start; 3140 gp->orphan = g_raid3_orphan; 3141 gp->access = g_raid3_access; 3142 gp->dumpconf = g_raid3_dumpconf; 3143 3144 sc->sc_id = md->md_id; 3145 sc->sc_mediasize = md->md_mediasize; 3146 sc->sc_sectorsize = md->md_sectorsize; 3147 sc->sc_ndisks = md->md_all; 3148 sc->sc_round_robin = 0; 3149 sc->sc_flags = md->md_mflags; 3150 sc->sc_bump_id = 0; 3151 sc->sc_idle = 1; 3152 sc->sc_last_write = time_uptime; 3153 sc->sc_writes = 0; 3154 for (n = 0; n < sc->sc_ndisks; n++) { 3155 sc->sc_disks[n].d_softc = sc; 3156 sc->sc_disks[n].d_no = n; 3157 sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; 3158 } 3159 sx_init(&sc->sc_lock, "graid3:lock"); 3160 bioq_init(&sc->sc_queue); 3161 mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); 3162 bioq_init(&sc->sc_regular_delayed); 3163 bioq_init(&sc->sc_inflight); 3164 bioq_init(&sc->sc_sync_delayed); 3165 TAILQ_INIT(&sc->sc_events); 3166 mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); 3167 callout_init(&sc->sc_callout, CALLOUT_MPSAFE); 3168 sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; 3169 gp->softc = sc; 3170 sc->sc_geom = gp; 3171 sc->sc_provider = NULL; 3172 /* 3173 * Synchronization geom. 3174 */ 3175 gp = g_new_geomf(mp, "%s.sync", md->md_name); 3176 gp->softc = sc; 3177 gp->orphan = g_raid3_orphan; 3178 sc->sc_sync.ds_geom = gp; 3179 3180 if (!g_raid3_use_malloc) { 3181 sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 3182 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, 3183 UMA_ALIGN_PTR, 0); 3184 sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; 3185 sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; 3186 sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = 3187 sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; 3188 sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 3189 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, 3190 UMA_ALIGN_PTR, 0); 3191 sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; 3192 sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; 3193 sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = 3194 sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; 3195 sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 3196 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, 3197 UMA_ALIGN_PTR, 0); 3198 sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; 3199 sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; 3200 sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = 3201 sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; 3202 } 3203 3204 error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, 3205 "g_raid3 %s", md->md_name); 3206 if (error != 0) { 3207 G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", 3208 sc->sc_name); 3209 if (!g_raid3_use_malloc) { 3210 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); 3211 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); 3212 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); 3213 } 3214 g_destroy_geom(sc->sc_sync.ds_geom); 3215 mtx_destroy(&sc->sc_events_mtx); 3216 mtx_destroy(&sc->sc_queue_mtx); 3217 sx_destroy(&sc->sc_lock); 3218 g_destroy_geom(sc->sc_geom); 3219 free(sc->sc_disks, M_RAID3); 3220 free(sc, M_RAID3); 3221 return (NULL); 3222 } 3223 3224 G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", 3225 sc->sc_name, sc->sc_ndisks, sc->sc_id); 3226 3227 sc->sc_rootmount = root_mount_hold("GRAID3"); 3228 G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); 3229 3230 /* 3231 * Run timeout. 3232 */ 3233 timeout = atomic_load_acq_int(&g_raid3_timeout); 3234 callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); 3235 return (sc->sc_geom); 3236} 3237 3238int 3239g_raid3_destroy(struct g_raid3_softc *sc, int how) 3240{ 3241 struct g_provider *pp; 3242 3243 g_topology_assert_not(); 3244 if (sc == NULL) 3245 return (ENXIO); 3246 sx_assert(&sc->sc_lock, SX_XLOCKED); 3247 3248 pp = sc->sc_provider; 3249 if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 3250 switch (how) { 3251 case G_RAID3_DESTROY_SOFT: 3252 G_RAID3_DEBUG(1, 3253 "Device %s is still open (r%dw%de%d).", pp->name, 3254 pp->acr, pp->acw, pp->ace); 3255 return (EBUSY); 3256 case G_RAID3_DESTROY_DELAYED: 3257 G_RAID3_DEBUG(1, 3258 "Device %s will be destroyed on last close.", 3259 pp->name); 3260 if (sc->sc_syncdisk != NULL) 3261 g_raid3_sync_stop(sc, 1); 3262 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; 3263 return (EBUSY); 3264 case G_RAID3_DESTROY_HARD: 3265 G_RAID3_DEBUG(1, "Device %s is still open, so it " 3266 "can't be definitely removed.", pp->name); 3267 break; 3268 } 3269 } 3270 3271 g_topology_lock(); 3272 if (sc->sc_geom->softc == NULL) { 3273 g_topology_unlock(); 3274 return (0); 3275 } 3276 sc->sc_geom->softc = NULL; 3277 sc->sc_sync.ds_geom->softc = NULL; 3278 g_topology_unlock(); 3279 3280 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 3281 sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; 3282 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 3283 sx_xunlock(&sc->sc_lock); 3284 mtx_lock(&sc->sc_queue_mtx); 3285 wakeup(sc); 3286 wakeup(&sc->sc_queue); 3287 mtx_unlock(&sc->sc_queue_mtx); 3288 G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); 3289 while (sc->sc_worker != NULL) 3290 tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); 3291 G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); 3292 sx_xlock(&sc->sc_lock); 3293 g_raid3_destroy_device(sc); 3294 free(sc->sc_disks, M_RAID3); 3295 free(sc, M_RAID3); 3296 return (0); 3297} 3298 3299static void 3300g_raid3_taste_orphan(struct g_consumer *cp) 3301{ 3302 3303 KASSERT(1 == 0, ("%s called while tasting %s.", __func__, 3304 cp->provider->name)); 3305} 3306 3307static struct g_geom * 3308g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 3309{ 3310 struct g_raid3_metadata md; 3311 struct g_raid3_softc *sc; 3312 struct g_consumer *cp; 3313 struct g_geom *gp; 3314 int error; 3315 3316 g_topology_assert(); 3317 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); 3318 G_RAID3_DEBUG(2, "Tasting %s.", pp->name); 3319 3320 gp = g_new_geomf(mp, "raid3:taste"); 3321 /* This orphan function should be never called. */ 3322 gp->orphan = g_raid3_taste_orphan; 3323 cp = g_new_consumer(gp); 3324 g_attach(cp, pp); 3325 error = g_raid3_read_metadata(cp, &md); 3326 g_detach(cp); 3327 g_destroy_consumer(cp); 3328 g_destroy_geom(gp); 3329 if (error != 0) 3330 return (NULL); 3331 gp = NULL; 3332 3333 if (md.md_provider[0] != '\0' && 3334 !g_compare_names(md.md_provider, pp->name)) 3335 return (NULL); 3336 if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) 3337 return (NULL); 3338 if (g_raid3_debug >= 2) 3339 raid3_metadata_dump(&md); 3340 3341 /* 3342 * Let's check if device already exists. 3343 */ 3344 sc = NULL; 3345 LIST_FOREACH(gp, &mp->geom, geom) { 3346 sc = gp->softc; 3347 if (sc == NULL) 3348 continue; 3349 if (sc->sc_sync.ds_geom == gp) 3350 continue; 3351 if (strcmp(md.md_name, sc->sc_name) != 0) 3352 continue; 3353 if (md.md_id != sc->sc_id) { 3354 G_RAID3_DEBUG(0, "Device %s already configured.", 3355 sc->sc_name); 3356 return (NULL); 3357 } 3358 break; 3359 } 3360 if (gp == NULL) { 3361 gp = g_raid3_create(mp, &md); 3362 if (gp == NULL) { 3363 G_RAID3_DEBUG(0, "Cannot create device %s.", 3364 md.md_name); 3365 return (NULL); 3366 } 3367 sc = gp->softc; 3368 } 3369 G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); 3370 g_topology_unlock(); 3371 sx_xlock(&sc->sc_lock); 3372 error = g_raid3_add_disk(sc, pp, &md); 3373 if (error != 0) { 3374 G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", 3375 pp->name, gp->name, error); 3376 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == 3377 sc->sc_ndisks) { 3378 g_cancel_event(sc); 3379 g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); 3380 g_topology_lock(); 3381 return (NULL); 3382 } 3383 gp = NULL; 3384 } 3385 sx_xunlock(&sc->sc_lock); 3386 g_topology_lock(); 3387 return (gp); 3388} 3389 3390static int 3391g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, 3392 struct g_geom *gp) 3393{ 3394 struct g_raid3_softc *sc; 3395 int error; 3396 3397 g_topology_unlock(); 3398 sc = gp->softc; 3399 sx_xlock(&sc->sc_lock); 3400 g_cancel_event(sc); 3401 error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); 3402 if (error != 0) 3403 sx_xunlock(&sc->sc_lock); 3404 g_topology_lock(); 3405 return (error); 3406} 3407 3408static void 3409g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 3410 struct g_consumer *cp, struct g_provider *pp) 3411{ 3412 struct g_raid3_softc *sc; 3413 3414 g_topology_assert(); 3415 3416 sc = gp->softc; 3417 if (sc == NULL) 3418 return; 3419 /* Skip synchronization geom. */ 3420 if (gp == sc->sc_sync.ds_geom) 3421 return; 3422 if (pp != NULL) { 3423 /* Nothing here. */ 3424 } else if (cp != NULL) { 3425 struct g_raid3_disk *disk; 3426 3427 disk = cp->private; 3428 if (disk == NULL) 3429 return; 3430 g_topology_unlock(); 3431 sx_xlock(&sc->sc_lock); 3432 sbuf_printf(sb, "%s<Type>", indent); 3433 if (disk->d_no == sc->sc_ndisks - 1) 3434 sbuf_printf(sb, "PARITY"); 3435 else 3436 sbuf_printf(sb, "DATA"); 3437 sbuf_printf(sb, "</Type>\n"); 3438 sbuf_printf(sb, "%s<Number>%u</Number>\n", indent, 3439 (u_int)disk->d_no); 3440 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 3441 sbuf_printf(sb, "%s<Synchronized>", indent); 3442 if (disk->d_sync.ds_offset == 0) 3443 sbuf_printf(sb, "0%%"); 3444 else { 3445 sbuf_printf(sb, "%u%%", 3446 (u_int)((disk->d_sync.ds_offset * 100) / 3447 (sc->sc_mediasize / (sc->sc_ndisks - 1)))); 3448 } 3449 sbuf_printf(sb, "</Synchronized>\n"); 3450 } 3451 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, 3452 disk->d_sync.ds_syncid); 3453 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid); 3454 sbuf_printf(sb, "%s<Flags>", indent); 3455 if (disk->d_flags == 0) 3456 sbuf_printf(sb, "NONE"); 3457 else { 3458 int first = 1; 3459 3460#define ADD_FLAG(flag, name) do { \ 3461 if ((disk->d_flags & (flag)) != 0) { \ 3462 if (!first) \ 3463 sbuf_printf(sb, ", "); \ 3464 else \ 3465 first = 0; \ 3466 sbuf_printf(sb, name); \ 3467 } \ 3468} while (0) 3469 ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); 3470 ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); 3471 ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, 3472 "SYNCHRONIZING"); 3473 ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); 3474 ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); 3475#undef ADD_FLAG 3476 } 3477 sbuf_printf(sb, "</Flags>\n"); 3478 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 3479 g_raid3_disk_state2str(disk->d_state)); 3480 sx_xunlock(&sc->sc_lock); 3481 g_topology_lock(); 3482 } else { 3483 g_topology_unlock(); 3484 sx_xlock(&sc->sc_lock); 3485 if (!g_raid3_use_malloc) { 3486 sbuf_printf(sb, 3487 "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent, 3488 sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); 3489 sbuf_printf(sb, 3490 "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent, 3491 sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); 3492 sbuf_printf(sb, 3493 "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent, 3494 sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); 3495 sbuf_printf(sb, 3496 "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent, 3497 sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); 3498 sbuf_printf(sb, 3499 "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent, 3500 sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); 3501 sbuf_printf(sb, 3502 "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent, 3503 sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); 3504 } 3505 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id); 3506 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid); 3507 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid); 3508 sbuf_printf(sb, "%s<Flags>", indent); 3509 if (sc->sc_flags == 0) 3510 sbuf_printf(sb, "NONE"); 3511 else { 3512 int first = 1; 3513 3514#define ADD_FLAG(flag, name) do { \ 3515 if ((sc->sc_flags & (flag)) != 0) { \ 3516 if (!first) \ 3517 sbuf_printf(sb, ", "); \ 3518 else \ 3519 first = 0; \ 3520 sbuf_printf(sb, name); \ 3521 } \ 3522} while (0) 3523 ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); 3524 ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); 3525 ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, 3526 "ROUND-ROBIN"); 3527 ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); 3528#undef ADD_FLAG 3529 } 3530 sbuf_printf(sb, "</Flags>\n"); 3531 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent, 3532 sc->sc_ndisks); 3533 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 3534 g_raid3_device_state2str(sc->sc_state)); 3535 sx_xunlock(&sc->sc_lock); 3536 g_topology_lock(); 3537 } 3538} 3539 3540static void 3541g_raid3_shutdown_pre_sync(void *arg, int howto) 3542{ 3543 struct g_class *mp; 3544 struct g_geom *gp, *gp2; 3545 struct g_raid3_softc *sc; 3546 int error; 3547 3548 mp = arg; 3549 DROP_GIANT(); 3550 g_topology_lock(); 3551 LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 3552 if ((sc = gp->softc) == NULL) 3553 continue; 3554 /* Skip synchronization geom. */ 3555 if (gp == sc->sc_sync.ds_geom) 3556 continue; 3557 g_topology_unlock(); 3558 sx_xlock(&sc->sc_lock); 3559 g_cancel_event(sc); 3560 error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); 3561 if (error != 0) 3562 sx_xunlock(&sc->sc_lock); 3563 g_topology_lock(); 3564 } 3565 g_topology_unlock(); 3566 PICKUP_GIANT(); 3567} 3568 3569static void 3570g_raid3_init(struct g_class *mp) 3571{ 3572 3573 g_raid3_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, 3574 g_raid3_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); 3575 if (g_raid3_pre_sync == NULL) 3576 G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); 3577} 3578 3579static void 3580g_raid3_fini(struct g_class *mp) 3581{ 3582 3583 if (g_raid3_pre_sync != NULL) 3584 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid3_pre_sync); 3585} 3586 3587DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); 3588