1219974Smav/*- 2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3219974Smav * All rights reserved. 4219974Smav * 5219974Smav * Redistribution and use in source and binary forms, with or without 6219974Smav * modification, are permitted provided that the following conditions 7219974Smav * are met: 8219974Smav * 1. Redistributions of source code must retain the above copyright 9219974Smav * notice, this list of conditions and the following disclaimer. 10219974Smav * 2. Redistributions in binary form must reproduce the above copyright 11219974Smav * notice, this list of conditions and the following disclaimer in the 12219974Smav * documentation and/or other materials provided with the distribution. 13219974Smav * 14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17219974Smav * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24219974Smav * SUCH DAMAGE. 25219974Smav */ 26219974Smav 27219974Smav#include <sys/cdefs.h> 28219974Smav__FBSDID("$FreeBSD$"); 29219974Smav 30219974Smav#include <sys/param.h> 31219974Smav#include <sys/bio.h> 32219974Smav#include <sys/endian.h> 33219974Smav#include <sys/kernel.h> 34219974Smav#include <sys/kobj.h> 35219974Smav#include <sys/limits.h> 36219974Smav#include <sys/lock.h> 37219974Smav#include <sys/malloc.h> 38219974Smav#include <sys/mutex.h> 39219974Smav#include <sys/sysctl.h> 40219974Smav#include <sys/systm.h> 41219974Smav#include <geom/geom.h> 42219974Smav#include "geom/raid/g_raid.h" 43219974Smav#include "g_raid_tr_if.h" 44219974Smav 45219974Smav#define N 2 46219974Smav 47240465SmavSYSCTL_DECL(_kern_geom_raid_raid1e); 48219974Smav 49219974Smav#define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 50219974Smavstatic int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; 51267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, 52219974Smav &g_raid1e_rebuild_slab, 0, 53219974Smav "Amount of the disk to rebuild each read/write cycle of the rebuild."); 54219974Smav 55219974Smav#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 56219974Smavstatic int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; 57267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, 58219974Smav &g_raid1e_rebuild_fair_io, 0, 59219974Smav "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 60219974Smav 61219974Smav#define RAID1E_REBUILD_CLUSTER_IDLE 100 62219974Smavstatic int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; 63267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, 64219974Smav &g_raid1e_rebuild_cluster_idle, 0, 65219974Smav "Number of slabs to do each time we trigger a rebuild cycle"); 66219974Smav 67219974Smav#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 68219974Smavstatic int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; 69267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, 70219974Smav &g_raid1e_rebuild_meta_update, 0, 71219974Smav "When to update the meta data."); 72219974Smav 73219974Smavstatic MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); 74219974Smav 75219974Smav#define TR_RAID1E_NONE 0 76219974Smav#define TR_RAID1E_REBUILD 1 77219974Smav#define TR_RAID1E_RESYNC 2 78219974Smav 79219974Smav#define TR_RAID1E_F_DOING_SOME 0x1 80219974Smav#define TR_RAID1E_F_LOCKED 0x2 81219974Smav#define TR_RAID1E_F_ABORT 0x4 82219974Smav 83219974Smavstruct g_raid_tr_raid1e_object { 84219974Smav struct g_raid_tr_object trso_base; 85219974Smav int trso_starting; 86219974Smav int trso_stopping; 87219974Smav int trso_type; 88219974Smav int trso_recover_slabs; /* slabs before rest */ 89219974Smav int trso_fair_io; 90219974Smav int trso_meta_update; 91219974Smav int trso_flags; 92219974Smav struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 93219974Smav void *trso_buffer; /* Buffer space */ 94219974Smav off_t trso_lock_pos; /* Locked range start. */ 95219974Smav off_t trso_lock_len; /* Locked range length. */ 96219974Smav struct bio trso_bio; 97219974Smav}; 98219974Smav 99219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1e; 100219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1e; 101219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1e; 102219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1e; 103219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; 104219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; 105219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; 106219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1e; 107219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1e; 108219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1e; 109219974Smav 110219974Smavstatic kobj_method_t g_raid_tr_raid1e_methods[] = { 111219974Smav KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), 112219974Smav KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), 113219974Smav KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), 114219974Smav KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), 115219974Smav KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), 116219974Smav KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), 117219974Smav KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), 118219974Smav KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), 119219974Smav KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), 120219974Smav KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), 121219974Smav { 0, 0 } 122219974Smav}; 123219974Smav 124219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1e_class = { 125219974Smav "RAID1E", 126219974Smav g_raid_tr_raid1e_methods, 127219974Smav sizeof(struct g_raid_tr_raid1e_object), 128240465Smav .trc_enable = 1, 129256610Smav .trc_priority = 200, 130256610Smav .trc_accept_unmapped = 1 131219974Smav}; 132219974Smav 133219974Smavstatic void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); 134219974Smavstatic void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 135219974Smav struct g_raid_subdisk *sd); 136219974Smavstatic int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 137219974Smav int no, off_t off, off_t len, u_int mask); 138219974Smav 139219974Smavstatic inline void 140219974SmavV2P(struct g_raid_volume *vol, off_t virt, 141219974Smav int *disk, off_t *offset, off_t *start) 142219974Smav{ 143219974Smav off_t nstrip; 144219974Smav u_int strip_size; 145219974Smav 146219974Smav strip_size = vol->v_strip_size; 147219974Smav /* Strip number. */ 148219974Smav nstrip = virt / strip_size; 149219974Smav /* Start position in strip. */ 150219974Smav *start = virt % strip_size; 151219974Smav /* Disk number. */ 152219974Smav *disk = (nstrip * N) % vol->v_disks_count; 153219974Smav /* Strip start position in disk. */ 154219974Smav *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; 155219974Smav} 156219974Smav 157219974Smavstatic inline void 158219974SmavP2V(struct g_raid_volume *vol, int disk, off_t offset, 159219974Smav off_t *virt, int *copy) 160219974Smav{ 161219974Smav off_t nstrip, start; 162219974Smav u_int strip_size; 163219974Smav 164219974Smav strip_size = vol->v_strip_size; 165219974Smav /* Start position in strip. */ 166219974Smav start = offset % strip_size; 167219974Smav /* Physical strip number. */ 168219974Smav nstrip = (offset / strip_size) * vol->v_disks_count + disk; 169219974Smav /* Number of physical strip (copy) inside virtual strip. */ 170219974Smav *copy = nstrip % N; 171219974Smav /* Offset in virtual space. */ 172219974Smav *virt = (nstrip / N) * strip_size + start; 173219974Smav} 174219974Smav 175219974Smavstatic int 176219974Smavg_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 177219974Smav{ 178219974Smav struct g_raid_tr_raid1e_object *trs; 179219974Smav 180219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 181219974Smav if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || 182234603Smav tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) 183219974Smav return (G_RAID_TR_TASTE_FAIL); 184219974Smav trs->trso_starting = 1; 185219974Smav return (G_RAID_TR_TASTE_SUCCEED); 186219974Smav} 187219974Smav 188219974Smavstatic int 189219974Smavg_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) 190219974Smav{ 191219974Smav struct g_raid_softc *sc; 192219974Smav struct g_raid_subdisk *sd, *bestsd, *worstsd; 193219974Smav int i, j, state, sstate; 194219974Smav 195219974Smav sc = vol->v_softc; 196219974Smav state = G_RAID_VOLUME_S_OPTIMAL; 197219974Smav for (i = 0; i < vol->v_disks_count / N; i++) { 198219974Smav bestsd = &vol->v_subdisks[i * N]; 199219974Smav for (j = 1; j < N; j++) { 200219974Smav sd = &vol->v_subdisks[i * N + j]; 201219974Smav if (sd->sd_state > bestsd->sd_state) 202219974Smav bestsd = sd; 203219974Smav else if (sd->sd_state == bestsd->sd_state && 204219974Smav (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 205219974Smav sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 206219974Smav sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 207219974Smav bestsd = sd; 208219974Smav } 209219974Smav if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && 210219974Smav bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { 211219974Smav /* We found reasonable candidate. */ 212219974Smav G_RAID_DEBUG1(1, sc, 213219974Smav "Promote subdisk %s:%d from %s to ACTIVE.", 214219974Smav vol->v_name, bestsd->sd_pos, 215219974Smav g_raid_subdisk_state2str(bestsd->sd_state)); 216219974Smav g_raid_change_subdisk_state(bestsd, 217219974Smav G_RAID_SUBDISK_S_ACTIVE); 218219974Smav g_raid_write_metadata(sc, 219219974Smav vol, bestsd, bestsd->sd_disk); 220219974Smav } 221219974Smav worstsd = &vol->v_subdisks[i * N]; 222219974Smav for (j = 1; j < N; j++) { 223219974Smav sd = &vol->v_subdisks[i * N + j]; 224219974Smav if (sd->sd_state < worstsd->sd_state) 225219974Smav worstsd = sd; 226219974Smav } 227219974Smav if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 228219974Smav sstate = G_RAID_VOLUME_S_OPTIMAL; 229219974Smav else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 230219974Smav sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 231219974Smav else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 232219974Smav sstate = G_RAID_VOLUME_S_DEGRADED; 233219974Smav else 234219974Smav sstate = G_RAID_VOLUME_S_BROKEN; 235219974Smav if (sstate < state) 236219974Smav state = sstate; 237219974Smav } 238219974Smav return (state); 239219974Smav} 240219974Smav 241219974Smavstatic int 242219974Smavg_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) 243219974Smav{ 244219974Smav struct g_raid_softc *sc; 245219974Smav struct g_raid_subdisk *sd, *bestsd, *worstsd; 246219974Smav int i, j, state, sstate; 247219974Smav 248219974Smav sc = vol->v_softc; 249219974Smav if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == 250219974Smav vol->v_disks_count) 251219974Smav return (G_RAID_VOLUME_S_OPTIMAL); 252219974Smav for (i = 0; i < vol->v_disks_count; i++) { 253219974Smav sd = &vol->v_subdisks[i]; 254219974Smav if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { 255219974Smav /* We found reasonable candidate. */ 256219974Smav G_RAID_DEBUG1(1, sc, 257219974Smav "Promote subdisk %s:%d from %s to STALE.", 258219974Smav vol->v_name, sd->sd_pos, 259219974Smav g_raid_subdisk_state2str(sd->sd_state)); 260219974Smav g_raid_change_subdisk_state(sd, 261219974Smav G_RAID_SUBDISK_S_STALE); 262219974Smav g_raid_write_metadata(sc, vol, sd, sd->sd_disk); 263219974Smav } 264219974Smav } 265219974Smav state = G_RAID_VOLUME_S_OPTIMAL; 266219974Smav for (i = 0; i < vol->v_disks_count; i++) { 267219974Smav bestsd = &vol->v_subdisks[i]; 268219974Smav worstsd = &vol->v_subdisks[i]; 269219974Smav for (j = 1; j < N; j++) { 270219974Smav sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; 271219974Smav if (sd->sd_state > bestsd->sd_state) 272219974Smav bestsd = sd; 273219974Smav else if (sd->sd_state == bestsd->sd_state && 274219974Smav (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 275219974Smav sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 276219974Smav sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 277219974Smav bestsd = sd; 278219974Smav if (sd->sd_state < worstsd->sd_state) 279219974Smav worstsd = sd; 280219974Smav } 281219974Smav if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 282219974Smav sstate = G_RAID_VOLUME_S_OPTIMAL; 283219974Smav else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 284219974Smav sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 285219974Smav else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) 286219974Smav sstate = G_RAID_VOLUME_S_DEGRADED; 287219974Smav else 288219974Smav sstate = G_RAID_VOLUME_S_BROKEN; 289219974Smav if (sstate < state) 290219974Smav state = sstate; 291219974Smav } 292219974Smav return (state); 293219974Smav} 294219974Smav 295219974Smavstatic int 296219974Smavg_raid_tr_update_state_raid1e(struct g_raid_volume *vol, 297219974Smav struct g_raid_subdisk *sd) 298219974Smav{ 299219974Smav struct g_raid_tr_raid1e_object *trs; 300219974Smav struct g_raid_softc *sc; 301219974Smav u_int s; 302219974Smav 303219974Smav sc = vol->v_softc; 304219974Smav trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; 305219974Smav if (trs->trso_stopping && 306219974Smav (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) 307219974Smav s = G_RAID_VOLUME_S_STOPPED; 308219974Smav else if (trs->trso_starting) 309219974Smav s = G_RAID_VOLUME_S_STARTING; 310219974Smav else { 311219974Smav if ((vol->v_disks_count % N) == 0) 312219974Smav s = g_raid_tr_update_state_raid1e_even(vol); 313219974Smav else 314219974Smav s = g_raid_tr_update_state_raid1e_odd(vol); 315219974Smav } 316219974Smav if (s != vol->v_state) { 317219974Smav g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 318219974Smav G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 319219974Smav G_RAID_EVENT_VOLUME); 320219974Smav g_raid_change_volume_state(vol, s); 321219974Smav if (!trs->trso_starting && !trs->trso_stopping) 322219974Smav g_raid_write_metadata(sc, vol, NULL, NULL); 323219974Smav } 324219974Smav if (!trs->trso_starting && !trs->trso_stopping) 325219974Smav g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); 326219974Smav return (0); 327219974Smav} 328219974Smav 329219974Smavstatic void 330219974Smavg_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 331219974Smav struct g_raid_disk *disk) 332219974Smav{ 333235270Smav struct g_raid_volume *vol; 334235270Smav 335235270Smav vol = sd->sd_volume; 336219974Smav /* 337219974Smav * We don't fail the last disk in the pack, since it still has decent 338219974Smav * data on it and that's better than failing the disk if it is the root 339219974Smav * file system. 340219974Smav * 341219974Smav * XXX should this be controlled via a tunable? It makes sense for 342219974Smav * the volume that has / on it. I can't think of a case where we'd 343219974Smav * want the volume to go away on this kind of event. 344219974Smav */ 345235270Smav if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + 346235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + 347235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 348235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < 349235270Smav vol->v_disks_count) && 350235270Smav (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) 351219974Smav return; 352219974Smav g_raid_fail_disk(sc, sd, disk); 353219974Smav} 354219974Smav 355219974Smavstatic void 356219974Smavg_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) 357219974Smav{ 358219974Smav struct g_raid_volume *vol; 359219974Smav struct g_raid_subdisk *sd; 360219974Smav 361219974Smav vol = trs->trso_base.tro_volume; 362219974Smav sd = trs->trso_failed_sd; 363219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 364219974Smav free(trs->trso_buffer, M_TR_RAID1E); 365219974Smav trs->trso_buffer = NULL; 366219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 367219974Smav trs->trso_type = TR_RAID1E_NONE; 368219974Smav trs->trso_recover_slabs = 0; 369219974Smav trs->trso_failed_sd = NULL; 370219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 371219974Smav} 372219974Smav 373219974Smavstatic void 374219974Smavg_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) 375219974Smav{ 376219974Smav struct g_raid_tr_raid1e_object *trs; 377219974Smav struct g_raid_subdisk *sd; 378219974Smav 379219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 380219974Smav sd = trs->trso_failed_sd; 381219974Smav G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 382219974Smav "Subdisk %s:%d-%s rebuild completed.", 383219974Smav sd->sd_volume->v_name, sd->sd_pos, 384219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 385219974Smav g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 386219974Smav sd->sd_rebuild_pos = 0; 387219974Smav g_raid_tr_raid1e_rebuild_done(trs); 388219974Smav} 389219974Smav 390219974Smavstatic void 391219974Smavg_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) 392219974Smav{ 393219974Smav struct g_raid_tr_raid1e_object *trs; 394219974Smav struct g_raid_subdisk *sd; 395219974Smav struct g_raid_volume *vol; 396219974Smav 397219974Smav vol = tr->tro_volume; 398219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 399219974Smav sd = trs->trso_failed_sd; 400219974Smav if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { 401219974Smav G_RAID_DEBUG1(1, vol->v_softc, 402219974Smav "Subdisk %s:%d-%s rebuild is aborting.", 403219974Smav sd->sd_volume->v_name, sd->sd_pos, 404219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 405219974Smav trs->trso_flags |= TR_RAID1E_F_ABORT; 406219974Smav } else { 407219974Smav G_RAID_DEBUG1(0, vol->v_softc, 408219974Smav "Subdisk %s:%d-%s rebuild aborted.", 409219974Smav sd->sd_volume->v_name, sd->sd_pos, 410219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 411219974Smav trs->trso_flags &= ~TR_RAID1E_F_ABORT; 412219974Smav if (trs->trso_flags & TR_RAID1E_F_LOCKED) { 413219974Smav trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 414219974Smav g_raid_unlock_range(tr->tro_volume, 415219974Smav trs->trso_lock_pos, trs->trso_lock_len); 416219974Smav } 417219974Smav g_raid_tr_raid1e_rebuild_done(trs); 418219974Smav } 419219974Smav} 420219974Smav 421219974Smavstatic void 422219974Smavg_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) 423219974Smav{ 424219974Smav struct g_raid_tr_raid1e_object *trs; 425219974Smav struct g_raid_softc *sc; 426219974Smav struct g_raid_volume *vol; 427219974Smav struct g_raid_subdisk *sd; 428219974Smav struct bio *bp; 429219974Smav off_t len, virtual, vend, offset, start; 430219974Smav int disk, copy, best; 431219974Smav 432219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 433219974Smav if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) 434219974Smav return; 435219974Smav vol = tr->tro_volume; 436219974Smav sc = vol->v_softc; 437219974Smav sd = trs->trso_failed_sd; 438219974Smav 439219974Smav while (1) { 440219974Smav if (sd->sd_rebuild_pos >= sd->sd_size) { 441219974Smav g_raid_tr_raid1e_rebuild_finish(tr); 442219974Smav return; 443219974Smav } 444219974Smav /* Get virtual offset from physical rebuild position. */ 445219974Smav P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); 446219974Smav /* Get physical offset back to get first stripe position. */ 447219974Smav V2P(vol, virtual, &disk, &offset, &start); 448219974Smav /* Calculate contignous data length. */ 449219974Smav len = MIN(g_raid1e_rebuild_slab, 450219974Smav sd->sd_size - sd->sd_rebuild_pos); 451219974Smav if ((vol->v_disks_count % N) != 0) 452219974Smav len = MIN(len, vol->v_strip_size - start); 453219974Smav /* Find disk with most accurate data. */ 454219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, disk, 455219974Smav offset + start, len, 0); 456219974Smav if (best < 0) { 457219974Smav /* There is no any valid disk. */ 458219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 459219974Smav return; 460219974Smav } else if (best != copy) { 461219974Smav /* Some other disk has better data. */ 462219974Smav break; 463219974Smav } 464219974Smav /* We have the most accurate data. Skip the range. */ 465219974Smav G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", 466219974Smav sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); 467219974Smav sd->sd_rebuild_pos += len; 468219974Smav } 469219974Smav 470219974Smav bp = &trs->trso_bio; 471219974Smav memset(bp, 0, sizeof(*bp)); 472219974Smav bp->bio_offset = offset + start + 473219974Smav ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); 474219974Smav bp->bio_length = len; 475219974Smav bp->bio_data = trs->trso_buffer; 476219974Smav bp->bio_cmd = BIO_READ; 477219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 478219974Smav bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; 479219974Smav G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); 480219974Smav /* 481219974Smav * If we are crossing stripe boundary, correct affected virtual 482219974Smav * range we should lock. 483219974Smav */ 484219974Smav if (start + len > vol->v_strip_size) { 485219974Smav P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); 486219974Smav len = vend - virtual; 487219974Smav } 488219974Smav trs->trso_flags |= TR_RAID1E_F_DOING_SOME; 489219974Smav trs->trso_flags |= TR_RAID1E_F_LOCKED; 490219974Smav trs->trso_lock_pos = virtual; 491219974Smav trs->trso_lock_len = len; 492219974Smav /* Lock callback starts I/O */ 493219974Smav g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); 494219974Smav} 495219974Smav 496219974Smavstatic void 497219974Smavg_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) 498219974Smav{ 499219974Smav struct g_raid_volume *vol; 500219974Smav struct g_raid_tr_raid1e_object *trs; 501219974Smav struct g_raid_subdisk *sd; 502219974Smav 503219974Smav vol = tr->tro_volume; 504219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 505219974Smav if (trs->trso_failed_sd) { 506219974Smav G_RAID_DEBUG1(1, vol->v_softc, 507219974Smav "Already rebuild in start rebuild. pos %jd\n", 508219974Smav (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 509219974Smav return; 510219974Smav } 511219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 512219974Smav if (sd == NULL) 513219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 514219974Smav if (sd == NULL) { 515219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 516219974Smav if (sd != NULL) { 517219974Smav sd->sd_rebuild_pos = 0; 518219974Smav g_raid_change_subdisk_state(sd, 519219974Smav G_RAID_SUBDISK_S_RESYNC); 520219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, NULL); 521219974Smav } else { 522219974Smav sd = g_raid_get_subdisk(vol, 523219974Smav G_RAID_SUBDISK_S_UNINITIALIZED); 524219974Smav if (sd == NULL) 525219974Smav sd = g_raid_get_subdisk(vol, 526219974Smav G_RAID_SUBDISK_S_NEW); 527219974Smav if (sd != NULL) { 528219974Smav sd->sd_rebuild_pos = 0; 529219974Smav g_raid_change_subdisk_state(sd, 530219974Smav G_RAID_SUBDISK_S_REBUILD); 531219974Smav g_raid_write_metadata(vol->v_softc, 532219974Smav vol, sd, NULL); 533219974Smav } 534219974Smav } 535219974Smav } 536219974Smav if (sd == NULL) { 537219974Smav G_RAID_DEBUG1(1, vol->v_softc, 538219974Smav "No failed disk to rebuild. night night."); 539219974Smav return; 540219974Smav } 541219974Smav trs->trso_failed_sd = sd; 542219974Smav G_RAID_DEBUG1(0, vol->v_softc, 543219974Smav "Subdisk %s:%d-%s rebuild start at %jd.", 544219974Smav sd->sd_volume->v_name, sd->sd_pos, 545219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 546219974Smav trs->trso_failed_sd->sd_rebuild_pos); 547219974Smav trs->trso_type = TR_RAID1E_REBUILD; 548219974Smav trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); 549219974Smav trs->trso_meta_update = g_raid1e_rebuild_meta_update; 550219974Smav g_raid_tr_raid1e_rebuild_some(tr); 551219974Smav} 552219974Smav 553219974Smavstatic void 554219974Smavg_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 555219974Smav struct g_raid_subdisk *sd) 556219974Smav{ 557219974Smav struct g_raid_volume *vol; 558219974Smav struct g_raid_tr_raid1e_object *trs; 559219974Smav int nr; 560219974Smav 561219974Smav vol = tr->tro_volume; 562219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 563219974Smav if (trs->trso_stopping) 564219974Smav return; 565219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 566219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 567219974Smav switch(trs->trso_type) { 568219974Smav case TR_RAID1E_NONE: 569219974Smav if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) 570219974Smav return; 571219974Smav if (nr == 0) { 572219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 573219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 574219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 575219974Smav if (nr == 0) 576219974Smav return; 577219974Smav } 578219974Smav g_raid_tr_raid1e_rebuild_start(tr); 579219974Smav break; 580219974Smav case TR_RAID1E_REBUILD: 581219974Smav if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || 582219974Smav trs->trso_failed_sd == sd) 583219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 584219974Smav break; 585219974Smav case TR_RAID1E_RESYNC: 586219974Smav break; 587219974Smav } 588219974Smav} 589219974Smav 590219974Smavstatic int 591219974Smavg_raid_tr_event_raid1e(struct g_raid_tr_object *tr, 592219974Smav struct g_raid_subdisk *sd, u_int event) 593219974Smav{ 594219974Smav 595219974Smav g_raid_tr_update_state_raid1e(tr->tro_volume, sd); 596219974Smav return (0); 597219974Smav} 598219974Smav 599219974Smavstatic int 600219974Smavg_raid_tr_start_raid1e(struct g_raid_tr_object *tr) 601219974Smav{ 602219974Smav struct g_raid_tr_raid1e_object *trs; 603219974Smav struct g_raid_volume *vol; 604219974Smav 605219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 606219974Smav vol = tr->tro_volume; 607219974Smav trs->trso_starting = 0; 608219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 609219974Smav return (0); 610219974Smav} 611219974Smav 612219974Smavstatic int 613219974Smavg_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) 614219974Smav{ 615219974Smav struct g_raid_tr_raid1e_object *trs; 616219974Smav struct g_raid_volume *vol; 617219974Smav 618219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 619219974Smav vol = tr->tro_volume; 620219974Smav trs->trso_starting = 0; 621219974Smav trs->trso_stopping = 1; 622219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 623219974Smav return (0); 624219974Smav} 625219974Smav 626219974Smav/* 627219974Smav * Select the disk to read from. Take into account: subdisk state, running 628219974Smav * error recovery, average disk load, head position and possible cache hits. 629219974Smav */ 630219974Smav#define ABS(x) (((x) >= 0) ? (x) : (-(x))) 631219974Smavstatic int 632219974Smavg_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 633219974Smav int no, off_t off, off_t len, u_int mask) 634219974Smav{ 635219974Smav struct g_raid_subdisk *sd; 636219974Smav off_t offset; 637219974Smav int i, best, prio, bestprio; 638219974Smav 639219974Smav best = -1; 640219974Smav bestprio = INT_MAX; 641219974Smav for (i = 0; i < N; i++) { 642219974Smav sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; 643219974Smav offset = off; 644219974Smav if (no + i >= vol->v_disks_count) 645219974Smav offset += vol->v_strip_size; 646219974Smav 647219974Smav prio = G_RAID_SUBDISK_LOAD(sd); 648219974Smav if ((mask & (1 << sd->sd_pos)) != 0) 649219974Smav continue; 650219974Smav switch (sd->sd_state) { 651219974Smav case G_RAID_SUBDISK_S_ACTIVE: 652219974Smav break; 653219974Smav case G_RAID_SUBDISK_S_RESYNC: 654219974Smav if (offset + off < sd->sd_rebuild_pos) 655219974Smav break; 656219974Smav /* FALLTHROUGH */ 657219974Smav case G_RAID_SUBDISK_S_STALE: 658219974Smav prio += i << 24; 659219974Smav break; 660219974Smav case G_RAID_SUBDISK_S_REBUILD: 661219974Smav if (offset + off < sd->sd_rebuild_pos) 662219974Smav break; 663219974Smav /* FALLTHROUGH */ 664219974Smav default: 665219974Smav continue; 666219974Smav } 667219974Smav prio += min(sd->sd_recovery, 255) << 16; 668219974Smav /* If disk head is precisely in position - highly prefer it. */ 669219974Smav if (G_RAID_SUBDISK_POS(sd) == offset) 670219974Smav prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 671219974Smav else 672219974Smav /* If disk head is close to position - prefer it. */ 673219974Smav if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < 674219974Smav G_RAID_SUBDISK_TRACK_SIZE) 675219974Smav prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 676219974Smav if (prio < bestprio) { 677219974Smav bestprio = prio; 678219974Smav best = i; 679219974Smav } 680219974Smav } 681219974Smav return (best); 682219974Smav} 683219974Smav 684219974Smavstatic void 685219974Smavg_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) 686219974Smav{ 687219974Smav struct g_raid_volume *vol; 688219974Smav struct g_raid_subdisk *sd; 689219974Smav struct bio_queue_head queue; 690219974Smav struct bio *cbp; 691219974Smav char *addr; 692219974Smav off_t offset, start, length, remain; 693219974Smav u_int no, strip_size; 694219974Smav int best; 695219974Smav 696219974Smav vol = tr->tro_volume; 697256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0) 698256610Smav addr = NULL; 699256610Smav else 700256610Smav addr = bp->bio_data; 701219974Smav strip_size = vol->v_strip_size; 702219974Smav V2P(vol, bp->bio_offset, &no, &offset, &start); 703219974Smav remain = bp->bio_length; 704219974Smav bioq_init(&queue); 705219974Smav while (remain > 0) { 706219974Smav length = MIN(strip_size - start, remain); 707219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 708219974Smav no, offset, length, 0); 709219974Smav KASSERT(best >= 0, ("No readable disk in volume %s!", 710219974Smav vol->v_name)); 711219974Smav no += best; 712219974Smav if (no >= vol->v_disks_count) { 713219974Smav no -= vol->v_disks_count; 714219974Smav offset += strip_size; 715219974Smav } 716219974Smav cbp = g_clone_bio(bp); 717219974Smav if (cbp == NULL) 718219974Smav goto failure; 719219974Smav cbp->bio_offset = offset + start; 720219974Smav cbp->bio_length = length; 721256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 722256610Smav cbp->bio_ma_offset += (uintptr_t)addr; 723256610Smav cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 724256610Smav cbp->bio_ma_offset %= PAGE_SIZE; 725256610Smav cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 726256610Smav cbp->bio_length) / PAGE_SIZE; 727256610Smav } else 728256610Smav cbp->bio_data = addr; 729219974Smav cbp->bio_caller1 = &vol->v_subdisks[no]; 730219974Smav bioq_insert_tail(&queue, cbp); 731219974Smav no += N - best; 732219974Smav if (no >= vol->v_disks_count) { 733219974Smav no -= vol->v_disks_count; 734219974Smav offset += strip_size; 735219974Smav } 736219974Smav remain -= length; 737219974Smav addr += length; 738219974Smav start = 0; 739219974Smav } 740256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) { 741219974Smav sd = cbp->bio_caller1; 742219974Smav cbp->bio_caller1 = NULL; 743219974Smav g_raid_subdisk_iostart(sd, cbp); 744219974Smav } 745219974Smav return; 746219974Smavfailure: 747256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) 748219974Smav g_destroy_bio(cbp); 749219974Smav if (bp->bio_error == 0) 750219974Smav bp->bio_error = ENOMEM; 751219974Smav g_raid_iodone(bp, bp->bio_error); 752219974Smav} 753219974Smav 754219974Smavstatic void 755219974Smavg_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) 756219974Smav{ 757219974Smav struct g_raid_volume *vol; 758219974Smav struct g_raid_subdisk *sd; 759219974Smav struct bio_queue_head queue; 760219974Smav struct bio *cbp; 761219974Smav char *addr; 762219974Smav off_t offset, start, length, remain; 763219974Smav u_int no, strip_size; 764219974Smav int i; 765219974Smav 766219974Smav vol = tr->tro_volume; 767256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0) 768256610Smav addr = NULL; 769256610Smav else 770256610Smav addr = bp->bio_data; 771219974Smav strip_size = vol->v_strip_size; 772219974Smav V2P(vol, bp->bio_offset, &no, &offset, &start); 773219974Smav remain = bp->bio_length; 774219974Smav bioq_init(&queue); 775219974Smav while (remain > 0) { 776219974Smav length = MIN(strip_size - start, remain); 777219974Smav for (i = 0; i < N; i++) { 778219974Smav sd = &vol->v_subdisks[no]; 779219974Smav switch (sd->sd_state) { 780219974Smav case G_RAID_SUBDISK_S_ACTIVE: 781219974Smav case G_RAID_SUBDISK_S_STALE: 782219974Smav case G_RAID_SUBDISK_S_RESYNC: 783219974Smav break; 784219974Smav case G_RAID_SUBDISK_S_REBUILD: 785219974Smav if (offset + start >= sd->sd_rebuild_pos) 786219974Smav goto nextdisk; 787219974Smav break; 788219974Smav default: 789219974Smav goto nextdisk; 790219974Smav } 791219974Smav cbp = g_clone_bio(bp); 792219974Smav if (cbp == NULL) 793219974Smav goto failure; 794219974Smav cbp->bio_offset = offset + start; 795219974Smav cbp->bio_length = length; 796256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0 && 797256610Smav bp->bio_cmd != BIO_DELETE) { 798256610Smav cbp->bio_ma_offset += (uintptr_t)addr; 799256610Smav cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 800256610Smav cbp->bio_ma_offset %= PAGE_SIZE; 801256610Smav cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 802256610Smav cbp->bio_length) / PAGE_SIZE; 803256610Smav } else 804256610Smav cbp->bio_data = addr; 805219974Smav cbp->bio_caller1 = sd; 806219974Smav bioq_insert_tail(&queue, cbp); 807219974Smavnextdisk: 808219974Smav if (++no >= vol->v_disks_count) { 809219974Smav no = 0; 810219974Smav offset += strip_size; 811219974Smav } 812219974Smav } 813219974Smav remain -= length; 814242323Smav if (bp->bio_cmd != BIO_DELETE) 815242323Smav addr += length; 816219974Smav start = 0; 817219974Smav } 818256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) { 819219974Smav sd = cbp->bio_caller1; 820219974Smav cbp->bio_caller1 = NULL; 821219974Smav g_raid_subdisk_iostart(sd, cbp); 822219974Smav } 823219974Smav return; 824219974Smavfailure: 825256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) 826219974Smav g_destroy_bio(cbp); 827219974Smav if (bp->bio_error == 0) 828219974Smav bp->bio_error = ENOMEM; 829219974Smav g_raid_iodone(bp, bp->bio_error); 830219974Smav} 831219974Smav 832219974Smavstatic void 833219974Smavg_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) 834219974Smav{ 835219974Smav struct g_raid_volume *vol; 836219974Smav struct g_raid_tr_raid1e_object *trs; 837219974Smav 838219974Smav vol = tr->tro_volume; 839219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 840219974Smav if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 841219974Smav vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 842219974Smav vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 843219974Smav g_raid_iodone(bp, EIO); 844219974Smav return; 845219974Smav } 846219974Smav /* 847219974Smav * If we're rebuilding, squeeze in rebuild activity every so often, 848219974Smav * even when the disk is busy. Be sure to only count real I/O 849219974Smav * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 850219974Smav * by this module. 851219974Smav */ 852219974Smav if (trs->trso_failed_sd != NULL && 853219974Smav !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 854219974Smav /* Make this new or running now round short. */ 855219974Smav trs->trso_recover_slabs = 0; 856219974Smav if (--trs->trso_fair_io <= 0) { 857219974Smav trs->trso_fair_io = g_raid1e_rebuild_fair_io; 858219974Smav g_raid_tr_raid1e_rebuild_some(tr); 859219974Smav } 860219974Smav } 861219974Smav switch (bp->bio_cmd) { 862219974Smav case BIO_READ: 863219974Smav g_raid_tr_iostart_raid1e_read(tr, bp); 864219974Smav break; 865219974Smav case BIO_WRITE: 866242323Smav case BIO_DELETE: 867219974Smav g_raid_tr_iostart_raid1e_write(tr, bp); 868219974Smav break; 869219974Smav case BIO_FLUSH: 870219974Smav g_raid_tr_flush_common(tr, bp); 871219974Smav break; 872219974Smav default: 873219974Smav KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 874219974Smav bp->bio_cmd, vol->v_name)); 875219974Smav break; 876219974Smav } 877219974Smav} 878219974Smav 879219974Smavstatic void 880219974Smavg_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, 881219974Smav struct g_raid_subdisk *sd, struct bio *bp) 882219974Smav{ 883219974Smav struct bio *cbp; 884219974Smav struct g_raid_subdisk *nsd; 885219974Smav struct g_raid_volume *vol; 886219974Smav struct bio *pbp; 887219974Smav struct g_raid_tr_raid1e_object *trs; 888219974Smav off_t virtual, offset, start; 889219974Smav uintptr_t mask; 890219974Smav int error, do_write, copy, disk, best; 891219974Smav 892219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 893219974Smav vol = tr->tro_volume; 894219974Smav if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 895219974Smav if (trs->trso_type == TR_RAID1E_REBUILD) { 896219974Smav nsd = trs->trso_failed_sd; 897219974Smav if (bp->bio_cmd == BIO_READ) { 898219974Smav 899219974Smav /* Immediately abort rebuild, if requested. */ 900219974Smav if (trs->trso_flags & TR_RAID1E_F_ABORT) { 901219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 902219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 903219974Smav return; 904219974Smav } 905219974Smav 906219974Smav /* On read error, skip and cross fingers. */ 907219974Smav if (bp->bio_error != 0) { 908219974Smav G_RAID_LOGREQ(0, bp, 909219974Smav "Read error during rebuild (%d), " 910219974Smav "possible data loss!", 911219974Smav bp->bio_error); 912219974Smav goto rebuild_round_done; 913219974Smav } 914219974Smav 915219974Smav /* 916219974Smav * The read operation finished, queue the 917219974Smav * write and get out. 918219974Smav */ 919219974Smav G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", 920219974Smav bp->bio_error); 921219974Smav bp->bio_cmd = BIO_WRITE; 922219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 923219974Smav bp->bio_offset = nsd->sd_rebuild_pos; 924219974Smav G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); 925219974Smav g_raid_subdisk_iostart(nsd, bp); 926219974Smav } else { 927219974Smav /* 928219974Smav * The write operation just finished. Do 929219974Smav * another. We keep cloning the master bio 930219974Smav * since it has the right buffers allocated to 931219974Smav * it. 932219974Smav */ 933219974Smav G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", 934219974Smav bp->bio_error); 935219974Smav if (bp->bio_error != 0 || 936219974Smav trs->trso_flags & TR_RAID1E_F_ABORT) { 937219974Smav if ((trs->trso_flags & 938219974Smav TR_RAID1E_F_ABORT) == 0) { 939219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, 940219974Smav nsd, nsd->sd_disk); 941219974Smav } 942219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 943219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 944219974Smav return; 945219974Smav } 946219974Smavrebuild_round_done: 947219974Smav trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 948219974Smav g_raid_unlock_range(tr->tro_volume, 949219974Smav trs->trso_lock_pos, trs->trso_lock_len); 950219974Smav nsd->sd_rebuild_pos += bp->bio_length; 951219974Smav if (nsd->sd_rebuild_pos >= nsd->sd_size) { 952219974Smav g_raid_tr_raid1e_rebuild_finish(tr); 953219974Smav return; 954219974Smav } 955219974Smav 956219974Smav /* Abort rebuild if we are stopping */ 957219974Smav if (trs->trso_stopping) { 958219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 959219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 960219974Smav return; 961219974Smav } 962219974Smav 963219974Smav if (--trs->trso_meta_update <= 0) { 964219974Smav g_raid_write_metadata(vol->v_softc, 965219974Smav vol, nsd, nsd->sd_disk); 966219974Smav trs->trso_meta_update = 967219974Smav g_raid1e_rebuild_meta_update; 968219974Smav /* Compensate short rebuild I/Os. */ 969219974Smav if ((vol->v_disks_count % N) != 0 && 970219974Smav vol->v_strip_size < 971219974Smav g_raid1e_rebuild_slab) { 972219974Smav trs->trso_meta_update *= 973219974Smav g_raid1e_rebuild_slab; 974219974Smav trs->trso_meta_update /= 975219974Smav vol->v_strip_size; 976219974Smav } 977219974Smav } 978219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 979219974Smav if (--trs->trso_recover_slabs <= 0) 980219974Smav return; 981219974Smav /* Run next rebuild iteration. */ 982219974Smav g_raid_tr_raid1e_rebuild_some(tr); 983219974Smav } 984219974Smav } else if (trs->trso_type == TR_RAID1E_RESYNC) { 985219974Smav /* 986219974Smav * read good sd, read bad sd in parallel. when both 987219974Smav * done, compare the buffers. write good to the bad 988219974Smav * if different. do the next bit of work. 989219974Smav */ 990219974Smav panic("Somehow, we think we're doing a resync"); 991219974Smav } 992219974Smav return; 993219974Smav } 994219974Smav pbp = bp->bio_parent; 995219974Smav pbp->bio_inbed++; 996219974Smav mask = (intptr_t)bp->bio_caller2; 997219974Smav if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 998219974Smav /* 999219974Smav * Read failed on first drive. Retry the read error on 1000219974Smav * another disk drive, if available, before erroring out the 1001219974Smav * read. 1002219974Smav */ 1003219974Smav sd->sd_disk->d_read_errs++; 1004219974Smav G_RAID_LOGREQ(0, bp, 1005219974Smav "Read error (%d), %d read errors total", 1006219974Smav bp->bio_error, sd->sd_disk->d_read_errs); 1007219974Smav 1008219974Smav /* 1009219974Smav * If there are too many read errors, we move to degraded. 1010219974Smav * XXX Do we want to FAIL the drive (eg, make the user redo 1011219974Smav * everything to get it back in sync), or just degrade the 1012219974Smav * drive, which kicks off a resync? 1013219974Smav */ 1014219974Smav do_write = 0; 1015219974Smav if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) 1016219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1017219974Smav else if (mask == 0) 1018219974Smav do_write = 1; 1019219974Smav 1020219974Smav /* Restore what we were doing. */ 1021219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1022219974Smav V2P(vol, virtual, &disk, &offset, &start); 1023219974Smav 1024219974Smav /* Find the other disk, and try to do the I/O to it. */ 1025219974Smav mask |= 1 << copy; 1026219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 1027219974Smav disk, offset, start, mask); 1028219974Smav if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1029219974Smav disk += best; 1030219974Smav if (disk >= vol->v_disks_count) { 1031219974Smav disk -= vol->v_disks_count; 1032219974Smav offset += vol->v_strip_size; 1033219974Smav } 1034219974Smav cbp->bio_offset = offset + start; 1035219974Smav cbp->bio_length = bp->bio_length; 1036219974Smav cbp->bio_data = bp->bio_data; 1037256610Smav cbp->bio_ma = bp->bio_ma; 1038256610Smav cbp->bio_ma_offset = bp->bio_ma_offset; 1039256610Smav cbp->bio_ma_n = bp->bio_ma_n; 1040219974Smav g_destroy_bio(bp); 1041219974Smav nsd = &vol->v_subdisks[disk]; 1042219974Smav G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 1043219974Smav nsd->sd_pos); 1044219974Smav if (do_write) 1045219974Smav mask |= 1 << 31; 1046258780Seadler if ((mask & (1U << 31)) != 0) 1047219974Smav sd->sd_recovery++; 1048219974Smav cbp->bio_caller2 = (void *)mask; 1049219974Smav if (do_write) { 1050219974Smav cbp->bio_caller1 = nsd; 1051219974Smav /* Lock callback starts I/O */ 1052219974Smav g_raid_lock_range(sd->sd_volume, 1053219974Smav virtual, cbp->bio_length, pbp, cbp); 1054219974Smav } else { 1055219974Smav g_raid_subdisk_iostart(nsd, cbp); 1056219974Smav } 1057219974Smav return; 1058219974Smav } 1059219974Smav /* 1060219974Smav * We can't retry. Return the original error by falling 1061219974Smav * through. This will happen when there's only one good disk. 1062219974Smav * We don't need to fail the raid, since its actual state is 1063219974Smav * based on the state of the subdisks. 1064219974Smav */ 1065219974Smav G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 1066219974Smav } 1067219974Smav if (bp->bio_cmd == BIO_READ && 1068219974Smav bp->bio_error == 0 && 1069258780Seadler (mask & (1U << 31)) != 0) { 1070219974Smav G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 1071219974Smav 1072219974Smav /* Restore what we were doing. */ 1073219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1074219974Smav V2P(vol, virtual, &disk, &offset, &start); 1075219974Smav 1076219974Smav /* Find best disk to write. */ 1077219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 1078219974Smav disk, offset, start, ~mask); 1079219974Smav if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1080219974Smav disk += best; 1081219974Smav if (disk >= vol->v_disks_count) { 1082219974Smav disk -= vol->v_disks_count; 1083219974Smav offset += vol->v_strip_size; 1084219974Smav } 1085219974Smav cbp->bio_offset = offset + start; 1086219974Smav cbp->bio_cmd = BIO_WRITE; 1087219974Smav cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 1088219974Smav cbp->bio_caller2 = (void *)mask; 1089219974Smav g_destroy_bio(bp); 1090219974Smav G_RAID_LOGREQ(2, cbp, 1091219974Smav "Attempting bad sector remap on failing drive."); 1092219974Smav g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); 1093219974Smav return; 1094219974Smav } 1095219974Smav } 1096258780Seadler if ((mask & (1U << 31)) != 0) { 1097219974Smav /* 1098219974Smav * We're done with a recovery, mark the range as unlocked. 1099298808Spfg * For any write errors, we aggressively fail the disk since 1100219974Smav * there was both a READ and a WRITE error at this location. 1101219974Smav * Both types of errors generally indicates the drive is on 1102219974Smav * the verge of total failure anyway. Better to stop trusting 1103219974Smav * it now. However, we need to reset error to 0 in that case 1104219974Smav * because we're not failing the original I/O which succeeded. 1105219974Smav */ 1106219974Smav 1107219974Smav /* Restore what we were doing. */ 1108219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1109219974Smav V2P(vol, virtual, &disk, &offset, &start); 1110219974Smav 1111219974Smav for (copy = 0; copy < N; copy++) { 1112219974Smav if ((mask & (1 << copy) ) != 0) 1113219974Smav vol->v_subdisks[(disk + copy) % 1114219974Smav vol->v_disks_count].sd_recovery--; 1115219974Smav } 1116219974Smav 1117219974Smav if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 1118219974Smav G_RAID_LOGREQ(0, bp, "Remap write failed: " 1119219974Smav "failing subdisk."); 1120219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1121219974Smav bp->bio_error = 0; 1122219974Smav } 1123219974Smav G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 1124219974Smav g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); 1125219974Smav } 1126242328Smav if (pbp->bio_cmd != BIO_READ) { 1127235270Smav if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 1128235270Smav pbp->bio_error = bp->bio_error; 1129242328Smav if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 1130235270Smav G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 1131235270Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1132235270Smav } 1133235270Smav error = pbp->bio_error; 1134235270Smav } else 1135235270Smav error = bp->bio_error; 1136219974Smav g_destroy_bio(bp); 1137219974Smav if (pbp->bio_children == pbp->bio_inbed) { 1138219974Smav pbp->bio_completed = pbp->bio_length; 1139219974Smav g_raid_iodone(pbp, error); 1140219974Smav } 1141219974Smav} 1142219974Smav 1143219974Smavstatic int 1144219974Smavg_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, 1145219974Smav void *virtual, vm_offset_t physical, off_t boffset, size_t blength) 1146219974Smav{ 1147219974Smav struct g_raid_volume *vol; 1148219974Smav struct g_raid_subdisk *sd; 1149219974Smav struct bio_queue_head queue; 1150219974Smav char *addr; 1151219974Smav off_t offset, start, length, remain; 1152219974Smav u_int no, strip_size; 1153219974Smav int i, error; 1154219974Smav 1155219974Smav vol = tr->tro_volume; 1156219974Smav addr = virtual; 1157219974Smav strip_size = vol->v_strip_size; 1158219974Smav V2P(vol, boffset, &no, &offset, &start); 1159219974Smav remain = blength; 1160219974Smav bioq_init(&queue); 1161219974Smav while (remain > 0) { 1162219974Smav length = MIN(strip_size - start, remain); 1163219974Smav for (i = 0; i < N; i++) { 1164219974Smav sd = &vol->v_subdisks[no]; 1165219974Smav switch (sd->sd_state) { 1166219974Smav case G_RAID_SUBDISK_S_ACTIVE: 1167219974Smav case G_RAID_SUBDISK_S_STALE: 1168219974Smav case G_RAID_SUBDISK_S_RESYNC: 1169219974Smav break; 1170219974Smav case G_RAID_SUBDISK_S_REBUILD: 1171219974Smav if (offset + start >= sd->sd_rebuild_pos) 1172219974Smav goto nextdisk; 1173219974Smav break; 1174219974Smav default: 1175219974Smav goto nextdisk; 1176219974Smav } 1177219974Smav error = g_raid_subdisk_kerneldump(sd, 1178219974Smav addr, 0, offset + start, length); 1179219974Smav if (error != 0) 1180219974Smav return (error); 1181219974Smavnextdisk: 1182219974Smav if (++no >= vol->v_disks_count) { 1183219974Smav no = 0; 1184219974Smav offset += strip_size; 1185219974Smav } 1186219974Smav } 1187219974Smav remain -= length; 1188219974Smav addr += length; 1189219974Smav start = 0; 1190219974Smav } 1191219974Smav return (0); 1192219974Smav} 1193219974Smav 1194219974Smavstatic int 1195219974Smavg_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) 1196219974Smav{ 1197219974Smav struct bio *bp; 1198219974Smav struct g_raid_subdisk *sd; 1199219974Smav 1200219974Smav bp = (struct bio *)argp; 1201219974Smav sd = (struct g_raid_subdisk *)bp->bio_caller1; 1202219974Smav g_raid_subdisk_iostart(sd, bp); 1203219974Smav 1204219974Smav return (0); 1205219974Smav} 1206219974Smav 1207219974Smavstatic int 1208219974Smavg_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) 1209219974Smav{ 1210219974Smav struct g_raid_tr_raid1e_object *trs; 1211219974Smav struct g_raid_volume *vol; 1212219974Smav 1213219974Smav vol = tr->tro_volume; 1214219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 1215219974Smav trs->trso_fair_io = g_raid1e_rebuild_fair_io; 1216219974Smav trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; 1217219974Smav /* Compensate short rebuild I/Os. */ 1218219974Smav if ((vol->v_disks_count % N) != 0 && 1219219974Smav vol->v_strip_size < g_raid1e_rebuild_slab) { 1220219974Smav trs->trso_recover_slabs *= g_raid1e_rebuild_slab; 1221219974Smav trs->trso_recover_slabs /= vol->v_strip_size; 1222219974Smav } 1223219974Smav if (trs->trso_type == TR_RAID1E_REBUILD) 1224219974Smav g_raid_tr_raid1e_rebuild_some(tr); 1225219974Smav return (0); 1226219974Smav} 1227219974Smav 1228219974Smavstatic int 1229219974Smavg_raid_tr_free_raid1e(struct g_raid_tr_object *tr) 1230219974Smav{ 1231219974Smav struct g_raid_tr_raid1e_object *trs; 1232219974Smav 1233219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 1234219974Smav 1235219974Smav if (trs->trso_buffer != NULL) { 1236219974Smav free(trs->trso_buffer, M_TR_RAID1E); 1237219974Smav trs->trso_buffer = NULL; 1238219974Smav } 1239219974Smav return (0); 1240219974Smav} 1241219974Smav 1242240465SmavG_RAID_TR_DECLARE(raid1e, "RAID1E"); 1243