tr_raid1e.c revision 240465
1219974Smav/*- 2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3219974Smav * All rights reserved. 4219974Smav * 5219974Smav * Redistribution and use in source and binary forms, with or without 6219974Smav * modification, are permitted provided that the following conditions 7219974Smav * are met: 8219974Smav * 1. Redistributions of source code must retain the above copyright 9219974Smav * notice, this list of conditions and the following disclaimer. 10219974Smav * 2. Redistributions in binary form must reproduce the above copyright 11219974Smav * notice, this list of conditions and the following disclaimer in the 12219974Smav * documentation and/or other materials provided with the distribution. 13219974Smav * 14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17219974Smav * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24219974Smav * SUCH DAMAGE. 25219974Smav */ 26219974Smav 27219974Smav#include <sys/cdefs.h> 28219974Smav__FBSDID("$FreeBSD: head/sys/geom/raid/tr_raid1e.c 240465 2012-09-13 13:27:09Z mav $"); 29219974Smav 30219974Smav#include <sys/param.h> 31219974Smav#include <sys/bio.h> 32219974Smav#include <sys/endian.h> 33219974Smav#include <sys/kernel.h> 34219974Smav#include <sys/kobj.h> 35219974Smav#include <sys/limits.h> 36219974Smav#include <sys/lock.h> 37219974Smav#include <sys/malloc.h> 38219974Smav#include <sys/mutex.h> 39219974Smav#include <sys/sysctl.h> 40219974Smav#include <sys/systm.h> 41219974Smav#include <geom/geom.h> 42219974Smav#include "geom/raid/g_raid.h" 43219974Smav#include "g_raid_tr_if.h" 44219974Smav 45219974Smav#define N 2 46219974Smav 47240465SmavSYSCTL_DECL(_kern_geom_raid_raid1e); 48219974Smav 49219974Smav#define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 50219974Smavstatic int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; 51219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size", 52219974Smav &g_raid1e_rebuild_slab); 53219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, 54219974Smav &g_raid1e_rebuild_slab, 0, 55219974Smav "Amount of the disk to rebuild each read/write cycle of the rebuild."); 56219974Smav 57219974Smav#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 58219974Smavstatic int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; 59219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io", 60219974Smav &g_raid1e_rebuild_fair_io); 61219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, 62219974Smav &g_raid1e_rebuild_fair_io, 0, 63219974Smav "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 64219974Smav 65219974Smav#define RAID1E_REBUILD_CLUSTER_IDLE 100 66219974Smavstatic int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; 67219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle", 68219974Smav &g_raid1e_rebuild_cluster_idle); 69219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, 70219974Smav &g_raid1e_rebuild_cluster_idle, 0, 71219974Smav "Number of slabs to do each time we trigger a rebuild cycle"); 72219974Smav 73219974Smav#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 74219974Smavstatic int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; 75219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update", 76219974Smav &g_raid1e_rebuild_meta_update); 77219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, 78219974Smav &g_raid1e_rebuild_meta_update, 0, 79219974Smav "When to update the meta data."); 80219974Smav 81219974Smavstatic MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); 82219974Smav 83219974Smav#define TR_RAID1E_NONE 0 84219974Smav#define TR_RAID1E_REBUILD 1 85219974Smav#define TR_RAID1E_RESYNC 2 86219974Smav 87219974Smav#define TR_RAID1E_F_DOING_SOME 0x1 88219974Smav#define TR_RAID1E_F_LOCKED 0x2 89219974Smav#define TR_RAID1E_F_ABORT 0x4 90219974Smav 91219974Smavstruct g_raid_tr_raid1e_object { 92219974Smav struct g_raid_tr_object trso_base; 93219974Smav int trso_starting; 94219974Smav int trso_stopping; 95219974Smav int trso_type; 96219974Smav int trso_recover_slabs; /* slabs before rest */ 97219974Smav int trso_fair_io; 98219974Smav int trso_meta_update; 99219974Smav int trso_flags; 100219974Smav struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 101219974Smav void *trso_buffer; /* Buffer space */ 102219974Smav off_t trso_lock_pos; /* Locked range start. */ 103219974Smav off_t trso_lock_len; /* Locked range length. */ 104219974Smav struct bio trso_bio; 105219974Smav}; 106219974Smav 107219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1e; 108219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1e; 109219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1e; 110219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1e; 111219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; 112219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; 113219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; 114219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1e; 115219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1e; 116219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1e; 117219974Smav 118219974Smavstatic kobj_method_t g_raid_tr_raid1e_methods[] = { 119219974Smav KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), 120219974Smav KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), 121219974Smav KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), 122219974Smav KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), 123219974Smav KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), 124219974Smav KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), 125219974Smav KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), 126219974Smav KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), 127219974Smav KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), 128219974Smav KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), 129219974Smav { 0, 0 } 130219974Smav}; 131219974Smav 132219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1e_class = { 133219974Smav "RAID1E", 134219974Smav g_raid_tr_raid1e_methods, 135219974Smav sizeof(struct g_raid_tr_raid1e_object), 136240465Smav .trc_enable = 1, 137219974Smav .trc_priority = 200 138219974Smav}; 139219974Smav 140219974Smavstatic void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); 141219974Smavstatic void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 142219974Smav struct g_raid_subdisk *sd); 143219974Smavstatic int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 144219974Smav int no, off_t off, off_t len, u_int mask); 145219974Smav 146219974Smavstatic inline void 147219974SmavV2P(struct g_raid_volume *vol, off_t virt, 148219974Smav int *disk, off_t *offset, off_t *start) 149219974Smav{ 150219974Smav off_t nstrip; 151219974Smav u_int strip_size; 152219974Smav 153219974Smav strip_size = vol->v_strip_size; 154219974Smav /* Strip number. */ 155219974Smav nstrip = virt / strip_size; 156219974Smav /* Start position in strip. */ 157219974Smav *start = virt % strip_size; 158219974Smav /* Disk number. */ 159219974Smav *disk = (nstrip * N) % vol->v_disks_count; 160219974Smav /* Strip start position in disk. */ 161219974Smav *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; 162219974Smav} 163219974Smav 164219974Smavstatic inline void 165219974SmavP2V(struct g_raid_volume *vol, int disk, off_t offset, 166219974Smav off_t *virt, int *copy) 167219974Smav{ 168219974Smav off_t nstrip, start; 169219974Smav u_int strip_size; 170219974Smav 171219974Smav strip_size = vol->v_strip_size; 172219974Smav /* Start position in strip. */ 173219974Smav start = offset % strip_size; 174219974Smav /* Physical strip number. */ 175219974Smav nstrip = (offset / strip_size) * vol->v_disks_count + disk; 176219974Smav /* Number of physical strip (copy) inside virtual strip. */ 177219974Smav *copy = nstrip % N; 178219974Smav /* Offset in virtual space. */ 179219974Smav *virt = (nstrip / N) * strip_size + start; 180219974Smav} 181219974Smav 182219974Smavstatic int 183219974Smavg_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 184219974Smav{ 185219974Smav struct g_raid_tr_raid1e_object *trs; 186219974Smav 187219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 188219974Smav if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || 189234603Smav tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) 190219974Smav return (G_RAID_TR_TASTE_FAIL); 191219974Smav trs->trso_starting = 1; 192219974Smav return (G_RAID_TR_TASTE_SUCCEED); 193219974Smav} 194219974Smav 195219974Smavstatic int 196219974Smavg_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) 197219974Smav{ 198219974Smav struct g_raid_softc *sc; 199219974Smav struct g_raid_subdisk *sd, *bestsd, *worstsd; 200219974Smav int i, j, state, sstate; 201219974Smav 202219974Smav sc = vol->v_softc; 203219974Smav state = G_RAID_VOLUME_S_OPTIMAL; 204219974Smav for (i = 0; i < vol->v_disks_count / N; i++) { 205219974Smav bestsd = &vol->v_subdisks[i * N]; 206219974Smav for (j = 1; j < N; j++) { 207219974Smav sd = &vol->v_subdisks[i * N + j]; 208219974Smav if (sd->sd_state > bestsd->sd_state) 209219974Smav bestsd = sd; 210219974Smav else if (sd->sd_state == bestsd->sd_state && 211219974Smav (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 212219974Smav sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 213219974Smav sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 214219974Smav bestsd = sd; 215219974Smav } 216219974Smav if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && 217219974Smav bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { 218219974Smav /* We found reasonable candidate. */ 219219974Smav G_RAID_DEBUG1(1, sc, 220219974Smav "Promote subdisk %s:%d from %s to ACTIVE.", 221219974Smav vol->v_name, bestsd->sd_pos, 222219974Smav g_raid_subdisk_state2str(bestsd->sd_state)); 223219974Smav g_raid_change_subdisk_state(bestsd, 224219974Smav G_RAID_SUBDISK_S_ACTIVE); 225219974Smav g_raid_write_metadata(sc, 226219974Smav vol, bestsd, bestsd->sd_disk); 227219974Smav } 228219974Smav worstsd = &vol->v_subdisks[i * N]; 229219974Smav for (j = 1; j < N; j++) { 230219974Smav sd = &vol->v_subdisks[i * N + j]; 231219974Smav if (sd->sd_state < worstsd->sd_state) 232219974Smav worstsd = sd; 233219974Smav } 234219974Smav if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 235219974Smav sstate = G_RAID_VOLUME_S_OPTIMAL; 236219974Smav else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 237219974Smav sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 238219974Smav else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 239219974Smav sstate = G_RAID_VOLUME_S_DEGRADED; 240219974Smav else 241219974Smav sstate = G_RAID_VOLUME_S_BROKEN; 242219974Smav if (sstate < state) 243219974Smav state = sstate; 244219974Smav } 245219974Smav return (state); 246219974Smav} 247219974Smav 248219974Smavstatic int 249219974Smavg_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) 250219974Smav{ 251219974Smav struct g_raid_softc *sc; 252219974Smav struct g_raid_subdisk *sd, *bestsd, *worstsd; 253219974Smav int i, j, state, sstate; 254219974Smav 255219974Smav sc = vol->v_softc; 256219974Smav if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == 257219974Smav vol->v_disks_count) 258219974Smav return (G_RAID_VOLUME_S_OPTIMAL); 259219974Smav for (i = 0; i < vol->v_disks_count; i++) { 260219974Smav sd = &vol->v_subdisks[i]; 261219974Smav if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { 262219974Smav /* We found reasonable candidate. */ 263219974Smav G_RAID_DEBUG1(1, sc, 264219974Smav "Promote subdisk %s:%d from %s to STALE.", 265219974Smav vol->v_name, sd->sd_pos, 266219974Smav g_raid_subdisk_state2str(sd->sd_state)); 267219974Smav g_raid_change_subdisk_state(sd, 268219974Smav G_RAID_SUBDISK_S_STALE); 269219974Smav g_raid_write_metadata(sc, vol, sd, sd->sd_disk); 270219974Smav } 271219974Smav } 272219974Smav state = G_RAID_VOLUME_S_OPTIMAL; 273219974Smav for (i = 0; i < vol->v_disks_count; i++) { 274219974Smav bestsd = &vol->v_subdisks[i]; 275219974Smav worstsd = &vol->v_subdisks[i]; 276219974Smav for (j = 1; j < N; j++) { 277219974Smav sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; 278219974Smav if (sd->sd_state > bestsd->sd_state) 279219974Smav bestsd = sd; 280219974Smav else if (sd->sd_state == bestsd->sd_state && 281219974Smav (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 282219974Smav sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 283219974Smav sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 284219974Smav bestsd = sd; 285219974Smav if (sd->sd_state < worstsd->sd_state) 286219974Smav worstsd = sd; 287219974Smav } 288219974Smav if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 289219974Smav sstate = G_RAID_VOLUME_S_OPTIMAL; 290219974Smav else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 291219974Smav sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 292219974Smav else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) 293219974Smav sstate = G_RAID_VOLUME_S_DEGRADED; 294219974Smav else 295219974Smav sstate = G_RAID_VOLUME_S_BROKEN; 296219974Smav if (sstate < state) 297219974Smav state = sstate; 298219974Smav } 299219974Smav return (state); 300219974Smav} 301219974Smav 302219974Smavstatic int 303219974Smavg_raid_tr_update_state_raid1e(struct g_raid_volume *vol, 304219974Smav struct g_raid_subdisk *sd) 305219974Smav{ 306219974Smav struct g_raid_tr_raid1e_object *trs; 307219974Smav struct g_raid_softc *sc; 308219974Smav u_int s; 309219974Smav 310219974Smav sc = vol->v_softc; 311219974Smav trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; 312219974Smav if (trs->trso_stopping && 313219974Smav (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) 314219974Smav s = G_RAID_VOLUME_S_STOPPED; 315219974Smav else if (trs->trso_starting) 316219974Smav s = G_RAID_VOLUME_S_STARTING; 317219974Smav else { 318219974Smav if ((vol->v_disks_count % N) == 0) 319219974Smav s = g_raid_tr_update_state_raid1e_even(vol); 320219974Smav else 321219974Smav s = g_raid_tr_update_state_raid1e_odd(vol); 322219974Smav } 323219974Smav if (s != vol->v_state) { 324219974Smav g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 325219974Smav G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 326219974Smav G_RAID_EVENT_VOLUME); 327219974Smav g_raid_change_volume_state(vol, s); 328219974Smav if (!trs->trso_starting && !trs->trso_stopping) 329219974Smav g_raid_write_metadata(sc, vol, NULL, NULL); 330219974Smav } 331219974Smav if (!trs->trso_starting && !trs->trso_stopping) 332219974Smav g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); 333219974Smav return (0); 334219974Smav} 335219974Smav 336219974Smavstatic void 337219974Smavg_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 338219974Smav struct g_raid_disk *disk) 339219974Smav{ 340235270Smav struct g_raid_volume *vol; 341235270Smav 342235270Smav vol = sd->sd_volume; 343219974Smav /* 344219974Smav * We don't fail the last disk in the pack, since it still has decent 345219974Smav * data on it and that's better than failing the disk if it is the root 346219974Smav * file system. 347219974Smav * 348219974Smav * XXX should this be controlled via a tunable? It makes sense for 349219974Smav * the volume that has / on it. I can't think of a case where we'd 350219974Smav * want the volume to go away on this kind of event. 351219974Smav */ 352235270Smav if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + 353235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + 354235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 355235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < 356235270Smav vol->v_disks_count) && 357235270Smav (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) 358219974Smav return; 359219974Smav g_raid_fail_disk(sc, sd, disk); 360219974Smav} 361219974Smav 362219974Smavstatic void 363219974Smavg_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) 364219974Smav{ 365219974Smav struct g_raid_volume *vol; 366219974Smav struct g_raid_subdisk *sd; 367219974Smav 368219974Smav vol = trs->trso_base.tro_volume; 369219974Smav sd = trs->trso_failed_sd; 370219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 371219974Smav free(trs->trso_buffer, M_TR_RAID1E); 372219974Smav trs->trso_buffer = NULL; 373219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 374219974Smav trs->trso_type = TR_RAID1E_NONE; 375219974Smav trs->trso_recover_slabs = 0; 376219974Smav trs->trso_failed_sd = NULL; 377219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 378219974Smav} 379219974Smav 380219974Smavstatic void 381219974Smavg_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) 382219974Smav{ 383219974Smav struct g_raid_tr_raid1e_object *trs; 384219974Smav struct g_raid_subdisk *sd; 385219974Smav 386219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 387219974Smav sd = trs->trso_failed_sd; 388219974Smav G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 389219974Smav "Subdisk %s:%d-%s rebuild completed.", 390219974Smav sd->sd_volume->v_name, sd->sd_pos, 391219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 392219974Smav g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 393219974Smav sd->sd_rebuild_pos = 0; 394219974Smav g_raid_tr_raid1e_rebuild_done(trs); 395219974Smav} 396219974Smav 397219974Smavstatic void 398219974Smavg_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) 399219974Smav{ 400219974Smav struct g_raid_tr_raid1e_object *trs; 401219974Smav struct g_raid_subdisk *sd; 402219974Smav struct g_raid_volume *vol; 403219974Smav 404219974Smav vol = tr->tro_volume; 405219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 406219974Smav sd = trs->trso_failed_sd; 407219974Smav if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { 408219974Smav G_RAID_DEBUG1(1, vol->v_softc, 409219974Smav "Subdisk %s:%d-%s rebuild is aborting.", 410219974Smav sd->sd_volume->v_name, sd->sd_pos, 411219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 412219974Smav trs->trso_flags |= TR_RAID1E_F_ABORT; 413219974Smav } else { 414219974Smav G_RAID_DEBUG1(0, vol->v_softc, 415219974Smav "Subdisk %s:%d-%s rebuild aborted.", 416219974Smav sd->sd_volume->v_name, sd->sd_pos, 417219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 418219974Smav trs->trso_flags &= ~TR_RAID1E_F_ABORT; 419219974Smav if (trs->trso_flags & TR_RAID1E_F_LOCKED) { 420219974Smav trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 421219974Smav g_raid_unlock_range(tr->tro_volume, 422219974Smav trs->trso_lock_pos, trs->trso_lock_len); 423219974Smav } 424219974Smav g_raid_tr_raid1e_rebuild_done(trs); 425219974Smav } 426219974Smav} 427219974Smav 428219974Smavstatic void 429219974Smavg_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) 430219974Smav{ 431219974Smav struct g_raid_tr_raid1e_object *trs; 432219974Smav struct g_raid_softc *sc; 433219974Smav struct g_raid_volume *vol; 434219974Smav struct g_raid_subdisk *sd; 435219974Smav struct bio *bp; 436219974Smav off_t len, virtual, vend, offset, start; 437219974Smav int disk, copy, best; 438219974Smav 439219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 440219974Smav if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) 441219974Smav return; 442219974Smav vol = tr->tro_volume; 443219974Smav sc = vol->v_softc; 444219974Smav sd = trs->trso_failed_sd; 445219974Smav 446219974Smav while (1) { 447219974Smav if (sd->sd_rebuild_pos >= sd->sd_size) { 448219974Smav g_raid_tr_raid1e_rebuild_finish(tr); 449219974Smav return; 450219974Smav } 451219974Smav /* Get virtual offset from physical rebuild position. */ 452219974Smav P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); 453219974Smav /* Get physical offset back to get first stripe position. */ 454219974Smav V2P(vol, virtual, &disk, &offset, &start); 455219974Smav /* Calculate contignous data length. */ 456219974Smav len = MIN(g_raid1e_rebuild_slab, 457219974Smav sd->sd_size - sd->sd_rebuild_pos); 458219974Smav if ((vol->v_disks_count % N) != 0) 459219974Smav len = MIN(len, vol->v_strip_size - start); 460219974Smav /* Find disk with most accurate data. */ 461219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, disk, 462219974Smav offset + start, len, 0); 463219974Smav if (best < 0) { 464219974Smav /* There is no any valid disk. */ 465219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 466219974Smav return; 467219974Smav } else if (best != copy) { 468219974Smav /* Some other disk has better data. */ 469219974Smav break; 470219974Smav } 471219974Smav /* We have the most accurate data. Skip the range. */ 472219974Smav G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", 473219974Smav sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); 474219974Smav sd->sd_rebuild_pos += len; 475219974Smav } 476219974Smav 477219974Smav bp = &trs->trso_bio; 478219974Smav memset(bp, 0, sizeof(*bp)); 479219974Smav bp->bio_offset = offset + start + 480219974Smav ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); 481219974Smav bp->bio_length = len; 482219974Smav bp->bio_data = trs->trso_buffer; 483219974Smav bp->bio_cmd = BIO_READ; 484219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 485219974Smav bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; 486219974Smav G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); 487219974Smav /* 488219974Smav * If we are crossing stripe boundary, correct affected virtual 489219974Smav * range we should lock. 490219974Smav */ 491219974Smav if (start + len > vol->v_strip_size) { 492219974Smav P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); 493219974Smav len = vend - virtual; 494219974Smav } 495219974Smav trs->trso_flags |= TR_RAID1E_F_DOING_SOME; 496219974Smav trs->trso_flags |= TR_RAID1E_F_LOCKED; 497219974Smav trs->trso_lock_pos = virtual; 498219974Smav trs->trso_lock_len = len; 499219974Smav /* Lock callback starts I/O */ 500219974Smav g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); 501219974Smav} 502219974Smav 503219974Smavstatic void 504219974Smavg_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) 505219974Smav{ 506219974Smav struct g_raid_volume *vol; 507219974Smav struct g_raid_tr_raid1e_object *trs; 508219974Smav struct g_raid_subdisk *sd; 509219974Smav 510219974Smav vol = tr->tro_volume; 511219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 512219974Smav if (trs->trso_failed_sd) { 513219974Smav G_RAID_DEBUG1(1, vol->v_softc, 514219974Smav "Already rebuild in start rebuild. pos %jd\n", 515219974Smav (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 516219974Smav return; 517219974Smav } 518219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 519219974Smav if (sd == NULL) 520219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 521219974Smav if (sd == NULL) { 522219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 523219974Smav if (sd != NULL) { 524219974Smav sd->sd_rebuild_pos = 0; 525219974Smav g_raid_change_subdisk_state(sd, 526219974Smav G_RAID_SUBDISK_S_RESYNC); 527219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, NULL); 528219974Smav } else { 529219974Smav sd = g_raid_get_subdisk(vol, 530219974Smav G_RAID_SUBDISK_S_UNINITIALIZED); 531219974Smav if (sd == NULL) 532219974Smav sd = g_raid_get_subdisk(vol, 533219974Smav G_RAID_SUBDISK_S_NEW); 534219974Smav if (sd != NULL) { 535219974Smav sd->sd_rebuild_pos = 0; 536219974Smav g_raid_change_subdisk_state(sd, 537219974Smav G_RAID_SUBDISK_S_REBUILD); 538219974Smav g_raid_write_metadata(vol->v_softc, 539219974Smav vol, sd, NULL); 540219974Smav } 541219974Smav } 542219974Smav } 543219974Smav if (sd == NULL) { 544219974Smav G_RAID_DEBUG1(1, vol->v_softc, 545219974Smav "No failed disk to rebuild. night night."); 546219974Smav return; 547219974Smav } 548219974Smav trs->trso_failed_sd = sd; 549219974Smav G_RAID_DEBUG1(0, vol->v_softc, 550219974Smav "Subdisk %s:%d-%s rebuild start at %jd.", 551219974Smav sd->sd_volume->v_name, sd->sd_pos, 552219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 553219974Smav trs->trso_failed_sd->sd_rebuild_pos); 554219974Smav trs->trso_type = TR_RAID1E_REBUILD; 555219974Smav trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); 556219974Smav trs->trso_meta_update = g_raid1e_rebuild_meta_update; 557219974Smav g_raid_tr_raid1e_rebuild_some(tr); 558219974Smav} 559219974Smav 560219974Smavstatic void 561219974Smavg_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 562219974Smav struct g_raid_subdisk *sd) 563219974Smav{ 564219974Smav struct g_raid_volume *vol; 565219974Smav struct g_raid_tr_raid1e_object *trs; 566219974Smav int nr; 567219974Smav 568219974Smav vol = tr->tro_volume; 569219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 570219974Smav if (trs->trso_stopping) 571219974Smav return; 572219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 573219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 574219974Smav switch(trs->trso_type) { 575219974Smav case TR_RAID1E_NONE: 576219974Smav if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) 577219974Smav return; 578219974Smav if (nr == 0) { 579219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 580219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 581219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 582219974Smav if (nr == 0) 583219974Smav return; 584219974Smav } 585219974Smav g_raid_tr_raid1e_rebuild_start(tr); 586219974Smav break; 587219974Smav case TR_RAID1E_REBUILD: 588219974Smav if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || 589219974Smav trs->trso_failed_sd == sd) 590219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 591219974Smav break; 592219974Smav case TR_RAID1E_RESYNC: 593219974Smav break; 594219974Smav } 595219974Smav} 596219974Smav 597219974Smavstatic int 598219974Smavg_raid_tr_event_raid1e(struct g_raid_tr_object *tr, 599219974Smav struct g_raid_subdisk *sd, u_int event) 600219974Smav{ 601219974Smav 602219974Smav g_raid_tr_update_state_raid1e(tr->tro_volume, sd); 603219974Smav return (0); 604219974Smav} 605219974Smav 606219974Smavstatic int 607219974Smavg_raid_tr_start_raid1e(struct g_raid_tr_object *tr) 608219974Smav{ 609219974Smav struct g_raid_tr_raid1e_object *trs; 610219974Smav struct g_raid_volume *vol; 611219974Smav 612219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 613219974Smav vol = tr->tro_volume; 614219974Smav trs->trso_starting = 0; 615219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 616219974Smav return (0); 617219974Smav} 618219974Smav 619219974Smavstatic int 620219974Smavg_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) 621219974Smav{ 622219974Smav struct g_raid_tr_raid1e_object *trs; 623219974Smav struct g_raid_volume *vol; 624219974Smav 625219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 626219974Smav vol = tr->tro_volume; 627219974Smav trs->trso_starting = 0; 628219974Smav trs->trso_stopping = 1; 629219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 630219974Smav return (0); 631219974Smav} 632219974Smav 633219974Smav/* 634219974Smav * Select the disk to read from. Take into account: subdisk state, running 635219974Smav * error recovery, average disk load, head position and possible cache hits. 636219974Smav */ 637219974Smav#define ABS(x) (((x) >= 0) ? (x) : (-(x))) 638219974Smavstatic int 639219974Smavg_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 640219974Smav int no, off_t off, off_t len, u_int mask) 641219974Smav{ 642219974Smav struct g_raid_subdisk *sd; 643219974Smav off_t offset; 644219974Smav int i, best, prio, bestprio; 645219974Smav 646219974Smav best = -1; 647219974Smav bestprio = INT_MAX; 648219974Smav for (i = 0; i < N; i++) { 649219974Smav sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; 650219974Smav offset = off; 651219974Smav if (no + i >= vol->v_disks_count) 652219974Smav offset += vol->v_strip_size; 653219974Smav 654219974Smav prio = G_RAID_SUBDISK_LOAD(sd); 655219974Smav if ((mask & (1 << sd->sd_pos)) != 0) 656219974Smav continue; 657219974Smav switch (sd->sd_state) { 658219974Smav case G_RAID_SUBDISK_S_ACTIVE: 659219974Smav break; 660219974Smav case G_RAID_SUBDISK_S_RESYNC: 661219974Smav if (offset + off < sd->sd_rebuild_pos) 662219974Smav break; 663219974Smav /* FALLTHROUGH */ 664219974Smav case G_RAID_SUBDISK_S_STALE: 665219974Smav prio += i << 24; 666219974Smav break; 667219974Smav case G_RAID_SUBDISK_S_REBUILD: 668219974Smav if (offset + off < sd->sd_rebuild_pos) 669219974Smav break; 670219974Smav /* FALLTHROUGH */ 671219974Smav default: 672219974Smav continue; 673219974Smav } 674219974Smav prio += min(sd->sd_recovery, 255) << 16; 675219974Smav /* If disk head is precisely in position - highly prefer it. */ 676219974Smav if (G_RAID_SUBDISK_POS(sd) == offset) 677219974Smav prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 678219974Smav else 679219974Smav /* If disk head is close to position - prefer it. */ 680219974Smav if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < 681219974Smav G_RAID_SUBDISK_TRACK_SIZE) 682219974Smav prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 683219974Smav if (prio < bestprio) { 684219974Smav bestprio = prio; 685219974Smav best = i; 686219974Smav } 687219974Smav } 688219974Smav return (best); 689219974Smav} 690219974Smav 691219974Smavstatic void 692219974Smavg_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) 693219974Smav{ 694219974Smav struct g_raid_volume *vol; 695219974Smav struct g_raid_subdisk *sd; 696219974Smav struct bio_queue_head queue; 697219974Smav struct bio *cbp; 698219974Smav char *addr; 699219974Smav off_t offset, start, length, remain; 700219974Smav u_int no, strip_size; 701219974Smav int best; 702219974Smav 703219974Smav vol = tr->tro_volume; 704219974Smav addr = bp->bio_data; 705219974Smav strip_size = vol->v_strip_size; 706219974Smav V2P(vol, bp->bio_offset, &no, &offset, &start); 707219974Smav remain = bp->bio_length; 708219974Smav bioq_init(&queue); 709219974Smav while (remain > 0) { 710219974Smav length = MIN(strip_size - start, remain); 711219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 712219974Smav no, offset, length, 0); 713219974Smav KASSERT(best >= 0, ("No readable disk in volume %s!", 714219974Smav vol->v_name)); 715219974Smav no += best; 716219974Smav if (no >= vol->v_disks_count) { 717219974Smav no -= vol->v_disks_count; 718219974Smav offset += strip_size; 719219974Smav } 720219974Smav cbp = g_clone_bio(bp); 721219974Smav if (cbp == NULL) 722219974Smav goto failure; 723219974Smav cbp->bio_offset = offset + start; 724219974Smav cbp->bio_data = addr; 725219974Smav cbp->bio_length = length; 726219974Smav cbp->bio_caller1 = &vol->v_subdisks[no]; 727219974Smav bioq_insert_tail(&queue, cbp); 728219974Smav no += N - best; 729219974Smav if (no >= vol->v_disks_count) { 730219974Smav no -= vol->v_disks_count; 731219974Smav offset += strip_size; 732219974Smav } 733219974Smav remain -= length; 734219974Smav addr += length; 735219974Smav start = 0; 736219974Smav } 737219974Smav for (cbp = bioq_first(&queue); cbp != NULL; 738219974Smav cbp = bioq_first(&queue)) { 739219974Smav bioq_remove(&queue, cbp); 740219974Smav sd = cbp->bio_caller1; 741219974Smav cbp->bio_caller1 = NULL; 742219974Smav g_raid_subdisk_iostart(sd, cbp); 743219974Smav } 744219974Smav return; 745219974Smavfailure: 746219974Smav for (cbp = bioq_first(&queue); cbp != NULL; 747219974Smav cbp = bioq_first(&queue)) { 748219974Smav bioq_remove(&queue, cbp); 749219974Smav g_destroy_bio(cbp); 750219974Smav } 751219974Smav if (bp->bio_error == 0) 752219974Smav bp->bio_error = ENOMEM; 753219974Smav g_raid_iodone(bp, bp->bio_error); 754219974Smav} 755219974Smav 756219974Smavstatic void 757219974Smavg_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) 758219974Smav{ 759219974Smav struct g_raid_volume *vol; 760219974Smav struct g_raid_subdisk *sd; 761219974Smav struct bio_queue_head queue; 762219974Smav struct bio *cbp; 763219974Smav char *addr; 764219974Smav off_t offset, start, length, remain; 765219974Smav u_int no, strip_size; 766219974Smav int i; 767219974Smav 768219974Smav vol = tr->tro_volume; 769219974Smav addr = bp->bio_data; 770219974Smav strip_size = vol->v_strip_size; 771219974Smav V2P(vol, bp->bio_offset, &no, &offset, &start); 772219974Smav remain = bp->bio_length; 773219974Smav bioq_init(&queue); 774219974Smav while (remain > 0) { 775219974Smav length = MIN(strip_size - start, remain); 776219974Smav for (i = 0; i < N; i++) { 777219974Smav sd = &vol->v_subdisks[no]; 778219974Smav switch (sd->sd_state) { 779219974Smav case G_RAID_SUBDISK_S_ACTIVE: 780219974Smav case G_RAID_SUBDISK_S_STALE: 781219974Smav case G_RAID_SUBDISK_S_RESYNC: 782219974Smav break; 783219974Smav case G_RAID_SUBDISK_S_REBUILD: 784219974Smav if (offset + start >= sd->sd_rebuild_pos) 785219974Smav goto nextdisk; 786219974Smav break; 787219974Smav default: 788219974Smav goto nextdisk; 789219974Smav } 790219974Smav cbp = g_clone_bio(bp); 791219974Smav if (cbp == NULL) 792219974Smav goto failure; 793219974Smav cbp->bio_offset = offset + start; 794219974Smav cbp->bio_data = addr; 795219974Smav cbp->bio_length = length; 796219974Smav cbp->bio_caller1 = sd; 797219974Smav bioq_insert_tail(&queue, cbp); 798219974Smavnextdisk: 799219974Smav if (++no >= vol->v_disks_count) { 800219974Smav no = 0; 801219974Smav offset += strip_size; 802219974Smav } 803219974Smav } 804219974Smav remain -= length; 805219974Smav addr += length; 806219974Smav start = 0; 807219974Smav } 808219974Smav for (cbp = bioq_first(&queue); cbp != NULL; 809219974Smav cbp = bioq_first(&queue)) { 810219974Smav bioq_remove(&queue, cbp); 811219974Smav sd = cbp->bio_caller1; 812219974Smav cbp->bio_caller1 = NULL; 813219974Smav g_raid_subdisk_iostart(sd, cbp); 814219974Smav } 815219974Smav return; 816219974Smavfailure: 817219974Smav for (cbp = bioq_first(&queue); cbp != NULL; 818219974Smav cbp = bioq_first(&queue)) { 819219974Smav bioq_remove(&queue, cbp); 820219974Smav g_destroy_bio(cbp); 821219974Smav } 822219974Smav if (bp->bio_error == 0) 823219974Smav bp->bio_error = ENOMEM; 824219974Smav g_raid_iodone(bp, bp->bio_error); 825219974Smav} 826219974Smav 827219974Smavstatic void 828219974Smavg_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) 829219974Smav{ 830219974Smav struct g_raid_volume *vol; 831219974Smav struct g_raid_tr_raid1e_object *trs; 832219974Smav 833219974Smav vol = tr->tro_volume; 834219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 835219974Smav if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 836219974Smav vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 837219974Smav vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 838219974Smav g_raid_iodone(bp, EIO); 839219974Smav return; 840219974Smav } 841219974Smav /* 842219974Smav * If we're rebuilding, squeeze in rebuild activity every so often, 843219974Smav * even when the disk is busy. Be sure to only count real I/O 844219974Smav * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 845219974Smav * by this module. 846219974Smav */ 847219974Smav if (trs->trso_failed_sd != NULL && 848219974Smav !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 849219974Smav /* Make this new or running now round short. */ 850219974Smav trs->trso_recover_slabs = 0; 851219974Smav if (--trs->trso_fair_io <= 0) { 852219974Smav trs->trso_fair_io = g_raid1e_rebuild_fair_io; 853219974Smav g_raid_tr_raid1e_rebuild_some(tr); 854219974Smav } 855219974Smav } 856219974Smav switch (bp->bio_cmd) { 857219974Smav case BIO_READ: 858219974Smav g_raid_tr_iostart_raid1e_read(tr, bp); 859219974Smav break; 860219974Smav case BIO_WRITE: 861219974Smav g_raid_tr_iostart_raid1e_write(tr, bp); 862219974Smav break; 863219974Smav case BIO_DELETE: 864219974Smav g_raid_iodone(bp, EIO); 865219974Smav break; 866219974Smav case BIO_FLUSH: 867219974Smav g_raid_tr_flush_common(tr, bp); 868219974Smav break; 869219974Smav default: 870219974Smav KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 871219974Smav bp->bio_cmd, vol->v_name)); 872219974Smav break; 873219974Smav } 874219974Smav} 875219974Smav 876219974Smavstatic void 877219974Smavg_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, 878219974Smav struct g_raid_subdisk *sd, struct bio *bp) 879219974Smav{ 880219974Smav struct bio *cbp; 881219974Smav struct g_raid_subdisk *nsd; 882219974Smav struct g_raid_volume *vol; 883219974Smav struct bio *pbp; 884219974Smav struct g_raid_tr_raid1e_object *trs; 885219974Smav off_t virtual, offset, start; 886219974Smav uintptr_t mask; 887219974Smav int error, do_write, copy, disk, best; 888219974Smav 889219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 890219974Smav vol = tr->tro_volume; 891219974Smav if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 892219974Smav if (trs->trso_type == TR_RAID1E_REBUILD) { 893219974Smav nsd = trs->trso_failed_sd; 894219974Smav if (bp->bio_cmd == BIO_READ) { 895219974Smav 896219974Smav /* Immediately abort rebuild, if requested. */ 897219974Smav if (trs->trso_flags & TR_RAID1E_F_ABORT) { 898219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 899219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 900219974Smav return; 901219974Smav } 902219974Smav 903219974Smav /* On read error, skip and cross fingers. */ 904219974Smav if (bp->bio_error != 0) { 905219974Smav G_RAID_LOGREQ(0, bp, 906219974Smav "Read error during rebuild (%d), " 907219974Smav "possible data loss!", 908219974Smav bp->bio_error); 909219974Smav goto rebuild_round_done; 910219974Smav } 911219974Smav 912219974Smav /* 913219974Smav * The read operation finished, queue the 914219974Smav * write and get out. 915219974Smav */ 916219974Smav G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", 917219974Smav bp->bio_error); 918219974Smav bp->bio_cmd = BIO_WRITE; 919219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 920219974Smav bp->bio_offset = nsd->sd_rebuild_pos; 921219974Smav G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); 922219974Smav g_raid_subdisk_iostart(nsd, bp); 923219974Smav } else { 924219974Smav /* 925219974Smav * The write operation just finished. Do 926219974Smav * another. We keep cloning the master bio 927219974Smav * since it has the right buffers allocated to 928219974Smav * it. 929219974Smav */ 930219974Smav G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", 931219974Smav bp->bio_error); 932219974Smav if (bp->bio_error != 0 || 933219974Smav trs->trso_flags & TR_RAID1E_F_ABORT) { 934219974Smav if ((trs->trso_flags & 935219974Smav TR_RAID1E_F_ABORT) == 0) { 936219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, 937219974Smav nsd, nsd->sd_disk); 938219974Smav } 939219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 940219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 941219974Smav return; 942219974Smav } 943219974Smavrebuild_round_done: 944219974Smav trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 945219974Smav g_raid_unlock_range(tr->tro_volume, 946219974Smav trs->trso_lock_pos, trs->trso_lock_len); 947219974Smav nsd->sd_rebuild_pos += bp->bio_length; 948219974Smav if (nsd->sd_rebuild_pos >= nsd->sd_size) { 949219974Smav g_raid_tr_raid1e_rebuild_finish(tr); 950219974Smav return; 951219974Smav } 952219974Smav 953219974Smav /* Abort rebuild if we are stopping */ 954219974Smav if (trs->trso_stopping) { 955219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 956219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 957219974Smav return; 958219974Smav } 959219974Smav 960219974Smav if (--trs->trso_meta_update <= 0) { 961219974Smav g_raid_write_metadata(vol->v_softc, 962219974Smav vol, nsd, nsd->sd_disk); 963219974Smav trs->trso_meta_update = 964219974Smav g_raid1e_rebuild_meta_update; 965219974Smav /* Compensate short rebuild I/Os. */ 966219974Smav if ((vol->v_disks_count % N) != 0 && 967219974Smav vol->v_strip_size < 968219974Smav g_raid1e_rebuild_slab) { 969219974Smav trs->trso_meta_update *= 970219974Smav g_raid1e_rebuild_slab; 971219974Smav trs->trso_meta_update /= 972219974Smav vol->v_strip_size; 973219974Smav } 974219974Smav } 975219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 976219974Smav if (--trs->trso_recover_slabs <= 0) 977219974Smav return; 978219974Smav /* Run next rebuild iteration. */ 979219974Smav g_raid_tr_raid1e_rebuild_some(tr); 980219974Smav } 981219974Smav } else if (trs->trso_type == TR_RAID1E_RESYNC) { 982219974Smav /* 983219974Smav * read good sd, read bad sd in parallel. when both 984219974Smav * done, compare the buffers. write good to the bad 985219974Smav * if different. do the next bit of work. 986219974Smav */ 987219974Smav panic("Somehow, we think we're doing a resync"); 988219974Smav } 989219974Smav return; 990219974Smav } 991219974Smav pbp = bp->bio_parent; 992219974Smav pbp->bio_inbed++; 993219974Smav mask = (intptr_t)bp->bio_caller2; 994219974Smav if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 995219974Smav /* 996219974Smav * Read failed on first drive. Retry the read error on 997219974Smav * another disk drive, if available, before erroring out the 998219974Smav * read. 999219974Smav */ 1000219974Smav sd->sd_disk->d_read_errs++; 1001219974Smav G_RAID_LOGREQ(0, bp, 1002219974Smav "Read error (%d), %d read errors total", 1003219974Smav bp->bio_error, sd->sd_disk->d_read_errs); 1004219974Smav 1005219974Smav /* 1006219974Smav * If there are too many read errors, we move to degraded. 1007219974Smav * XXX Do we want to FAIL the drive (eg, make the user redo 1008219974Smav * everything to get it back in sync), or just degrade the 1009219974Smav * drive, which kicks off a resync? 1010219974Smav */ 1011219974Smav do_write = 0; 1012219974Smav if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) 1013219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1014219974Smav else if (mask == 0) 1015219974Smav do_write = 1; 1016219974Smav 1017219974Smav /* Restore what we were doing. */ 1018219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1019219974Smav V2P(vol, virtual, &disk, &offset, &start); 1020219974Smav 1021219974Smav /* Find the other disk, and try to do the I/O to it. */ 1022219974Smav mask |= 1 << copy; 1023219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 1024219974Smav disk, offset, start, mask); 1025219974Smav if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1026219974Smav disk += best; 1027219974Smav if (disk >= vol->v_disks_count) { 1028219974Smav disk -= vol->v_disks_count; 1029219974Smav offset += vol->v_strip_size; 1030219974Smav } 1031219974Smav cbp->bio_offset = offset + start; 1032219974Smav cbp->bio_length = bp->bio_length; 1033219974Smav cbp->bio_data = bp->bio_data; 1034219974Smav g_destroy_bio(bp); 1035219974Smav nsd = &vol->v_subdisks[disk]; 1036219974Smav G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 1037219974Smav nsd->sd_pos); 1038219974Smav if (do_write) 1039219974Smav mask |= 1 << 31; 1040219974Smav if ((mask & (1 << 31)) != 0) 1041219974Smav sd->sd_recovery++; 1042219974Smav cbp->bio_caller2 = (void *)mask; 1043219974Smav if (do_write) { 1044219974Smav cbp->bio_caller1 = nsd; 1045219974Smav /* Lock callback starts I/O */ 1046219974Smav g_raid_lock_range(sd->sd_volume, 1047219974Smav virtual, cbp->bio_length, pbp, cbp); 1048219974Smav } else { 1049219974Smav g_raid_subdisk_iostart(nsd, cbp); 1050219974Smav } 1051219974Smav return; 1052219974Smav } 1053219974Smav /* 1054219974Smav * We can't retry. Return the original error by falling 1055219974Smav * through. This will happen when there's only one good disk. 1056219974Smav * We don't need to fail the raid, since its actual state is 1057219974Smav * based on the state of the subdisks. 1058219974Smav */ 1059219974Smav G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 1060219974Smav } 1061219974Smav if (bp->bio_cmd == BIO_READ && 1062219974Smav bp->bio_error == 0 && 1063219974Smav (mask & (1 << 31)) != 0) { 1064219974Smav G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 1065219974Smav 1066219974Smav /* Restore what we were doing. */ 1067219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1068219974Smav V2P(vol, virtual, &disk, &offset, &start); 1069219974Smav 1070219974Smav /* Find best disk to write. */ 1071219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 1072219974Smav disk, offset, start, ~mask); 1073219974Smav if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1074219974Smav disk += best; 1075219974Smav if (disk >= vol->v_disks_count) { 1076219974Smav disk -= vol->v_disks_count; 1077219974Smav offset += vol->v_strip_size; 1078219974Smav } 1079219974Smav cbp->bio_offset = offset + start; 1080219974Smav cbp->bio_length = bp->bio_length; 1081219974Smav cbp->bio_data = bp->bio_data; 1082219974Smav cbp->bio_cmd = BIO_WRITE; 1083219974Smav cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 1084219974Smav cbp->bio_caller2 = (void *)mask; 1085219974Smav g_destroy_bio(bp); 1086219974Smav G_RAID_LOGREQ(2, cbp, 1087219974Smav "Attempting bad sector remap on failing drive."); 1088219974Smav g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); 1089219974Smav return; 1090219974Smav } 1091219974Smav } 1092219974Smav if ((mask & (1 << 31)) != 0) { 1093219974Smav /* 1094219974Smav * We're done with a recovery, mark the range as unlocked. 1095219974Smav * For any write errors, we agressively fail the disk since 1096219974Smav * there was both a READ and a WRITE error at this location. 1097219974Smav * Both types of errors generally indicates the drive is on 1098219974Smav * the verge of total failure anyway. Better to stop trusting 1099219974Smav * it now. However, we need to reset error to 0 in that case 1100219974Smav * because we're not failing the original I/O which succeeded. 1101219974Smav */ 1102219974Smav 1103219974Smav /* Restore what we were doing. */ 1104219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1105219974Smav V2P(vol, virtual, &disk, &offset, &start); 1106219974Smav 1107219974Smav for (copy = 0; copy < N; copy++) { 1108219974Smav if ((mask & (1 << copy) ) != 0) 1109219974Smav vol->v_subdisks[(disk + copy) % 1110219974Smav vol->v_disks_count].sd_recovery--; 1111219974Smav } 1112219974Smav 1113219974Smav if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 1114219974Smav G_RAID_LOGREQ(0, bp, "Remap write failed: " 1115219974Smav "failing subdisk."); 1116219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1117219974Smav bp->bio_error = 0; 1118219974Smav } 1119219974Smav G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 1120219974Smav g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); 1121219974Smav } 1122235270Smav if (pbp->bio_cmd != BIO_READ) { 1123235270Smav if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 1124235270Smav pbp->bio_error = bp->bio_error; 1125235270Smav if (bp->bio_error != 0) { 1126235270Smav G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 1127235270Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1128235270Smav } 1129235270Smav error = pbp->bio_error; 1130235270Smav } else 1131235270Smav error = bp->bio_error; 1132219974Smav g_destroy_bio(bp); 1133219974Smav if (pbp->bio_children == pbp->bio_inbed) { 1134219974Smav pbp->bio_completed = pbp->bio_length; 1135219974Smav g_raid_iodone(pbp, error); 1136219974Smav } 1137219974Smav} 1138219974Smav 1139219974Smavstatic int 1140219974Smavg_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, 1141219974Smav void *virtual, vm_offset_t physical, off_t boffset, size_t blength) 1142219974Smav{ 1143219974Smav struct g_raid_volume *vol; 1144219974Smav struct g_raid_subdisk *sd; 1145219974Smav struct bio_queue_head queue; 1146219974Smav char *addr; 1147219974Smav off_t offset, start, length, remain; 1148219974Smav u_int no, strip_size; 1149219974Smav int i, error; 1150219974Smav 1151219974Smav vol = tr->tro_volume; 1152219974Smav addr = virtual; 1153219974Smav strip_size = vol->v_strip_size; 1154219974Smav V2P(vol, boffset, &no, &offset, &start); 1155219974Smav remain = blength; 1156219974Smav bioq_init(&queue); 1157219974Smav while (remain > 0) { 1158219974Smav length = MIN(strip_size - start, remain); 1159219974Smav for (i = 0; i < N; i++) { 1160219974Smav sd = &vol->v_subdisks[no]; 1161219974Smav switch (sd->sd_state) { 1162219974Smav case G_RAID_SUBDISK_S_ACTIVE: 1163219974Smav case G_RAID_SUBDISK_S_STALE: 1164219974Smav case G_RAID_SUBDISK_S_RESYNC: 1165219974Smav break; 1166219974Smav case G_RAID_SUBDISK_S_REBUILD: 1167219974Smav if (offset + start >= sd->sd_rebuild_pos) 1168219974Smav goto nextdisk; 1169219974Smav break; 1170219974Smav default: 1171219974Smav goto nextdisk; 1172219974Smav } 1173219974Smav error = g_raid_subdisk_kerneldump(sd, 1174219974Smav addr, 0, offset + start, length); 1175219974Smav if (error != 0) 1176219974Smav return (error); 1177219974Smavnextdisk: 1178219974Smav if (++no >= vol->v_disks_count) { 1179219974Smav no = 0; 1180219974Smav offset += strip_size; 1181219974Smav } 1182219974Smav } 1183219974Smav remain -= length; 1184219974Smav addr += length; 1185219974Smav start = 0; 1186219974Smav } 1187219974Smav return (0); 1188219974Smav} 1189219974Smav 1190219974Smavstatic int 1191219974Smavg_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) 1192219974Smav{ 1193219974Smav struct bio *bp; 1194219974Smav struct g_raid_subdisk *sd; 1195219974Smav 1196219974Smav bp = (struct bio *)argp; 1197219974Smav sd = (struct g_raid_subdisk *)bp->bio_caller1; 1198219974Smav g_raid_subdisk_iostart(sd, bp); 1199219974Smav 1200219974Smav return (0); 1201219974Smav} 1202219974Smav 1203219974Smavstatic int 1204219974Smavg_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) 1205219974Smav{ 1206219974Smav struct g_raid_tr_raid1e_object *trs; 1207219974Smav struct g_raid_volume *vol; 1208219974Smav 1209219974Smav vol = tr->tro_volume; 1210219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 1211219974Smav trs->trso_fair_io = g_raid1e_rebuild_fair_io; 1212219974Smav trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; 1213219974Smav /* Compensate short rebuild I/Os. */ 1214219974Smav if ((vol->v_disks_count % N) != 0 && 1215219974Smav vol->v_strip_size < g_raid1e_rebuild_slab) { 1216219974Smav trs->trso_recover_slabs *= g_raid1e_rebuild_slab; 1217219974Smav trs->trso_recover_slabs /= vol->v_strip_size; 1218219974Smav } 1219219974Smav if (trs->trso_type == TR_RAID1E_REBUILD) 1220219974Smav g_raid_tr_raid1e_rebuild_some(tr); 1221219974Smav return (0); 1222219974Smav} 1223219974Smav 1224219974Smavstatic int 1225219974Smavg_raid_tr_free_raid1e(struct g_raid_tr_object *tr) 1226219974Smav{ 1227219974Smav struct g_raid_tr_raid1e_object *trs; 1228219974Smav 1229219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 1230219974Smav 1231219974Smav if (trs->trso_buffer != NULL) { 1232219974Smav free(trs->trso_buffer, M_TR_RAID1E); 1233219974Smav trs->trso_buffer = NULL; 1234219974Smav } 1235219974Smav return (0); 1236219974Smav} 1237219974Smav 1238240465SmavG_RAID_TR_DECLARE(raid1e, "RAID1E"); 1239