tr_raid1e.c revision 256610
1219974Smav/*- 2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3219974Smav * All rights reserved. 4219974Smav * 5219974Smav * Redistribution and use in source and binary forms, with or without 6219974Smav * modification, are permitted provided that the following conditions 7219974Smav * are met: 8219974Smav * 1. Redistributions of source code must retain the above copyright 9219974Smav * notice, this list of conditions and the following disclaimer. 10219974Smav * 2. Redistributions in binary form must reproduce the above copyright 11219974Smav * notice, this list of conditions and the following disclaimer in the 12219974Smav * documentation and/or other materials provided with the distribution. 13219974Smav * 14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17219974Smav * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24219974Smav * SUCH DAMAGE. 25219974Smav */ 26219974Smav 27219974Smav#include <sys/cdefs.h> 28219974Smav__FBSDID("$FreeBSD: head/sys/geom/raid/tr_raid1e.c 256610 2013-10-16 09:33:23Z mav $"); 29219974Smav 30219974Smav#include <sys/param.h> 31219974Smav#include <sys/bio.h> 32219974Smav#include <sys/endian.h> 33219974Smav#include <sys/kernel.h> 34219974Smav#include <sys/kobj.h> 35219974Smav#include <sys/limits.h> 36219974Smav#include <sys/lock.h> 37219974Smav#include <sys/malloc.h> 38219974Smav#include <sys/mutex.h> 39219974Smav#include <sys/sysctl.h> 40219974Smav#include <sys/systm.h> 41219974Smav#include <geom/geom.h> 42219974Smav#include "geom/raid/g_raid.h" 43219974Smav#include "g_raid_tr_if.h" 44219974Smav 45219974Smav#define N 2 46219974Smav 47240465SmavSYSCTL_DECL(_kern_geom_raid_raid1e); 48219974Smav 49219974Smav#define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 50219974Smavstatic int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; 51219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size", 52219974Smav &g_raid1e_rebuild_slab); 53219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, 54219974Smav &g_raid1e_rebuild_slab, 0, 55219974Smav "Amount of the disk to rebuild each read/write cycle of the rebuild."); 56219974Smav 57219974Smav#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 58219974Smavstatic int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; 59219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io", 60219974Smav &g_raid1e_rebuild_fair_io); 61219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, 62219974Smav &g_raid1e_rebuild_fair_io, 0, 63219974Smav "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 64219974Smav 65219974Smav#define RAID1E_REBUILD_CLUSTER_IDLE 100 66219974Smavstatic int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; 67219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle", 68219974Smav &g_raid1e_rebuild_cluster_idle); 69219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, 70219974Smav &g_raid1e_rebuild_cluster_idle, 0, 71219974Smav "Number of slabs to do each time we trigger a rebuild cycle"); 72219974Smav 73219974Smav#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 74219974Smavstatic int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; 75219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update", 76219974Smav &g_raid1e_rebuild_meta_update); 77219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, 78219974Smav &g_raid1e_rebuild_meta_update, 0, 79219974Smav "When to update the meta data."); 80219974Smav 81219974Smavstatic MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); 82219974Smav 83219974Smav#define TR_RAID1E_NONE 0 84219974Smav#define TR_RAID1E_REBUILD 1 85219974Smav#define TR_RAID1E_RESYNC 2 86219974Smav 87219974Smav#define TR_RAID1E_F_DOING_SOME 0x1 88219974Smav#define TR_RAID1E_F_LOCKED 0x2 89219974Smav#define TR_RAID1E_F_ABORT 0x4 90219974Smav 91219974Smavstruct g_raid_tr_raid1e_object { 92219974Smav struct g_raid_tr_object trso_base; 93219974Smav int trso_starting; 94219974Smav int trso_stopping; 95219974Smav int trso_type; 96219974Smav int trso_recover_slabs; /* slabs before rest */ 97219974Smav int trso_fair_io; 98219974Smav int trso_meta_update; 99219974Smav int trso_flags; 100219974Smav struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 101219974Smav void *trso_buffer; /* Buffer space */ 102219974Smav off_t trso_lock_pos; /* Locked range start. */ 103219974Smav off_t trso_lock_len; /* Locked range length. */ 104219974Smav struct bio trso_bio; 105219974Smav}; 106219974Smav 107219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1e; 108219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1e; 109219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1e; 110219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1e; 111219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; 112219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; 113219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; 114219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1e; 115219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1e; 116219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1e; 117219974Smav 118219974Smavstatic kobj_method_t g_raid_tr_raid1e_methods[] = { 119219974Smav KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), 120219974Smav KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), 121219974Smav KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), 122219974Smav KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), 123219974Smav KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), 124219974Smav KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), 125219974Smav KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), 126219974Smav KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), 127219974Smav KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), 128219974Smav KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), 129219974Smav { 0, 0 } 130219974Smav}; 131219974Smav 132219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1e_class = { 133219974Smav "RAID1E", 134219974Smav g_raid_tr_raid1e_methods, 135219974Smav sizeof(struct g_raid_tr_raid1e_object), 136240465Smav .trc_enable = 1, 137256610Smav .trc_priority = 200, 138256610Smav .trc_accept_unmapped = 1 139219974Smav}; 140219974Smav 141219974Smavstatic void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); 142219974Smavstatic void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 143219974Smav struct g_raid_subdisk *sd); 144219974Smavstatic int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 145219974Smav int no, off_t off, off_t len, u_int mask); 146219974Smav 147219974Smavstatic inline void 148219974SmavV2P(struct g_raid_volume *vol, off_t virt, 149219974Smav int *disk, off_t *offset, off_t *start) 150219974Smav{ 151219974Smav off_t nstrip; 152219974Smav u_int strip_size; 153219974Smav 154219974Smav strip_size = vol->v_strip_size; 155219974Smav /* Strip number. */ 156219974Smav nstrip = virt / strip_size; 157219974Smav /* Start position in strip. */ 158219974Smav *start = virt % strip_size; 159219974Smav /* Disk number. */ 160219974Smav *disk = (nstrip * N) % vol->v_disks_count; 161219974Smav /* Strip start position in disk. */ 162219974Smav *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; 163219974Smav} 164219974Smav 165219974Smavstatic inline void 166219974SmavP2V(struct g_raid_volume *vol, int disk, off_t offset, 167219974Smav off_t *virt, int *copy) 168219974Smav{ 169219974Smav off_t nstrip, start; 170219974Smav u_int strip_size; 171219974Smav 172219974Smav strip_size = vol->v_strip_size; 173219974Smav /* Start position in strip. */ 174219974Smav start = offset % strip_size; 175219974Smav /* Physical strip number. */ 176219974Smav nstrip = (offset / strip_size) * vol->v_disks_count + disk; 177219974Smav /* Number of physical strip (copy) inside virtual strip. */ 178219974Smav *copy = nstrip % N; 179219974Smav /* Offset in virtual space. */ 180219974Smav *virt = (nstrip / N) * strip_size + start; 181219974Smav} 182219974Smav 183219974Smavstatic int 184219974Smavg_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 185219974Smav{ 186219974Smav struct g_raid_tr_raid1e_object *trs; 187219974Smav 188219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 189219974Smav if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || 190234603Smav tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) 191219974Smav return (G_RAID_TR_TASTE_FAIL); 192219974Smav trs->trso_starting = 1; 193219974Smav return (G_RAID_TR_TASTE_SUCCEED); 194219974Smav} 195219974Smav 196219974Smavstatic int 197219974Smavg_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) 198219974Smav{ 199219974Smav struct g_raid_softc *sc; 200219974Smav struct g_raid_subdisk *sd, *bestsd, *worstsd; 201219974Smav int i, j, state, sstate; 202219974Smav 203219974Smav sc = vol->v_softc; 204219974Smav state = G_RAID_VOLUME_S_OPTIMAL; 205219974Smav for (i = 0; i < vol->v_disks_count / N; i++) { 206219974Smav bestsd = &vol->v_subdisks[i * N]; 207219974Smav for (j = 1; j < N; j++) { 208219974Smav sd = &vol->v_subdisks[i * N + j]; 209219974Smav if (sd->sd_state > bestsd->sd_state) 210219974Smav bestsd = sd; 211219974Smav else if (sd->sd_state == bestsd->sd_state && 212219974Smav (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 213219974Smav sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 214219974Smav sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 215219974Smav bestsd = sd; 216219974Smav } 217219974Smav if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && 218219974Smav bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { 219219974Smav /* We found reasonable candidate. */ 220219974Smav G_RAID_DEBUG1(1, sc, 221219974Smav "Promote subdisk %s:%d from %s to ACTIVE.", 222219974Smav vol->v_name, bestsd->sd_pos, 223219974Smav g_raid_subdisk_state2str(bestsd->sd_state)); 224219974Smav g_raid_change_subdisk_state(bestsd, 225219974Smav G_RAID_SUBDISK_S_ACTIVE); 226219974Smav g_raid_write_metadata(sc, 227219974Smav vol, bestsd, bestsd->sd_disk); 228219974Smav } 229219974Smav worstsd = &vol->v_subdisks[i * N]; 230219974Smav for (j = 1; j < N; j++) { 231219974Smav sd = &vol->v_subdisks[i * N + j]; 232219974Smav if (sd->sd_state < worstsd->sd_state) 233219974Smav worstsd = sd; 234219974Smav } 235219974Smav if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 236219974Smav sstate = G_RAID_VOLUME_S_OPTIMAL; 237219974Smav else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 238219974Smav sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 239219974Smav else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 240219974Smav sstate = G_RAID_VOLUME_S_DEGRADED; 241219974Smav else 242219974Smav sstate = G_RAID_VOLUME_S_BROKEN; 243219974Smav if (sstate < state) 244219974Smav state = sstate; 245219974Smav } 246219974Smav return (state); 247219974Smav} 248219974Smav 249219974Smavstatic int 250219974Smavg_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) 251219974Smav{ 252219974Smav struct g_raid_softc *sc; 253219974Smav struct g_raid_subdisk *sd, *bestsd, *worstsd; 254219974Smav int i, j, state, sstate; 255219974Smav 256219974Smav sc = vol->v_softc; 257219974Smav if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == 258219974Smav vol->v_disks_count) 259219974Smav return (G_RAID_VOLUME_S_OPTIMAL); 260219974Smav for (i = 0; i < vol->v_disks_count; i++) { 261219974Smav sd = &vol->v_subdisks[i]; 262219974Smav if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { 263219974Smav /* We found reasonable candidate. */ 264219974Smav G_RAID_DEBUG1(1, sc, 265219974Smav "Promote subdisk %s:%d from %s to STALE.", 266219974Smav vol->v_name, sd->sd_pos, 267219974Smav g_raid_subdisk_state2str(sd->sd_state)); 268219974Smav g_raid_change_subdisk_state(sd, 269219974Smav G_RAID_SUBDISK_S_STALE); 270219974Smav g_raid_write_metadata(sc, vol, sd, sd->sd_disk); 271219974Smav } 272219974Smav } 273219974Smav state = G_RAID_VOLUME_S_OPTIMAL; 274219974Smav for (i = 0; i < vol->v_disks_count; i++) { 275219974Smav bestsd = &vol->v_subdisks[i]; 276219974Smav worstsd = &vol->v_subdisks[i]; 277219974Smav for (j = 1; j < N; j++) { 278219974Smav sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; 279219974Smav if (sd->sd_state > bestsd->sd_state) 280219974Smav bestsd = sd; 281219974Smav else if (sd->sd_state == bestsd->sd_state && 282219974Smav (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 283219974Smav sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 284219974Smav sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 285219974Smav bestsd = sd; 286219974Smav if (sd->sd_state < worstsd->sd_state) 287219974Smav worstsd = sd; 288219974Smav } 289219974Smav if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 290219974Smav sstate = G_RAID_VOLUME_S_OPTIMAL; 291219974Smav else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 292219974Smav sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 293219974Smav else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) 294219974Smav sstate = G_RAID_VOLUME_S_DEGRADED; 295219974Smav else 296219974Smav sstate = G_RAID_VOLUME_S_BROKEN; 297219974Smav if (sstate < state) 298219974Smav state = sstate; 299219974Smav } 300219974Smav return (state); 301219974Smav} 302219974Smav 303219974Smavstatic int 304219974Smavg_raid_tr_update_state_raid1e(struct g_raid_volume *vol, 305219974Smav struct g_raid_subdisk *sd) 306219974Smav{ 307219974Smav struct g_raid_tr_raid1e_object *trs; 308219974Smav struct g_raid_softc *sc; 309219974Smav u_int s; 310219974Smav 311219974Smav sc = vol->v_softc; 312219974Smav trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; 313219974Smav if (trs->trso_stopping && 314219974Smav (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) 315219974Smav s = G_RAID_VOLUME_S_STOPPED; 316219974Smav else if (trs->trso_starting) 317219974Smav s = G_RAID_VOLUME_S_STARTING; 318219974Smav else { 319219974Smav if ((vol->v_disks_count % N) == 0) 320219974Smav s = g_raid_tr_update_state_raid1e_even(vol); 321219974Smav else 322219974Smav s = g_raid_tr_update_state_raid1e_odd(vol); 323219974Smav } 324219974Smav if (s != vol->v_state) { 325219974Smav g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 326219974Smav G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 327219974Smav G_RAID_EVENT_VOLUME); 328219974Smav g_raid_change_volume_state(vol, s); 329219974Smav if (!trs->trso_starting && !trs->trso_stopping) 330219974Smav g_raid_write_metadata(sc, vol, NULL, NULL); 331219974Smav } 332219974Smav if (!trs->trso_starting && !trs->trso_stopping) 333219974Smav g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); 334219974Smav return (0); 335219974Smav} 336219974Smav 337219974Smavstatic void 338219974Smavg_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 339219974Smav struct g_raid_disk *disk) 340219974Smav{ 341235270Smav struct g_raid_volume *vol; 342235270Smav 343235270Smav vol = sd->sd_volume; 344219974Smav /* 345219974Smav * We don't fail the last disk in the pack, since it still has decent 346219974Smav * data on it and that's better than failing the disk if it is the root 347219974Smav * file system. 348219974Smav * 349219974Smav * XXX should this be controlled via a tunable? It makes sense for 350219974Smav * the volume that has / on it. I can't think of a case where we'd 351219974Smav * want the volume to go away on this kind of event. 352219974Smav */ 353235270Smav if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + 354235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + 355235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 356235270Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < 357235270Smav vol->v_disks_count) && 358235270Smav (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) 359219974Smav return; 360219974Smav g_raid_fail_disk(sc, sd, disk); 361219974Smav} 362219974Smav 363219974Smavstatic void 364219974Smavg_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) 365219974Smav{ 366219974Smav struct g_raid_volume *vol; 367219974Smav struct g_raid_subdisk *sd; 368219974Smav 369219974Smav vol = trs->trso_base.tro_volume; 370219974Smav sd = trs->trso_failed_sd; 371219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 372219974Smav free(trs->trso_buffer, M_TR_RAID1E); 373219974Smav trs->trso_buffer = NULL; 374219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 375219974Smav trs->trso_type = TR_RAID1E_NONE; 376219974Smav trs->trso_recover_slabs = 0; 377219974Smav trs->trso_failed_sd = NULL; 378219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 379219974Smav} 380219974Smav 381219974Smavstatic void 382219974Smavg_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) 383219974Smav{ 384219974Smav struct g_raid_tr_raid1e_object *trs; 385219974Smav struct g_raid_subdisk *sd; 386219974Smav 387219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 388219974Smav sd = trs->trso_failed_sd; 389219974Smav G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 390219974Smav "Subdisk %s:%d-%s rebuild completed.", 391219974Smav sd->sd_volume->v_name, sd->sd_pos, 392219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 393219974Smav g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 394219974Smav sd->sd_rebuild_pos = 0; 395219974Smav g_raid_tr_raid1e_rebuild_done(trs); 396219974Smav} 397219974Smav 398219974Smavstatic void 399219974Smavg_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) 400219974Smav{ 401219974Smav struct g_raid_tr_raid1e_object *trs; 402219974Smav struct g_raid_subdisk *sd; 403219974Smav struct g_raid_volume *vol; 404219974Smav 405219974Smav vol = tr->tro_volume; 406219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 407219974Smav sd = trs->trso_failed_sd; 408219974Smav if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { 409219974Smav G_RAID_DEBUG1(1, vol->v_softc, 410219974Smav "Subdisk %s:%d-%s rebuild is aborting.", 411219974Smav sd->sd_volume->v_name, sd->sd_pos, 412219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 413219974Smav trs->trso_flags |= TR_RAID1E_F_ABORT; 414219974Smav } else { 415219974Smav G_RAID_DEBUG1(0, vol->v_softc, 416219974Smav "Subdisk %s:%d-%s rebuild aborted.", 417219974Smav sd->sd_volume->v_name, sd->sd_pos, 418219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 419219974Smav trs->trso_flags &= ~TR_RAID1E_F_ABORT; 420219974Smav if (trs->trso_flags & TR_RAID1E_F_LOCKED) { 421219974Smav trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 422219974Smav g_raid_unlock_range(tr->tro_volume, 423219974Smav trs->trso_lock_pos, trs->trso_lock_len); 424219974Smav } 425219974Smav g_raid_tr_raid1e_rebuild_done(trs); 426219974Smav } 427219974Smav} 428219974Smav 429219974Smavstatic void 430219974Smavg_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) 431219974Smav{ 432219974Smav struct g_raid_tr_raid1e_object *trs; 433219974Smav struct g_raid_softc *sc; 434219974Smav struct g_raid_volume *vol; 435219974Smav struct g_raid_subdisk *sd; 436219974Smav struct bio *bp; 437219974Smav off_t len, virtual, vend, offset, start; 438219974Smav int disk, copy, best; 439219974Smav 440219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 441219974Smav if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) 442219974Smav return; 443219974Smav vol = tr->tro_volume; 444219974Smav sc = vol->v_softc; 445219974Smav sd = trs->trso_failed_sd; 446219974Smav 447219974Smav while (1) { 448219974Smav if (sd->sd_rebuild_pos >= sd->sd_size) { 449219974Smav g_raid_tr_raid1e_rebuild_finish(tr); 450219974Smav return; 451219974Smav } 452219974Smav /* Get virtual offset from physical rebuild position. */ 453219974Smav P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); 454219974Smav /* Get physical offset back to get first stripe position. */ 455219974Smav V2P(vol, virtual, &disk, &offset, &start); 456219974Smav /* Calculate contignous data length. */ 457219974Smav len = MIN(g_raid1e_rebuild_slab, 458219974Smav sd->sd_size - sd->sd_rebuild_pos); 459219974Smav if ((vol->v_disks_count % N) != 0) 460219974Smav len = MIN(len, vol->v_strip_size - start); 461219974Smav /* Find disk with most accurate data. */ 462219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, disk, 463219974Smav offset + start, len, 0); 464219974Smav if (best < 0) { 465219974Smav /* There is no any valid disk. */ 466219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 467219974Smav return; 468219974Smav } else if (best != copy) { 469219974Smav /* Some other disk has better data. */ 470219974Smav break; 471219974Smav } 472219974Smav /* We have the most accurate data. Skip the range. */ 473219974Smav G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", 474219974Smav sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); 475219974Smav sd->sd_rebuild_pos += len; 476219974Smav } 477219974Smav 478219974Smav bp = &trs->trso_bio; 479219974Smav memset(bp, 0, sizeof(*bp)); 480219974Smav bp->bio_offset = offset + start + 481219974Smav ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); 482219974Smav bp->bio_length = len; 483219974Smav bp->bio_data = trs->trso_buffer; 484219974Smav bp->bio_cmd = BIO_READ; 485219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 486219974Smav bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; 487219974Smav G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); 488219974Smav /* 489219974Smav * If we are crossing stripe boundary, correct affected virtual 490219974Smav * range we should lock. 491219974Smav */ 492219974Smav if (start + len > vol->v_strip_size) { 493219974Smav P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); 494219974Smav len = vend - virtual; 495219974Smav } 496219974Smav trs->trso_flags |= TR_RAID1E_F_DOING_SOME; 497219974Smav trs->trso_flags |= TR_RAID1E_F_LOCKED; 498219974Smav trs->trso_lock_pos = virtual; 499219974Smav trs->trso_lock_len = len; 500219974Smav /* Lock callback starts I/O */ 501219974Smav g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); 502219974Smav} 503219974Smav 504219974Smavstatic void 505219974Smavg_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) 506219974Smav{ 507219974Smav struct g_raid_volume *vol; 508219974Smav struct g_raid_tr_raid1e_object *trs; 509219974Smav struct g_raid_subdisk *sd; 510219974Smav 511219974Smav vol = tr->tro_volume; 512219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 513219974Smav if (trs->trso_failed_sd) { 514219974Smav G_RAID_DEBUG1(1, vol->v_softc, 515219974Smav "Already rebuild in start rebuild. pos %jd\n", 516219974Smav (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 517219974Smav return; 518219974Smav } 519219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 520219974Smav if (sd == NULL) 521219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 522219974Smav if (sd == NULL) { 523219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 524219974Smav if (sd != NULL) { 525219974Smav sd->sd_rebuild_pos = 0; 526219974Smav g_raid_change_subdisk_state(sd, 527219974Smav G_RAID_SUBDISK_S_RESYNC); 528219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, NULL); 529219974Smav } else { 530219974Smav sd = g_raid_get_subdisk(vol, 531219974Smav G_RAID_SUBDISK_S_UNINITIALIZED); 532219974Smav if (sd == NULL) 533219974Smav sd = g_raid_get_subdisk(vol, 534219974Smav G_RAID_SUBDISK_S_NEW); 535219974Smav if (sd != NULL) { 536219974Smav sd->sd_rebuild_pos = 0; 537219974Smav g_raid_change_subdisk_state(sd, 538219974Smav G_RAID_SUBDISK_S_REBUILD); 539219974Smav g_raid_write_metadata(vol->v_softc, 540219974Smav vol, sd, NULL); 541219974Smav } 542219974Smav } 543219974Smav } 544219974Smav if (sd == NULL) { 545219974Smav G_RAID_DEBUG1(1, vol->v_softc, 546219974Smav "No failed disk to rebuild. night night."); 547219974Smav return; 548219974Smav } 549219974Smav trs->trso_failed_sd = sd; 550219974Smav G_RAID_DEBUG1(0, vol->v_softc, 551219974Smav "Subdisk %s:%d-%s rebuild start at %jd.", 552219974Smav sd->sd_volume->v_name, sd->sd_pos, 553219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 554219974Smav trs->trso_failed_sd->sd_rebuild_pos); 555219974Smav trs->trso_type = TR_RAID1E_REBUILD; 556219974Smav trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); 557219974Smav trs->trso_meta_update = g_raid1e_rebuild_meta_update; 558219974Smav g_raid_tr_raid1e_rebuild_some(tr); 559219974Smav} 560219974Smav 561219974Smavstatic void 562219974Smavg_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 563219974Smav struct g_raid_subdisk *sd) 564219974Smav{ 565219974Smav struct g_raid_volume *vol; 566219974Smav struct g_raid_tr_raid1e_object *trs; 567219974Smav int nr; 568219974Smav 569219974Smav vol = tr->tro_volume; 570219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 571219974Smav if (trs->trso_stopping) 572219974Smav return; 573219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 574219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 575219974Smav switch(trs->trso_type) { 576219974Smav case TR_RAID1E_NONE: 577219974Smav if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) 578219974Smav return; 579219974Smav if (nr == 0) { 580219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 581219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 582219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 583219974Smav if (nr == 0) 584219974Smav return; 585219974Smav } 586219974Smav g_raid_tr_raid1e_rebuild_start(tr); 587219974Smav break; 588219974Smav case TR_RAID1E_REBUILD: 589219974Smav if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || 590219974Smav trs->trso_failed_sd == sd) 591219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 592219974Smav break; 593219974Smav case TR_RAID1E_RESYNC: 594219974Smav break; 595219974Smav } 596219974Smav} 597219974Smav 598219974Smavstatic int 599219974Smavg_raid_tr_event_raid1e(struct g_raid_tr_object *tr, 600219974Smav struct g_raid_subdisk *sd, u_int event) 601219974Smav{ 602219974Smav 603219974Smav g_raid_tr_update_state_raid1e(tr->tro_volume, sd); 604219974Smav return (0); 605219974Smav} 606219974Smav 607219974Smavstatic int 608219974Smavg_raid_tr_start_raid1e(struct g_raid_tr_object *tr) 609219974Smav{ 610219974Smav struct g_raid_tr_raid1e_object *trs; 611219974Smav struct g_raid_volume *vol; 612219974Smav 613219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 614219974Smav vol = tr->tro_volume; 615219974Smav trs->trso_starting = 0; 616219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 617219974Smav return (0); 618219974Smav} 619219974Smav 620219974Smavstatic int 621219974Smavg_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) 622219974Smav{ 623219974Smav struct g_raid_tr_raid1e_object *trs; 624219974Smav struct g_raid_volume *vol; 625219974Smav 626219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 627219974Smav vol = tr->tro_volume; 628219974Smav trs->trso_starting = 0; 629219974Smav trs->trso_stopping = 1; 630219974Smav g_raid_tr_update_state_raid1e(vol, NULL); 631219974Smav return (0); 632219974Smav} 633219974Smav 634219974Smav/* 635219974Smav * Select the disk to read from. Take into account: subdisk state, running 636219974Smav * error recovery, average disk load, head position and possible cache hits. 637219974Smav */ 638219974Smav#define ABS(x) (((x) >= 0) ? (x) : (-(x))) 639219974Smavstatic int 640219974Smavg_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 641219974Smav int no, off_t off, off_t len, u_int mask) 642219974Smav{ 643219974Smav struct g_raid_subdisk *sd; 644219974Smav off_t offset; 645219974Smav int i, best, prio, bestprio; 646219974Smav 647219974Smav best = -1; 648219974Smav bestprio = INT_MAX; 649219974Smav for (i = 0; i < N; i++) { 650219974Smav sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; 651219974Smav offset = off; 652219974Smav if (no + i >= vol->v_disks_count) 653219974Smav offset += vol->v_strip_size; 654219974Smav 655219974Smav prio = G_RAID_SUBDISK_LOAD(sd); 656219974Smav if ((mask & (1 << sd->sd_pos)) != 0) 657219974Smav continue; 658219974Smav switch (sd->sd_state) { 659219974Smav case G_RAID_SUBDISK_S_ACTIVE: 660219974Smav break; 661219974Smav case G_RAID_SUBDISK_S_RESYNC: 662219974Smav if (offset + off < sd->sd_rebuild_pos) 663219974Smav break; 664219974Smav /* FALLTHROUGH */ 665219974Smav case G_RAID_SUBDISK_S_STALE: 666219974Smav prio += i << 24; 667219974Smav break; 668219974Smav case G_RAID_SUBDISK_S_REBUILD: 669219974Smav if (offset + off < sd->sd_rebuild_pos) 670219974Smav break; 671219974Smav /* FALLTHROUGH */ 672219974Smav default: 673219974Smav continue; 674219974Smav } 675219974Smav prio += min(sd->sd_recovery, 255) << 16; 676219974Smav /* If disk head is precisely in position - highly prefer it. */ 677219974Smav if (G_RAID_SUBDISK_POS(sd) == offset) 678219974Smav prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 679219974Smav else 680219974Smav /* If disk head is close to position - prefer it. */ 681219974Smav if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < 682219974Smav G_RAID_SUBDISK_TRACK_SIZE) 683219974Smav prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 684219974Smav if (prio < bestprio) { 685219974Smav bestprio = prio; 686219974Smav best = i; 687219974Smav } 688219974Smav } 689219974Smav return (best); 690219974Smav} 691219974Smav 692219974Smavstatic void 693219974Smavg_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) 694219974Smav{ 695219974Smav struct g_raid_volume *vol; 696219974Smav struct g_raid_subdisk *sd; 697219974Smav struct bio_queue_head queue; 698219974Smav struct bio *cbp; 699219974Smav char *addr; 700219974Smav off_t offset, start, length, remain; 701219974Smav u_int no, strip_size; 702219974Smav int best; 703219974Smav 704219974Smav vol = tr->tro_volume; 705256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0) 706256610Smav addr = NULL; 707256610Smav else 708256610Smav addr = bp->bio_data; 709219974Smav strip_size = vol->v_strip_size; 710219974Smav V2P(vol, bp->bio_offset, &no, &offset, &start); 711219974Smav remain = bp->bio_length; 712219974Smav bioq_init(&queue); 713219974Smav while (remain > 0) { 714219974Smav length = MIN(strip_size - start, remain); 715219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 716219974Smav no, offset, length, 0); 717219974Smav KASSERT(best >= 0, ("No readable disk in volume %s!", 718219974Smav vol->v_name)); 719219974Smav no += best; 720219974Smav if (no >= vol->v_disks_count) { 721219974Smav no -= vol->v_disks_count; 722219974Smav offset += strip_size; 723219974Smav } 724219974Smav cbp = g_clone_bio(bp); 725219974Smav if (cbp == NULL) 726219974Smav goto failure; 727219974Smav cbp->bio_offset = offset + start; 728219974Smav cbp->bio_length = length; 729256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 730256610Smav cbp->bio_ma_offset += (uintptr_t)addr; 731256610Smav cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 732256610Smav cbp->bio_ma_offset %= PAGE_SIZE; 733256610Smav cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 734256610Smav cbp->bio_length) / PAGE_SIZE; 735256610Smav } else 736256610Smav cbp->bio_data = addr; 737219974Smav cbp->bio_caller1 = &vol->v_subdisks[no]; 738219974Smav bioq_insert_tail(&queue, cbp); 739219974Smav no += N - best; 740219974Smav if (no >= vol->v_disks_count) { 741219974Smav no -= vol->v_disks_count; 742219974Smav offset += strip_size; 743219974Smav } 744219974Smav remain -= length; 745219974Smav addr += length; 746219974Smav start = 0; 747219974Smav } 748256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) { 749219974Smav sd = cbp->bio_caller1; 750219974Smav cbp->bio_caller1 = NULL; 751219974Smav g_raid_subdisk_iostart(sd, cbp); 752219974Smav } 753219974Smav return; 754219974Smavfailure: 755256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) 756219974Smav g_destroy_bio(cbp); 757219974Smav if (bp->bio_error == 0) 758219974Smav bp->bio_error = ENOMEM; 759219974Smav g_raid_iodone(bp, bp->bio_error); 760219974Smav} 761219974Smav 762219974Smavstatic void 763219974Smavg_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) 764219974Smav{ 765219974Smav struct g_raid_volume *vol; 766219974Smav struct g_raid_subdisk *sd; 767219974Smav struct bio_queue_head queue; 768219974Smav struct bio *cbp; 769219974Smav char *addr; 770219974Smav off_t offset, start, length, remain; 771219974Smav u_int no, strip_size; 772219974Smav int i; 773219974Smav 774219974Smav vol = tr->tro_volume; 775256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0) 776256610Smav addr = NULL; 777256610Smav else 778256610Smav addr = bp->bio_data; 779219974Smav strip_size = vol->v_strip_size; 780219974Smav V2P(vol, bp->bio_offset, &no, &offset, &start); 781219974Smav remain = bp->bio_length; 782219974Smav bioq_init(&queue); 783219974Smav while (remain > 0) { 784219974Smav length = MIN(strip_size - start, remain); 785219974Smav for (i = 0; i < N; i++) { 786219974Smav sd = &vol->v_subdisks[no]; 787219974Smav switch (sd->sd_state) { 788219974Smav case G_RAID_SUBDISK_S_ACTIVE: 789219974Smav case G_RAID_SUBDISK_S_STALE: 790219974Smav case G_RAID_SUBDISK_S_RESYNC: 791219974Smav break; 792219974Smav case G_RAID_SUBDISK_S_REBUILD: 793219974Smav if (offset + start >= sd->sd_rebuild_pos) 794219974Smav goto nextdisk; 795219974Smav break; 796219974Smav default: 797219974Smav goto nextdisk; 798219974Smav } 799219974Smav cbp = g_clone_bio(bp); 800219974Smav if (cbp == NULL) 801219974Smav goto failure; 802219974Smav cbp->bio_offset = offset + start; 803219974Smav cbp->bio_length = length; 804256610Smav if ((bp->bio_flags & BIO_UNMAPPED) != 0 && 805256610Smav bp->bio_cmd != BIO_DELETE) { 806256610Smav cbp->bio_ma_offset += (uintptr_t)addr; 807256610Smav cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 808256610Smav cbp->bio_ma_offset %= PAGE_SIZE; 809256610Smav cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 810256610Smav cbp->bio_length) / PAGE_SIZE; 811256610Smav } else 812256610Smav cbp->bio_data = addr; 813219974Smav cbp->bio_caller1 = sd; 814219974Smav bioq_insert_tail(&queue, cbp); 815219974Smavnextdisk: 816219974Smav if (++no >= vol->v_disks_count) { 817219974Smav no = 0; 818219974Smav offset += strip_size; 819219974Smav } 820219974Smav } 821219974Smav remain -= length; 822242323Smav if (bp->bio_cmd != BIO_DELETE) 823242323Smav addr += length; 824219974Smav start = 0; 825219974Smav } 826256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) { 827219974Smav sd = cbp->bio_caller1; 828219974Smav cbp->bio_caller1 = NULL; 829219974Smav g_raid_subdisk_iostart(sd, cbp); 830219974Smav } 831219974Smav return; 832219974Smavfailure: 833256610Smav while ((cbp = bioq_takefirst(&queue)) != NULL) 834219974Smav g_destroy_bio(cbp); 835219974Smav if (bp->bio_error == 0) 836219974Smav bp->bio_error = ENOMEM; 837219974Smav g_raid_iodone(bp, bp->bio_error); 838219974Smav} 839219974Smav 840219974Smavstatic void 841219974Smavg_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) 842219974Smav{ 843219974Smav struct g_raid_volume *vol; 844219974Smav struct g_raid_tr_raid1e_object *trs; 845219974Smav 846219974Smav vol = tr->tro_volume; 847219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 848219974Smav if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 849219974Smav vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 850219974Smav vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 851219974Smav g_raid_iodone(bp, EIO); 852219974Smav return; 853219974Smav } 854219974Smav /* 855219974Smav * If we're rebuilding, squeeze in rebuild activity every so often, 856219974Smav * even when the disk is busy. Be sure to only count real I/O 857219974Smav * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 858219974Smav * by this module. 859219974Smav */ 860219974Smav if (trs->trso_failed_sd != NULL && 861219974Smav !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 862219974Smav /* Make this new or running now round short. */ 863219974Smav trs->trso_recover_slabs = 0; 864219974Smav if (--trs->trso_fair_io <= 0) { 865219974Smav trs->trso_fair_io = g_raid1e_rebuild_fair_io; 866219974Smav g_raid_tr_raid1e_rebuild_some(tr); 867219974Smav } 868219974Smav } 869219974Smav switch (bp->bio_cmd) { 870219974Smav case BIO_READ: 871219974Smav g_raid_tr_iostart_raid1e_read(tr, bp); 872219974Smav break; 873219974Smav case BIO_WRITE: 874242323Smav case BIO_DELETE: 875219974Smav g_raid_tr_iostart_raid1e_write(tr, bp); 876219974Smav break; 877219974Smav case BIO_FLUSH: 878219974Smav g_raid_tr_flush_common(tr, bp); 879219974Smav break; 880219974Smav default: 881219974Smav KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 882219974Smav bp->bio_cmd, vol->v_name)); 883219974Smav break; 884219974Smav } 885219974Smav} 886219974Smav 887219974Smavstatic void 888219974Smavg_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, 889219974Smav struct g_raid_subdisk *sd, struct bio *bp) 890219974Smav{ 891219974Smav struct bio *cbp; 892219974Smav struct g_raid_subdisk *nsd; 893219974Smav struct g_raid_volume *vol; 894219974Smav struct bio *pbp; 895219974Smav struct g_raid_tr_raid1e_object *trs; 896219974Smav off_t virtual, offset, start; 897219974Smav uintptr_t mask; 898219974Smav int error, do_write, copy, disk, best; 899219974Smav 900219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 901219974Smav vol = tr->tro_volume; 902219974Smav if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 903219974Smav if (trs->trso_type == TR_RAID1E_REBUILD) { 904219974Smav nsd = trs->trso_failed_sd; 905219974Smav if (bp->bio_cmd == BIO_READ) { 906219974Smav 907219974Smav /* Immediately abort rebuild, if requested. */ 908219974Smav if (trs->trso_flags & TR_RAID1E_F_ABORT) { 909219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 910219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 911219974Smav return; 912219974Smav } 913219974Smav 914219974Smav /* On read error, skip and cross fingers. */ 915219974Smav if (bp->bio_error != 0) { 916219974Smav G_RAID_LOGREQ(0, bp, 917219974Smav "Read error during rebuild (%d), " 918219974Smav "possible data loss!", 919219974Smav bp->bio_error); 920219974Smav goto rebuild_round_done; 921219974Smav } 922219974Smav 923219974Smav /* 924219974Smav * The read operation finished, queue the 925219974Smav * write and get out. 926219974Smav */ 927219974Smav G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", 928219974Smav bp->bio_error); 929219974Smav bp->bio_cmd = BIO_WRITE; 930219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 931219974Smav bp->bio_offset = nsd->sd_rebuild_pos; 932219974Smav G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); 933219974Smav g_raid_subdisk_iostart(nsd, bp); 934219974Smav } else { 935219974Smav /* 936219974Smav * The write operation just finished. Do 937219974Smav * another. We keep cloning the master bio 938219974Smav * since it has the right buffers allocated to 939219974Smav * it. 940219974Smav */ 941219974Smav G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", 942219974Smav bp->bio_error); 943219974Smav if (bp->bio_error != 0 || 944219974Smav trs->trso_flags & TR_RAID1E_F_ABORT) { 945219974Smav if ((trs->trso_flags & 946219974Smav TR_RAID1E_F_ABORT) == 0) { 947219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, 948219974Smav nsd, nsd->sd_disk); 949219974Smav } 950219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 951219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 952219974Smav return; 953219974Smav } 954219974Smavrebuild_round_done: 955219974Smav trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 956219974Smav g_raid_unlock_range(tr->tro_volume, 957219974Smav trs->trso_lock_pos, trs->trso_lock_len); 958219974Smav nsd->sd_rebuild_pos += bp->bio_length; 959219974Smav if (nsd->sd_rebuild_pos >= nsd->sd_size) { 960219974Smav g_raid_tr_raid1e_rebuild_finish(tr); 961219974Smav return; 962219974Smav } 963219974Smav 964219974Smav /* Abort rebuild if we are stopping */ 965219974Smav if (trs->trso_stopping) { 966219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 967219974Smav g_raid_tr_raid1e_rebuild_abort(tr); 968219974Smav return; 969219974Smav } 970219974Smav 971219974Smav if (--trs->trso_meta_update <= 0) { 972219974Smav g_raid_write_metadata(vol->v_softc, 973219974Smav vol, nsd, nsd->sd_disk); 974219974Smav trs->trso_meta_update = 975219974Smav g_raid1e_rebuild_meta_update; 976219974Smav /* Compensate short rebuild I/Os. */ 977219974Smav if ((vol->v_disks_count % N) != 0 && 978219974Smav vol->v_strip_size < 979219974Smav g_raid1e_rebuild_slab) { 980219974Smav trs->trso_meta_update *= 981219974Smav g_raid1e_rebuild_slab; 982219974Smav trs->trso_meta_update /= 983219974Smav vol->v_strip_size; 984219974Smav } 985219974Smav } 986219974Smav trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 987219974Smav if (--trs->trso_recover_slabs <= 0) 988219974Smav return; 989219974Smav /* Run next rebuild iteration. */ 990219974Smav g_raid_tr_raid1e_rebuild_some(tr); 991219974Smav } 992219974Smav } else if (trs->trso_type == TR_RAID1E_RESYNC) { 993219974Smav /* 994219974Smav * read good sd, read bad sd in parallel. when both 995219974Smav * done, compare the buffers. write good to the bad 996219974Smav * if different. do the next bit of work. 997219974Smav */ 998219974Smav panic("Somehow, we think we're doing a resync"); 999219974Smav } 1000219974Smav return; 1001219974Smav } 1002219974Smav pbp = bp->bio_parent; 1003219974Smav pbp->bio_inbed++; 1004219974Smav mask = (intptr_t)bp->bio_caller2; 1005219974Smav if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 1006219974Smav /* 1007219974Smav * Read failed on first drive. Retry the read error on 1008219974Smav * another disk drive, if available, before erroring out the 1009219974Smav * read. 1010219974Smav */ 1011219974Smav sd->sd_disk->d_read_errs++; 1012219974Smav G_RAID_LOGREQ(0, bp, 1013219974Smav "Read error (%d), %d read errors total", 1014219974Smav bp->bio_error, sd->sd_disk->d_read_errs); 1015219974Smav 1016219974Smav /* 1017219974Smav * If there are too many read errors, we move to degraded. 1018219974Smav * XXX Do we want to FAIL the drive (eg, make the user redo 1019219974Smav * everything to get it back in sync), or just degrade the 1020219974Smav * drive, which kicks off a resync? 1021219974Smav */ 1022219974Smav do_write = 0; 1023219974Smav if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) 1024219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1025219974Smav else if (mask == 0) 1026219974Smav do_write = 1; 1027219974Smav 1028219974Smav /* Restore what we were doing. */ 1029219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1030219974Smav V2P(vol, virtual, &disk, &offset, &start); 1031219974Smav 1032219974Smav /* Find the other disk, and try to do the I/O to it. */ 1033219974Smav mask |= 1 << copy; 1034219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 1035219974Smav disk, offset, start, mask); 1036219974Smav if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1037219974Smav disk += best; 1038219974Smav if (disk >= vol->v_disks_count) { 1039219974Smav disk -= vol->v_disks_count; 1040219974Smav offset += vol->v_strip_size; 1041219974Smav } 1042219974Smav cbp->bio_offset = offset + start; 1043219974Smav cbp->bio_length = bp->bio_length; 1044219974Smav cbp->bio_data = bp->bio_data; 1045256610Smav cbp->bio_ma = bp->bio_ma; 1046256610Smav cbp->bio_ma_offset = bp->bio_ma_offset; 1047256610Smav cbp->bio_ma_n = bp->bio_ma_n; 1048219974Smav g_destroy_bio(bp); 1049219974Smav nsd = &vol->v_subdisks[disk]; 1050219974Smav G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 1051219974Smav nsd->sd_pos); 1052219974Smav if (do_write) 1053219974Smav mask |= 1 << 31; 1054219974Smav if ((mask & (1 << 31)) != 0) 1055219974Smav sd->sd_recovery++; 1056219974Smav cbp->bio_caller2 = (void *)mask; 1057219974Smav if (do_write) { 1058219974Smav cbp->bio_caller1 = nsd; 1059219974Smav /* Lock callback starts I/O */ 1060219974Smav g_raid_lock_range(sd->sd_volume, 1061219974Smav virtual, cbp->bio_length, pbp, cbp); 1062219974Smav } else { 1063219974Smav g_raid_subdisk_iostart(nsd, cbp); 1064219974Smav } 1065219974Smav return; 1066219974Smav } 1067219974Smav /* 1068219974Smav * We can't retry. Return the original error by falling 1069219974Smav * through. This will happen when there's only one good disk. 1070219974Smav * We don't need to fail the raid, since its actual state is 1071219974Smav * based on the state of the subdisks. 1072219974Smav */ 1073219974Smav G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 1074219974Smav } 1075219974Smav if (bp->bio_cmd == BIO_READ && 1076219974Smav bp->bio_error == 0 && 1077219974Smav (mask & (1 << 31)) != 0) { 1078219974Smav G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 1079219974Smav 1080219974Smav /* Restore what we were doing. */ 1081219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1082219974Smav V2P(vol, virtual, &disk, &offset, &start); 1083219974Smav 1084219974Smav /* Find best disk to write. */ 1085219974Smav best = g_raid_tr_raid1e_select_read_disk(vol, 1086219974Smav disk, offset, start, ~mask); 1087219974Smav if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1088219974Smav disk += best; 1089219974Smav if (disk >= vol->v_disks_count) { 1090219974Smav disk -= vol->v_disks_count; 1091219974Smav offset += vol->v_strip_size; 1092219974Smav } 1093219974Smav cbp->bio_offset = offset + start; 1094219974Smav cbp->bio_cmd = BIO_WRITE; 1095219974Smav cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 1096219974Smav cbp->bio_caller2 = (void *)mask; 1097219974Smav g_destroy_bio(bp); 1098219974Smav G_RAID_LOGREQ(2, cbp, 1099219974Smav "Attempting bad sector remap on failing drive."); 1100219974Smav g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); 1101219974Smav return; 1102219974Smav } 1103219974Smav } 1104219974Smav if ((mask & (1 << 31)) != 0) { 1105219974Smav /* 1106219974Smav * We're done with a recovery, mark the range as unlocked. 1107219974Smav * For any write errors, we agressively fail the disk since 1108219974Smav * there was both a READ and a WRITE error at this location. 1109219974Smav * Both types of errors generally indicates the drive is on 1110219974Smav * the verge of total failure anyway. Better to stop trusting 1111219974Smav * it now. However, we need to reset error to 0 in that case 1112219974Smav * because we're not failing the original I/O which succeeded. 1113219974Smav */ 1114219974Smav 1115219974Smav /* Restore what we were doing. */ 1116219974Smav P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1117219974Smav V2P(vol, virtual, &disk, &offset, &start); 1118219974Smav 1119219974Smav for (copy = 0; copy < N; copy++) { 1120219974Smav if ((mask & (1 << copy) ) != 0) 1121219974Smav vol->v_subdisks[(disk + copy) % 1122219974Smav vol->v_disks_count].sd_recovery--; 1123219974Smav } 1124219974Smav 1125219974Smav if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 1126219974Smav G_RAID_LOGREQ(0, bp, "Remap write failed: " 1127219974Smav "failing subdisk."); 1128219974Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1129219974Smav bp->bio_error = 0; 1130219974Smav } 1131219974Smav G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 1132219974Smav g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); 1133219974Smav } 1134242328Smav if (pbp->bio_cmd != BIO_READ) { 1135235270Smav if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 1136235270Smav pbp->bio_error = bp->bio_error; 1137242328Smav if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 1138235270Smav G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 1139235270Smav g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1140235270Smav } 1141235270Smav error = pbp->bio_error; 1142235270Smav } else 1143235270Smav error = bp->bio_error; 1144219974Smav g_destroy_bio(bp); 1145219974Smav if (pbp->bio_children == pbp->bio_inbed) { 1146219974Smav pbp->bio_completed = pbp->bio_length; 1147219974Smav g_raid_iodone(pbp, error); 1148219974Smav } 1149219974Smav} 1150219974Smav 1151219974Smavstatic int 1152219974Smavg_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, 1153219974Smav void *virtual, vm_offset_t physical, off_t boffset, size_t blength) 1154219974Smav{ 1155219974Smav struct g_raid_volume *vol; 1156219974Smav struct g_raid_subdisk *sd; 1157219974Smav struct bio_queue_head queue; 1158219974Smav char *addr; 1159219974Smav off_t offset, start, length, remain; 1160219974Smav u_int no, strip_size; 1161219974Smav int i, error; 1162219974Smav 1163219974Smav vol = tr->tro_volume; 1164219974Smav addr = virtual; 1165219974Smav strip_size = vol->v_strip_size; 1166219974Smav V2P(vol, boffset, &no, &offset, &start); 1167219974Smav remain = blength; 1168219974Smav bioq_init(&queue); 1169219974Smav while (remain > 0) { 1170219974Smav length = MIN(strip_size - start, remain); 1171219974Smav for (i = 0; i < N; i++) { 1172219974Smav sd = &vol->v_subdisks[no]; 1173219974Smav switch (sd->sd_state) { 1174219974Smav case G_RAID_SUBDISK_S_ACTIVE: 1175219974Smav case G_RAID_SUBDISK_S_STALE: 1176219974Smav case G_RAID_SUBDISK_S_RESYNC: 1177219974Smav break; 1178219974Smav case G_RAID_SUBDISK_S_REBUILD: 1179219974Smav if (offset + start >= sd->sd_rebuild_pos) 1180219974Smav goto nextdisk; 1181219974Smav break; 1182219974Smav default: 1183219974Smav goto nextdisk; 1184219974Smav } 1185219974Smav error = g_raid_subdisk_kerneldump(sd, 1186219974Smav addr, 0, offset + start, length); 1187219974Smav if (error != 0) 1188219974Smav return (error); 1189219974Smavnextdisk: 1190219974Smav if (++no >= vol->v_disks_count) { 1191219974Smav no = 0; 1192219974Smav offset += strip_size; 1193219974Smav } 1194219974Smav } 1195219974Smav remain -= length; 1196219974Smav addr += length; 1197219974Smav start = 0; 1198219974Smav } 1199219974Smav return (0); 1200219974Smav} 1201219974Smav 1202219974Smavstatic int 1203219974Smavg_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) 1204219974Smav{ 1205219974Smav struct bio *bp; 1206219974Smav struct g_raid_subdisk *sd; 1207219974Smav 1208219974Smav bp = (struct bio *)argp; 1209219974Smav sd = (struct g_raid_subdisk *)bp->bio_caller1; 1210219974Smav g_raid_subdisk_iostart(sd, bp); 1211219974Smav 1212219974Smav return (0); 1213219974Smav} 1214219974Smav 1215219974Smavstatic int 1216219974Smavg_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) 1217219974Smav{ 1218219974Smav struct g_raid_tr_raid1e_object *trs; 1219219974Smav struct g_raid_volume *vol; 1220219974Smav 1221219974Smav vol = tr->tro_volume; 1222219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 1223219974Smav trs->trso_fair_io = g_raid1e_rebuild_fair_io; 1224219974Smav trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; 1225219974Smav /* Compensate short rebuild I/Os. */ 1226219974Smav if ((vol->v_disks_count % N) != 0 && 1227219974Smav vol->v_strip_size < g_raid1e_rebuild_slab) { 1228219974Smav trs->trso_recover_slabs *= g_raid1e_rebuild_slab; 1229219974Smav trs->trso_recover_slabs /= vol->v_strip_size; 1230219974Smav } 1231219974Smav if (trs->trso_type == TR_RAID1E_REBUILD) 1232219974Smav g_raid_tr_raid1e_rebuild_some(tr); 1233219974Smav return (0); 1234219974Smav} 1235219974Smav 1236219974Smavstatic int 1237219974Smavg_raid_tr_free_raid1e(struct g_raid_tr_object *tr) 1238219974Smav{ 1239219974Smav struct g_raid_tr_raid1e_object *trs; 1240219974Smav 1241219974Smav trs = (struct g_raid_tr_raid1e_object *)tr; 1242219974Smav 1243219974Smav if (trs->trso_buffer != NULL) { 1244219974Smav free(trs->trso_buffer, M_TR_RAID1E); 1245219974Smav trs->trso_buffer = NULL; 1246219974Smav } 1247219974Smav return (0); 1248219974Smav} 1249219974Smav 1250240465SmavG_RAID_TR_DECLARE(raid1e, "RAID1E"); 1251