/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * NAME: raid.c * * DESCRIPTION: Main RAID driver source file containing open, close and I/O * operations. * * ROUTINES PROVIDED FOR EXTERNAL USE: * raid_open() - open the RAID metadevice for access. * raid_internal_open() - internal open routine of RAID metdevice. * md_raid_strategy() - perform normal I/O operations, * such as read and write. * raid_close() - close the RAID metadevice. * raid_internal_close() - internal close routine of RAID metadevice. * raid_snarf() - initialize and clean up MDD records. * raid_halt() - reset the RAID metadevice * raid_line() - return the line # of this segment * raid_dcolumn() - return the data column # of this segment * raid_pcolumn() - return the parity column # of this segment */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include md_ops_t raid_md_ops; #ifndef lint char _depends_on[] = "drv/md"; md_ops_t *md_interface_ops = &raid_md_ops; #endif /* lint */ extern unit_t md_nunits; extern unit_t md_nsets; extern md_set_t md_set[]; extern int md_status; extern major_t md_major; extern mdq_anchor_t md_done_daemon; extern mdq_anchor_t md_mstr_daemon; extern int md_sleep_for_test; extern clock_t md_hz; extern md_event_queue_t *md_event_queue; int pchunks = 16; int phigh = 1024; int plow = 128; int cchunks = 64; int chigh = 1024; int clow = 512; int bchunks = 32; int bhigh = 256; int blow = 128; int raid_total_io = 0; int raid_reads = 0; int raid_writes = 0; int raid_no_bpmaps = 0; int raid_512 = 0; int raid_1024 = 0; int raid_1024_8192 = 0; int raid_8192 = 0; int raid_8192_bigger = 0; int raid_line_lock_wait = 0; int data_buffer_waits = 0; int parity_buffer_waits = 0; /* writer line locks */ int raid_writer_locks = 0; /* total writer locks */ int raid_write_waits = 0; /* total writer locks that waited */ int raid_full_line_writes = 0; /* total full line writes */ int raid_write_queue_length = 0; /* wait queue length */ int raid_max_write_q_length = 0; /* maximum queue length */ int raid_write_locks_active = 0; /* writer locks at any time */ int raid_max_write_locks = 0; /* maximum writer locks active */ /* read line locks */ int raid_reader_locks = 0; /* total reader locks held */ int raid_reader_locks_active = 0; /* reader locks held */ int raid_max_reader_locks = 0; /* maximum reader locks held in run */ int raid_read_overlaps = 0; /* number of times 2 reads hit same line */ int raid_read_waits = 0; /* times a reader waited on writer */ /* prewrite stats */ int raid_prewrite_waits = 0; /* number of waits for a pw slot */ int raid_pw = 0; /* number of pw slots in use */ int raid_prewrite_max = 0; /* maximum number of pw slots in use */ int raid_pw_invalidates = 0; static clock_t md_wr_wait = 0; int nv_available = 0; /* presence of nv-ram support in device */ int nv_prewrite = 1; /* mark prewrites with nv_available */ int nv_parity = 1; /* mark parity with nv_available */ kmem_cache_t *raid_parent_cache = NULL; kmem_cache_t *raid_child_cache = NULL; kmem_cache_t *raid_cbuf_cache = NULL; int raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags); static void freebuffers(md_raidcs_t *cs); static int raid_read(mr_unit_t *un, md_raidcs_t *cs); static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs); static int raid_write(mr_unit_t *un, md_raidcs_t *cs); static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs); static void raid_stage(md_raidcs_t *cs); static void raid_enqueue(md_raidcs_t *cs); static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un); uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un); static void getpbuffer(md_raidcs_t *cs); static void getdbuffer(md_raidcs_t *cs); static void raid_done(buf_t *bp); static void raid_io_startup(mr_unit_t *un); static rus_state_t raid_col2unit(rcs_state_t state, rus_state_t unitstate) { switch (state) { case RCS_INIT: return (RUS_INIT); case RCS_OKAY: return (RUS_OKAY); case RCS_RESYNC: if (unitstate & RUS_LAST_ERRED) return (RUS_LAST_ERRED); else return (RUS_ERRED); case RCS_ERRED: return (RUS_ERRED); case RCS_LAST_ERRED: return (RUS_ERRED); default: break; } panic("raid_col2unit"); /*NOTREACHED*/ } void raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force) { rus_state_t unitstate, origstate; rcs_state_t colstate; rcs_state_t orig_colstate; int errcnt = 0, okaycnt = 0, resynccnt = 0; int i; char *devname; ASSERT(un); ASSERT(col < un->un_totalcolumncnt); ASSERT(newstate & (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | RCS_LAST_ERRED | RCS_REGEN)); ASSERT((newstate & ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | RCS_LAST_ERRED | RCS_REGEN)) == 0); ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); unitstate = un->un_state; origstate = unitstate; if (force) { un->un_column[col].un_devstate = newstate; un->un_state = raid_col2unit(newstate, unitstate); uniqtime32(&un->un_column[col].un_devtimestamp); uniqtime32(&un->un_timestamp); return; } ASSERT(un->un_state & (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)); ASSERT((un->un_state & ~(RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0); if (un->un_column[col].un_devstate == newstate) return; if (newstate == RCS_REGEN) { if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) return; un->un_state = RUS_REGEN; return; } orig_colstate = un->un_column[col].un_devstate; /* * if there is another column in the error state then this * column should go to the last errored state */ for (i = 0; i < un->un_totalcolumncnt; i++) { if (i == col) colstate = newstate; else colstate = un->un_column[i].un_devstate; if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED)) errcnt++; if (colstate & RCS_OKAY) okaycnt++; if (colstate & RCS_RESYNC) resynccnt++; } ASSERT(resynccnt < 2); if (okaycnt == un->un_totalcolumncnt) unitstate = RUS_OKAY; else if (errcnt > 1) { unitstate = RUS_LAST_ERRED; if (newstate & RCS_ERRED) newstate = RCS_LAST_ERRED; } else if (errcnt == 1) if (!(unitstate & RUS_LAST_ERRED)) unitstate = RUS_ERRED; if (un->un_state == RUS_DOI) unitstate = RUS_DOI; un->un_column[col].un_devstate = newstate; uniqtime32(&un->un_column[col].un_devtimestamp); /* * if there are last errored column being brought back online * by open or snarf, then be sure to clear the RUS_LAST_ERRED * bit to allow writes. If there is a real error then the * column will go back into last erred. */ if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) && (raid_state_cnt(un, RCS_ERRED) == 1)) unitstate = RUS_ERRED; un->un_state = unitstate; uniqtime32(&un->un_timestamp); if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) && (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) { devname = md_devname(MD_UN2SET(un), un->un_column[col].un_dev, NULL, 0); cmn_err(CE_WARN, "md: %s: %s needs maintenance", md_shortname(MD_SID(un)), devname); if (unitstate & RUS_LAST_ERRED) { cmn_err(CE_WARN, "md: %s: %s last erred", md_shortname(MD_SID(un)), devname); } else if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { /* * Close the broken device and clear the open flag on * it. We have to check that the device is open, * otherwise the first open on it has resulted in the * error that is being processed and the actual un_dev * will be NODEV64. */ md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; } } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED && un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { /* * Similar to logic above except no log messages since we * are just transitioning from Last Erred to Erred. */ md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; } /* * If a resync has completed, see if there is a Last Erred * component that we can change to the Erred state. */ if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) { for (i = 0; i < un->un_totalcolumncnt; i++) { if (i != col && (un->un_column[i].un_devstate & RCS_LAST_ERRED)) { raid_set_state(un, i, RCS_ERRED, 0); break; } } } } /* * NAME: erred_check_line * * DESCRIPTION: Return the type of write to perform on an erred column based * upon any resync activity. * * if a column is being resynced and the write is above the * resync point may have to write to the target being resynced. * * Column state may make it impossible to do the write * in which case RCL_EIO or RCL_ENXIO is returned. * * If a column cannot be written directly, RCL_ERRED is * returned and processing should proceed accordingly. * * PARAMETERS: minor_t mnum - minor number identity of metadevice * md_raidcs_t *cs - child save structure * mr_column_t *dcolumn - pointer to data column structure * mr_column_t *pcolumn - pointer to parity column structure * * RETURNS: RCL_OKAY, RCL_ERRED * * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held * across call. */ static int erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column) { ASSERT(un != NULL); ASSERT(cs->cs_flags & MD_RCS_LLOCKD); if (column->un_devstate & RCS_OKAY) return (RCL_OKAY); if (column->un_devstate & RCS_ERRED) return (RCL_ERRED); /* do not read from errored disk */ /* * for the last errored case their are two considerations. * When the last errored column is the only errored column then * do treat it like a maintenance column, not doing I/O from * it. When it there are other failures then just attempt * to use it. */ if (column->un_devstate & RCS_LAST_ERRED) return (RCL_ERRED); ASSERT(column->un_devstate & RCS_RESYNC); /* * When a resync from a hotspare is being done (copy resync) * then always treat it as an OKAY column, since no regen * is required. */ if (column->un_devflags & MD_RAID_COPY_RESYNC) { return (RCL_OKAY); } mutex_enter(&un->un_mx); if (cs->cs_line < un->un_resync_line_index) { mutex_exit(&un->un_mx); return (RCL_OKAY); } mutex_exit(&un->un_mx); return (RCL_ERRED); } /* * NAMES: raid_state_cnt * * DESCRIPTION: counts number of column in a specific state * * PARAMETERS: md_raid_t *un * rcs_state state */ int raid_state_cnt(mr_unit_t *un, rcs_state_t state) { int i, retval = 0; for (i = 0; i < un->un_totalcolumncnt; i++) if (un->un_column[i].un_devstate & state) retval++; return (retval); } /* * NAMES: raid_io_overlaps * * DESCRIPTION: checkst for overlap of 2 child save structures * * PARAMETERS: md_raidcs_t cs1 * md_raidcs_t cs2 * * RETURNS: 0 - no overlap * 1 - overlap */ int raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2) { if (cs1->cs_blkno > cs2->cs_lastblk) return (0); if (cs1->cs_lastblk < cs2->cs_blkno) return (0); return (1); } /* * NAMES: raid_parent_constructor * DESCRIPTION: parent structure constructor routine * PARAMETERS: */ /*ARGSUSED1*/ static int raid_parent_constructor(void *p, void *d1, int d2) { mutex_init(&((md_raidps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&((md_raidps_t *)p)->ps_mapin_mx, NULL, MUTEX_DEFAULT, NULL); return (0); } void raid_parent_init(md_raidps_t *ps) { bzero(ps, offsetof(md_raidps_t, ps_mx)); ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE; ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC; } /*ARGSUSED1*/ static void raid_parent_destructor(void *p, void *d) { mutex_destroy(&((md_raidps_t *)p)->ps_mx); mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx); } /* * NAMES: raid_child_constructor * DESCRIPTION: child structure constructor routine * PARAMETERS: */ /*ARGSUSED1*/ static int raid_child_constructor(void *p, void *d1, int d2) { md_raidcs_t *cs = (md_raidcs_t *)p; mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL); bioinit(&cs->cs_dbuf); bioinit(&cs->cs_pbuf); bioinit(&cs->cs_hbuf); return (0); } void raid_child_init(md_raidcs_t *cs) { bzero(cs, offsetof(md_raidcs_t, cs_mx)); md_bioreset(&cs->cs_dbuf); md_bioreset(&cs->cs_pbuf); md_bioreset(&cs->cs_hbuf); ((md_raidcs_t *)cs)->cs_dbuf.b_chain = ((md_raidcs_t *)cs)->cs_pbuf.b_chain = ((md_raidcs_t *)cs)->cs_hbuf.b_chain = (struct buf *)(cs); cs->cs_magic = RAID_CSMAGIC; cs->cs_line = MD_DISKADDR_ERROR; cs->cs_dpwslot = -1; cs->cs_ppwslot = -1; } /*ARGSUSED1*/ static void raid_child_destructor(void *p, void *d) { biofini(&((md_raidcs_t *)p)->cs_dbuf); biofini(&((md_raidcs_t *)p)->cs_hbuf); biofini(&((md_raidcs_t *)p)->cs_pbuf); mutex_destroy(&((md_raidcs_t *)p)->cs_mx); } /*ARGSUSED1*/ static int raid_cbuf_constructor(void *p, void *d1, int d2) { bioinit(&((md_raidcbuf_t *)p)->cbuf_bp); return (0); } static void raid_cbuf_init(md_raidcbuf_t *cb) { bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp)); md_bioreset(&cb->cbuf_bp); cb->cbuf_magic = RAID_BUFMAGIC; cb->cbuf_pwslot = -1; cb->cbuf_flags = CBUF_WRITE; } /*ARGSUSED1*/ static void raid_cbuf_destructor(void *p, void *d) { biofini(&((md_raidcbuf_t *)p)->cbuf_bp); } /* * NAMES: raid_run_queue * DESCRIPTION: spawn a backend processing daemon for RAID metadevice. * PARAMETERS: */ /*ARGSUSED*/ static void raid_run_queue(void *d) { if (!(md_status & MD_GBL_DAEMONS_LIVE)) md_daemon(1, &md_done_daemon); } /* * NAME: raid_build_pwslot * DESCRIPTION: builds mr_pw_reserve for the column * PARAMETERS: un is the pointer to the unit structure * colindex is the column to create the structure for */ int raid_build_pw_reservation(mr_unit_t *un, int colindex) { mr_pw_reserve_t *pw; mr_scoreboard_t *sb; int i; pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) + (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP); pw->pw_magic = RAID_PWMAGIC; pw->pw_column = colindex; pw->pw_free = un->un_pwcnt; sb = &pw->pw_sb[0]; for (i = 0; i < un->un_pwcnt; i++) { sb[i].sb_column = colindex; sb[i].sb_flags = SB_UNUSED; sb[i].sb_start_blk = 0; sb[i].sb_last_blk = 0; sb[i].sb_cs = NULL; } un->un_column_ic[colindex].un_pw_reserve = pw; return (0); } /* * NAME: raid_free_pw_reservation * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * int colindex - index of the column whose pre-write slot struct * is to be destroyed. */ void raid_free_pw_reservation(mr_unit_t *un, int colindex) { mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve; kmem_free(pw, sizeof (mr_pw_reserve_t) + (sizeof (mr_scoreboard_t) * un->un_pwcnt)); } /* * NAME: raid_cancel_pwslot * DESCRIPTION: RAID metadevice write routine * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raid_cancel_pwslot(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; mr_pw_reserve_t *pw; mr_scoreboard_t *sb; mr_column_ic_t *col; md_raidcbuf_t *cbuf; int broadcast = 0; if (cs->cs_ps->ps_flags & MD_RPS_READ) return; if (cs->cs_dpwslot != -1) { col = &un->un_column_ic[cs->cs_dcolumn]; pw = col->un_pw_reserve; sb = &pw->pw_sb[cs->cs_dpwslot]; sb->sb_flags = SB_AVAIL; if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) broadcast++; sb->sb_cs = NULL; } if (cs->cs_ppwslot != -1) { col = &un->un_column_ic[cs->cs_pcolumn]; pw = col->un_pw_reserve; sb = &pw->pw_sb[cs->cs_ppwslot]; sb->sb_flags = SB_AVAIL; if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) broadcast++; sb->sb_cs = NULL; } for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { if (cbuf->cbuf_pwslot == -1) continue; col = &un->un_column_ic[cbuf->cbuf_column]; pw = col->un_pw_reserve; sb = &pw->pw_sb[cbuf->cbuf_pwslot]; sb->sb_flags = SB_AVAIL; if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) broadcast++; sb->sb_cs = NULL; } if (broadcast) { cv_broadcast(&un->un_cv); return; } mutex_enter(&un->un_mx); if (un->un_rflags & MD_RFLAG_NEEDPW) cv_broadcast(&un->un_cv); mutex_exit(&un->un_mx); } static void raid_free_pwinvalidate(md_raidcs_t *cs) { md_raidcbuf_t *cbuf; md_raidcbuf_t *cbuf_to_free; mr_unit_t *un = cs->cs_un; mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); mr_pw_reserve_t *pw; mr_scoreboard_t *sb; int broadcast = 0; cbuf = cs->cs_pw_inval_list; ASSERT(cbuf); mutex_enter(&un->un_linlck_mx); while (cbuf) { pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve; sb = &pw->pw_sb[0]; ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND); sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED; sb[cbuf->cbuf_pwslot].sb_cs = NULL; if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) broadcast++; cbuf_to_free = cbuf; cbuf = cbuf->cbuf_next; kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize)); kmem_cache_free(raid_cbuf_cache, cbuf_to_free); } cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL; /* * now that there is a free prewrite slot, check to see if there * are any io operations waiting first wake up the raid_io_startup * then signal the the processes waiting in raid_write. */ if (ui->ui_io_lock->io_list_front) raid_io_startup(un); mutex_exit(&un->un_linlck_mx); if (broadcast) { cv_broadcast(&un->un_cv); return; } mutex_enter(&un->un_mx); if (un->un_rflags & MD_RFLAG_NEEDPW) cv_broadcast(&un->un_cv); mutex_exit(&un->un_mx); } static int raid_get_pwslot(md_raidcs_t *cs, int column) { mr_scoreboard_t *sb; mr_pw_reserve_t *pw; mr_unit_t *un = cs->cs_un; diskaddr_t start_blk = cs->cs_blkno; diskaddr_t last_blk = cs->cs_lastblk; int i; int pwcnt = un->un_pwcnt; int avail = -1; int use = -1; int flags; /* start with the data column */ pw = cs->cs_un->un_column_ic[column].un_pw_reserve; sb = &pw->pw_sb[0]; ASSERT(pw->pw_free > 0); for (i = 0; i < pwcnt; i++) { flags = sb[i].sb_flags; if (flags & SB_INVAL_PEND) continue; if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED))) avail = i; if ((start_blk > sb[i].sb_last_blk) || (last_blk < sb[i].sb_start_blk)) continue; /* OVERLAP */ ASSERT(! (sb[i].sb_flags & SB_INUSE)); /* * raid_invalidate_pwslot attempts to zero out prewrite entry * in parallel with other disk reads/writes related to current * transaction. however cs_frags accounting for this case is * broken because raid_write_io resets cs_frags i.e. ignoring * that it could have been been set to > 0 value by * raid_invalidate_pwslot. While this can be fixed an * additional problem is that we don't seem to handle * correctly the case of getting a disk error for prewrite * entry invalidation. * It does not look like we really need * to invalidate prewrite slots because raid_replay sorts * prewrite id's in ascending order and during recovery the * latest prewrite entry for the same block will be replay * last. That's why i ifdef'd out the call to * raid_invalidate_pwslot. --aguzovsk@east */ if (use == -1) { use = i; } } ASSERT(avail != -1); pw->pw_free--; if (use == -1) use = avail; ASSERT(! (sb[use].sb_flags & SB_INUSE)); sb[use].sb_flags = SB_INUSE; sb[use].sb_cs = cs; sb[use].sb_start_blk = start_blk; sb[use].sb_last_blk = last_blk; ASSERT((use >= 0) && (use < un->un_pwcnt)); return (use); } static int raid_check_pw(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; int i; ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); /* * check to be sure there is a prewrite slot available * if not just return. */ if (cs->cs_flags & MD_RCS_LINE) { for (i = 0; i < un->un_totalcolumncnt; i++) if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0) return (1); return (0); } if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0) return (1); if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0) return (1); return (0); } static int raid_alloc_pwslot(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; md_raidcbuf_t *cbuf; ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); if (raid_check_pw(cs)) return (1); mutex_enter(&un->un_mx); un->un_pwid++; cs->cs_pwid = un->un_pwid; mutex_exit(&un->un_mx); cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn); for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column); } cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn); cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS; return (0); } /* * NAMES: raid_build_incore * DESCRIPTION: RAID metadevice incore structure building routine * PARAMETERS: void *p - pointer to a unit structure * int snarfing - a flag to indicate snarfing is required */ int raid_build_incore(void *p, int snarfing) { mr_unit_t *un = (mr_unit_t *)p; minor_t mnum = MD_SID(un); mddb_recid_t hs_recid = 0; int i; int preserve_flags; mr_column_t *column; int iosize; md_dev64_t hs, dev; int resync_cnt = 0, error_cnt = 0; hs = NODEV64; dev = NODEV64; /* clear out bogus pointer incase we return(1) prior to alloc */ un->mr_ic = NULL; if (MD_STATUS(un) & MD_UN_BEING_RESET) { mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); return (1); } if (MD_UNIT(mnum) != NULL) return (0); if (snarfing) MD_STATUS(un) = 0; un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic), KM_SLEEP); un->un_column_ic = (mr_column_ic_t *) kmem_zalloc(sizeof (mr_column_ic_t) * un->un_totalcolumncnt, KM_SLEEP); for (i = 0; i < un->un_totalcolumncnt; i++) { column = &un->un_column[i]; preserve_flags = column->un_devflags & (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); column->un_devflags &= ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN | MD_RAID_WRITE_ALT); if (raid_build_pw_reservation(un, i) != 0) { /* could not build pwslot */ return (1); } if (snarfing) { set_t setno = MD_MIN2SET(mnum); dev = md_getdevnum(setno, mddb_getsidenum(setno), column->un_orig_key, MD_NOTRUST_DEVT); /* * Comment out instead of remove so we have history * In the pre-SVM releases stored devt is used so * as long as there is one snarf is always happy * even the component is powered off. This is not * the case in current SVM implementation. NODEV64 * can be returned and in this case since we resolve * the devt at 'open' time (first use of metadevice) * we will allow snarf continue. * * if (dev == NODEV64) * return (1); */ /* * Setup un_orig_dev from device id info if the device * is valid (not NODEV64). */ if (dev != NODEV64) column->un_orig_dev = dev; if (column->un_devstate & RCS_RESYNC) resync_cnt++; if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) error_cnt++; if (HOTSPARED(un, i)) { (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 0, &column->un_hs_id, NULL, &hs, NULL); /* * Same here * * if (hs == NODEV64) * return (1); */ } if (HOTSPARED(un, i)) { if (column->un_devstate & (RCS_OKAY | RCS_LAST_ERRED)) { column->un_dev = hs; column->un_pwstart = column->un_hs_pwstart; column->un_devstart = column->un_hs_devstart; preserve_flags &= ~(MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); } else if (column->un_devstate & RCS_RESYNC) { /* * if previous system was 4.0 set * the direction flags */ if ((preserve_flags & (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC)) == 0) { if (column->un_alt_dev != NODEV64) preserve_flags |= MD_RAID_COPY_RESYNC; else preserve_flags |= /* CSTYLED */ MD_RAID_REGEN_RESYNC; } } } else { /* no hot spares */ column->un_dev = dev; column->un_pwstart = column->un_orig_pwstart; column->un_devstart = column->un_orig_devstart; if (column->un_devstate & RCS_RESYNC) { preserve_flags |= MD_RAID_REGEN_RESYNC; preserve_flags &= ~MD_RAID_COPY_RESYNC; } } if (! (column->un_devstate & RCS_RESYNC)) { preserve_flags &= ~(MD_RAID_REGEN_RESYNC | MD_RAID_COPY_RESYNC); } column->un_devflags = preserve_flags; column->un_alt_dev = NODEV64; column->un_alt_pwstart = 0; column->un_alt_devstart = 0; un->un_resync_line_index = 0; un->un_resync_index = 0; un->un_percent_done = 0; } } if (resync_cnt && error_cnt) { for (i = 0; i < un->un_totalcolumncnt; i++) { column = &un->un_column[i]; if (HOTSPARED(un, i) && (column->un_devstate & RCS_RESYNC) && (column->un_devflags & MD_RAID_COPY_RESYNC)) /* hotspare has data */ continue; if (HOTSPARED(un, i) && (column->un_devstate & RCS_RESYNC)) { /* hotspare does not have data */ raid_hs_release(HS_FREE, un, &hs_recid, i); column->un_dev = column->un_orig_dev; column->un_pwstart = column->un_orig_pwstart; column->un_devstart = column->un_orig_devstart; mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM); } if (column->un_devstate & RCS_ERRED) column->un_devstate = RCS_LAST_ERRED; if (column->un_devstate & RCS_RESYNC) column->un_devstate = RCS_ERRED; } } mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM); un->un_pwid = 1; /* or some other possible value */ un->un_magic = RAID_UNMAGIC; iosize = un->un_iosize; un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL); un->un_linlck_chn = NULL; /* place various information in the in-core data structures */ md_nblocks_set(mnum, un->c.un_total_blocks); MD_UNIT(mnum) = un; return (0); } /* * NAMES: reset_raid * DESCRIPTION: RAID metadevice reset routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * minor_t mnum - RAID metadevice minor number * int removing - a flag to imply removing device name from * MDDB database. */ void reset_raid(mr_unit_t *un, minor_t mnum, int removing) { int i, n = 0; sv_dev_t *sv; mr_column_t *column; int column_cnt = un->un_totalcolumncnt; mddb_recid_t *recids, vtoc_id; int hserr; ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) && (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL)); md_destroy_unit_incore(mnum, &raid_md_ops); md_nblocks_set(mnum, -1ULL); MD_UNIT(mnum) = NULL; if (un->un_pbuffer) { kmem_free(un->un_pbuffer, dbtob(un->un_iosize)); un->un_pbuffer = NULL; } if (un->un_dbuffer) { kmem_free(un->un_dbuffer, dbtob(un->un_iosize)); un->un_dbuffer = NULL; } /* free all pre-write slots created during build incore */ for (i = 0; i < un->un_totalcolumncnt; i++) raid_free_pw_reservation(un, i); kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * un->un_totalcolumncnt); kmem_free(un->mr_ic, sizeof (*un->mr_ic)); /* * Attempt release of its minor node */ md_remove_minor_node(mnum); if (!removing) return; sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t), KM_SLEEP); recids = (mddb_recid_t *) kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP); for (i = 0; i < column_cnt; i++) { md_unit_t *comp_un; md_dev64_t comp_dev; column = &un->un_column[i]; sv[i].setno = MD_MIN2SET(mnum); sv[i].key = column->un_orig_key; if (HOTSPARED(un, i)) { if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) hserr = HS_BAD; else hserr = HS_FREE; raid_hs_release(hserr, un, &recids[n++], i); } /* * deparent any metadevices. * NOTE: currently soft partitions are the only metadevices * allowed in RAID metadevices. */ comp_dev = column->un_dev; if (md_getmajor(comp_dev) == md_major) { comp_un = MD_UNIT(md_getminor(comp_dev)); recids[n++] = MD_RECID(comp_un); md_reset_parent(comp_dev); } } /* decrement the reference count of the old hsp */ if (un->un_hsp_id != -1) (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, &recids[n++], NULL, NULL, NULL); recids[n] = 0; MD_STATUS(un) |= MD_UN_BEING_RESET; vtoc_id = un->c.un_vtoc_id; raid_commit(un, recids); /* * Remove self from the namespace */ if (un->c.un_revision & MD_FN_META_DEV) { (void) md_rem_selfname(un->c.un_self_id); } /* Remove the unit structure */ mddb_deleterec_wrapper(un->c.un_record_id); /* Remove the vtoc, if present */ if (vtoc_id) mddb_deleterec_wrapper(vtoc_id); md_rem_names(sv, column_cnt); kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t)); kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t)); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, MD_MIN2SET(mnum), mnum); } /* * NAMES: raid_error_parent * DESCRIPTION: mark a parent structure in error * PARAMETERS: md_raidcs_t *cs - pointer to child structure * int error - error value to set * NOTE: (TBR) - this routine currently is not in use. */ static void raid_error_parent(md_raidps_t *ps, int error) { mutex_enter(&ps->ps_mx); ps->ps_flags |= MD_RPS_ERROR; ps->ps_error = error; mutex_exit(&ps->ps_mx); } /* * The following defines tell raid_free_parent * RFP_RLS_LOCK release the unit reader lock when done. * RFP_DECR_PWFRAGS decrement ps_pwfrags * RFP_DECR_FRAGS decrement ps_frags * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep */ #define RFP_RLS_LOCK 0x00001 #define RFP_DECR_PWFRAGS 0x00002 #define RFP_DECR_FRAGS 0x00004 #define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS) /* * NAMES: raid_free_parent * DESCRIPTION: free a parent structure * PARAMETERS: md_raidcs_t *cs - pointer to child structure * int todo - indicates what needs to be done */ static void raid_free_parent(md_raidps_t *ps, int todo) { mdi_unit_t *ui = ps->ps_ui; ASSERT(ps->ps_magic == RAID_PSMAGIC); ASSERT(ps->ps_flags & MD_RPS_INUSE); mutex_enter(&ps->ps_mx); if (todo & RFP_DECR_PWFRAGS) { ASSERT(ps->ps_pwfrags); ps->ps_pwfrags--; if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) { if (ps->ps_flags & MD_RPS_ERROR) { ps->ps_bp->b_flags |= B_ERROR; ps->ps_bp->b_error = ps->ps_error; } md_kstat_done(ui, ps->ps_bp, 0); biodone(ps->ps_bp); ps->ps_flags |= MD_RPS_IODONE; } } if (todo & RFP_DECR_FRAGS) { ASSERT(ps->ps_frags); ps->ps_frags--; } if (ps->ps_frags != 0) { mutex_exit(&ps->ps_mx); return; } ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0)); mutex_exit(&ps->ps_mx); if (todo & RFP_RLS_LOCK) md_io_readerexit(ui); if (panicstr) { ps->ps_flags |= MD_RPS_DONE; return; } if (ps->ps_flags & MD_RPS_HSREQ) (void) raid_hotspares(); ASSERT(todo & RFP_RLS_LOCK); ps->ps_flags &= ~MD_RPS_INUSE; md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id)); kmem_cache_free(raid_parent_cache, ps); } /* * NAMES: raid_free_child * DESCRIPTION: free a parent structure * PARAMETERS: md_raidcs_t *cs - pointer to child structure * int drop_locks - 0 for no locks held * NOTE: (TBR) - this routine currently is not in use. */ static void raid_free_child(md_raidcs_t *cs, int drop_locks) { mr_unit_t *un = cs->cs_un; md_raidcbuf_t *cbuf, *cbuf1; if (cs->cs_pw_inval_list) raid_free_pwinvalidate(cs); if (drop_locks) { ASSERT(cs->cs_flags & MD_RCS_LLOCKD && (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER))); md_unit_readerexit(MDI_UNIT(MD_SID(un))); raid_line_exit(cs); } else { ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD)); } freebuffers(cs); cbuf = cs->cs_buflist; while (cbuf) { cbuf1 = cbuf->cbuf_next; kmem_cache_free(raid_cbuf_cache, cbuf); cbuf = cbuf1; } if (cs->cs_dbuf.b_flags & B_REMAPPED) bp_mapout(&cs->cs_dbuf); kmem_cache_free(raid_child_cache, cs); } /* * NAME: raid_regen_parity * * DESCRIPTION: This routine is used to regenerate the parity blocks * for the entire raid device. It is called from * both the regen thread and the IO path. * * On error the entire device is marked as in error by * placing the erroring device in error and all other * devices in last_errored. * * PARAMETERS: md_raidcs_t *cs */ void raid_regen_parity(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); caddr_t buffer; caddr_t parity_buffer; buf_t *bp; uint_t *dbuf, *pbuf; uint_t colcnt = un->un_totalcolumncnt; int column; int parity_column = cs->cs_pcolumn; size_t bcount; int j; /* * This routine uses the data and parity buffers allocated to a * write. In the case of a read the buffers are allocated and * freed at the end. */ ASSERT(IO_READER_HELD(un)); ASSERT(cs->cs_flags & MD_RCS_LLOCKD); ASSERT(UNIT_READER_HELD(un)); if (raid_state_cnt(un, RCS_OKAY) != colcnt) return; if (cs->cs_flags & MD_RCS_READER) { getpbuffer(cs); getdbuffer(cs); } ASSERT(cs->cs_dbuffer && cs->cs_pbuffer); bcount = cs->cs_bcount; buffer = cs->cs_dbuffer; parity_buffer = cs->cs_pbuffer; bzero(parity_buffer, bcount); bp = &cs->cs_dbuf; for (column = 0; column < colcnt; column++) { if (column == parity_column) continue; reset_buf(bp, B_READ | B_BUSY, bcount); bp->b_un.b_addr = buffer; bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev); bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart; bp->b_bcount = bcount; bp->b_bufsize = bcount; (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); if (biowait(bp)) goto bail; pbuf = (uint_t *)(void *)parity_buffer; dbuf = (uint_t *)(void *)buffer; for (j = 0; j < (bcount / (sizeof (uint_t))); j++) { *pbuf = *pbuf ^ *dbuf; pbuf++; dbuf++; } } reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount); bp->b_un.b_addr = parity_buffer; bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev); bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart; bp->b_bcount = bcount; bp->b_bufsize = bcount; (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); if (biowait(bp)) goto bail; if (cs->cs_flags & MD_RCS_READER) { freebuffers(cs); cs->cs_pbuffer = NULL; cs->cs_dbuffer = NULL; } bp->b_chain = (struct buf *)cs; return; bail: if (cs->cs_flags & MD_RCS_READER) { freebuffers(cs); cs->cs_pbuffer = NULL; cs->cs_dbuffer = NULL; } md_unit_readerexit(ui); un = md_unit_writerlock(ui); raid_set_state(un, column, RCS_ERRED, 0); for (column = 0; column < colcnt; column++) raid_set_state(un, column, RCS_ERRED, 0); raid_commit(un, NULL); md_unit_writerexit(ui); un = md_unit_readerlock(ui); bp->b_chain = (struct buf *)cs; } /* * NAMES: raid_error_state * DESCRIPTION: check unit and column states' impact on I/O error * NOTE: the state now may not be the state when the * I/O completed due to race conditions. * PARAMETERS: mr_unit_t *un - pointer to raid unit structure * md_raidcs_t *cs - pointer to child structure * buf_t *bp - pointer to buffer structure */ static int raid_error_state(mr_unit_t *un, buf_t *bp) { int column; int i; ASSERT(IO_READER_HELD(un)); ASSERT(UNIT_WRITER_HELD(un)); column = -1; for (i = 0; i < un->un_totalcolumncnt; i++) { if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) { column = i; break; } if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) { column = i; break; } } /* in case a replace snuck in while waiting on unit writer lock */ if (column == -1) { return (0); } (void) raid_set_state(un, column, RCS_ERRED, 0); ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED)); raid_commit(un, NULL); if (un->un_state & RUS_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } else if (un->un_state & RUS_LAST_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } return (EIO); } /* * NAME: raid_mapin_buf * DESCRIPTION: wait for the input buffer header to be maped in * PARAMETERS: md_raidps_t *ps */ static void raid_mapin_buf(md_raidcs_t *cs) { md_raidps_t *ps = cs->cs_ps; /* * check to see if the buffer is maped. If all is ok return the * offset of the data and return. Since it is expensive to grab * a mutex this is only done if the mapin is not complete. * Once the mutex is aquired it is possible that the mapin was * not done so recheck and if necessary do the mapin. */ if (ps->ps_mapin > 0) { cs->cs_addr = ps->ps_addr + cs->cs_offset; return; } mutex_enter(&ps->ps_mapin_mx); if (ps->ps_mapin > 0) { cs->cs_addr = ps->ps_addr + cs->cs_offset; mutex_exit(&ps->ps_mapin_mx); return; } bp_mapin(ps->ps_bp); /* * get the new b_addr out of the parent since bp_mapin just changed it */ ps->ps_addr = ps->ps_bp->b_un.b_addr; cs->cs_addr = ps->ps_addr + cs->cs_offset; ps->ps_mapin++; mutex_exit(&ps->ps_mapin_mx); } /* * NAMES: raid_read_no_retry * DESCRIPTION: I/O retry routine for a RAID metadevice read * read failed attempting to regenerate the data, * no retry possible, error occured in raid_raidregenloop(). * PARAMETERS: mr_unit_t *un - pointer to raid unit structure * md_raidcs_t *cs - pointer to child structure */ /*ARGSUSED*/ static void raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs) { md_raidps_t *ps = cs->cs_ps; raid_error_parent(ps, EIO); raid_free_child(cs, 1); /* decrement readfrags */ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); } /* * NAMES: raid_read_retry * DESCRIPTION: I/O retry routine for a RAID metadevice read * PARAMETERS: md_raidcs_t *cs - pointer to child structure */ static void raid_read_retry(mr_unit_t *un, md_raidcs_t *cs) { /* re-initialize the buf_t structure for raid_read() */ cs->cs_dbuf.b_chain = (struct buf *)cs; cs->cs_dbuf.b_back = &cs->cs_dbuf; cs->cs_dbuf.b_forw = &cs->cs_dbuf; cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ cs->cs_dbuf.b_error = 0; /* initialize error */ cs->cs_dbuf.b_offset = -1; /* Initialize semaphores */ sema_init(&cs->cs_dbuf.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&cs->cs_dbuf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); cs->cs_pbuf.b_chain = (struct buf *)cs; cs->cs_pbuf.b_back = &cs->cs_pbuf; cs->cs_pbuf.b_forw = &cs->cs_pbuf; cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ cs->cs_pbuf.b_error = 0; /* initialize error */ cs->cs_pbuf.b_offset = -1; sema_init(&cs->cs_pbuf.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&cs->cs_pbuf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */ cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */ /* * re-scheduling I/O with raid_read_io() is simpler. basically, * raid_read_io() is invoked again with same child structure. * (NOTE: we aren`t supposed to do any error recovery when an I/O * error occured in raid_raidregenloop(). */ raid_mapin_buf(cs); raid_read_io(un, cs); } /* * NAMES: raid_rderr * DESCRIPTION: I/O error handling routine for a RAID metadevice read * PARAMETERS: md_raidcs_t *cs - pointer to child structure * LOCKS: must obtain unit writer lock while calling raid_error_state * since a unit or column state transition may take place. * must obtain unit reader lock to retry I/O. */ /*ARGSUSED*/ static void raid_rderr(md_raidcs_t *cs) { md_raidps_t *ps; mdi_unit_t *ui; mr_unit_t *un; int error = 0; ps = cs->cs_ps; ui = ps->ps_ui; un = (mr_unit_t *)md_unit_writerlock(ui); ASSERT(un != 0); if (cs->cs_dbuf.b_flags & B_ERROR) error = raid_error_state(un, &cs->cs_dbuf); if (cs->cs_pbuf.b_flags & B_ERROR) error |= raid_error_state(un, &cs->cs_pbuf); md_unit_writerexit(ui); ps->ps_flags |= MD_RPS_HSREQ; un = (mr_unit_t *)md_unit_readerlock(ui); ASSERT(un != 0); /* now attempt the appropriate retry routine */ (*(cs->cs_retry_call))(un, cs); } /* * NAMES: raid_read_error * DESCRIPTION: I/O error handling routine for a RAID metadevice read * PARAMETERS: md_raidcs_t *cs - pointer to child structure */ /*ARGSUSED*/ static void raid_read_error(md_raidcs_t *cs) { md_raidps_t *ps; mdi_unit_t *ui; mr_unit_t *un; set_t setno; ps = cs->cs_ps; ui = ps->ps_ui; un = cs->cs_un; setno = MD_UN2SET(un); if ((cs->cs_dbuf.b_flags & B_ERROR) && (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) cmn_err(CE_WARN, "md %s: read error on %s", md_shortname(MD_SID(un)), md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); if ((cs->cs_pbuf.b_flags & B_ERROR) && (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) cmn_err(CE_WARN, "md %s: read error on %s", md_shortname(MD_SID(un)), md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); md_unit_readerexit(ui); ASSERT(cs->cs_frags == 0); /* now schedule processing for possible state change */ daemon_request(&md_mstr_daemon, raid_rderr, (daemon_queue_t *)cs, REQ_OLD); } /* * NAMES: getdbuffer * DESCRIPTION: data buffer allocation for a child structure * PARAMETERS: md_raidcs_t *cs - pointer to child structure * * NOTE: always get dbuffer before pbuffer * and get both buffers before pwslot * otherwise a deadlock could be introduced. */ static void getdbuffer(md_raidcs_t *cs) { mr_unit_t *un; cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); if (cs->cs_dbuffer != NULL) return; un = cs->cs_ps->ps_un; mutex_enter(&un->un_mx); while (un->un_dbuffer == NULL) { STAT_INC(data_buffer_waits); un->un_rflags |= MD_RFLAG_NEEDBUF; cv_wait(&un->un_cv, &un->un_mx); } cs->cs_dbuffer = un->un_dbuffer; cs->cs_flags |= MD_RCS_UNDBUF; un->un_dbuffer = NULL; mutex_exit(&un->un_mx); } /* * NAMES: getpbuffer * DESCRIPTION: parity buffer allocation for a child structure * PARAMETERS: md_raidcs_t *cs - pointer to child structure * * NOTE: always get dbuffer before pbuffer * and get both buffers before pwslot * otherwise a deadlock could be introduced. */ static void getpbuffer(md_raidcs_t *cs) { mr_unit_t *un; cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); if (cs->cs_pbuffer != NULL) return; un = cs->cs_ps->ps_un; mutex_enter(&un->un_mx); while (un->un_pbuffer == NULL) { STAT_INC(parity_buffer_waits); un->un_rflags |= MD_RFLAG_NEEDBUF; cv_wait(&un->un_cv, &un->un_mx); } cs->cs_pbuffer = un->un_pbuffer; cs->cs_flags |= MD_RCS_UNPBUF; un->un_pbuffer = NULL; mutex_exit(&un->un_mx); } static void getresources(md_raidcs_t *cs) { md_raidcbuf_t *cbuf; /* * NOTE: always get dbuffer before pbuffer * and get both buffers before pwslot * otherwise a deadlock could be introduced. */ getdbuffer(cs); getpbuffer(cs); for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) cbuf->cbuf_buffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP); } /* * NAMES: freebuffers * DESCRIPTION: child structure buffer freeing routine * PARAMETERS: md_raidcs_t *cs - pointer to child structure */ static void freebuffers(md_raidcs_t *cs) { mr_unit_t *un; md_raidcbuf_t *cbuf; /* free buffers used for full line write */ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { if (cbuf->cbuf_buffer == NULL) continue; kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE); cbuf->cbuf_buffer = NULL; cbuf->cbuf_bcount = 0; } if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { un = cs->cs_un; mutex_enter(&un->un_mx); } if (cs->cs_dbuffer) { if (cs->cs_flags & MD_RCS_UNDBUF) un->un_dbuffer = cs->cs_dbuffer; else kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE); } if (cs->cs_pbuffer) { if (cs->cs_flags & MD_RCS_UNPBUF) un->un_pbuffer = cs->cs_pbuffer; else kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE); } if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { un->un_rflags &= ~MD_RFLAG_NEEDBUF; cv_broadcast(&un->un_cv); mutex_exit(&un->un_mx); } } /* * NAMES: raid_line_reader_lock, raid_line_writer_lock * DESCRIPTION: RAID metadevice line reader and writer lock routines * data column # and parity column #. * PARAMETERS: md_raidcs_t *cs - pointer to child structure */ void raid_line_reader_lock(md_raidcs_t *cs, int resync_thread) { mr_unit_t *un; md_raidcs_t *cs1; ASSERT(cs->cs_line != MD_DISKADDR_ERROR); un = cs->cs_un; cs->cs_flags |= MD_RCS_READER; STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); if (!panicstr) mutex_enter(&un->un_linlck_mx); cs1 = un->un_linlck_chn; while (cs1 != NULL) { for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) if (raid_io_overlaps(cs, cs1) == 1) if (cs1->cs_flags & MD_RCS_WRITER) break; if (cs1 != NULL) { if (panicstr) panic("md; raid line write lock held"); un->un_linlck_flg = 1; cv_wait(&un->un_linlck_cv, &un->un_linlck_mx); STAT_INC(raid_read_waits); } } STAT_MAX(raid_max_reader_locks, raid_reader_locks_active); STAT_INC(raid_reader_locks); cs1 = un->un_linlck_chn; if (cs1 != NULL) cs1->cs_linlck_prev = cs; cs->cs_linlck_next = cs1; cs->cs_linlck_prev = NULL; un->un_linlck_chn = cs; cs->cs_flags |= MD_RCS_LLOCKD; if (resync_thread) { diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; diskaddr_t line = (lastblk + 1) / un->un_segsize; ASSERT(raid_state_cnt(un, RCS_RESYNC)); mutex_enter(&un->un_mx); un->un_resync_line_index = line; mutex_exit(&un->un_mx); } if (!panicstr) mutex_exit(&un->un_linlck_mx); } int raid_line_writer_lock(md_raidcs_t *cs, int lock) { mr_unit_t *un; md_raidcs_t *cs1; ASSERT(cs->cs_line != MD_DISKADDR_ERROR); cs->cs_flags |= MD_RCS_WRITER; un = cs->cs_ps->ps_un; STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); if (lock && !panicstr) mutex_enter(&un->un_linlck_mx); ASSERT(MUTEX_HELD(&un->un_linlck_mx)); cs1 = un->un_linlck_chn; for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) if (raid_io_overlaps(cs, cs1)) break; if (cs1 != NULL) { if (panicstr) panic("md: line writer lock inaccessible"); goto no_lock_exit; } if (raid_alloc_pwslot(cs)) { if (panicstr) panic("md: no prewrite slots"); STAT_INC(raid_prewrite_waits); goto no_lock_exit; } cs1 = un->un_linlck_chn; if (cs1 != NULL) cs1->cs_linlck_prev = cs; cs->cs_linlck_next = cs1; cs->cs_linlck_prev = NULL; un->un_linlck_chn = cs; cs->cs_flags |= MD_RCS_LLOCKD; cs->cs_flags &= ~MD_RCS_WAITING; STAT_INC(raid_writer_locks); STAT_MAX(raid_max_write_locks, raid_write_locks_active); if (lock && !panicstr) mutex_exit(&un->un_linlck_mx); return (0); no_lock_exit: /* if this is already queued then do not requeue it */ ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); if (!lock || (cs->cs_flags & MD_RCS_WAITING)) return (1); cs->cs_flags |= MD_RCS_WAITING; cs->cs_un = un; raid_enqueue(cs); if (lock && !panicstr) mutex_exit(&un->un_linlck_mx); return (1); } static void raid_startio(md_raidcs_t *cs) { mdi_unit_t *ui = cs->cs_ps->ps_ui; mr_unit_t *un = cs->cs_un; un = md_unit_readerlock(ui); raid_write_io(un, cs); } void raid_io_startup(mr_unit_t *un) { md_raidcs_t *waiting_list, *cs1; md_raidcs_t *previous = NULL, *next = NULL; mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; ASSERT(MUTEX_HELD(&un->un_linlck_mx)); mutex_enter(io_list_mutex); /* * check to be sure there are no reader locks outstanding. If * there are not then pass on the writer lock. */ waiting_list = ui->ui_io_lock->io_list_front; while (waiting_list) { ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD)); for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) if (raid_io_overlaps(waiting_list, cs1) == 1) break; /* * there was an IOs that overlaps this io so go onto * the next io in the waiting list */ if (cs1) { previous = waiting_list; waiting_list = waiting_list->cs_linlck_next; continue; } /* * There are no IOs that overlap this, so remove it from * the waiting queue, and start it */ if (raid_check_pw(waiting_list)) { ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); previous = waiting_list; waiting_list = waiting_list->cs_linlck_next; continue; } ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); next = waiting_list->cs_linlck_next; if (previous) previous->cs_linlck_next = next; else ui->ui_io_lock->io_list_front = next; if (ui->ui_io_lock->io_list_front == NULL) ui->ui_io_lock->io_list_back = NULL; if (ui->ui_io_lock->io_list_back == waiting_list) ui->ui_io_lock->io_list_back = previous; waiting_list->cs_linlck_next = NULL; waiting_list->cs_flags &= ~MD_RCS_WAITING; STAT_DEC(raid_write_queue_length); if (raid_line_writer_lock(waiting_list, 0)) panic("region locking corrupted"); ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD); daemon_request(&md_mstr_daemon, raid_startio, (daemon_queue_t *)waiting_list, REQ_OLD); waiting_list = next; } mutex_exit(io_list_mutex); } void raid_line_exit(md_raidcs_t *cs) { mr_unit_t *un; un = cs->cs_ps->ps_un; STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); mutex_enter(&un->un_linlck_mx); if (cs->cs_flags & MD_RCS_READER) STAT_DEC(raid_reader_locks_active); else STAT_DEC(raid_write_locks_active); if (cs->cs_linlck_prev) cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next; else un->un_linlck_chn = cs->cs_linlck_next; if (cs->cs_linlck_next) cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev; cs->cs_flags &= ~MD_RCS_LLOCKD; if (un->un_linlck_flg) cv_broadcast(&un->un_linlck_cv); un->un_linlck_flg = 0; cs->cs_line = MD_DISKADDR_ERROR; raid_cancel_pwslot(cs); /* * now that the lock is droped go ahead and see if there are any * other writes that can be started up */ raid_io_startup(un); mutex_exit(&un->un_linlck_mx); } /* * NAMES: raid_line, raid_pcolumn, raid_dcolumn * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #, * data column # and parity column #. * PARAMETERS: int segment - segment number * mr_unit_t *un - pointer to an unit structure * RETURNS: raid_line returns line # * raid_dcolumn returns data column # * raid_pcolumn returns parity column # */ static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un) { diskaddr_t adj_seg; diskaddr_t line; diskaddr_t max_orig_segment; max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; if (segment >= max_orig_segment) { adj_seg = segment - max_orig_segment; line = adj_seg % un->un_segsincolumn; } else { line = segment / (un->un_origcolumncnt - 1); } return (line); } uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un) { diskaddr_t adj_seg; diskaddr_t line; diskaddr_t max_orig_segment; uint_t column; max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; if (segment >= max_orig_segment) { adj_seg = segment - max_orig_segment; column = un->un_origcolumncnt + (uint_t)(adj_seg / un->un_segsincolumn); } else { line = segment / (un->un_origcolumncnt - 1); column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line) % un->un_origcolumncnt); } return (column); } uint_t raid_pcolumn(diskaddr_t segment, mr_unit_t *un) { diskaddr_t adj_seg; diskaddr_t line; diskaddr_t max_orig_segment; uint_t column; max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; if (segment >= max_orig_segment) { adj_seg = segment - max_orig_segment; line = adj_seg % un->un_segsincolumn; } else { line = segment / (un->un_origcolumncnt - 1); } column = (uint_t)((line + (un->un_origcolumncnt - 1)) % un->un_origcolumncnt); return (column); } /* * Is called in raid_iosetup to probe each column to insure * that all the columns are in 'okay' state and meet the * 'full line' requirement. If any column is in error, * we don't want to enable the 'full line' flag. Previously, * we would do so and disable it only when a error is * detected after the first 'full line' io which is too late * and leads to the potential data corruption. */ static int raid_check_cols(mr_unit_t *un) { buf_t bp; char *buf; mr_column_t *colptr; minor_t mnum = MD_SID(un); int i; int err = 0; buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); for (i = 0; i < un->un_totalcolumncnt; i++) { md_dev64_t tmpdev; colptr = &un->un_column[i]; tmpdev = colptr->un_dev; /* * Open by device id * If this device is hotspared * use the hotspare key */ tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? colptr->un_hs_key : colptr->un_orig_key); if (tmpdev == NODEV64) { err = 1; break; } colptr->un_dev = tmpdev; bzero((caddr_t)&bp, sizeof (buf_t)); bp.b_back = &bp; bp.b_forw = &bp; bp.b_flags = (B_READ | B_BUSY); sema_init(&bp.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&bp.b_sem, 0, NULL, SEMA_DEFAULT, NULL); bp.b_edev = md_dev64_to_dev(colptr->un_dev); bp.b_lblkno = colptr->un_pwstart; bp.b_bcount = DEV_BSIZE; bp.b_bufsize = DEV_BSIZE; bp.b_un.b_addr = (caddr_t)buf; (void) md_call_strategy(&bp, 0, NULL); if (biowait(&bp)) { err = 1; break; } } kmem_free(buf, DEV_BSIZE); return (err); } /* * NAME: raid_iosetup * DESCRIPTION: RAID metadevice specific I/O set up routine which does * all the necessary calculations to determine the location * of the segement for the I/O. * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice * diskaddr_t blkno - block number of the I/O attempt * size_t blkcnt - block count for this I/O * md_raidcs_t *cs - child structure for each segmented I/O * * NOTE: The following is an example of a raid disk layer out: * * Total Column = 5 * Original Column = 4 * Segment Per Column = 10 * * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6 * ------------------------------------------------------------- * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40 * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31 * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32 * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33 * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34 * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35 * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36 * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37 * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38 * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39 */ static size_t raid_iosetup( mr_unit_t *un, diskaddr_t blkno, size_t blkcnt, md_raidcs_t *cs ) { diskaddr_t segment; diskaddr_t segstart; diskaddr_t segoff; size_t leftover; diskaddr_t line; uint_t iosize; uint_t colcnt; /* caculate the segment# and offset for the block */ segment = blkno / un->un_segsize; segstart = segment * un->un_segsize; segoff = blkno - segstart; iosize = un->un_iosize - 1; colcnt = un->un_totalcolumncnt - 1; line = raid_line(segment, un); cs->cs_dcolumn = raid_dcolumn(segment, un); cs->cs_pcolumn = raid_pcolumn(segment, un); cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags; cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags; cs->cs_line = line; if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) && (UNIT_STATE(un) & RCS_OKAY) && (segoff == 0) && (un->un_totalcolumncnt == un->un_origcolumncnt) && (un->un_segsize < un->un_iosize) && (un->un_iosize <= un->un_maxio) && (blkno == line * un->un_segsize * colcnt) && (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) && (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) && (raid_check_cols(un) == 0)) { md_raidcbuf_t **cbufp; md_raidcbuf_t *cbuf; int i, j; STAT_INC(raid_full_line_writes); leftover = blkcnt - (un->un_segsize * colcnt); ASSERT(blkcnt >= (un->un_segsize * colcnt)); cs->cs_blkno = line * un->un_segsize; cs->cs_blkcnt = un->un_segsize; cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; cs->cs_bcount = dbtob(cs->cs_blkcnt); cs->cs_flags |= MD_RCS_LINE; cbufp = &cs->cs_buflist; for (i = 0; i < un->un_totalcolumncnt; i++) { j = cs->cs_dcolumn + i; j = j % un->un_totalcolumncnt; if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn)) continue; cbuf = kmem_cache_alloc(raid_cbuf_cache, MD_ALLOCFLAGS); raid_cbuf_init(cbuf); cbuf->cbuf_un = cs->cs_un; cbuf->cbuf_ps = cs->cs_ps; cbuf->cbuf_column = j; cbuf->cbuf_bcount = dbtob(un->un_segsize); *cbufp = cbuf; cbufp = &cbuf->cbuf_next; } return (leftover); } leftover = blkcnt - (un->un_segsize - segoff); if (blkcnt > (un->un_segsize - segoff)) blkcnt -= leftover; else leftover = 0; if (blkcnt > (size_t)iosize) { leftover += (blkcnt - iosize); blkcnt = iosize; } /* calculate the line# and column# for the segment */ cs->cs_flags &= ~MD_RCS_LINE; cs->cs_blkno = line * un->un_segsize + segoff; cs->cs_blkcnt = (uint_t)blkcnt; cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; cs->cs_bcount = dbtob((uint_t)blkcnt); return (leftover); } /* * NAME: raid_done * DESCRIPTION: RAID metadevice I/O done interrupt routine * PARAMETERS: struct buf *bp - pointer to a buffer structure */ static void raid_done(struct buf *bp) { md_raidcs_t *cs; int flags, frags; sema_v(&bp->b_io); cs = (md_raidcs_t *)bp->b_chain; ASSERT(cs != NULL); mutex_enter(&cs->cs_mx); if (bp->b_flags & B_ERROR) { cs->cs_flags |= MD_RCS_ERROR; cs->cs_flags &= ~(MD_RCS_ISCALL); } flags = cs->cs_flags; frags = --cs->cs_frags; mutex_exit(&cs->cs_mx); if (frags != 0) { return; } if (flags & MD_RCS_ERROR) { if (cs->cs_error_call) { daemon_request(&md_done_daemon, cs->cs_error_call, (daemon_queue_t *)cs, REQ_OLD); } return; } if (flags & MD_RCS_ISCALL) { cs->cs_flags &= ~(MD_RCS_ISCALL); (*(cs->cs_call))(cs); return; } daemon_request(&md_done_daemon, cs->cs_call, (daemon_queue_t *)cs, REQ_OLD); } /* * the flag RIO_EXTRA is used when dealing with a column in the process * of being resynced. During the resync, writes may have to take place * on both the original component and a hotspare component. */ #define RIO_DATA 0x00100 /* use data buffer & data column */ #define RIO_PARITY 0x00200 /* use parity buffer & parity column */ #define RIO_WRITE 0x00400 /* issue a write */ #define RIO_READ 0x00800 /* issue a read */ #define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */ #define RIO_ALT 0x02000 /* do write to alternate device */ #define RIO_EXTRA 0x04000 /* use extra buffer */ #define RIO_COLMASK 0x000ff #define RIO_PREWRITE RIO_WRITE | RIO_PWIO /* * NAME: raidio * DESCRIPTION: RAID metadevice write routine * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raidio(md_raidcs_t *cs, int flags) { buf_t *bp; int column; int flag; void *private; mr_unit_t *un; int iosize; diskaddr_t pwstart; diskaddr_t devstart; md_dev64_t dev; un = cs->cs_un; ASSERT(IO_READER_HELD(un)); ASSERT(UNIT_READER_HELD(un)); if (flags & RIO_DATA) { if (flags & RIO_EXTRA) bp = &cs->cs_hbuf; else bp = &cs->cs_dbuf; bp->b_un.b_addr = cs->cs_dbuffer; column = cs->cs_dcolumn; } else { if (flags & RIO_EXTRA) bp = &cs->cs_hbuf; else bp = &cs->cs_pbuf; bp->b_un.b_addr = cs->cs_pbuffer; column = cs->cs_pcolumn; } if (flags & RIO_COLMASK) column = (flags & RIO_COLMASK) - 1; bp->b_bcount = cs->cs_bcount; bp->b_bufsize = cs->cs_bcount; iosize = un->un_iosize; /* check if the hotspared device will be used */ if (flags & RIO_ALT && (flags & RIO_WRITE)) { pwstart = un->un_column[column].un_alt_pwstart; devstart = un->un_column[column].un_alt_devstart; dev = un->un_column[column].un_alt_dev; } else { pwstart = un->un_column[column].un_pwstart; devstart = un->un_column[column].un_devstart; dev = un->un_column[column].un_dev; } /* if not writing to log skip log header */ if ((flags & RIO_PWIO) == 0) { bp->b_lblkno = devstart + cs->cs_blkno; bp->b_un.b_addr += DEV_BSIZE; } else { bp->b_bcount += DEV_BSIZE; bp->b_bufsize = bp->b_bcount; if (flags & RIO_DATA) { bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart; } else { /* not DATA -> PARITY */ bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart; } } bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available); bp->b_flags |= B_BUSY; if (flags & RIO_READ) { bp->b_flags |= B_READ; } else { bp->b_flags |= B_WRITE; if ((nv_available && nv_parity && (flags & RIO_PARITY)) || (nv_available && nv_prewrite && (flags & RIO_PWIO))) bp->b_flags |= nv_available; } bp->b_iodone = (int (*)())raid_done; bp->b_edev = md_dev64_to_dev(dev); ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV)); private = cs->cs_strategy_private; flag = cs->cs_strategy_flag; md_call_strategy(bp, flag, private); } /* * NAME: genstandardparity * DESCRIPTION: This routine * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void genstandardparity(md_raidcs_t *cs) { uint_t *dbuf, *pbuf; size_t wordcnt; uint_t dsum = 0; uint_t psum = 0; ASSERT((cs->cs_bcount & 0x3) == 0); wordcnt = cs->cs_bcount / sizeof (uint_t); dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); /* Word aligned */ if (((uintptr_t)cs->cs_addr & 0x3) == 0) { uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr); uint_t uval; while (wordcnt--) { uval = *uwbuf++; psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval)); ++pbuf; *dbuf = uval; dsum ^= uval; ++dbuf; } } else { uchar_t *ubbuf = (uchar_t *)(cs->cs_addr); union { uint_t wb; uchar_t bb[4]; } cb; while (wordcnt--) { cb.bb[0] = *ubbuf++; cb.bb[1] = *ubbuf++; cb.bb[2] = *ubbuf++; cb.bb[3] = *ubbuf++; psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb)); ++pbuf; *dbuf = cb.wb; dsum ^= cb.wb; ++dbuf; } } RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2, cs->cs_dcolumn, RAID_PWMAGIC); RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 2, cs->cs_pcolumn, RAID_PWMAGIC); } static void genlineparity(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; md_raidcbuf_t *cbuf; uint_t *pbuf, *dbuf; uint_t *uwbuf; uchar_t *ubbuf; size_t wordcnt; uint_t psum = 0, dsum = 0; size_t count = un->un_segsize * DEV_BSIZE; uint_t col; buf_t *bp; ASSERT((cs->cs_bcount & 0x3) == 0); pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); uwbuf = (uint_t *)(void *)(cs->cs_addr); ubbuf = (uchar_t *)(void *)(cs->cs_addr); wordcnt = count / sizeof (uint_t); /* Word aligned */ if (((uintptr_t)cs->cs_addr & 0x3) == 0) { uint_t uval; while (wordcnt--) { uval = *uwbuf++; *dbuf = uval; *pbuf = uval; dsum ^= uval; ++pbuf; ++dbuf; } } else { union { uint_t wb; uchar_t bb[4]; } cb; while (wordcnt--) { cb.bb[0] = *ubbuf++; cb.bb[1] = *ubbuf++; cb.bb[2] = *ubbuf++; cb.bb[3] = *ubbuf++; *dbuf = cb.wb; *pbuf = cb.wb; dsum ^= cb.wb; ++pbuf; ++dbuf; } } RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC); raidio(cs, RIO_PREWRITE | RIO_DATA); for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { dsum = 0; pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE); wordcnt = count / sizeof (uint_t); col = cbuf->cbuf_column; /* Word aligned */ if (((uintptr_t)cs->cs_addr & 0x3) == 0) { uint_t uval; /* * Only calculate psum when working on the last * data buffer. */ if (cbuf->cbuf_next == NULL) { psum = 0; while (wordcnt--) { uval = *uwbuf++; *dbuf = uval; psum ^= (*pbuf ^= uval); dsum ^= uval; ++dbuf; ++pbuf; } } else { while (wordcnt--) { uval = *uwbuf++; *dbuf = uval; *pbuf ^= uval; dsum ^= uval; ++dbuf; ++pbuf; } } } else { union { uint_t wb; uchar_t bb[4]; } cb; /* * Only calculate psum when working on the last * data buffer. */ if (cbuf->cbuf_next == NULL) { psum = 0; while (wordcnt--) { cb.bb[0] = *ubbuf++; cb.bb[1] = *ubbuf++; cb.bb[2] = *ubbuf++; cb.bb[3] = *ubbuf++; *dbuf = cb.wb; psum ^= (*pbuf ^= cb.wb); dsum ^= cb.wb; ++dbuf; ++pbuf; } } else { while (wordcnt--) { cb.bb[0] = *ubbuf++; cb.bb[1] = *ubbuf++; cb.bb[2] = *ubbuf++; cb.bb[3] = *ubbuf++; *dbuf = cb.wb; *pbuf ^= cb.wb; dsum ^= cb.wb; ++dbuf; ++pbuf; } } } RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, un->un_totalcolumncnt, col, RAID_PWMAGIC); /* * fill in buffer for write to prewrite area */ bp = &cbuf->cbuf_bp; bp->b_un.b_addr = cbuf->cbuf_buffer; bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE; bp->b_bufsize = bp->b_bcount; bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) + un->un_column[col].un_pwstart; bp->b_flags = B_WRITE | B_BUSY; if (nv_available && nv_prewrite) bp->b_flags |= nv_available; bp->b_iodone = (int (*)())raid_done; bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev); bp->b_chain = (struct buf *)cs; md_call_strategy(bp, cs->cs_strategy_flag, cs->cs_strategy_private); } RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC); raidio(cs, RIO_PREWRITE | RIO_PARITY); } /* * NAME: raid_readregenloop * DESCRIPTION: RAID metadevice write routine * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raid_readregenloop(md_raidcs_t *cs) { mr_unit_t *un; md_raidps_t *ps; uint_t *dbuf; uint_t *pbuf; size_t wordcnt; un = cs->cs_un; /* * XOR the parity with data bytes, must skip the * pre-write entry header in all data/parity buffers */ wordcnt = cs->cs_bcount / sizeof (uint_t); dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); while (wordcnt--) *dbuf++ ^= *pbuf++; /* bump up the loop count */ cs->cs_loop++; /* skip the errored component */ if (cs->cs_loop == cs->cs_dcolumn) cs->cs_loop++; if (cs->cs_loop != un->un_totalcolumncnt) { cs->cs_frags = 1; raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); return; } /* reaching the end sof loop */ ps = cs->cs_ps; bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount); raid_free_child(cs, 1); /* decrement readfrags */ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); } /* * NAME: raid_read_io * DESCRIPTION: RAID metadevice read I/O routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs) { int flag; void *private; buf_t *bp; buf_t *pb = cs->cs_ps->ps_bp; mr_column_t *column; flag = cs->cs_strategy_flag; private = cs->cs_strategy_private; column = &un->un_column[cs->cs_dcolumn]; /* * The component to be read is good, simply set up bp structure * and call low level md routine doing the read. */ if (COLUMN_ISOKAY(un, cs->cs_dcolumn) || (COLUMN_ISLASTERR(un, cs->cs_dcolumn) && (cs->cs_flags & MD_RCS_RECOVERY) == 0)) { dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */ ddi_dev = md_dev64_to_dev(column->un_dev); bp = &cs->cs_dbuf; bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev, column->un_devstart + cs->cs_blkno, (int (*)())raid_done, bp, KM_NOSLEEP); bp->b_chain = (buf_t *)cs; cs->cs_frags = 1; cs->cs_error_call = raid_read_error; cs->cs_retry_call = raid_read_retry; cs->cs_flags |= MD_RCS_ISCALL; cs->cs_stage = RAID_READ_DONE; cs->cs_call = raid_stage; ASSERT(bp->b_edev != 0); md_call_strategy(bp, flag, private); return; } /* * The component to be read is bad, have to go through * raid specific method to read data from other members. */ cs->cs_loop = 0; /* * NOTE: always get dbuffer before pbuffer * and get both buffers before pwslot * otherwise a deadlock could be introduced. */ raid_mapin_buf(cs); getdbuffer(cs); getpbuffer(cs); if (cs->cs_loop == cs->cs_dcolumn) cs->cs_loop++; /* zero out data buffer for use as a data sink */ bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount); cs->cs_stage = RAID_NONE; cs->cs_call = raid_readregenloop; cs->cs_error_call = raid_read_error; cs->cs_retry_call = raid_read_no_retry; cs->cs_frags = 1; /* use parity buffer to read other columns */ raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); } /* * NAME: raid_read * DESCRIPTION: RAID metadevice write routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ static int raid_read(mr_unit_t *un, md_raidcs_t *cs) { int error = 0; md_raidps_t *ps; mdi_unit_t *ui; minor_t mnum; ASSERT(IO_READER_HELD(un)); ps = cs->cs_ps; ui = ps->ps_ui; raid_line_reader_lock(cs, 0); un = (mr_unit_t *)md_unit_readerlock(ui); ASSERT(UNIT_STATE(un) != RUS_INIT); mnum = MD_SID(un); cs->cs_un = un; /* make sure the read doesn't go beyond the end of the column */ if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) { error = ENXIO; } if (error) goto rerror; if (un->un_state & RUS_REGEN) { raid_regen_parity(cs); un = MD_UNIT(mnum); cs->cs_un = un; } raid_read_io(un, cs); return (0); rerror: raid_error_parent(ps, error); raid_free_child(cs, 1); /* decrement readfrags */ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); return (0); } /* * NAME: raid_write_err_retry * DESCRIPTION: RAID metadevice write retry routine * write was for parity or data only; * complete write with error, no recovery possible * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ /*ARGSUSED*/ static void raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs) { md_raidps_t *ps = cs->cs_ps; int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; /* decrement pwfrags if needed, and frags */ if (!(cs->cs_flags & MD_RCS_PWDONE)) flags |= RFP_DECR_PWFRAGS; raid_error_parent(ps, EIO); raid_free_child(cs, 1); raid_free_parent(ps, flags); } /* * NAME: raid_write_err_retry * DESCRIPTION: RAID metadevice write retry routine * write is too far along to retry and parent * has already been signaled with iodone. * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ /*ARGSUSED*/ static void raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs) { md_raidps_t *ps = cs->cs_ps; int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; /* decrement pwfrags if needed, and frags */ if (!(cs->cs_flags & MD_RCS_PWDONE)) flags |= RFP_DECR_PWFRAGS; raid_free_child(cs, 1); raid_free_parent(ps, flags); } /* * NAME: raid_write_retry * DESCRIPTION: RAID metadevice write retry routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ static void raid_write_retry(mr_unit_t *un, md_raidcs_t *cs) { md_raidps_t *ps; ps = cs->cs_ps; /* re-initialize the buf_t structure for raid_write() */ cs->cs_dbuf.b_chain = (struct buf *)cs; cs->cs_dbuf.b_back = &cs->cs_dbuf; cs->cs_dbuf.b_forw = &cs->cs_dbuf; cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ cs->cs_dbuf.b_error = 0; /* initialize error */ cs->cs_dbuf.b_offset = -1; /* Initialize semaphores */ sema_init(&cs->cs_dbuf.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&cs->cs_dbuf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); cs->cs_pbuf.b_chain = (struct buf *)cs; cs->cs_pbuf.b_back = &cs->cs_pbuf; cs->cs_pbuf.b_forw = &cs->cs_pbuf; cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ cs->cs_pbuf.b_error = 0; /* initialize error */ cs->cs_pbuf.b_offset = -1; sema_init(&cs->cs_pbuf.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&cs->cs_pbuf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); cs->cs_hbuf.b_chain = (struct buf *)cs; cs->cs_hbuf.b_back = &cs->cs_hbuf; cs->cs_hbuf.b_forw = &cs->cs_hbuf; cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */ cs->cs_hbuf.b_error = 0; /* initialize error */ cs->cs_hbuf.b_offset = -1; sema_init(&cs->cs_hbuf.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&cs->cs_hbuf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); cs->cs_flags &= ~(MD_RCS_ERROR); /* * If we have already done'ed the i/o but have done prewrite * on this child, then reset PWDONE flag and bump pwfrags before * restarting i/o. * If pwfrags is zero, we have already 'iodone'd the i/o so * leave things alone. We don't want to re-'done' it. */ mutex_enter(&ps->ps_mx); if (cs->cs_flags & MD_RCS_PWDONE) { cs->cs_flags &= ~MD_RCS_PWDONE; ps->ps_pwfrags++; } mutex_exit(&ps->ps_mx); raid_write_io(un, cs); } /* * NAME: raid_wrerr * DESCRIPTION: RAID metadevice write routine * PARAMETERS: md_raidcs_t *cs - pointer to a child structure * LOCKS: must obtain unit writer lock while calling raid_error_state * since a unit or column state transition may take place. * must obtain unit reader lock to retry I/O. */ static void raid_wrerr(md_raidcs_t *cs) { md_raidps_t *ps; mdi_unit_t *ui; mr_unit_t *un; md_raidcbuf_t *cbuf; ps = cs->cs_ps; ui = ps->ps_ui; un = (mr_unit_t *)md_unit_writerlock(ui); ASSERT(un != 0); if (cs->cs_dbuf.b_flags & B_ERROR) (void) raid_error_state(un, &cs->cs_dbuf); if (cs->cs_pbuf.b_flags & B_ERROR) (void) raid_error_state(un, &cs->cs_pbuf); if (cs->cs_hbuf.b_flags & B_ERROR) (void) raid_error_state(un, &cs->cs_hbuf); for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) if (cbuf->cbuf_bp.b_flags & B_ERROR) (void) raid_error_state(un, &cbuf->cbuf_bp); md_unit_writerexit(ui); ps->ps_flags |= MD_RPS_HSREQ; un = (mr_unit_t *)md_unit_readerlock(ui); /* now attempt the appropriate retry routine */ (*(cs->cs_retry_call))(un, cs); } /* * NAMES: raid_write_error * DESCRIPTION: I/O error handling routine for a RAID metadevice write * PARAMETERS: md_raidcs_t *cs - pointer to child structure */ /*ARGSUSED*/ static void raid_write_error(md_raidcs_t *cs) { md_raidps_t *ps; mdi_unit_t *ui; mr_unit_t *un; md_raidcbuf_t *cbuf; set_t setno; ps = cs->cs_ps; ui = ps->ps_ui; un = cs->cs_un; setno = MD_UN2SET(un); /* * locate each buf that is in error on this io and then * output an error message */ if ((cs->cs_dbuf.b_flags & B_ERROR) && (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) cmn_err(CE_WARN, "md %s: write error on %s", md_shortname(MD_SID(un)), md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); if ((cs->cs_pbuf.b_flags & B_ERROR) && (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) cmn_err(CE_WARN, "md %s: write error on %s", md_shortname(MD_SID(un)), md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) if ((cbuf->cbuf_bp.b_flags & B_ERROR) && (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) && (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED)) cmn_err(CE_WARN, "md %s: write error on %s", md_shortname(MD_SID(un)), md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev), NULL, 0)); md_unit_readerexit(ui); ASSERT(cs->cs_frags == 0); /* now schedule processing for possible state change */ daemon_request(&md_mstr_daemon, raid_wrerr, (daemon_queue_t *)cs, REQ_OLD); } /* * NAME: raid_write_ponly * DESCRIPTION: RAID metadevice write routine * in the case where only the parity column can be written * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raid_write_ponly(md_raidcs_t *cs) { md_raidps_t *ps; mr_unit_t *un = cs->cs_un; ps = cs->cs_ps; /* decrement pwfrags if needed, but not frags */ ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); raid_free_parent(ps, RFP_DECR_PWFRAGS); cs->cs_flags |= MD_RCS_PWDONE; cs->cs_frags = 1; cs->cs_stage = RAID_WRITE_PONLY_DONE; cs->cs_call = raid_stage; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_no_retry; if (WRITE_ALT(un, cs->cs_pcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); } raidio(cs, RIO_PARITY | RIO_WRITE); } /* * NAME: raid_write_ploop * DESCRIPTION: RAID metadevice write routine, constructs parity from * data in other columns. * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raid_write_ploop(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; uint_t *dbuf; uint_t *pbuf; size_t wordcnt; uint_t psum = 0; wordcnt = cs->cs_bcount / sizeof (uint_t); dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); while (wordcnt--) *pbuf++ ^= *dbuf++; cs->cs_loop++; /* * build parity from scratch using new data, * skip reading the data and parity columns. */ while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) cs->cs_loop++; if (cs->cs_loop != un->un_totalcolumncnt) { cs->cs_frags = 1; raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); return; } /* construct checksum for parity buffer */ wordcnt = cs->cs_bcount / sizeof (uint_t); pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); while (wordcnt--) { psum ^= *pbuf; pbuf++; } RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 1, cs->cs_pcolumn, RAID_PWMAGIC); cs->cs_stage = RAID_NONE; cs->cs_call = raid_write_ponly; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_err_retry; cs->cs_frags = 1; if (WRITE_ALT(un, cs->cs_pcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); } raidio(cs, RIO_PARITY | RIO_PREWRITE); } /* * NAME: raid_write_donly * DESCRIPTION: RAID metadevice write routine * Completed writing data to prewrite entry * in the case where only the data column can be written * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raid_write_donly(md_raidcs_t *cs) { md_raidps_t *ps; mr_unit_t *un = cs->cs_un; ps = cs->cs_ps; /* WARNING: don't release unit reader lock here... */ /* decrement pwfrags if needed, but not frags */ ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); raid_free_parent(ps, RFP_DECR_PWFRAGS); cs->cs_flags |= MD_RCS_PWDONE; cs->cs_frags = 1; cs->cs_stage = RAID_WRITE_DONLY_DONE; cs->cs_call = raid_stage; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_err_retry; if (WRITE_ALT(un, cs->cs_dcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); } raidio(cs, RIO_DATA | RIO_WRITE); } /* * NAME: raid_write_got_old * DESCRIPTION: RAID metadevice write routine * completed read of old data and old parity * PARAMETERS: md_raidcs_t *cs - pointer to a child structure */ static void raid_write_got_old(md_raidcs_t *cs) { mr_unit_t *un = cs->cs_un; ASSERT(IO_READER_HELD(cs->cs_un)); ASSERT(UNIT_READER_HELD(cs->cs_un)); raid_mapin_buf(cs); genstandardparity(cs); cs->cs_frags = 2; cs->cs_call = raid_stage; cs->cs_stage = RAID_PREWRITE_DONE; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_retry; if (WRITE_ALT(un, cs->cs_dcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE); } if (WRITE_ALT(un, cs->cs_pcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); } ASSERT(cs->cs_frags < 4); raidio(cs, RIO_DATA | RIO_PREWRITE); raidio(cs, RIO_PARITY | RIO_PREWRITE); } /* * NAME: raid_write_io * DESCRIPTION: RAID metadevice write I/O routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ /*ARGSUSED*/ static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs) { md_raidps_t *ps = cs->cs_ps; uint_t *dbuf; uint_t *ubuf; size_t wordcnt; uint_t dsum = 0; int pcheck; int dcheck; ASSERT((un->un_column[cs->cs_pcolumn].un_devstate & RCS_INIT) == 0); ASSERT((un->un_column[cs->cs_dcolumn].un_devstate & RCS_INIT) == 0); ASSERT(IO_READER_HELD(un)); ASSERT(UNIT_READER_HELD(un)); ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS); if (cs->cs_flags & MD_RCS_LINE) { mr_unit_t *un = cs->cs_un; ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt); raid_mapin_buf(cs); cs->cs_frags = un->un_origcolumncnt; cs->cs_call = raid_stage; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_no_retry; cs->cs_stage = RAID_LINE_PWDONE; genlineparity(cs); return; } pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]); dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]); cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck; if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) { int err = EIO; if ((un->un_column[cs->cs_pcolumn].un_devstate == RCS_LAST_ERRED) || (un->un_column[cs->cs_dcolumn].un_devstate == RCS_LAST_ERRED)) err = ENXIO; raid_error_parent(ps, err); ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); raid_free_child(cs, 1); raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK | RFP_DECR_PWFRAGS); return; } if (pcheck & RCL_ERRED) { /* * handle case of only having data drive */ raid_mapin_buf(cs); wordcnt = cs->cs_bcount / sizeof (uint_t); dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); ubuf = (uint_t *)(void *)(cs->cs_addr); while (wordcnt--) { *dbuf = *ubuf; dsum ^= *ubuf; dbuf++; ubuf++; } RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1, cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, 1, cs->cs_dcolumn, RAID_PWMAGIC); cs->cs_frags = 1; cs->cs_stage = RAID_NONE; cs->cs_call = raid_write_donly; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_err_retry; if (WRITE_ALT(un, cs->cs_dcolumn)) { cs->cs_frags++; raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA | RIO_PREWRITE); } raidio(cs, RIO_DATA | RIO_PREWRITE); return; } if (dcheck & RCL_ERRED) { /* * handle case of only having parity drive * build parity from scratch using new data, * skip reading the data and parity columns. */ raid_mapin_buf(cs); cs->cs_loop = 0; while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) cs->cs_loop++; /* copy new data in to begin building parity */ bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount); cs->cs_stage = RAID_NONE; cs->cs_call = raid_write_ploop; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_err_retry; cs->cs_frags = 1; raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); return; } /* * handle normal cases * read old data and old parity */ cs->cs_frags = 2; cs->cs_stage = RAID_NONE; cs->cs_call = raid_write_got_old; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_retry; ASSERT(ps->ps_magic == RAID_PSMAGIC); raidio(cs, RIO_DATA | RIO_READ); raidio(cs, RIO_PARITY | RIO_READ); } static void raid_enqueue(md_raidcs_t *cs) { mdi_unit_t *ui = cs->cs_ps->ps_ui; kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; md_raidcs_t *cs1; mutex_enter(io_list_mutex); ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); if (ui->ui_io_lock->io_list_front == NULL) { ui->ui_io_lock->io_list_front = cs; ui->ui_io_lock->io_list_back = cs; } else { cs1 = ui->ui_io_lock->io_list_back; cs1->cs_linlck_next = cs; ui->ui_io_lock->io_list_back = cs; } STAT_INC(raid_write_waits); STAT_MAX(raid_max_write_q_length, raid_write_queue_length); cs->cs_linlck_next = NULL; mutex_exit(io_list_mutex); } /* * NAME: raid_write * DESCRIPTION: RAID metadevice write routine * PARAMETERS: mr_unit_t *un - pointer to a unit structure * md_raidcs_t *cs - pointer to a child structure */ /*ARGSUSED*/ static int raid_write(mr_unit_t *un, md_raidcs_t *cs) { int error = 0; md_raidps_t *ps; mdi_unit_t *ui; minor_t mnum; ASSERT(IO_READER_HELD(un)); ps = cs->cs_ps; ui = ps->ps_ui; ASSERT(UNIT_STATE(un) != RUS_INIT); if (UNIT_STATE(un) == RUS_LAST_ERRED) error = EIO; /* make sure the write doesn't go beyond the column */ if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) error = ENXIO; if (error) goto werror; getresources(cs); /* * this is an advisory loop that keeps the waiting lists short * to reduce cpu time. Since there is a race introduced by not * aquiring all the correct mutexes, use a cv_timedwait to be * sure the write always will wake up and start. */ while (raid_check_pw(cs)) { mutex_enter(&un->un_mx); un->un_rflags |= MD_RFLAG_NEEDPW; STAT_INC(raid_prewrite_waits); (void) cv_reltimedwait(&un->un_cv, &un->un_mx, md_wr_wait, TR_CLOCK_TICK); un->un_rflags &= ~MD_RFLAG_NEEDPW; mutex_exit(&un->un_mx); } if (raid_line_writer_lock(cs, 1)) return (0); un = (mr_unit_t *)md_unit_readerlock(ui); cs->cs_un = un; mnum = MD_SID(un); if (un->un_state & RUS_REGEN) { raid_regen_parity(cs); un = MD_UNIT(mnum); cs->cs_un = un; } raid_write_io(un, cs); return (0); werror: /* aquire unit reader lock sinc raid_free_child always drops it */ raid_error_parent(ps, error); raid_free_child(cs, 0); /* decrement both pwfrags and frags */ raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK); return (0); } /* * NAMES: raid_stage * DESCRIPTION: post-processing routine for a RAID metadevice * PARAMETERS: md_raidcs_t *cs - pointer to child structure */ static void raid_stage(md_raidcs_t *cs) { md_raidps_t *ps = cs->cs_ps; mr_unit_t *un = cs->cs_un; md_raidcbuf_t *cbuf; buf_t *bp; void *private; int flag; switch (cs->cs_stage) { case RAID_READ_DONE: raid_free_child(cs, 1); /* decrement readfrags */ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); return; case RAID_WRITE_DONE: case RAID_WRITE_PONLY_DONE: case RAID_WRITE_DONLY_DONE: /* * Completed writing real parity and/or data. */ ASSERT(cs->cs_flags & MD_RCS_PWDONE); raid_free_child(cs, 1); /* decrement frags but not pwfrags */ raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK); return; case RAID_PREWRITE_DONE: /* * completed writing data and parity to prewrite entries */ /* * WARNING: don't release unit reader lock here.. * decrement pwfrags but not frags */ raid_free_parent(ps, RFP_DECR_PWFRAGS); cs->cs_flags |= MD_RCS_PWDONE; cs->cs_frags = 2; cs->cs_stage = RAID_WRITE_DONE; cs->cs_call = raid_stage; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_no_retry; if (WRITE_ALT(un, cs->cs_pcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); } if (WRITE_ALT(un, cs->cs_dcolumn)) { cs->cs_frags++; raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); } ASSERT(cs->cs_frags < 4); raidio(cs, RIO_DATA | RIO_WRITE); raidio(cs, RIO_PARITY | RIO_WRITE); if (cs->cs_pw_inval_list) { raid_free_pwinvalidate(cs); } return; case RAID_LINE_PWDONE: ASSERT(cs->cs_frags == 0); raid_free_parent(ps, RFP_DECR_PWFRAGS); cs->cs_flags |= MD_RCS_PWDONE; cs->cs_frags = un->un_origcolumncnt; cs->cs_call = raid_stage; cs->cs_error_call = raid_write_error; cs->cs_retry_call = raid_write_no_retry; cs->cs_stage = RAID_WRITE_DONE; for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { /* * fill in buffer for write to prewrite area */ bp = &cbuf->cbuf_bp; bp->b_back = bp; bp->b_forw = bp; bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE; bp->b_bcount = cbuf->cbuf_bcount; bp->b_bufsize = cbuf->cbuf_bcount; bp->b_lblkno = un->un_column[cbuf->cbuf_column].un_devstart + cs->cs_blkno; bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR); bp->b_flags &= ~nv_available; bp->b_flags |= B_WRITE | B_BUSY; bp->b_iodone = (int (*)())raid_done; bp->b_edev = md_dev64_to_dev( un->un_column[cbuf->cbuf_column].un_dev); bp->b_chain = (struct buf *)cs; private = cs->cs_strategy_private; flag = cs->cs_strategy_flag; md_call_strategy(bp, flag, private); } raidio(cs, RIO_DATA | RIO_WRITE); raidio(cs, RIO_PARITY | RIO_WRITE); if (cs->cs_pw_inval_list) { raid_free_pwinvalidate(cs); } return; default: ASSERT(0); break; } } /* * NAME: md_raid_strategy * DESCRIPTION: RAID metadevice I/O oprations entry point. * PARAMETERS: buf_t *pb - pointer to a user I/O buffer * int flag - metadevice specific flag * void *private - carry over flag ?? * */ void md_raid_strategy(buf_t *pb, int flag, void *private) { md_raidps_t *ps; md_raidcs_t *cs; int doing_writes; int err; mr_unit_t *un; mdi_unit_t *ui; size_t count; diskaddr_t blkno; caddr_t addr; off_t offset; int colcnt; minor_t mnum; set_t setno; ui = MDI_UNIT(getminor(pb->b_edev)); md_kstat_waitq_enter(ui); un = (mr_unit_t *)md_io_readerlock(ui); setno = MD_MIN2SET(getminor(pb->b_edev)); if ((flag & MD_NOBLOCK) == 0) { if (md_inc_iocount(setno) != 0) { pb->b_flags |= B_ERROR; pb->b_error = ENXIO; pb->b_resid = pb->b_bcount; md_kstat_waitq_exit(ui); md_io_readerexit(ui); biodone(pb); return; } } else { md_inc_iocount_noblock(setno); } mnum = MD_SID(un); colcnt = un->un_totalcolumncnt - 1; count = pb->b_bcount; STAT_CHECK(raid_512, count == 512); STAT_CHECK(raid_1024, count == 1024); STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192); STAT_CHECK(raid_8192, count == 8192); STAT_CHECK(raid_8192_bigger, count > 8192); (void *) md_unit_readerlock(ui); if (!(flag & MD_STR_NOTTOP)) { err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */ if (err != 0) { md_kstat_waitq_exit(ui); md_io_readerexit(ui); return; } } md_unit_readerexit(ui); STAT_INC(raid_total_io); /* allocate a parent structure for the user I/O */ ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS); raid_parent_init(ps); /* * Save essential information from the original buffhdr * in the md_save structure. */ ps->ps_un = un; ps->ps_ui = ui; ps->ps_bp = pb; ps->ps_addr = pb->b_un.b_addr; if ((pb->b_flags & B_READ) == 0) { ps->ps_flags |= MD_RPS_WRITE; doing_writes = 1; STAT_INC(raid_writes); } else { ps->ps_flags |= MD_RPS_READ; doing_writes = 0; STAT_INC(raid_reads); } count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */ blkno = pb->b_lblkno; /* block number on device */ addr = 0; offset = 0; ps->ps_pwfrags = 1; ps->ps_frags = 1; md_kstat_waitq_to_runq(ui); do { cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS); raid_child_init(cs); cs->cs_ps = ps; cs->cs_un = un; cs->cs_mdunit = mnum; cs->cs_strategy_flag = flag; cs->cs_strategy_private = private; cs->cs_addr = addr; cs->cs_offset = offset; count = raid_iosetup(un, blkno, count, cs); if (cs->cs_flags & MD_RCS_LINE) { blkno += (cs->cs_blkcnt * colcnt); offset += (cs->cs_bcount * colcnt); } else { blkno += cs->cs_blkcnt; offset += cs->cs_bcount; } /* for each cs bump up the ps_pwfrags and ps_frags fields */ if (count) { mutex_enter(&ps->ps_mx); ps->ps_pwfrags++; ps->ps_frags++; mutex_exit(&ps->ps_mx); if (doing_writes) (void) raid_write(un, cs); else (void) raid_read(un, cs); } } while (count); if (doing_writes) { (void) raid_write(un, cs); } else (void) raid_read(un, cs); if (! (flag & MD_STR_NOTTOP) && panicstr) { while (! (ps->ps_flags & MD_RPS_DONE)) { md_daemon(1, &md_done_daemon); drv_usecwait(10); } kmem_cache_free(raid_parent_cache, ps); } } /* * NAMES: raid_snarf * DESCRIPTION: RAID metadevice SNARF entry point * PARAMETERS: md_snarfcmd_t cmd, * set_t setno * RETURNS: */ static int raid_snarf(md_snarfcmd_t cmd, set_t setno) { mr_unit_t *un; mddb_recid_t recid; int gotsomething; int all_raid_gotten; mddb_type_t typ1; uint_t ncol; mddb_de_ic_t *dep; mddb_rb32_t *rbp; size_t newreqsize; mr_unit_t *big_un; mr_unit32_od_t *small_un; if (cmd == MD_SNARF_CLEANUP) return (0); all_raid_gotten = 1; gotsomething = 0; typ1 = (mddb_type_t)md_getshared_key(setno, raid_md_ops.md_driver.md_drivername); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) { continue; } dep = mddb_getrecdep(recid); dep->de_flags = MDDB_F_RAID; rbp = dep->de_rb; switch (rbp->rb_revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: if ((rbp->rb_private & MD_PRV_CONVD) == 0) { /* * This means, we have an old and small record * and this record hasn't already been * converted. Before we create an incore * metadevice from this we have to convert it to * a big record. */ small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid); ncol = small_un->un_totalcolumncnt; newreqsize = sizeof (mr_unit_t) + ((ncol - 1) * sizeof (mr_column_t)); big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); raid_convert((caddr_t)small_un, (caddr_t)big_un, SMALL_2_BIG); kmem_free(small_un, dep->de_reqsize); dep->de_rb_userdata = big_un; dep->de_reqsize = newreqsize; un = big_un; rbp->rb_private |= MD_PRV_CONVD; } else { /* * Record has already been converted. Just * get its address. */ un = (mr_unit_t *)mddb_getrecaddr(recid); } un->c.un_revision &= ~MD_64BIT_META_DEV; break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: /* Big device */ un = (mr_unit_t *)mddb_getrecaddr(recid); un->c.un_revision |= MD_64BIT_META_DEV; un->c.un_flag |= MD_EFILABEL; break; } MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); /* * Create minor device node for snarfed entry. */ (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); if (MD_UNIT(MD_SID(un)) != NULL) { mddb_setrecprivate(recid, MD_PRV_PENDDEL); continue; } all_raid_gotten = 0; if (raid_build_incore((void *)un, 1) == 0) { mddb_setrecprivate(recid, MD_PRV_GOTIT); md_create_unit_incore(MD_SID(un), &raid_md_ops, 1); gotsomething = 1; } else if (un->mr_ic) { kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * un->un_totalcolumncnt); kmem_free(un->mr_ic, sizeof (*un->mr_ic)); } } if (!all_raid_gotten) { return (gotsomething); } recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) mddb_setrecprivate(recid, MD_PRV_PENDDEL); return (0); } /* * NAMES: raid_halt * DESCRIPTION: RAID metadevice HALT entry point * PARAMETERS: md_haltcmd_t cmd - * set_t setno - * RETURNS: */ static int raid_halt(md_haltcmd_t cmd, set_t setno) { set_t i; mdi_unit_t *ui; minor_t mnum; if (cmd == MD_HALT_CLOSE) return (0); if (cmd == MD_HALT_OPEN) return (0); if (cmd == MD_HALT_UNLOAD) return (0); if (cmd == MD_HALT_CHECK) { for (i = 0; i < md_nunits; i++) { mnum = MD_MKMIN(setno, i); if ((ui = MDI_UNIT(mnum)) == NULL) continue; if (ui->ui_opsindex != raid_md_ops.md_selfindex) continue; if (md_unit_isopen(ui)) return (1); } return (0); } if (cmd != MD_HALT_DOIT) return (1); for (i = 0; i < md_nunits; i++) { mnum = MD_MKMIN(setno, i); if ((ui = MDI_UNIT(mnum)) == NULL) continue; if (ui->ui_opsindex != raid_md_ops.md_selfindex) continue; reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0); } return (0); } /* * NAMES: raid_close_all_devs * DESCRIPTION: Close all the devices of the unit. * PARAMETERS: mr_unit_t *un - pointer to unit structure * RETURNS: */ void raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags) { int i; mr_column_t *device; for (i = 0; i < un->un_totalcolumncnt; i++) { device = &un->un_column[i]; if (device->un_devflags & MD_RAID_DEV_ISOPEN) { ASSERT((device->un_dev != (md_dev64_t)0) && (device->un_dev != NODEV64)); if ((device->un_devstate & RCS_OKAY) && init_pw) (void) init_pw_area(un, device->un_dev, device->un_pwstart, i); md_layered_close(device->un_dev, md_cflags); device->un_devflags &= ~MD_RAID_DEV_ISOPEN; } } } /* * NAMES: raid_open_all_devs * DESCRIPTION: Open all the components (columns) of the device unit. * PARAMETERS: mr_unit_t *un - pointer to unit structure * RETURNS: */ static int raid_open_all_devs(mr_unit_t *un, int md_oflags) { minor_t mnum = MD_SID(un); int i; int not_opened = 0; int commit = 0; int col = -1; mr_column_t *device; set_t setno = MD_MIN2SET(MD_SID(un)); side_t side = mddb_getsidenum(setno); mdkey_t key; mdi_unit_t *ui = MDI_UNIT(mnum); ui->ui_tstate &= ~MD_INACCESSIBLE; for (i = 0; i < un->un_totalcolumncnt; i++) { md_dev64_t tmpdev; device = &un->un_column[i]; if (COLUMN_STATE(un, i) & RCS_ERRED) { not_opened++; continue; } if (device->un_devflags & MD_RAID_DEV_ISOPEN) continue; tmpdev = device->un_dev; /* * Open by device id */ key = HOTSPARED(un, i) ? device->un_hs_key : device->un_orig_key; if ((md_getmajor(tmpdev) != md_major) && md_devid_found(setno, side, key) == 1) { tmpdev = md_resolve_bydevid(mnum, tmpdev, key); } if (md_layered_open(mnum, &tmpdev, md_oflags)) { device->un_dev = tmpdev; not_opened++; continue; } device->un_dev = tmpdev; device->un_devflags |= MD_RAID_DEV_ISOPEN; } /* if open errors and errored devices are 1 then device can run */ if (not_opened > 1) { cmn_err(CE_WARN, "md: %s failed to open. open error on %s\n", md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), device->un_orig_dev, NULL, 0)); ui->ui_tstate |= MD_INACCESSIBLE; SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); return (not_opened > 1); } for (i = 0; i < un->un_totalcolumncnt; i++) { device = &un->un_column[i]; if (device->un_devflags & MD_RAID_DEV_ISOPEN) { if (device->un_devstate & RCS_LAST_ERRED) { /* * At this point in time there is a possibility * that errors were the result of a controller * failure with more than a single column on it * so clear out last errored columns and let errors * re-occur is necessary. */ raid_set_state(un, i, RCS_OKAY, 0); commit++; } continue; } ASSERT(col == -1); col = i; } if (col != -1) { raid_set_state(un, col, RCS_ERRED, 0); commit++; } if (commit) raid_commit(un, NULL); if (col != -1) { if (COLUMN_STATE(un, col) & RCS_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } } return (0); } /* * NAMES: raid_internal_open * DESCRIPTION: Do the actual RAID open * PARAMETERS: minor_t mnum - minor number of the RAID device * int flag - * int otyp - * int md_oflags - RAID open flags * RETURNS: 0 if successful, nonzero otherwise */ int raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags) { mr_unit_t *un; mdi_unit_t *ui; int err = 0; int replay_error = 0; ui = MDI_UNIT(mnum); ASSERT(ui != NULL); un = (mr_unit_t *)md_unit_openclose_enter(ui); /* * this MUST be checked before md_unit_isopen is checked. * raid_init_columns sets md_unit_isopen to block reset, halt. */ if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) && !(md_oflags & MD_OFLG_ISINIT)) { md_unit_openclose_exit(ui); return (EAGAIN); } if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) { err = md_unit_incopen(mnum, flag, otyp); goto out; } md_unit_readerexit(ui); un = (mr_unit_t *)md_unit_writerlock(ui); if (raid_open_all_devs(un, md_oflags) == 0) { if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) { md_unit_writerexit(ui); un = (mr_unit_t *)md_unit_readerlock(ui); raid_close_all_devs(un, 0, md_oflags); goto out; } } else { /* * if this unit contains more than two errored components * should return error and close all opened devices */ md_unit_writerexit(ui); un = (mr_unit_t *)md_unit_readerlock(ui); raid_close_all_devs(un, 0, md_oflags); md_unit_openclose_exit(ui); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); return (ENXIO); } if (!(MD_STATUS(un) & MD_UN_REPLAYED)) { replay_error = raid_replay(un); MD_STATUS(un) |= MD_UN_REPLAYED; } md_unit_writerexit(ui); un = (mr_unit_t *)md_unit_readerlock(ui); if ((replay_error == RAID_RPLY_READONLY) && ((flag & (FREAD | FWRITE)) == FREAD)) { md_unit_openclose_exit(ui); return (0); } /* allocate hotspare if possible */ (void) raid_hotspares(); out: md_unit_openclose_exit(ui); return (err); } /* * NAMES: raid_open * DESCRIPTION: RAID metadevice OPEN entry point * PARAMETERS: dev_t dev - * int flag - * int otyp - * cred_t * cred_p - * int md_oflags - * RETURNS: */ /*ARGSUSED1*/ static int raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) { int error = 0; if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) { return (error); } return (0); } /* * NAMES: raid_internal_close * DESCRIPTION: RAID metadevice CLOSE actual implementation * PARAMETERS: minor_t - minor number of the RAID device * int otyp - * int init_pw - * int md_cflags - RAID close flags * RETURNS: 0 if successful, nonzero otherwise */ /*ARGSUSED*/ int raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags) { mdi_unit_t *ui = MDI_UNIT(mnum); mr_unit_t *un; int err = 0; /* single thread */ un = (mr_unit_t *)md_unit_openclose_enter(ui); /* count closed */ if ((err = md_unit_decopen(mnum, otyp)) != 0) goto out; /* close devices, if necessary */ if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { raid_close_all_devs(un, init_pw, md_cflags); } /* unlock, return success */ out: md_unit_openclose_exit(ui); return (err); } /* * NAMES: raid_close * DESCRIPTION: RAID metadevice close entry point * PARAMETERS: dev_t dev - * int flag - * int otyp - * cred_t * cred_p - * int md_oflags - * RETURNS: */ /*ARGSUSED1*/ static int raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) { int retval; (void) md_io_writerlock(MDI_UNIT(getminor(dev))); retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags); (void) md_io_writerexit(MDI_UNIT(getminor(dev))); return (retval); } /* * raid_probe_close_all_devs */ void raid_probe_close_all_devs(mr_unit_t *un) { int i; mr_column_t *device; for (i = 0; i < un->un_totalcolumncnt; i++) { device = &un->un_column[i]; if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { md_layered_close(device->un_dev, MD_OFLG_PROBEDEV); device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN; } } } /* * Raid_probe_dev: * * On entry the unit writerlock is held */ static int raid_probe_dev(mdi_unit_t *ui, minor_t mnum) { mr_unit_t *un; int i; int not_opened = 0; int commit = 0; int col = -1; mr_column_t *device; int md_devopen = 0; if (md_unit_isopen(ui)) md_devopen++; un = MD_UNIT(mnum); /* * If the state has been set to LAST_ERRED because * of an error when the raid device was open at some * point in the past, don't probe. We really don't want * to reset the state in this case. */ if (UNIT_STATE(un) == RUS_LAST_ERRED) return (0); ui->ui_tstate &= ~MD_INACCESSIBLE; for (i = 0; i < un->un_totalcolumncnt; i++) { md_dev64_t tmpdev; device = &un->un_column[i]; if (COLUMN_STATE(un, i) & RCS_ERRED) { not_opened++; continue; } tmpdev = device->un_dev; /* * Currently the flags passed are not needed since * there cannot be an underlying metadevice. However * they are kept here for consistency. * * Open by device id */ tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)? device->un_hs_key : device->un_orig_key); if (md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) { device->un_dev = tmpdev; not_opened++; continue; } device->un_dev = tmpdev; device->un_devflags |= MD_RAID_DEV_PROBEOPEN; } /* * The code below is careful on setting the LAST_ERRED state. * * If open errors and exactly one device has failed we can run. * If more then one device fails we have to figure out when to set * LAST_ERRED state. The rationale is to avoid unnecessary resyncs * since they are painful and time consuming. * * When more than one component/column fails there are 2 scenerios. * * 1. Metadevice has NOT been opened: In this case, the behavior * mimics the open symantics. ie. Only the first failed device * is ERRED and LAST_ERRED is not set. * * 2. Metadevice has been opened: Here the read/write sematics are * followed. The first failed devicce is ERRED and on the next * failed device LAST_ERRED is set. */ if (not_opened > 1 && !md_devopen) { cmn_err(CE_WARN, "md: %s failed to open. open error on %s\n", md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), device->un_orig_dev, NULL, 0)); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); raid_probe_close_all_devs(un); ui->ui_tstate |= MD_INACCESSIBLE; return (not_opened > 1); } if (!md_devopen) { for (i = 0; i < un->un_totalcolumncnt; i++) { device = &un->un_column[i]; if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { if (device->un_devstate & RCS_LAST_ERRED) { /* * At this point in time there is a * possibility that errors were the * result of a controller failure with * more than a single column on it so * clear out last errored columns and * let errors re-occur is necessary. */ raid_set_state(un, i, RCS_OKAY, 0); commit++; } continue; } ASSERT(col == -1); /* * note if multiple devices are failing then only * the last one is marked as error */ col = i; } if (col != -1) { raid_set_state(un, col, RCS_ERRED, 0); commit++; } } else { for (i = 0; i < un->un_totalcolumncnt; i++) { device = &un->un_column[i]; /* if we have LAST_ERRED go ahead and commit. */ if (un->un_state & RUS_LAST_ERRED) break; /* * could not open the component */ if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) { col = i; raid_set_state(un, col, RCS_ERRED, 0); commit++; } } } if (commit) raid_commit(un, NULL); if (col != -1) { if (COLUMN_STATE(un, col) & RCS_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } } raid_probe_close_all_devs(un); return (0); } static int raid_imp_set( set_t setno ) { mddb_recid_t recid; int i, gotsomething; mddb_type_t typ1; mddb_de_ic_t *dep; mddb_rb32_t *rbp; mr_unit_t *un64; mr_unit32_od_t *un32; md_dev64_t self_devt; minor_t *self_id; /* minor needs to be updated */ md_parent_t *parent_id; /* parent needs to be updated */ mddb_recid_t *record_id; /* record id needs to be updated */ hsp_t *hsp_id; gotsomething = 0; typ1 = (mddb_type_t)md_getshared_key(setno, raid_md_ops.md_driver.md_drivername); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) continue; dep = mddb_getrecdep(recid); rbp = dep->de_rb; switch (rbp->rb_revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: /* * Small device */ un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid); self_id = &(un32->c.un_self_id); parent_id = &(un32->c.un_parent); record_id = &(un32->c.un_record_id); hsp_id = &(un32->un_hsp_id); for (i = 0; i < un32->un_totalcolumncnt; i++) { mr_column32_od_t *device; device = &un32->un_column[i]; if (!md_update_minor(setno, mddb_getsidenum (setno), device->un_orig_key)) goto out; if (device->un_hs_id != 0) device->un_hs_id = MAKERECID(setno, device->un_hs_id); } break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: un64 = (mr_unit_t *)mddb_getrecaddr(recid); self_id = &(un64->c.un_self_id); parent_id = &(un64->c.un_parent); record_id = &(un64->c.un_record_id); hsp_id = &(un64->un_hsp_id); for (i = 0; i < un64->un_totalcolumncnt; i++) { mr_column_t *device; device = &un64->un_column[i]; if (!md_update_minor(setno, mddb_getsidenum (setno), device->un_orig_key)) goto out; if (device->un_hs_id != 0) device->un_hs_id = MAKERECID(setno, device->un_hs_id); } break; } /* * If this is a top level and a friendly name metadevice, * update its minor in the namespace. */ if ((*parent_id == MD_NO_PARENT) && ((rbp->rb_revision == MDDB_REV_RBFN) || (rbp->rb_revision == MDDB_REV_RB64FN))) { self_devt = md_makedevice(md_major, *self_id); if (!md_update_top_device_minor(setno, mddb_getsidenum(setno), self_devt)) goto out; } /* * Update unit with the imported setno */ mddb_setrecprivate(recid, MD_PRV_GOTIT); *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); if (*hsp_id != -1) *hsp_id = MAKERECID(setno, DBID(*hsp_id)); if (*parent_id != MD_NO_PARENT) *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); *record_id = MAKERECID(setno, DBID(*record_id)); gotsomething = 1; } out: return (gotsomething); } static md_named_services_t raid_named_services[] = { {raid_hotspares, "poke hotspares" }, {raid_rename_check, MDRNM_CHECK }, {raid_rename_lock, MDRNM_LOCK }, {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK }, {(intptr_t (*)()) raid_probe_dev, "probe open test" }, {NULL, 0 } }; md_ops_t raid_md_ops = { raid_open, /* open */ raid_close, /* close */ md_raid_strategy, /* strategy */ NULL, /* print */ NULL, /* dump */ NULL, /* read */ NULL, /* write */ md_raid_ioctl, /* ioctl, */ raid_snarf, /* raid_snarf */ raid_halt, /* raid_halt */ NULL, /* aread */ NULL, /* awrite */ raid_imp_set, /* import set */ raid_named_services }; static void init_init() { /* default to a second */ if (md_wr_wait == 0) md_wr_wait = md_hz >> 1; raid_parent_cache = kmem_cache_create("md_raid_parent", sizeof (md_raidps_t), 0, raid_parent_constructor, raid_parent_destructor, raid_run_queue, NULL, NULL, 0); raid_child_cache = kmem_cache_create("md_raid_child", sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0, raid_child_constructor, raid_child_destructor, raid_run_queue, NULL, NULL, 0); raid_cbuf_cache = kmem_cache_create("md_raid_cbufs", sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor, raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0); } static void fini_uninit() { kmem_cache_destroy(raid_parent_cache); kmem_cache_destroy(raid_child_cache); kmem_cache_destroy(raid_cbuf_cache); raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL; } /* define the module linkage */ MD_PLUGIN_MISC_MODULE("raid module", init_init(), fini_uninit())