lvm/mirror/mirror.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/user.h>
#include <sys/uio.h>
#include <sys/t_lock.h>
#include <sys/buf.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/kmem.h>
#include <vm/page.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mkdev.h>
#include <sys/stat.h>
#include <sys/open.h>
#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/debug.h>
#include <sys/dklabel.h>
#include <vm/hat.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_mirror.h>
#include <sys/lvm/md_convert.h>
#include <sys/lvm/md_mddb.h>
#include <sys/esunddi.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>
#include <sys/lvm/mdmn_commd.h>
#include <sys/avl.h>

md_ops_t		mirror_md_ops;
#ifndef	lint
char			_depends_on[] = "drv/md";
md_ops_t		*md_interface_ops = &mirror_md_ops;
#endif

extern mdq_anchor_t	md_done_daemon;
extern mdq_anchor_t	md_mstr_daemon;
extern mdq_anchor_t	md_mirror_daemon;
extern mdq_anchor_t	md_mirror_io_daemon;
extern mdq_anchor_t	md_mirror_rs_daemon;
extern mdq_anchor_t	md_mhs_daemon;

extern unit_t		md_nunits;
extern set_t		md_nsets;
extern md_set_t		md_set[];

extern int		md_status;
extern clock_t		md_hz;

extern md_krwlock_t	md_unit_array_rw;
extern kmutex_t		md_mx;
extern kcondvar_t	md_cv;
extern int		md_mtioctl_cnt;

daemon_request_t	mirror_timeout;
static daemon_request_t	hotspare_request;
static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */

int	md_mirror_mcs_buf_off;

/* Flags for mdmn_ksend_message to allow debugging */
int	md_mirror_msg_flags;

#ifdef DEBUG
/* Flag to switch on debug messages */
int	mirror_debug_flag = 0;
#endif

/*
 * Struct used to hold count of DMR reads and the timestamp of last DMR read
 * It is used to verify, using a debugger, that the DMR read ioctl has been
 * executed.
 */
dmr_stats_t	mirror_dmr_stats = {0, 0};

/*
 * Mutex protecting list of non-failfast drivers.
 */
static kmutex_t	non_ff_drv_mutex;
extern char	**non_ff_drivers;

extern major_t	md_major;

/*
 * Write-On-Write memory pool.
 */
static void		copy_write_cont(wowhdr_t *wowhdr);
static kmem_cache_t	*mirror_wowblk_cache = NULL;
static int		md_wowbuf_size = 16384;
static size_t		md_wowblk_size;

/*
 * This is a flag that allows:
 *	- disabling the write-on-write mechanism.
 *	- logging occurrences of write-on-write
 *	- switching wow handling procedure processing
 * Counter for occurences of WOW.
 */
static uint_t	md_mirror_wow_flg = 0;
static int	md_mirror_wow_cnt = 0;

/*
 * Tunable to enable/disable dirty region
 * processing when closing down a mirror.
 */
static int	new_resync = 1;
kmem_cache_t	*mirror_parent_cache = NULL;
kmem_cache_t	*mirror_child_cache = NULL;

extern int	md_ff_disable;		/* disable failfast */

static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
static void	mirror_read_strategy(buf_t *, int, void *);
static void	mirror_write_strategy(buf_t *, int, void *);
static void	become_owner(daemon_queue_t *);
static int	mirror_done(struct buf *cb);
static int	mirror_done_common(struct buf *cb);
static void	clear_retry_error(struct buf *cb);

/*
 * patchables
 */
int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
int	md_def_num_rr	= 1000;	/* Default number of dirty regions */

/*
 * patchable to change delay before rescheduling mirror ownership request.
 * Value is clock ticks, default 0.5 seconds
 */
clock_t	md_mirror_owner_to = 500000;

/*ARGSUSED1*/
static int
mirror_parent_constructor(void *p, void *d1, int d2)
{
	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
	return (0);
}

static void
mirror_parent_init(md_mps_t *ps)
{
	bzero(ps, offsetof(md_mps_t, ps_mx));
	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
}

/*ARGSUSED1*/
static void
mirror_parent_destructor(void *p, void *d)
{
	mutex_destroy(&((md_mps_t *)p)->ps_mx);
}

/*ARGSUSED1*/
static int
mirror_child_constructor(void *p, void *d1, int d2)
{
	bioinit(&((md_mcs_t *)p)->cs_buf);
	return (0);
}

void
mirror_child_init(md_mcs_t *cs)
{
	cs->cs_ps = NULL;
	cs->cs_mdunit = 0;
	md_bioreset(&cs->cs_buf);
}

/*ARGSUSED1*/
static void
mirror_child_destructor(void *p, void *d)
{
	biofini(&((md_mcs_t *)p)->cs_buf);
}

static void
mirror_wowblk_init(wowhdr_t *p)
{
	bzero(p, md_wowblk_size);
}

static void
send_poke_hotspares_msg(daemon_request_t *drq)
{
	int			rval;
	md_mn_msg_pokehsp_t	pokehsp;
	md_mn_kresult_t		*kresult;
	set_t			setno = (set_t)drq->dq.qlen;

	pokehsp.pokehsp_setno = setno;

	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
	    sizeof (pokehsp), kresult);

	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
		/* If we're shutting down already, pause things here. */
		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
			while (!md_mn_is_commd_present()) {
				delay(md_hz);
			}
		}
		cmn_err(CE_PANIC,
		    "ksend_message failure: POKE_HOTSPARES");
	}
	kmem_free(kresult, sizeof (md_mn_kresult_t));

	/* Allow further requests to use this set's queue structure */
	mutex_enter(&drq->dr_mx);
	drq->dr_pending = 0;
	mutex_exit(&drq->dr_mx);
}

/*
 * Send a poke_hotspares message to the master node. To avoid swamping the
 * commd handler with requests we only send a message if there is not one
 * already outstanding. We punt the request to a separate thread context as
 * cannot afford to block waiting on the request to be serviced. This is
 * essential when a reconfig cycle is in progress as any open() of a multinode
 * metadevice may result in a livelock.
 */
static void
send_poke_hotspares(set_t setno)
{
	daemon_request_t	*drq = &mn_hs_request[setno];

	mutex_enter(&drq->dr_mx);
	if (drq->dr_pending == 0) {
		drq->dr_pending = 1;
		drq->dq.qlen = (int)setno;
		daemon_request(&md_mhs_daemon,
		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
	}
	mutex_exit(&drq->dr_mx);
}

void
mirror_set_sm_state(
	mm_submirror_t		*sm,
	mm_submirror_ic_t	*smic,
	sm_state_t		newstate,
	int			force)
{
	int			compcnt;
	int			i;
	int			errcnt;
	sm_state_t		origstate;
	md_m_shared_t		*shared;

	if (force) {
		sm->sm_state = newstate;
		uniqtime32(&sm->sm_timestamp);
		return;
	}

	origstate = newstate;

	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
	for (i = 0, errcnt = 0; i < compcnt; i++) {
		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
		    (sm->sm_dev, sm, i);
		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
			newstate |= SMS_COMP_ERRED;
		if (shared->ms_state & (CS_RESYNC))
			newstate |= SMS_COMP_RESYNC;
		if (shared->ms_state & CS_ERRED)
			errcnt++;
	}

	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
		newstate &= ~origstate;

	if (errcnt == compcnt)
		newstate |= SMS_ALL_ERRED;
	else
		newstate &= ~SMS_ALL_ERRED;

	sm->sm_state = newstate;
	uniqtime32(&sm->sm_timestamp);
}

static int
mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
							int frm_probe)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	md_m_shared_t		*shared;
	int			ci;
	int			i;
	int			compcnt;
	int			open_comp; /* flag for open component */

	for (i = *smi; i < NMIRROR; i++) {
		sm = &un->un_sm[i];
		smic = &un->un_smic[i];

		if (!SMS_IS(sm, SMS_INUSE))
			continue;

		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
		for (ci = *cip; ci < compcnt; ci++) {
			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
			    (sm->sm_dev, sm, ci);
			/*
			 * if called from any routine but probe, we check for
			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
			 * it sets MDM_S_PROBEOPEN flag and we test for this
			 * flag. They are both exclusive tests.
			 */
			open_comp = (frm_probe) ?
			    (shared->ms_flags & MDM_S_PROBEOPEN):
			    (shared->ms_flags & MDM_S_ISOPEN);
			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
			    ((shared->ms_state == CS_OKAY) ||
			    (shared->ms_state == CS_RESYNC))) {
				if (clr_error) {
					shared->ms_flags &= ~MDM_S_IOERR;
				}
				*cip = ci;
				*smi = i;
				return (1);
			}

			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
				shared->ms_flags &= ~MDM_S_IOERR;
			}
		}

		*cip = 0;
	}
	return (0);
}

/*ARGSUSED*/
static void
mirror_run_queue(void *d)
{
	if (!(md_status & MD_GBL_DAEMONS_LIVE))
		md_daemon(1, &md_done_daemon);
}
/*
 * check_comp_4_hotspares
 *
 * This function attempts to allocate a hotspare for this component if the
 * component is in error. In a MN set, the function can be called in 2 modes.
 * It can be called either when a component error has been detected or when a
 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
 * in flags and the request is sent to all nodes.
 * The handler on each of the nodes then calls this function with
 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
 *
 * For non-MN sets the function simply attempts to allocate a hotspare.
 *
 * On entry, the following locks are held
 *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
 *	md_unit_writerlock
 *
 * Returns	0 if ok
 *		1 if the unit containing the component has been cleared while
 *		  the mdmn_ksend_message() was being executed
 */
extern int
check_comp_4_hotspares(
	mm_unit_t	*un,
	int		smi,
	int		ci,
	uint_t		flags,
	mddb_recid_t	hs_id,	/* Only used by MN disksets */
	IOLOCK		*lockp	/* can be NULL */
)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	md_m_shared_t		*shared;
	mddb_recid_t		recids[6];
	minor_t			mnum;
	intptr_t		(*hs_dev)();
	void			(*hs_done)();
	void			*hs_data;
	md_error_t		mde = mdnullerror;
	set_t			setno;
	md_mn_msg_allochsp_t	allochspmsg;
	md_mn_kresult_t		*kresult;
	mm_unit_t		*new_un;
	int			rval;

	mnum = MD_SID(un);
	setno = MD_UN2SET(un);
	sm = &un->un_sm[smi];
	smic = &un->un_smic[smi];
	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
	    (sm->sm_dev, sm, ci);

	if (shared->ms_state != CS_ERRED)
		return (0);

	/* Don't start a new component resync if a resync is already running. */
	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
		return (0);

	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
		uint_t		msgflags;
		md_mn_msgtype_t	msgtype;

		/* Send allocate hotspare message to all nodes */

		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
		allochspmsg.msg_allochsp_sm = smi;
		allochspmsg.msg_allochsp_comp = ci;
		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;

		/*
		 * Before calling mdmn_ksend_message(), release locks
		 * Can never be in the context of an ioctl.
		 */
		md_unit_writerexit(MDI_UNIT(mnum));
		if (flags & MD_HOTSPARE_LINKHELD)
			rw_exit(&mirror_md_ops.md_link_rw.lock);
#ifdef DEBUG
		if (mirror_debug_flag)
			printf("send alloc hotspare, flags="
			    "0x%x %x, %x, %x, %x\n", flags,
			    allochspmsg.msg_allochsp_mnum,
			    allochspmsg.msg_allochsp_sm,
			    allochspmsg.msg_allochsp_comp,
			    allochspmsg.msg_allochsp_hs_id);
#endif
		if (flags & MD_HOTSPARE_WMUPDATE) {
			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
			/*
			 * When coming from an update of watermarks, there
			 * must already be a message logged that triggered
			 * this action. So, no need to log this message, too.
			 */
			msgflags = MD_MSGF_NO_LOG;
		} else {
			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
			msgflags = MD_MSGF_DEFAULT_FLAGS;
		}

		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
		    (char *)&allochspmsg, sizeof (allochspmsg),
		    kresult);

		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
#ifdef DEBUG
			if (mirror_debug_flag)
				mdmn_ksend_show_error(rval, kresult,
				    "ALLOCATE HOTSPARE");
#endif
			/*
			 * If message is sent ok but exitval indicates an error
			 * it must be because the mirror has been cleared. In
			 * this case re-obtain lock and return an error
			 */
			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
				if (flags & MD_HOTSPARE_LINKHELD) {
					rw_enter(&mirror_md_ops.md_link_rw.lock,
					    RW_READER);
				}
				kmem_free(kresult, sizeof (md_mn_kresult_t));
				return (1);
			}
			/* If we're shutting down already, pause things here. */
			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
				while (!md_mn_is_commd_present()) {
					delay(md_hz);
				}
			}
			cmn_err(CE_PANIC,
			    "ksend_message failure: ALLOCATE_HOTSPARE");
		}
		kmem_free(kresult, sizeof (md_mn_kresult_t));

		/*
		 * re-obtain the locks
		 */
		if (flags & MD_HOTSPARE_LINKHELD)
			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
		new_un = md_unit_writerlock(MDI_UNIT(mnum));

		/*
		 * As we had to release the locks in order to send the
		 * message to all nodes, we need to check to see if the
		 * unit has changed. If it has we release the writerlock
		 * and return fail.
		 */
		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
			md_unit_writerexit(MDI_UNIT(mnum));
			return (1);
		}
	} else {
		if (MD_MNSET_SETNO(setno)) {
			/*
			 * If 2 or more nodes simultaneously see a
			 * component failure, these nodes will each
			 * send an ALLOCATE_HOTSPARE[2] message.
			 * The first message will allocate the hotspare
			 * and the subsequent messages should do nothing.
			 *
			 * If a slave node doesn't have a hotspare allocated
			 * at the time the message is initiated, then the
			 * passed in hs_id will be 0.  If the node
			 * executing this routine has a component shared
			 * ms_hs_id of non-zero, but the message shows a
			 * hs_id of 0, then just return since a hotspare
			 * has already been allocated for this failing
			 * component.  When the slave node returns from
			 * the ksend_message the hotspare will have
			 * already been allocated.
			 *
			 * If the slave node does send an hs_id of non-zero,
			 * and the slave node's hs_id matches this node's
			 * ms_hs_id, then the hotspare has error'd and
			 * should be replaced.
			 *
			 * If the slave node sends an hs_id of non-zero and
			 * this node has a different shared ms_hs_id, then
			 * just return since this hotspare has already
			 * been hotspared.
			 */
			if (shared->ms_hs_id != 0) {
				if (hs_id == 0) {
#ifdef DEBUG
					if (mirror_debug_flag) {
						printf("check_comp_4_hotspares"
						    "(NOXMIT), short circuit "
						    "hs_id=0x%x, "
						    "ms_hs_id=0x%x\n",
						    hs_id, shared->ms_hs_id);
					}
#endif
					return (0);
				}
				if (hs_id != shared->ms_hs_id) {
#ifdef DEBUG
					if (mirror_debug_flag) {
						printf("check_comp_4_hotspares"
						    "(NOXMIT), short circuit2 "
						    "hs_id=0x%x, "
						    "ms_hs_id=0x%x\n",
						    hs_id, shared->ms_hs_id);
					}
#endif
					return (0);
				}
			}
		}

		sm = &un->un_sm[smi];
		hs_dev = md_get_named_service(sm->sm_dev, 0,
		    "hotspare device", 0);
		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
		    &hs_data) != 0)
			return (0);

		/*
		 * set_sm_comp_state() commits the modified records.
		 * As we don't transmit the changes, no need to drop the lock.
		 */
		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);

		(*hs_done)(sm->sm_dev, hs_data);

		mirror_check_failfast(mnum);

		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
		    setno, MD_SID(un));

		/*
		 * For a multi-node set we need to reset the un_rs_type,
		 * un_rs_resync_done and un_rs_resync_2_do fields as the
		 * hot-spare resync must copy all applicable data.
		 */
		if (MD_MNSET_SETNO(setno)) {
			un->un_rs_type = MD_RS_NONE;
			un->un_rs_resync_done = 0;
			un->un_rs_resync_2_do = 0;
		}

		/*
		 * Must drop writer lock since mirror_resync_unit will
		 * open devices and must be able to grab readerlock.
		 * Don't need to drop IOLOCK since any descendent routines
		 * calling ksend_messages will drop the IOLOCK as needed.
		 *
		 */
		if (lockp) {
			md_ioctl_writerexit(lockp);
		} else {
			md_unit_writerexit(MDI_UNIT(mnum));
		}

		/* start resync */
		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);

		if (lockp) {
			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
		} else {
			new_un = md_unit_writerlock(MDI_UNIT(mnum));
		}
	}
	return (0);
}

/*
 * check_unit_4_hotspares
 *
 * For a given mirror, allocate hotspares, if available for any components
 * that are in error
 *
 * Returns	0 if ok
 *		1 if check_comp_4_hotspares returns non-zero. This will only
 *		  happen for a MN unit where the unit has been cleared while
 *		  the allocate hotspare message is sent to all nodes.
 */
static int
check_unit_4_hotspares(mm_unit_t *un, int flags)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	int			ci;
	int			i;
	int			compcnt;

	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
		return (0);

	for (i = 0; i < NMIRROR; i++) {
		sm = &un->un_sm[i];
		smic = &un->un_smic[i];
		if (!SMS_IS(sm, SMS_INUSE))
			continue;
		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
		for (ci = 0; ci < compcnt; ci++) {
			md_m_shared_t		*shared;

			shared = (md_m_shared_t *)
			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
			/*
			 * Never called from ioctl context, so pass in
			 * (IOLOCK *)NULL.  Pass through flags from calling
			 * routine, also setting XMIT flag.
			 */
			if (check_comp_4_hotspares(un, i, ci,
			    (MD_HOTSPARE_XMIT | flags),
			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
				return (1);
		}
	}
	return (0);
}

static void
check_4_hotspares(daemon_request_t *drq)
{
	mdi_unit_t	*ui;
	mm_unit_t	*un;
	md_link_t	*next;
	int		x;

	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
	drq->dr_pending = 0;		/* again in low level routine if */
	mutex_exit(&drq->dr_mx);	/* something found to do	*/

	/*
	 * Used to have a problem here. The disksets weren't marked as being
	 * MNHOLD. This opened a window where we could be searching for
	 * hotspares and have the disk set unloaded (released) from under
	 * us causing a panic in stripe_component_count().
	 * The way to prevent that is to mark the set MNHOLD which prevents
	 * any diskset from being released while we are scanning the mirrors,
	 * submirrors and components.
	 */

	for (x = 0; x < md_nsets; x++)
		md_holdset_enter(x);

	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
		ui = MDI_UNIT(next->ln_id);

		un = (mm_unit_t *)md_unit_readerlock(ui);

		/*
		 * Only check the unit if we are the master for this set
		 * For an MN set, poke_hotspares() is only effective on the
		 * master
		 */
		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
			md_unit_readerexit(ui);
			continue;
		}
		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
			md_unit_readerexit(ui);
			continue;
		}
		md_unit_readerexit(ui);

		un = (mm_unit_t *)md_unit_writerlock(ui);
		/*
		 * check_unit_4_hotspares will exit 1 if the unit has been
		 * removed during the process of allocating the hotspare.
		 * This can only happen for a MN metadevice. If unit no longer
		 * exists, no need to release writerlock
		 */
		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
			md_unit_writerexit(ui);
		else {
			/*
			 * If check_unit_4_hotspares failed, queue another
			 * request and break out of this one
			 */
			(void) poke_hotspares();
			break;
		}
	}
	rw_exit(&mirror_md_ops.md_link_rw.lock);

	for (x = 0; x < md_nsets; x++)
		md_holdset_exit(x);
}

/*
 * poke_hotspares
 *
 * If there is not a pending poke_hotspares request pending, queue a requent
 * to call check_4_hotspares(). This will scan all mirrors and attempt to
 * allocate hotspares for all components in error.
 */
int
poke_hotspares()
{
	mutex_enter(&hotspare_request.dr_mx);
	if (hotspare_request.dr_pending == 0) {
		hotspare_request.dr_pending = 1;
		daemon_request(&md_mhs_daemon,
		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
		    REQ_OLD);
	}
	mutex_exit(&hotspare_request.dr_mx);
	return (0);
}

static void
free_all_ecomps(err_comp_t *ecomp)
{
	err_comp_t	*d;

	while (ecomp != NULL) {
		d = ecomp;
		ecomp = ecomp->ec_next;
		kmem_free(d, sizeof (err_comp_t));
	}
}

/*
 * NAME: mirror_openfail_console_info
 *
 * DESCRIPTION: Prints a informative message to the console when mirror
 *		cannot be opened.
 *
 * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
 *	       int		smi - submirror index
 *	       int		ci - component index
 */

void
mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
{
	void (*get_dev)();
	ms_cd_info_t cd;
	md_dev64_t tmpdev;

	tmpdev = un->un_sm[smi].sm_dev;
	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
	if (get_dev != NULL) {
		(void) (*get_dev)(tmpdev, smi, ci, &cd);
		cmn_err(CE_WARN, "md %s: open error on %s",
		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
		    cd.cd_dev, NULL, 0));
	} else {
		cmn_err(CE_WARN, "md %s: open error",
		    md_shortname(MD_SID(un)));
	}
}

static int
mirror_close_all_devs(mm_unit_t *un, int md_cflags)
{
	int i;
	md_dev64_t dev;

	for (i = 0; i < NMIRROR; i++) {
		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
			continue;
		dev = un->un_sm[i].sm_dev;
		md_layered_close(dev, md_cflags);
	}
	return (0);
}

/*
 * Keep track of drivers that don't support failfast.  We use this so that
 * we only log one diagnostic message for each of these drivers, no matter
 * how many times we run the mirror_check_failfast function.
 * Return 1 if this is a new driver that does not support failfast,
 * return 0 if we have already seen this non-failfast driver.
 */
static int
new_non_ff_driver(const char *s)
{
	mutex_enter(&non_ff_drv_mutex);
	if (non_ff_drivers == NULL) {
		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
		    KM_NOSLEEP);
		if (non_ff_drivers == NULL) {
			mutex_exit(&non_ff_drv_mutex);
			return (1);
		}

		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
		    KM_NOSLEEP);
		if (non_ff_drivers[0] == NULL) {
			kmem_free(non_ff_drivers, 2 * sizeof (char *));
			non_ff_drivers = NULL;
			mutex_exit(&non_ff_drv_mutex);
			return (1);
		}

		(void) strcpy(non_ff_drivers[0], s);
		non_ff_drivers[1] = NULL;

	} else {
		int i;
		char **tnames;
		char **tmp;

		for (i = 0; non_ff_drivers[i] != NULL; i++) {
			if (strcmp(s, non_ff_drivers[i]) == 0) {
				mutex_exit(&non_ff_drv_mutex);
				return (0);
			}
		}

		/* allow for new element and null */
		i += 2;
		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
		if (tnames == NULL) {
			mutex_exit(&non_ff_drv_mutex);
			return (1);
		}

		for (i = 0; non_ff_drivers[i] != NULL; i++)
			tnames[i] = non_ff_drivers[i];

		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
		if (tnames[i] == NULL) {
			/* adjust i so that it is the right count to free */
			kmem_free(tnames, (i + 2) * sizeof (char *));
			mutex_exit(&non_ff_drv_mutex);
			return (1);
		}

		(void) strcpy(tnames[i++], s);
		tnames[i] = NULL;

		tmp = non_ff_drivers;
		non_ff_drivers = tnames;
		/* i now represents the count we previously alloced */
		kmem_free(tmp, i * sizeof (char *));
	}
	mutex_exit(&non_ff_drv_mutex);

	return (1);
}

/*
 * Check for the "ddi-failfast-supported" devtree property on each submirror
 * component to indicate if we should do I/O to that submirror with the
 * B_FAILFAST flag set or not.  This check is made at various state transitions
 * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
 * only need to check one drive (e.g. hotspare) but since the check is
 * fast and infrequent and sometimes needs to be done on all components we
 * just check all components on each call.
 */
void
mirror_check_failfast(minor_t mnum)
{
	int		i;
	mm_unit_t	*un;

	if (md_ff_disable)
		return;

	un = MD_UNIT(mnum);

	for (i = 0; i < NMIRROR; i++) {
		int			ci;
		int			cnt;
		int			ff = 1;
		mm_submirror_t		*sm;
		mm_submirror_ic_t	*smic;
		void			(*get_dev)();

		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
			continue;

		sm = &un->un_sm[i];
		smic = &un->un_smic[i];

		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
		    "get device", 0);

		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
		for (ci = 0; ci < cnt; ci++) {
			int		found = 0;
			dev_t		ci_dev;
			major_t		major;
			dev_info_t	*devi;
			ms_cd_info_t	cd;

			/*
			 * this already returns the hs
			 * dev if the device is spared
			 */
			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);

			ci_dev = md_dev64_to_dev(cd.cd_dev);
			major = getmajor(ci_dev);

			if (major == md_major) {
				/*
				 * this component must be a soft
				 * partition; get the real dev
				 */
				minor_t	dev_mnum;
				mdi_unit_t	*ui;
				mp_unit_t	*un;
				set_t	setno;
				side_t	side;
				md_dev64_t	tmpdev;

				ui = MDI_UNIT(getminor(ci_dev));

				/* grab necessary lock */
				un = (mp_unit_t *)md_unit_readerlock(ui);

				dev_mnum = MD_SID(un);
				setno = MD_MIN2SET(dev_mnum);
				side = mddb_getsidenum(setno);

				tmpdev = un->un_dev;

				/* Get dev by device id */
				if (md_devid_found(setno, side,
				    un->un_key) == 1) {
					tmpdev = md_resolve_bydevid(dev_mnum,
					    tmpdev, un->un_key);
				}

				md_unit_readerexit(ui);

				ci_dev = md_dev64_to_dev(tmpdev);
				major = getmajor(ci_dev);
			}

			if (ci_dev != NODEV32 &&
			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
			    != NULL) {
				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
				int		propvalue = 0;
				int		proplength = sizeof (int);
				int		error;
				struct cb_ops	*cb;

				if ((cb = devopsp[major]->devo_cb_ops) !=
				    NULL) {
					error = (*cb->cb_prop_op)
					    (DDI_DEV_T_ANY, devi, prop_op,
					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
					    "ddi-failfast-supported",
					    (caddr_t)&propvalue, &proplength);

					if (error == DDI_PROP_SUCCESS)
						found = 1;
				}

				if (!found && new_non_ff_driver(
				    ddi_driver_name(devi))) {
					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
					    "disabled on %s",
					    ddi_driver_name(devi));
				}

				ddi_release_devi(devi);
			}

			/*
			 * All components must support
			 * failfast in the submirror.
			 */
			if (!found) {
				ff = 0;
				break;
			}
		}

		if (ff) {
			sm->sm_flags |= MD_SM_FAILFAST;
		} else {
			sm->sm_flags &= ~MD_SM_FAILFAST;
		}
	}
}

/*
 * Return true if the submirror is unavailable.
 * If any of the submirror components are opened then the submirror cannot
 * be unavailable (MD_INACCESSIBLE).
 * If any of the components are already in the errored state, then the submirror
 * cannot be unavailable (MD_INACCESSIBLE).
 */
static bool_t
submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	md_m_shared_t		*shared;
	int			ci;
	int			compcnt;

	sm = &un->un_sm[smi];
	smic = &un->un_smic[smi];

	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
	for (ci = 0; ci < compcnt; ci++) {
		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
		    (sm->sm_dev, sm, ci);
		if (from_probe) {
			if (shared->ms_flags & MDM_S_PROBEOPEN)
				return (B_FALSE);
		} else {
			if (shared->ms_flags & MDM_S_ISOPEN)
				return (B_FALSE);
		}
		if (shared->ms_state == CS_ERRED ||
		    shared->ms_state == CS_LAST_ERRED)
			return (B_FALSE);
	}

	return (B_TRUE);
}

static int
mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
{
	int		i;
	mm_unit_t	*un;
	mdi_unit_t	*ui;
	int		err;
	int		smi;
	int		ci;
	err_comp_t	*c;
	err_comp_t	*ecomps = NULL;
	int		smmask = 0;
	set_t		setno;
	int		sm_cnt;
	int		sm_unavail_cnt;

	mirror_check_failfast(mnum);

	un = MD_UNIT(mnum);
	ui = MDI_UNIT(mnum);
	setno = MD_UN2SET(un);

	for (i = 0; i < NMIRROR; i++) {
		md_dev64_t tmpdev = un->un_sm[i].sm_dev;

		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
			continue;
		if (md_layered_open(mnum, &tmpdev, md_oflags))
			smmask |= SMI2BIT(i);
		un->un_sm[i].sm_dev = tmpdev;
	}

	/*
	 * If smmask is clear, all submirrors are accessible. Clear the
	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
	 * mirror device.   If smmask is set, we have to determine which of the
	 * submirrors are in error. If no submirror is accessible we mark the
	 * whole mirror as MD_INACCESSIBLE.
	 */
	if (smmask == 0) {
		if (lockp) {
			md_ioctl_readerexit(lockp);
			(void) md_ioctl_writerlock(lockp, ui);
		} else {
			md_unit_readerexit(ui);
			(void) md_unit_writerlock(ui);
		}
		ui->ui_tstate &= ~MD_INACCESSIBLE;
		if (lockp) {
			md_ioctl_writerexit(lockp);
			(void) md_ioctl_readerlock(lockp, ui);
		} else {
			md_unit_writerexit(ui);
			(void) md_unit_readerlock(ui);
		}

		for (i = 0; i < NMIRROR; i++) {
			md_dev64_t	tmpdev;
			mdi_unit_t	*sm_ui;

			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
				continue;

			tmpdev = un->un_sm[i].sm_dev;
			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
			(void) md_unit_writerlock(sm_ui);
			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
			md_unit_writerexit(sm_ui);
		}

		return (0);
	}

	for (i = 0; i < NMIRROR; i++) {
		md_dev64_t tmpdev;

		if (!(smmask & SMI2BIT(i)))
			continue;

		tmpdev = un->un_sm[i].sm_dev;
		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
		un->un_sm[i].sm_dev = tmpdev;
		ASSERT(err == 0);
	}

	if (lockp) {
		md_ioctl_readerexit(lockp);
		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
	} else {
		md_unit_readerexit(ui);
		un = (mm_unit_t *)md_unit_writerlock(ui);
	}

	/*
	 * We want to make sure the unavailable flag is not masking a real
	 * error on the submirror.
	 * For each submirror,
	 *    if all of the submirror components couldn't be opened and there
	 *    are no errors on the submirror, then set the unavailable flag
	 *    otherwise, clear unavailable.
	 */
	sm_cnt = 0;
	sm_unavail_cnt = 0;
	for (i = 0; i < NMIRROR; i++) {
		md_dev64_t	tmpdev;
		mdi_unit_t	*sm_ui;

		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
			continue;

		sm_cnt++;
		tmpdev = un->un_sm[i].sm_dev;
		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));

		(void) md_unit_writerlock(sm_ui);
		if (submirror_unavailable(un, i, 0)) {
			sm_ui->ui_tstate |= MD_INACCESSIBLE;
			sm_unavail_cnt++;
		} else {
			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
		}
		md_unit_writerexit(sm_ui);
	}

	/*
	 * If all of the submirrors are unavailable, the mirror is also
	 * unavailable.
	 */
	if (sm_cnt == sm_unavail_cnt) {
		ui->ui_tstate |= MD_INACCESSIBLE;
	} else {
		ui->ui_tstate &= ~MD_INACCESSIBLE;
	}

	smi = 0;
	ci = 0;
	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
		if (mirror_other_sources(un, smi, ci, 1) == 1) {

			free_all_ecomps(ecomps);
			(void) mirror_close_all_devs(un, md_oflags);
			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
			    SVM_TAG_METADEVICE, setno, MD_SID(un));
			mirror_openfail_console_info(un, smi, ci);
			if (lockp) {
				md_ioctl_writerexit(lockp);
				(void) md_ioctl_readerlock(lockp, ui);
			} else {
				md_unit_writerexit(ui);
				(void) md_unit_readerlock(ui);
			}
			return (ENXIO);
		}

		/* track all component states that need changing */
		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
		c->ec_next = ecomps;
		c->ec_smi = smi;
		c->ec_ci = ci;
		ecomps = c;
		ci++;
	}

	/* Make all state changes and commit them */
	for (c = ecomps; c != NULL; c = c->ec_next) {
		/*
		 * If lockp is set, then entering kernel through ioctl.
		 * For a MN set, the only ioctl path is via a commd message
		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
		 * being sent to each node.
		 * In this case, set NO_XMIT so that set_sm_comp_state
		 * won't attempt to send a message on a message.
		 *
		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
		 * which flag is passed.
		 */
		if (lockp) {
			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
			    MD_STATE_NO_XMIT, lockp);
		} else {
			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
		}
		/*
		 * For a MN set, the NOTIFY is done when the state change is
		 * processed on each node
		 */
		if (!MD_MNSET_SETNO(setno)) {
			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
			    SVM_TAG_METADEVICE, setno, MD_SID(un));
		}
	}

	if (lockp) {
		md_ioctl_writerexit(lockp);
		(void) md_ioctl_readerlock(lockp, ui);
	} else {
		md_unit_writerexit(ui);
		(void) md_unit_readerlock(ui);
	}

	free_all_ecomps(ecomps);

	/* allocate hotspares for all errored components */
	if (MD_MNSET_SETNO(setno)) {
		/*
		 * If we're called from an ioctl (lockp set) then we cannot
		 * directly call send_poke_hotspares as this will block until
		 * the message gets despatched to all nodes. If the cluster is
		 * going through a reconfig cycle then the message will block
		 * until the cycle is complete, and as we originate from a
		 * service call from commd we will livelock.
		 */
		if (lockp == NULL) {
			md_unit_readerexit(ui);
			send_poke_hotspares(setno);
			(void) md_unit_readerlock(ui);
		}
	} else {
		(void) poke_hotspares();
	}
	return (0);
}

void
mirror_overlap_tree_remove(md_mps_t *ps)
{
	mm_unit_t	*un;

	if (panicstr)
		return;

	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
	un = ps->ps_un;

	mutex_enter(&un->un_overlap_tree_mx);
	avl_remove(&un->un_overlap_root, ps);
	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
	if (un->un_overlap_tree_flag != 0) {
		un->un_overlap_tree_flag = 0;
		cv_broadcast(&un->un_overlap_tree_cv);
	}
	mutex_exit(&un->un_overlap_tree_mx);
}


/*
 * wait_for_overlaps:
 * -----------------
 * Check that given i/o request does not cause an overlap with already pending
 * i/o. If it does, block until the overlapped i/o completes.
 *
 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
 * it must not already be in the tree.
 */
static void
wait_for_overlaps(md_mps_t *ps, int flags)
{
	mm_unit_t	*un;
	avl_index_t	where;
	md_mps_t	*ps1;

	if (panicstr)
		return;

	un = ps->ps_un;
	mutex_enter(&un->un_overlap_tree_mx);
	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
		mutex_exit(&un->un_overlap_tree_mx);
		return;
	}

	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));

	do {
		ps1 = avl_find(&un->un_overlap_root, ps, &where);
		if (ps1 == NULL) {
			/*
			 * The candidate range does not overlap with any
			 * range in the tree.  Insert it and be done.
			 */
			avl_insert(&un->un_overlap_root, ps, where);
			ps->ps_flags |= MD_MPS_ON_OVERLAP;
		} else {
			/*
			 * The candidate range would overlap.  Set the flag
			 * indicating we need to be woken up, and sleep
			 * until another thread removes a range.  If upon
			 * waking up we find this mps was put on the tree
			 * by another thread, the loop terminates.
			 */
			un->un_overlap_tree_flag = 1;
			cv_wait(&un->un_overlap_tree_cv,
			    &un->un_overlap_tree_mx);
		}
	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
	mutex_exit(&un->un_overlap_tree_mx);
}

/*
 * This function is called from mirror_done to check whether any pages have
 * been modified while a mirrored write was in progress.  Returns 0 if
 * all pages associated with bp are clean, 1 otherwise.
 */
static int
any_pages_dirty(struct buf *bp)
{
	int	rval;

	rval = biomodified(bp);
	if (rval == -1)
		rval = 0;

	return (rval);
}

#define	MAX_EXTRAS 10

void
mirror_commit(
	mm_unit_t	*un,
	int		smmask,
	mddb_recid_t	*extras
)
{
	mm_submirror_t		*sm;
	md_unit_t		*su;
	int			i;

	/* 2=mirror,null id */
	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];

	int			ri = 0;

	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
		return;

	/* Add two, this includes the mirror unit and the null recid */
	if (extras != NULL) {
		int	nrecids = 0;
		while (extras[nrecids] != 0) {
			nrecids++;
		}
		ASSERT(nrecids <= MAX_EXTRAS);
	}

	if (un != NULL)
		recids[ri++] = un->c.un_record_id;
	for (i = 0;  i < NMIRROR; i++) {
		if (!(smmask & SMI2BIT(i)))
			continue;
		sm = &un->un_sm[i];
		if (!SMS_IS(sm, SMS_INUSE))
			continue;
		if (md_getmajor(sm->sm_dev) != md_major)
			continue;
		su =  MD_UNIT(md_getminor(sm->sm_dev));
		recids[ri++] = su->c.un_record_id;
	}

	if (extras != NULL)
		while (*extras != 0) {
			recids[ri++] = *extras;
			extras++;
		}

	if (ri == 0)
		return;
	recids[ri] = 0;

	/*
	 * Ok to hold ioctl lock across record commit to mddb as
	 * long as the record(s) being committed aren't resync records.
	 */
	mddb_commitrecs_wrapper(recids);
}


/*
 * This routine is used to set a bit in the writable_bm bitmap
 * which represents each submirror in a metamirror which
 * is writable. The first writable submirror index is assigned
 * to the sm_index.  The number of writable submirrors are returned in nunits.
 *
 * This routine returns the submirror's unit number.
 */

static void
select_write_units(struct mm_unit *un, md_mps_t *ps)
{

	int		i;
	unsigned	writable_bm = 0;
	unsigned	nunits = 0;

	for (i = 0; i < NMIRROR; i++) {
		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
			/* set bit of all writable units */
			writable_bm |= SMI2BIT(i);
			nunits++;
		}
	}
	ps->ps_writable_sm = writable_bm;
	ps->ps_active_cnt = nunits;
	ps->ps_current_sm = 0;
}

static
unsigned
select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
{

	int		i;
	unsigned	writable_bm = 0;
	unsigned	nunits = 0;

	for (i = 0; i < NMIRROR; i++) {
		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
			writable_bm |= SMI2BIT(i);
			nunits++;
		}
	}
	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
		writable_bm &= ~ps->ps_allfrom_sm;
		nunits--;
	}
	ps->ps_writable_sm = writable_bm;
	ps->ps_active_cnt = nunits;
	ps->ps_current_sm = 0;
	return (nunits);
}

static md_dev64_t
select_read_unit(
	mm_unit_t	*un,
	diskaddr_t	blkno,
	u_longlong_t	reqcount,
	u_longlong_t	*cando,
	int		must_be_opened,
	md_m_shared_t	**shared,
	md_mcs_t	*cs)
{
	int			i;
	md_m_shared_t		*s;
	uint_t			lasterrcnt = 0;
	md_dev64_t		dev = 0;
	u_longlong_t		cnt;
	u_longlong_t		mincnt;
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	mdi_unit_t		*ui;

	mincnt = reqcount;
	for (i = 0; i < NMIRROR; i++) {
		if (!SUBMIRROR_IS_READABLE(un, i))
			continue;
		sm = &un->un_sm[i];
		smic = &un->un_smic[i];
		cnt = reqcount;

		/*
		 * If the current submirror is marked as inaccessible, do not
		 * try to access it.
		 */
		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
		(void) md_unit_readerlock(ui);
		if (ui->ui_tstate & MD_INACCESSIBLE) {
			md_unit_readerexit(ui);
			continue;
		}
		md_unit_readerexit(ui);

		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
		    (sm->sm_dev, sm, blkno, &cnt);

		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
			continue;
		if (s->ms_state == CS_OKAY) {
			*cando = cnt;
			if (shared != NULL)
				*shared = s;

			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
			    cs != NULL) {
				cs->cs_buf.b_flags |= B_FAILFAST;
			}

			return (un->un_sm[i].sm_dev);
		}
		if (s->ms_state != CS_LAST_ERRED)
			continue;

		/* don't use B_FAILFAST since we're Last Erred */

		if (mincnt > cnt)
			mincnt = cnt;
		if (s->ms_lasterrcnt > lasterrcnt) {
			lasterrcnt = s->ms_lasterrcnt;
			if (shared != NULL)
				*shared = s;
			dev = un->un_sm[i].sm_dev;
		}
	}
	*cando = mincnt;
	return (dev);
}

/*
 * Given a 32-bit bitmap, this routine will return the bit number
 * of the nth bit set.	The nth bit set is passed via the index integer.
 *
 * This routine is used to run through the writable submirror bitmap
 * and starting all of the writes.  See the value returned is the
 * index to appropriate submirror structure, in the md_sm
 * array for metamirrors.
 */
static int
md_find_nth_unit(uint_t mask, int index)
{
	int	bit, nfound;

	for (bit = -1, nfound = -1; nfound != index; bit++) {
		ASSERT(mask != 0);
		nfound += (mask & 1);
		mask >>= 1;
	}
	return (bit);
}

static int
fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
{
	mm_unit_t	*un;
	buf_t		*bp;
	int		i;
	unsigned	nunits = 0;
	int		iunit;
	uint_t		running_bm = 0;
	uint_t		sm_index;

	bp = &cs->cs_buf;
	un = ps->ps_un;

	for (i = 0; i < NMIRROR; i++) {
		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
			continue;
		running_bm |= SMI2BIT(i);
		nunits++;
	}
	if (nunits == 0)
		return (1);

	/*
	 * For directed mirror read (DMR) we only use the specified side and
	 * do not compute the source of the read.
	 * If we're running with MD_MPS_DIRTY_RD set we always return the
	 * first mirror side (this prevents unnecessary ownership switching).
	 * Otherwise we return the submirror according to the mirror read option
	 */
	if (ps->ps_flags & MD_MPS_DMR) {
		sm_index = un->un_dmr_last_read;
	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
		sm_index = md_find_nth_unit(running_bm, 0);
	} else {
		/* Normal (non-DMR) operation */
		switch (un->un_read_option) {
		case RD_GEOMETRY:
			iunit = (int)(bp->b_lblkno /
			    howmany(un->c.un_total_blocks, nunits));
			sm_index = md_find_nth_unit(running_bm, iunit);
			break;
		case RD_FIRST:
			sm_index = md_find_nth_unit(running_bm, 0);
			break;
		case RD_LOAD_BAL:
			/* this is intentional to fall into the default */
		default:
			un->un_last_read = (un->un_last_read + 1) % nunits;
			sm_index = md_find_nth_unit(running_bm,
			    un->un_last_read);
			break;
		}
	}
	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
	ps->ps_allfrom_sm = SMI2BIT(sm_index);

	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
		bp->b_flags |= B_FAILFAST;
	}

	return (0);
}

static
int
mirror_are_submirrors_available(mm_unit_t *un)
{
	int i;
	for (i = 0; i < NMIRROR; i++) {
		md_dev64_t tmpdev = un->un_sm[i].sm_dev;

		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
		    md_getmajor(tmpdev) != md_major)
			continue;

		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
			return (0);

		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
			return (0);
	}
	return (1);
}

void
build_submirror(mm_unit_t *un, int i, int snarfing)
{
	struct mm_submirror	*sm;
	struct mm_submirror_ic	*smic;
	md_unit_t		*su;
	set_t			setno;

	sm = &un->un_sm[i];
	smic = &un->un_smic[i];

	sm->sm_flags = 0; /* sometime we may need to do more here */

	setno = MD_UN2SET(un);

	if (!SMS_IS(sm, SMS_INUSE))
		return;
	if (snarfing) {
		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
		    sm->sm_key, MD_NOTRUST_DEVT);
	} else {
		if (md_getmajor(sm->sm_dev) == md_major) {
			su = MD_UNIT(md_getminor(sm->sm_dev));
			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
			/* submirror can no longer be soft partitioned */
			MD_CAPAB(su) &= (~MD_CAN_SP);
		}
	}
	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
	    0, "shared by blk", 0);
	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
	    0, "shared by indx", 0);
	smic->sm_get_component_count = (int (*)())md_get_named_service(
	    sm->sm_dev, 0, "get component count", 0);
	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
	    "get block count skip size", 0);
	sm->sm_state &= ~SMS_IGNORE;
	if (SMS_IS(sm, SMS_OFFLINE))
		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
	md_set_parent(sm->sm_dev, MD_SID(un));
}

static void
mirror_cleanup(mm_unit_t *un)
{
	mddb_recid_t	recid;
	int		smi;
	sv_dev_t	sv[NMIRROR];
	int		nsv = 0;

	/*
	 * If a MN diskset and this node is not the master, do
	 * not delete any records on snarf of the mirror records.
	 */
	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
		return;
	}

	for (smi = 0; smi < NMIRROR; smi++) {
		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
			continue;
		sv[nsv].setno = MD_UN2SET(un);
		sv[nsv++].key = un->un_sm[smi].sm_key;
	}

	recid = un->un_rr_dirty_recid;
	mddb_deleterec_wrapper(un->c.un_record_id);
	if (recid > 0)
		mddb_deleterec_wrapper(recid);

	md_rem_names(sv, nsv);
}

/*
 * Comparison function for the avl tree which tracks
 * outstanding writes on submirrors.
 *
 * Returns:
 *	-1: ps1 < ps2
 *	 0: ps1 and ps2 overlap
 *	 1: ps1 > ps2
 */
static int
mirror_overlap_compare(const void *p1, const void *p2)
{
	const md_mps_t *ps1 = (md_mps_t *)p1;
	const md_mps_t *ps2 = (md_mps_t *)p2;

	if (ps1->ps_firstblk < ps2->ps_firstblk) {
		if (ps1->ps_lastblk >= ps2->ps_firstblk)
			return (0);
		return (-1);
	}

	if (ps1->ps_firstblk > ps2->ps_firstblk) {
		if (ps1->ps_firstblk <= ps2->ps_lastblk)
			return (0);
		return (1);
	}

	return (0);
}

/* Return a -1 if optimized record unavailable and set should be released */
int
mirror_build_incore(mm_unit_t *un, int snarfing)
{
	int		i;

	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
		return (1);
	}

	if (mirror_are_submirrors_available(un) == 0)
		return (1);

	if (MD_UNIT(MD_SID(un)) != NULL)
		return (0);

	MD_STATUS(un) = 0;

	/* pre-4.1 didn't define CAN_META_CHILD capability */
	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;

	un->un_overlap_tree_flag = 0;
	avl_create(&un->un_overlap_root, mirror_overlap_compare,
	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));

	for (i = 0; i < NMIRROR; i++)
		build_submirror(un, i, snarfing);

	if (unit_setup_resync(un, snarfing) != 0) {
		if (snarfing) {
			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
			/*
			 * If a MN set and set is not stale, then return -1
			 * which will force the caller to unload the set.
			 * The MN diskset nodes will return failure if
			 * unit_setup_resync fails so that nodes won't
			 * get out of sync.
			 *
			 * If set is STALE, the master node can't allocate
			 * a resync record (if needed), but node needs to
			 * join the set so that user can delete broken mddbs.
			 * So, if set is STALE, just continue on.
			 */
			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
				return (-1);
			}
		} else
			return (1);
	}

	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);

	un->un_suspend_wr_flag = 0;
	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);

	/*
	 * Allocate mutexes for mirror-owner and resync-owner changes.
	 * All references to the owner message state field must be guarded
	 * by this mutex.
	 */
	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);

	/*
	 * Allocate mutex and condvar for resync thread manipulation. These
	 * will be used by mirror_resync_unit/mirror_ioctl_resync
	 */
	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);

	/*
	 * Allocate mutex and condvar for resync progress thread manipulation.
	 * This allows resyncs to be continued across an intervening reboot.
	 */
	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);

	/*
	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
	 * provides synchronization between a user-ioctl and the resulting
	 * strategy() call that performs the read().
	 */
	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);

	/*
	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
	 */
	for (i = 0; i < MD_MNMAXSIDES; i++) {
		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
	}

	/* place various information in the in-core data structures */
	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
	MD_UNIT(MD_SID(un)) = un;

	return (0);
}


void
reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
{
	mddb_recid_t	recid, vtoc_id;
	size_t		bitcnt;
	size_t		shortcnt;
	int		smi;
	sv_dev_t	sv[NMIRROR];
	int		nsv = 0;
	uint_t		bits = 0;
	minor_t		selfid;
	md_unit_t	*su;
	int		i;

	md_destroy_unit_incore(mnum, &mirror_md_ops);

	shortcnt = un->un_rrd_num * sizeof (short);
	bitcnt = howmany(un->un_rrd_num, NBBY);

	if (un->un_outstanding_writes)
		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
	if (un->un_goingclean_bm)
		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
	if (un->un_goingdirty_bm)
		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
	if (un->un_resync_bm)
		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
	if (un->un_pernode_dirty_sum)
		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);

	/*
	 * Destroy the taskq for deferred processing of DRL clean requests.
	 * This taskq will only be present for Multi Owner mirrors.
	 */
	if (un->un_drl_task != NULL)
		ddi_taskq_destroy(un->un_drl_task);

	md_nblocks_set(mnum, -1ULL);
	MD_UNIT(mnum) = NULL;

	/*
	 * Attempt release of its minor node
	 */
	md_remove_minor_node(mnum);

	if (!removing)
		return;

	for (smi = 0; smi < NMIRROR; smi++) {
		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
			continue;
		/* reallow soft partitioning of submirror and reset parent */
		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
		MD_CAPAB(su) |= MD_CAN_SP;
		md_reset_parent(un->un_sm[smi].sm_dev);
		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);

		sv[nsv].setno = MD_MIN2SET(mnum);
		sv[nsv++].key = un->un_sm[smi].sm_key;
		bits |= SMI2BIT(smi);
	}

	MD_STATUS(un) |= MD_UN_BEING_RESET;
	recid = un->un_rr_dirty_recid;
	vtoc_id = un->c.un_vtoc_id;
	selfid = MD_SID(un);

	mirror_commit(un, bits, 0);

	avl_destroy(&un->un_overlap_root);

	/* Destroy all mutexes and condvars before returning. */
	mutex_destroy(&un->un_suspend_wr_mx);
	cv_destroy(&un->un_suspend_wr_cv);
	mutex_destroy(&un->un_overlap_tree_mx);
	cv_destroy(&un->un_overlap_tree_cv);
	mutex_destroy(&un->un_owner_mx);
	mutex_destroy(&un->un_rs_thread_mx);
	cv_destroy(&un->un_rs_thread_cv);
	mutex_destroy(&un->un_rs_progress_mx);
	cv_destroy(&un->un_rs_progress_cv);
	mutex_destroy(&un->un_dmr_mx);
	cv_destroy(&un->un_dmr_cv);

	for (i = 0; i < MD_MNMAXSIDES; i++) {
		rw_destroy(&un->un_pernode_dirty_mx[i]);
		if (un->un_pernode_dirty_bm[i])
			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
	}

	/*
	 * Remove self from the namespace
	 */
	if (un->c.un_revision & MD_FN_META_DEV) {
		(void) md_rem_selfname(un->c.un_self_id);
	}

	/* This frees the unit structure. */
	mddb_deleterec_wrapper(un->c.un_record_id);

	if (recid != 0)
		mddb_deleterec_wrapper(recid);

	/* Remove the vtoc, if present */
	if (vtoc_id)
		mddb_deleterec_wrapper(vtoc_id);

	md_rem_names(sv, nsv);

	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
	    MD_MIN2SET(selfid), selfid);
}

int
mirror_internal_open(
	minor_t		mnum,
	int		flag,
	int		otyp,
	int		md_oflags,
	IOLOCK		*lockp		/* can be NULL */
)
{
	mdi_unit_t	*ui = MDI_UNIT(mnum);
	int		err = 0;

tryagain:
	/* single thread */
	if (lockp) {
		/*
		 * If ioctl lock is held, use openclose_enter
		 * routine that will set the ioctl flag when
		 * grabbing the readerlock.
		 */
		(void) md_ioctl_openclose_enter(lockp, ui);
	} else {
		(void) md_unit_openclose_enter(ui);
	}

	/*
	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
	 * message in a MN diskset and this requires that the openclose
	 * lock is dropped in order to send this message.  So, another
	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
	 * attempting an open while this thread has an open in progress.
	 * Call the *_lh version of the lock exit routines since the ui_mx
	 * mutex must be held from checking for OPENINPROGRESS until
	 * after the cv_wait call.
	 */
	mutex_enter(&ui->ui_mx);
	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
		if (lockp) {
			(void) md_ioctl_openclose_exit_lh(lockp);
		} else {
			md_unit_openclose_exit_lh(ui);
		}
		cv_wait(&ui->ui_cv, &ui->ui_mx);
		mutex_exit(&ui->ui_mx);
		goto tryagain;
	}

	ui->ui_lock |= MD_UL_OPENINPROGRESS;
	mutex_exit(&ui->ui_mx);

	/* open devices, if necessary */
	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
			goto out;
	}

	/* count open */
	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
		goto out;

	/* unlock, return success */
out:
	mutex_enter(&ui->ui_mx);
	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
	mutex_exit(&ui->ui_mx);

	if (lockp) {
		/*
		 * If ioctl lock is held, use openclose_exit
		 * routine that will clear the lockp reader flag.
		 */
		(void) md_ioctl_openclose_exit(lockp);
	} else {
		md_unit_openclose_exit(ui);
	}
	return (err);
}

int
mirror_internal_close(
	minor_t		mnum,
	int		otyp,
	int		md_cflags,
	IOLOCK		*lockp		/* can be NULL */
)
{
	mdi_unit_t	*ui = MDI_UNIT(mnum);
	mm_unit_t	*un;
	int		err = 0;

	/* single thread */
	if (lockp) {
		/*
		 * If ioctl lock is held, use openclose_enter
		 * routine that will set the ioctl flag when
		 * grabbing the readerlock.
		 */
		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
	} else {
		un = (mm_unit_t *)md_unit_openclose_enter(ui);
	}

	/* count closed */
	if ((err = md_unit_decopen(mnum, otyp)) != 0)
		goto out;

	/* close devices, if necessary */
	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
		/*
		 * Clean up dirty bitmap for this unit. Do this
		 * before closing the underlying devices to avoid
		 * race conditions with reset_mirror() as a
		 * result of a 'metaset -r' command running in
		 * parallel. This might cause deallocation of
		 * dirty region bitmaps; with underlying metadevices
		 * in place this can't happen.
		 * Don't do this if a MN set and ABR not set
		 */
		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
			    !(ui->ui_tstate & MD_ABR_CAP))
				mirror_process_unit_resync(un);
		}
		(void) mirror_close_all_devs(un, md_cflags);

		/*
		 * For a MN set with transient capabilities (eg ABR/DMR) set,
		 * clear these capabilities on the last open in the cluster.
		 * To do this we send a message to all nodes to see of the
		 * device is open.
		 */
		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
			if (lockp) {
				(void) md_ioctl_openclose_exit(lockp);
			} else {
				md_unit_openclose_exit(ui);
			}

			/*
			 * if we are in the context of an ioctl, drop the
			 * ioctl lock.
			 * Otherwise, no other locks should be held.
			 */
			if (lockp) {
				IOLOCK_RETURN_RELEASE(0, lockp);
			}

			mdmn_clear_all_capabilities(mnum);

			/* if dropped the lock previously, regain it */
			if (lockp) {
				IOLOCK_RETURN_REACQUIRE(lockp);
			}
			return (0);
		}
		/* unlock and return success */
	}
out:
	/* Call whether lockp is NULL or not. */
	if (lockp) {
		md_ioctl_openclose_exit(lockp);
	} else {
		md_unit_openclose_exit(ui);
	}
	return (err);
}

/*
 * When a component has completed resyncing and is now ok, check if the
 * corresponding component in the other submirrors is in the Last Erred
 * state.  If it is, we want to change that to the Erred state so we stop
 * using that component and start using this good component instead.
 *
 * This is called from set_sm_comp_state and recursively calls
 * set_sm_comp_state if it needs to change the Last Erred state.
 */
static void
reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
	IOLOCK *lockp)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	int			ci;
	int			i;
	int			compcnt;
	int			changed = 0;

	for (i = 0; i < NMIRROR; i++) {
		sm = &un->un_sm[i];
		smic = &un->un_smic[i];

		if (!SMS_IS(sm, SMS_INUSE))
			continue;

		/* ignore the submirror that we just made ok */
		if (i == smi)
			continue;

		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
		for (ci = 0; ci < compcnt; ci++) {
			md_m_shared_t	*shared;

			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
			    (sm->sm_dev, sm, ci);

			if ((shared->ms_state & CS_LAST_ERRED) &&
			    !mirror_other_sources(un, i, ci, 1)) {

				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
				    flags, lockp);
				changed = 1;
			}
		}
	}

	/* maybe there is a hotspare for this newly erred component */
	if (changed) {
		set_t	setno;

		setno = MD_UN2SET(un);
		if (MD_MNSET_SETNO(setno)) {
			send_poke_hotspares(setno);
		} else {
			(void) poke_hotspares();
		}
	}
}

/*
 * set_sm_comp_state
 *
 * Set the state of a submirror component to the specified new state.
 * If the mirror is in a multi-node set, send messages to all nodes to
 * block all writes to the mirror and then update the state and release the
 * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
 * MD_STATE_XMIT will be unset in 2 cases:
 * 1. When the state is changed to CS_RESYNC as this state change
 * will already have been updated on each node by the processing of the
 * distributed metasync command, hence no need to xmit.
 * 2. When the state is change to CS_OKAY after a resync has completed. Again
 * the resync completion will already have been processed on each node by
 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
 * resync, hence no need to xmit.
 *
 * In case we are called from the updates of a watermark,
 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
 * a metainit or similar. In this case the message that we sent to propagate
 * the state change must not be a class1 message as that would deadlock with
 * the metainit command that is still being processed.
 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
 * instead. This also makes the submessage generator to create a class2
 * submessage rather than a class1 (which would also block)
 *
 * On entry, unit_writerlock is held
 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
 * also held.
 */
void
set_sm_comp_state(
	mm_unit_t	*un,
	int		smi,
	int		ci,
	int		newstate,
	mddb_recid_t	*extras,
	uint_t		flags,
	IOLOCK		*lockp
)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	md_m_shared_t		*shared;
	int			origstate;
	void			(*get_dev)();
	ms_cd_info_t		cd;
	char			devname[MD_MAX_CTDLEN];
	int			err;
	set_t			setno = MD_UN2SET(un);
	md_mn_msg_stch_t	stchmsg;
	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
	md_mn_kresult_t		*kresult;
	int			rval;
	uint_t			msgflags;
	md_mn_msgtype_t		msgtype;
	int			save_lock = 0;
	mdi_unit_t		*ui_sm;

	sm = &un->un_sm[smi];
	smic = &un->un_smic[smi];

	/* If we have a real error status then turn off MD_INACCESSIBLE. */
	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
	}

	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
	    (sm->sm_dev, sm, ci);
	origstate = shared->ms_state;

	/*
	 * If the new state is an error and the old one wasn't, generate
	 * a console message. We do this before we send the state to other
	 * nodes in a MN set because the state change may change the component
	 * name  if a hotspare is allocated.
	 */
	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {

		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
		    "get device", 0);
		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);

		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
		    cd.cd_dev, devname, sizeof (devname));

		if (err == ENOENT) {
			(void) md_devname(setno, cd.cd_dev, devname,
			    sizeof (devname));
		}

		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
		    md_shortname(md_getminor(sm->sm_dev)), devname);

		if (newstate & CS_LAST_ERRED) {
			cmn_err(CE_WARN, "md: %s: %s last erred",
			    md_shortname(md_getminor(sm->sm_dev)),
			    devname);

		} else if (shared->ms_flags & MDM_S_ISOPEN) {
			/*
			 * Close the broken device and clear the open flag on
			 * it.  Closing the device means the RCM framework will
			 * be able to unconfigure the device if required.
			 *
			 * We have to check that the device is open, otherwise
			 * the first open on it has resulted in the error that
			 * is being processed and the actual cd.cd_dev will be
			 * NODEV64.
			 *
			 * If this is a multi-node mirror, then the multinode
			 * state checks following this code will cause the
			 * slave nodes to close the mirror in the function
			 * mirror_set_state().
			 */
			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
			shared->ms_flags &= ~MDM_S_ISOPEN;
		}

	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
	    (shared->ms_flags & MDM_S_ISOPEN)) {
		/*
		 * Similar to logic above except no log messages since we
		 * are just transitioning from Last Erred to Erred.
		 */
		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
		    "get device", 0);
		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);

		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
		shared->ms_flags &= ~MDM_S_ISOPEN;
	}

	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
		/*
		 * For a multi-node mirror, send the state change to the
		 * master, which broadcasts to all nodes, including this
		 * one. Once the message is received, the state is set
		 * in-core and the master commits the change to disk.
		 * There is a case, comp_replace,  where this function
		 * can be called from within an ioctl and therefore in this
		 * case, as the ioctl will already be called on each node,
		 * there is no need to xmit the state change to the master for
		 * distribution to the other nodes. MD_STATE_XMIT flag is used
		 * to indicate whether a xmit is required. The mirror's
		 * transient state is set to MD_ERR_PENDING to avoid sending
		 * multiple messages.
		 */
		if (newstate & (CS_ERRED|CS_LAST_ERRED))
			ui->ui_tstate |= MD_ERR_PENDING;

		/*
		 * Send a state update message to all nodes. This message
		 * will generate 2 submessages, the first one to suspend
		 * all writes to the mirror and the second to update the
		 * state and resume writes.
		 */
		stchmsg.msg_stch_mnum = un->c.un_self_id;
		stchmsg.msg_stch_sm = smi;
		stchmsg.msg_stch_comp = ci;
		stchmsg.msg_stch_new_state = newstate;
		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
#ifdef DEBUG
		if (mirror_debug_flag)
			printf("send set state, %x, %x, %x, %x, %x\n",
			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
			    stchmsg.msg_stch_hs_id);
#endif
		if (flags & MD_STATE_WMUPDATE) {
			msgtype  = MD_MN_MSG_STATE_UPDATE2;
			/*
			 * When coming from an update of watermarks, there
			 * must already be a message logged that triggered
			 * this action. So, no need to log this message, too.
			 */
			msgflags = MD_MSGF_NO_LOG;
		} else {
			msgtype  = MD_MN_MSG_STATE_UPDATE;
			msgflags = MD_MSGF_DEFAULT_FLAGS;
		}

		/*
		 * If we are in the context of an ioctl, drop the ioctl lock.
		 * lockp holds the list of locks held.
		 *
		 * Otherwise, increment the appropriate reacquire counters.
		 * If openclose lock is *held, then must reacquire reader
		 * lock before releasing the openclose lock.
		 * Do not drop the ARRAY_WRITER lock as we may not be able
		 * to reacquire it.
		 */
		if (lockp) {
			if (lockp->l_flags & MD_ARRAY_WRITER) {
				save_lock = MD_ARRAY_WRITER;
				lockp->l_flags &= ~MD_ARRAY_WRITER;
			} else if (lockp->l_flags & MD_ARRAY_READER) {
				save_lock = MD_ARRAY_READER;
				lockp->l_flags &= ~MD_ARRAY_READER;
			}
			IOLOCK_RETURN_RELEASE(0, lockp);
		} else {
			if (flags & MD_STATE_OCHELD) {
				md_unit_writerexit(ui);
				(void) md_unit_readerlock(ui);
				md_unit_openclose_exit(ui);
			} else {
				md_unit_writerexit(ui);
			}
		}

		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
		    (char *)&stchmsg, sizeof (stchmsg), kresult);

		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
			/* If we're shutting down already, pause things here. */
			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
				while (!md_mn_is_commd_present()) {
					delay(md_hz);
				}
			}
			cmn_err(CE_PANIC,
			    "ksend_message failure: STATE_UPDATE");
		}
		kmem_free(kresult, sizeof (md_mn_kresult_t));

		/* if dropped the lock previously, regain it */
		if (lockp) {
			IOLOCK_RETURN_REACQUIRE(lockp);
			lockp->l_flags |= save_lock;
		} else {
			/*
			 * Reacquire dropped locks and update acquirecnts
			 * appropriately.
			 */
			if (flags & MD_STATE_OCHELD) {
				/*
				 * openclose also grabs readerlock.
				 */
				(void) md_unit_openclose_enter(ui);
				md_unit_readerexit(ui);
				(void) md_unit_writerlock(ui);
			} else {
				(void) md_unit_writerlock(ui);
			}
		}

		ui->ui_tstate &= ~MD_ERR_PENDING;
	} else {
		shared->ms_state = newstate;
		uniqtime32(&shared->ms_timestamp);

		if (newstate == CS_ERRED)
			shared->ms_flags |= MDM_S_NOWRITE;
		else
			shared->ms_flags &= ~MDM_S_NOWRITE;

		shared->ms_flags &= ~MDM_S_IOERR;
		un->un_changecnt++;
		shared->ms_lasterrcnt = un->un_changecnt;

		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
		mirror_commit(un, SMI2BIT(smi), extras);
	}

	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
		/*
		 * Resetting the Last Erred state will recursively call back
		 * into this function (set_sm_comp_state) to update the state.
		 */
		reset_lasterred(un, smi, extras, flags, lockp);
	}
}

static int
find_another_logical(
	mm_unit_t		*un,
	mm_submirror_t		*esm,
	diskaddr_t		blk,
	u_longlong_t		cnt,
	int			must_be_open,
	int			state,
	int			err_cnt)
{
	u_longlong_t	cando;
	md_dev64_t	dev;
	md_m_shared_t	*s;

	esm->sm_state |= SMS_IGNORE;
	while (cnt != 0) {
		u_longlong_t	 mcnt;

		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */

		dev = select_read_unit(un, blk, mcnt, &cando,
		    must_be_open, &s, NULL);
		if (dev == (md_dev64_t)0)
			break;

		if ((state == CS_LAST_ERRED) &&
		    (s->ms_state == CS_LAST_ERRED) &&
		    (err_cnt > s->ms_lasterrcnt))
			break;

		cnt -= cando;
		blk += cando;
	}
	esm->sm_state &= ~SMS_IGNORE;
	return (cnt != 0);
}

int
mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	size_t			count;
	diskaddr_t		block;
	u_longlong_t		skip;
	u_longlong_t		size;
	md_dev64_t		dev;
	int			cnt;
	md_m_shared_t		*s;
	int			not_found;

	sm = &un->un_sm[smi];
	smic = &un->un_smic[smi];
	dev = sm->sm_dev;

	/*
	 * Make sure every component of the submirror
	 * has other sources.
	 */
	if (ci < 0) {
		/* Find the highest lasterrcnt */
		cnt = (*(smic->sm_get_component_count))(dev, sm);
		for (ci = 0; ci < cnt; ci++) {
			not_found = mirror_other_sources(un, smi, ci,
			    must_be_open);
			if (not_found)
				return (1);
		}
		return (0);
	}

	/*
	 * Make sure this component has other sources
	 */
	(void) (*(smic->sm_get_bcss))
	    (dev, sm, ci, &block, &count, &skip, &size);

	if (count == 0)
		return (1);

	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);

	while (count--) {
		if (block >= un->c.un_total_blocks)
			return (0);

		if ((block + size) > un->c.un_total_blocks)
			size = un->c.un_total_blocks - block;

		not_found = find_another_logical(un, sm, block, size,
		    must_be_open, s->ms_state, s->ms_lasterrcnt);
		if (not_found)
			return (1);

		block += size + skip;
	}
	return (0);
}

static void
finish_error(md_mps_t *ps)
{
	struct buf	*pb;
	mm_unit_t	*un;
	mdi_unit_t	*ui;
	uint_t		new_str_flags;

	pb = ps->ps_bp;
	un = ps->ps_un;
	ui = ps->ps_ui;

	/*
	 * Must flag any error to the resync originator if we're performing
	 * a Write-after-Read. This corresponds to an i/o error on a resync
	 * target device and in this case we ought to abort the resync as there
	 * is nothing that can be done to recover from this without operator
	 * intervention. If we don't set the B_ERROR flag we will continue
	 * reading from the mirror but won't write to the target (as it will
	 * have been placed into an errored state).
	 * To handle the case of multiple components within a submirror we only
	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
	 * The originator of the resync read will cause this bit to be set if
	 * the underlying component count is one for a submirror resync. All
	 * other resync types will have the flag set as there is no underlying
	 * resync which can be performed on a contained metadevice for these
	 * resync types (optimized or component).
	 */

	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
			pb->b_flags |= B_ERROR;
		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
		MPS_FREE(mirror_parent_cache, ps);
		md_unit_readerexit(ui);
		md_biodone(pb);
		return;
	}
	/*
	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
	 * operation therefore this I/O request has already been counted,
	 * the I/O count variable will be decremented by mirror_done()'s
	 * call to md_biodone().
	 */
	if (ps->ps_changecnt != un->un_changecnt) {
		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
		if (ps->ps_flags & MD_MPS_WOW)
			new_str_flags |= MD_STR_WOW;
		if (ps->ps_flags & MD_MPS_MAPPED)
			new_str_flags |= MD_STR_MAPPED;
		/*
		 * If this I/O request was a read that was part of a resync,
		 * set MD_STR_WAR for the retried read to ensure that the
		 * resync write (i.e. write-after-read) will be performed
		 */
		if (ps->ps_flags & MD_MPS_RESYNC_READ)
			new_str_flags |= MD_STR_WAR;
		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
		MPS_FREE(mirror_parent_cache, ps);
		md_unit_readerexit(ui);
		(void) md_mirror_strategy(pb, new_str_flags, NULL);
		return;
	}

	pb->b_flags |= B_ERROR;
	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
	MPS_FREE(mirror_parent_cache, ps);
	md_unit_readerexit(ui);
	md_biodone(pb);
}

static void
error_update_unit(md_mps_t *ps)
{
	mm_unit_t		*un;
	mdi_unit_t		*ui;
	int			smi;	/* sub mirror index */
	int			ci;	/* errored component */
	set_t			setno;
	uint_t			flags;	/* for set_sm_comp_state() */
	uint_t			hspflags; /* for check_comp_4_hotspares() */

	ui = ps->ps_ui;
	un = (mm_unit_t *)md_unit_writerlock(ui);
	setno = MD_UN2SET(un);

	/* All of these updates have to propagated in case of MN set */
	flags = MD_STATE_XMIT;
	hspflags = MD_HOTSPARE_XMIT;

	/* special treatment if we are called during updating watermarks */
	if (ps->ps_flags & MD_MPS_WMUPDATE) {
		flags |= MD_STATE_WMUPDATE;
		hspflags |= MD_HOTSPARE_WMUPDATE;
	}
	smi = 0;
	ci = 0;
	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
		if (mirror_other_sources(un, smi, ci, 0) == 1) {

			/* Never called from ioctl context, so (IOLOCK *)NULL */
			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
			    (IOLOCK *)NULL);
			/*
			 * For a MN set, the NOTIFY is done when the state
			 * change is processed on each node
			 */
			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
				    SVM_TAG_METADEVICE, setno, MD_SID(un));
			}
			continue;
		}
		/* Never called from ioctl context, so (IOLOCK *)NULL */
		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
		    (IOLOCK *)NULL);
		/*
		 * For a MN set, the NOTIFY is done when the state
		 * change is processed on each node
		 */
		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
			    SVM_TAG_METADEVICE, setno, MD_SID(un));
		}
		smi = 0;
		ci = 0;
	}

	md_unit_writerexit(ui);
	if (MD_MNSET_SETNO(setno)) {
		send_poke_hotspares(setno);
	} else {
		(void) poke_hotspares();
	}
	(void) md_unit_readerlock(ui);

	finish_error(ps);
}

/*
 * When we have a B_FAILFAST IO error on a Last Erred component we need to
 * retry the IO without B_FAILFAST set so that we try to ensure that the
 * component "sees" each IO.
 */
static void
last_err_retry(md_mcs_t *cs)
{
	struct buf	*cb;
	md_mps_t	*ps;
	uint_t		flags;

	cb = &cs->cs_buf;
	cb->b_flags &= ~B_FAILFAST;

	/* if we're panicing just let this I/O error out */
	if (panicstr) {
		(void) mirror_done(cb);
		return;
	}

	/* reissue the I/O */

	ps = cs->cs_ps;

	bioerror(cb, 0);

	mutex_enter(&ps->ps_mx);

	flags = MD_STR_NOTTOP;
	if (ps->ps_flags & MD_MPS_MAPPED)
		flags |= MD_STR_MAPPED;
	if (ps->ps_flags & MD_MPS_NOBLOCK)
		flags |= MD_NOBLOCK;

	mutex_exit(&ps->ps_mx);

	clear_retry_error(cb);

	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
	    md_shortname(getminor(cb->b_edev)));

	md_call_strategy(cb, flags, NULL);
}

static void
mirror_error(md_mps_t *ps)
{
	int		smi;	/* sub mirror index */
	int		ci;	/* errored component */

	if (panicstr) {
		finish_error(ps);
		return;
	}

	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
		mirror_overlap_tree_remove(ps);

	smi = 0;
	ci = 0;
	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
		md_unit_readerexit(ps->ps_ui);
		daemon_request(&md_mstr_daemon, error_update_unit,
		    (daemon_queue_t *)ps, REQ_OLD);
		return;
	}

	finish_error(ps);
}

static int
copy_write_done(struct buf *cb)
{
	md_mps_t	*ps;
	buf_t		*pb;
	char		*wowbuf;
	wowhdr_t	*wowhdr;
	ssize_t		wow_resid;

	/* get wowbuf ans save structure */
	wowbuf = cb->b_un.b_addr;
	wowhdr = WOWBUF_HDR(wowbuf);
	ps = wowhdr->wow_ps;
	pb = ps->ps_bp;

	/* Save error information, then free cb */
	if (cb->b_flags & B_ERROR)
		pb->b_flags |= B_ERROR;

	if (cb->b_flags & B_REMAPPED)
		bp_mapout(cb);

	freerbuf(cb);

	/* update residual and continue if needed */
	if ((pb->b_flags & B_ERROR) == 0) {
		wow_resid = pb->b_bcount - wowhdr->wow_offset;
		pb->b_resid = wow_resid;
		if (wow_resid > 0)  {
			daemon_request(&md_mstr_daemon, copy_write_cont,
			    (daemon_queue_t *)wowhdr, REQ_OLD);
			return (1);
		}
	}

	/* Write is complete, release resources. */
	kmem_cache_free(mirror_wowblk_cache, wowhdr);
	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
	MPS_FREE(mirror_parent_cache, ps);
	md_biodone(pb);
	return (0);
}

static void
copy_write_cont(wowhdr_t *wowhdr)
{
	buf_t		*pb;
	buf_t		*cb;
	char		*wowbuf;
	int		wow_offset;
	size_t		wow_resid;
	diskaddr_t	wow_blkno;

	wowbuf = WOWHDR_BUF(wowhdr);
	pb = wowhdr->wow_ps->ps_bp;

	/* get data on current location */
	wow_offset = wowhdr->wow_offset;
	wow_resid = pb->b_bcount - wow_offset;
	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);

	/* setup child buffer */
	cb = getrbuf(KM_SLEEP);
	cb->b_flags = B_WRITE;
	cb->b_edev = pb->b_edev;
	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
	cb->b_iodone = copy_write_done;
	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
	cb->b_lblkno = wow_blkno;

	/* move offset to next section */
	wowhdr->wow_offset += cb->b_bcount;

	/* copy and setup write for current section */
	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);

	/* do it */
	/*
	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
	 * that handles the WOW condition. The resultant increment on the
	 * I/O count variable is cleared by copy_write_done()'s call to
	 * md_biodone().
	 */
	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
	    | MD_STR_MAPPED, NULL);
}

static void
md_mirror_copy_write(md_mps_t *ps)
{
	wowhdr_t	*wowhdr;

	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
	mirror_wowblk_init(wowhdr);
	wowhdr->wow_ps = ps;
	wowhdr->wow_offset = 0;
	copy_write_cont(wowhdr);
}

static void
handle_wow(md_mps_t *ps)
{
	buf_t		*pb;

	pb = ps->ps_bp;

	bp_mapin(pb);

	md_mirror_wow_cnt++;
	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
		cmn_err(CE_NOTE,
		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
		    md_shortname(getminor(pb->b_edev)),
		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
	}

	/*
	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
	 * operation therefore this I/O request has already been counted,
	 * the I/O count variable will be decremented by mirror_done()'s
	 * call to md_biodone().
	 */
	if (md_mirror_wow_flg & WOW_NOCOPY)
		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
	else
		md_mirror_copy_write(ps);
}

/*
 * Return true if the specified submirror is either in the Last Erred
 * state or is transitioning into the Last Erred state.
 */
static bool_t
submirror_is_lasterred(mm_unit_t *un, int smi)
{
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	md_m_shared_t		*shared;
	int			ci;
	int			compcnt;

	sm = &un->un_sm[smi];
	smic = &un->un_smic[smi];

	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
	for (ci = 0; ci < compcnt; ci++) {
		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
		    (sm->sm_dev, sm, ci);

		if (shared->ms_state == CS_LAST_ERRED)
			return (B_TRUE);

		/*
		 * It is not currently Last Erred, check if entering Last Erred.
		 */
		if ((shared->ms_flags & MDM_S_IOERR) &&
		    ((shared->ms_state == CS_OKAY) ||
		    (shared->ms_state == CS_RESYNC))) {
			if (mirror_other_sources(un, smi, ci, 0) == 1)
				return (B_TRUE);
		}
	}

	return (B_FALSE);
}


static int
mirror_done(struct buf *cb)
{
	md_mps_t	*ps;
	md_mcs_t	*cs;

	/*LINTED*/
	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
	ps = cs->cs_ps;

	mutex_enter(&ps->ps_mx);

	/* check if we need to retry an errored failfast I/O */
	if (cb->b_flags & B_ERROR) {
		struct buf *pb = ps->ps_bp;

		if (cb->b_flags & B_FAILFAST) {
			int		i;
			mm_unit_t	*un = ps->ps_un;

			for (i = 0; i < NMIRROR; i++) {
				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
					continue;

				if (cb->b_edev ==
				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {

					/*
					 * This is the submirror that had the
					 * error.  Check if it is Last Erred.
					 */
					if (submirror_is_lasterred(un, i)) {
						daemon_queue_t *dqp;

						mutex_exit(&ps->ps_mx);
						dqp = (daemon_queue_t *)cs;
						dqp->dq_prev = NULL;
						dqp->dq_next = NULL;
						daemon_request(&md_done_daemon,
						    last_err_retry, dqp,
						    REQ_OLD);
						return (1);
					}
					break;
				}
			}
		}

		/* continue to process the buf without doing a retry */
		ps->ps_flags |= MD_MPS_ERROR;
		pb->b_error = cb->b_error;
	}

	return (mirror_done_common(cb));
}

/*
 * Split from the original mirror_done function so we can handle bufs after a
 * retry.
 * ps->ps_mx is already held in the caller of this function and the cb error
 * has already been checked and handled in the caller.
 */
static int
mirror_done_common(struct buf *cb)
{
	struct buf	*pb;
	mm_unit_t	*un;
	mdi_unit_t	*ui;
	md_mps_t	*ps;
	md_mcs_t	*cs;
	size_t		end_rr, start_rr, current_rr;

	/*LINTED*/
	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
	ps = cs->cs_ps;
	pb = ps->ps_bp;

	if (cb->b_flags & B_REMAPPED)
		bp_mapout(cb);

	ps->ps_frags--;
	if (ps->ps_frags != 0) {
		mutex_exit(&ps->ps_mx);
		kmem_cache_free(mirror_child_cache, cs);
		return (1);
	}
	un = ps->ps_un;
	ui = ps->ps_ui;

	/*
	 * Do not update outstanding_writes if we're running with ABR
	 * set for this mirror or the write() was issued with MD_STR_ABR set.
	 * Also a resync initiated write() has no outstanding_writes update
	 * either.
	 */
	if (((cb->b_flags & B_READ) == 0) &&
	    (un->un_nsm >= 2) &&
	    (ps->ps_call == NULL) &&
	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
		mutex_enter(&un->un_resync_mx);
		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
			un->un_outstanding_writes[current_rr]--;
		mutex_exit(&un->un_resync_mx);
	}
	kmem_cache_free(mirror_child_cache, cs);
	mutex_exit(&ps->ps_mx);

	if (ps->ps_call != NULL) {
		daemon_request(&md_done_daemon, ps->ps_call,
		    (daemon_queue_t *)ps, REQ_OLD);
		return (1);
	}

	if ((ps->ps_flags & MD_MPS_ERROR)) {
		daemon_request(&md_done_daemon, mirror_error,
		    (daemon_queue_t *)ps, REQ_OLD);
		return (1);
	}

	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
		mirror_overlap_tree_remove(ps);

	/*
	 * Handle Write-on-Write problem.
	 * Skip In case of Raw and Direct I/O as they are
	 * handled earlier.
	 *
	 */
	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
	    !(pb->b_flags & B_READ) &&
	    !(ps->ps_flags & MD_MPS_WOW) &&
	    !(pb->b_flags & B_PHYS) &&
	    any_pages_dirty(pb)) {
		md_unit_readerexit(ps->ps_ui);
		daemon_request(&md_mstr_daemon, handle_wow,
		    (daemon_queue_t *)ps, REQ_OLD);
		return (1);
	}

	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
	MPS_FREE(mirror_parent_cache, ps);
	md_unit_readerexit(ui);
	md_biodone(pb);
	return (0);
}

/*
 * Clear error state in submirror component if the retry worked after
 * a failfast error.
 */
static void
clear_retry_error(struct buf *cb)
{
	int			smi;
	md_mcs_t		*cs;
	mm_unit_t		*un;
	mdi_unit_t		*ui_sm;
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	u_longlong_t		cnt;
	md_m_shared_t		*shared;

	/*LINTED*/
	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
	un = cs->cs_ps->ps_un;

	for (smi = 0; smi < NMIRROR; smi++) {
		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
			continue;

		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
			break;
	}

	if (smi >= NMIRROR)
		return;

	sm = &un->un_sm[smi];
	smic = &un->un_smic[smi];
	cnt = cb->b_bcount;

	ui_sm = MDI_UNIT(getminor(cb->b_edev));
	(void) md_unit_writerlock(ui_sm);

	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
	    cb->b_blkno, &cnt);

	if (shared->ms_flags & MDM_S_IOERR) {
		shared->ms_flags &= ~MDM_S_IOERR;

	} else {
		/* the buf spans components and the first one is not erred */
		int	cnt;
		int	i;

		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
		for (i = 0; i < cnt; i++) {
			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
			    (sm->sm_dev, sm, i);

			if (shared->ms_flags & MDM_S_IOERR &&
			    shared->ms_state == CS_OKAY) {

				shared->ms_flags &= ~MDM_S_IOERR;
				break;
			}
		}
	}

	md_unit_writerexit(ui_sm);
}

static size_t
mirror_map_read(
	md_mps_t *ps,
	md_mcs_t *cs,
	diskaddr_t blkno,
	u_longlong_t	count
)
{
	mm_unit_t	*un;
	buf_t		*bp;
	u_longlong_t	cando;

	bp = &cs->cs_buf;
	un = ps->ps_un;

	bp->b_lblkno = blkno;
	if (fast_select_read_unit(ps, cs) == 0) {
		bp->b_bcount = ldbtob(count);
		return (0);
	}
	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
	    count, &cando, 0, NULL, cs));
	bp->b_bcount = ldbtob(cando);
	if (count != cando)
		return (cando);
	return (0);
}

static void
write_after_read(md_mps_t *ps)
{
	struct buf	*pb;
	int		flags;

	if (ps->ps_flags & MD_MPS_ERROR) {
		mirror_error(ps);
		return;
	}

	pb = ps->ps_bp;
	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
	ps->ps_call = NULL;
	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
	flags = MD_STR_NOTTOP | MD_STR_WAR;
	if (ps->ps_flags & MD_MPS_MAPPED)
		flags |= MD_STR_MAPPED;
	if (ps->ps_flags & MD_MPS_NOBLOCK)
		flags |= MD_NOBLOCK;
	if (ps->ps_flags & MD_MPS_DIRTY_RD)
		flags |= MD_STR_DIRTY_RD;
	(void) mirror_write_strategy(pb, flags, ps);
}

static void
continue_serial(md_mps_t *ps)
{
	md_mcs_t	*cs;
	buf_t		*cb;
	mm_unit_t	*un;
	int		flags;

	un = ps->ps_un;
	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
	mirror_child_init(cs);
	cb = &cs->cs_buf;
	ps->ps_call = NULL;
	ps->ps_frags = 1;
	(void) mirror_map_write(un, cs, ps, 0);
	flags = MD_STR_NOTTOP;
	if (ps->ps_flags & MD_MPS_MAPPED)
		flags |= MD_STR_MAPPED;
	md_call_strategy(cb, flags, NULL);
}

static int
mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
{
	int i;
	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
	buf_t		*cb;
	buf_t		*pb;
	diskaddr_t	blkno;
	size_t		bcount;
	off_t		offset;

	pb = ps->ps_bp;
	cb = &cs->cs_buf;
	cs->cs_ps = ps;

	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);

	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);

	blkno = pb->b_lblkno;
	bcount = pb->b_bcount;
	offset = 0;
	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
		blkno = DK_LABEL_LOC + 1;
		/*
		 * This handles the case where we're requesting
		 * a write to block 0 on a label partition
		 * and the request size was smaller than the
		 * size of the label.  If this is the case
		 * then we'll return -1.  Failure to do so will
		 * either cause the calling thread to hang due to
		 * an ssd bug, or worse if the bcount were allowed
		 * to go negative (ie large).
		 */
		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
			return (-1);
		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
	}

	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
	    cb, KM_NOSLEEP);
	if (war)
		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;

	/*
	 * If the submirror is in the erred stated, check if any component is
	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
	 * flag on the IO.
	 *
	 * Provide a fast path for the non-erred case (which should be the
	 * normal case).
	 */
	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
			mm_submirror_t		*sm;
			mm_submirror_ic_t	*smic;
			int			ci;
			int			compcnt;

			sm = &un->un_sm[i];
			smic = &un->un_smic[i];

			compcnt = (*(smic->sm_get_component_count))
			    (sm->sm_dev, un);
			for (ci = 0; ci < compcnt; ci++) {
				md_m_shared_t	*shared;

				shared = (md_m_shared_t *)
				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
				    sm, ci);

				if (shared->ms_state == CS_LAST_ERRED)
					break;
			}
			if (ci >= compcnt)
				cb->b_flags |= B_FAILFAST;

		} else {
			cb->b_flags |= B_FAILFAST;
		}
	}

	ps->ps_current_sm++;
	if (ps->ps_current_sm != ps->ps_active_cnt) {
		if (un->un_write_option == WR_SERIAL) {
			ps->ps_call = continue_serial;
			return (0);
		}
		return (1);
	}
	return (0);
}

/*
 * directed_read_done:
 * ------------------
 * Completion routine called when a DMR request has been returned from the
 * underlying driver. Wake-up the original ioctl() and return the data to
 * the user.
 */
static void
directed_read_done(md_mps_t *ps)
{
	mm_unit_t	*un;
	mdi_unit_t	*ui;

	un = ps->ps_un;
	ui = ps->ps_ui;

	md_unit_readerexit(ui);
	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
	ps->ps_call = NULL;

	mutex_enter(&un->un_dmr_mx);
	cv_signal(&un->un_dmr_cv);
	mutex_exit(&un->un_dmr_mx);

	/* release the parent structure */
	kmem_cache_free(mirror_parent_cache, ps);
}

/*
 * daemon_io:
 * ------------
 * Called to issue a mirror_write_strategy() or mirror_read_strategy
 * call from a blockable context. NOTE: no mutex can be held on entry to this
 * routine
 */
static void
daemon_io(daemon_queue_t *dq)
{
	md_mps_t	*ps = (md_mps_t *)dq;
	int		flag = MD_STR_NOTTOP;
	buf_t		*pb = ps->ps_bp;

	if (ps->ps_flags & MD_MPS_MAPPED)
		flag |= MD_STR_MAPPED;
	if (ps->ps_flags & MD_MPS_WOW)
		flag |= MD_STR_WOW;
	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
		flag |= MD_STR_WAR;
	if (ps->ps_flags & MD_MPS_ABR)
		flag |= MD_STR_ABR;
	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
		flag |= MD_STR_BLOCK_OK;

	/*
	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
	 * MD_STR_WAR before calling mirror_read_strategy
	 */
	if (pb->b_flags & B_READ) {
		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
			flag |= MD_STR_WAR;
		mirror_read_strategy(pb, flag, ps);
	} else
		mirror_write_strategy(pb, flag, ps);
}

/*
 * update_resync:
 * -------------
 * Called to update the in-core version of the resync record with the latest
 * version that was committed to disk when the previous mirror owner
 * relinquished ownership. This call is likely to block as we must hold-off
 * any current resync processing that may be occurring.
 * On completion of the resync record update we issue the mirror_write_strategy
 * call to complete the i/o that first started this sequence. To remove a race
 * condition between a new write() request which is submitted and the resync
 * record update we acquire the writerlock. This will hold off all i/o to the
 * mirror until the resync update has completed.
 * NOTE: no mutex can be held on entry to this routine
 */
static void
update_resync(daemon_queue_t *dq)
{
	md_mps_t	*ps = (md_mps_t *)dq;
	buf_t		*pb = ps->ps_bp;
	mdi_unit_t	*ui = ps->ps_ui;
	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
	set_t		setno;
	int		restart_resync;

	mutex_enter(&un->un_rrp_inflight_mx);
	(void) md_unit_writerlock(ui);
	ps->ps_un = un;
	setno = MD_MIN2SET(getminor(pb->b_edev));
	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
		/*
		 * Synchronize our in-core view of what regions need to be
		 * resync'd with the on-disk version.
		 */
		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
		    un->un_dirty_bm);

		/* Region dirty map is now up to date */
	}
	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
	md_unit_writerexit(ui);
	mutex_exit(&un->un_rrp_inflight_mx);

	/* Restart the resync thread if it was previously blocked */
	if (restart_resync) {
		mutex_enter(&un->un_rs_thread_mx);
		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
		cv_signal(&un->un_rs_thread_cv);
		mutex_exit(&un->un_rs_thread_mx);
	}
	/* Continue with original deferred i/o */
	daemon_io(dq);
}

/*
 * owner_timeout:
 * -------------
 * Called if the original mdmn_ksend_message() failed and the request is to be
 * retried. Reattempt the original ownership change.
 *
 * NOTE: called at interrupt context (see timeout(9f)).
 */
static void
owner_timeout(void *arg)
{
	daemon_queue_t	*dq = (daemon_queue_t *)arg;

	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
}

/*
 * become_owner:
 * ------------
 * Called to issue RPC request to become the owner of the mirror
 * associated with this i/o request. We assume that the ownership request
 * is synchronous, so if it succeeds we will issue the request via
 * mirror_write_strategy().
 * If multiple i/o's are outstanding we will be called from the mirror_daemon
 * service thread.
 * NOTE: no mutex should be held on entry to this routine.
 */
static void
become_owner(daemon_queue_t *dq)
{
	md_mps_t	*ps = (md_mps_t *)dq;
	mm_unit_t	*un = ps->ps_un;
	buf_t		*pb = ps->ps_bp;
	set_t		setno;
	md_mn_kresult_t	*kres;
	int		msg_flags = md_mirror_msg_flags;
	md_mps_t	*ps1;

	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);

	/*
	 * If we're already the mirror owner we do not need to send a message
	 * but can simply process the i/o request immediately.
	 * If we've already sent the request to become owner we requeue the
	 * request as we're waiting for the synchronous ownership message to
	 * be processed.
	 */
	if (MD_MN_MIRROR_OWNER(un)) {
		/*
		 * As the strategy() call will potentially block we need to
		 * punt this to a separate thread and complete this request
		 * as quickly as possible. Note: if we're a read request
		 * this must be a resync, we cannot afford to be queued
		 * behind any intervening i/o requests. In this case we put the
		 * request on the md_mirror_rs_daemon queue.
		 */
		if (pb->b_flags & B_READ) {
			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
			    REQ_OLD);
		} else {
			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
			    REQ_OLD);
		}
	} else {
		mutex_enter(&un->un_owner_mx);
		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
			md_mn_req_owner_t	*msg;
			int			rval = 0;

			/*
			 * Check to see that we haven't exceeded the maximum
			 * retry count. If we have we fail the i/o as the
			 * comms mechanism has become wedged beyond recovery.
			 */
			if (dq->qlen++ >= MD_OWNER_RETRIES) {
				mutex_exit(&un->un_owner_mx);
				cmn_err(CE_WARN,
				    "md_mirror: Request exhausted ownership "
				    "retry limit of %d attempts", dq->qlen);
				pb->b_error = EIO;
				pb->b_flags |= B_ERROR;
				pb->b_resid = pb->b_bcount;
				kmem_cache_free(mirror_parent_cache, ps);
				md_biodone(pb);
				return;
			}

			/*
			 * Issue request to change ownership. The call is
			 * synchronous so when it returns we can complete the
			 * i/o (if successful), or enqueue it again so that
			 * the operation will be retried.
			 */
			un->un_owner_state |= MM_MN_OWNER_SENT;
			mutex_exit(&un->un_owner_mx);

			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
			setno = MD_MIN2SET(getminor(pb->b_edev));
			msg->mnum = MD_SID(un);
			msg->owner = md_mn_mynode_id;
			msg_flags |= MD_MSGF_NO_LOG;
			/*
			 * If this IO is triggered by updating a watermark,
			 * it might be issued by the creation of a softpartition
			 * while the commd subsystem is suspended.
			 * We don't want this message to block.
			 */
			if (ps->ps_flags & MD_MPS_WMUPDATE) {
				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
			}

			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
			rval = mdmn_ksend_message(setno,
			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
			    (char *)msg, sizeof (md_mn_req_owner_t), kres);

			kmem_free(msg, sizeof (md_mn_req_owner_t));

			if (MDMN_KSEND_MSG_OK(rval, kres)) {
				dq->qlen = 0;
				/*
				 * Successfully changed owner, reread the
				 * resync record so that we have a valid idea of
				 * any previously committed incomplete write()s.
				 * NOTE: As we need to acquire the resync mutex
				 * this may block, so we defer it to a separate
				 * thread handler. This makes us (effectively)
				 * non-blocking once the ownership message
				 * handling has completed.
				 */
				mutex_enter(&un->un_owner_mx);
				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
					un->un_mirror_owner = md_mn_mynode_id;
					/* Sets owner of un_rr_dirty record */
					if (un->un_rr_dirty_recid)
						(void) mddb_setowner(
						    un->un_rr_dirty_recid,
						    md_mn_mynode_id);
					un->un_owner_state &=
					    ~MM_MN_BECOME_OWNER;
					/*
					 * Release the block on the current
					 * resync region if it is blocked
					 */
					ps1 = un->un_rs_prev_overlap;
					if ((ps1 != NULL) &&
					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
						mirror_overlap_tree_remove(ps1);
					mutex_exit(&un->un_owner_mx);

					/*
					 * If we're a read, this must be a
					 * resync request, issue
					 * the i/o request on the
					 * md_mirror_rs_daemon queue. This is
					 * to avoid a deadlock between the
					 * resync_unit thread and
					 * subsequent i/o requests that may
					 * block on the resync region.
					 */
					if (pb->b_flags & B_READ) {
						daemon_request(
						    &md_mirror_rs_daemon,
						    update_resync, dq, REQ_OLD);
					} else {
						daemon_request(
						    &md_mirror_io_daemon,
						    update_resync, dq, REQ_OLD);
					}
					kmem_free(kres,
					    sizeof (md_mn_kresult_t));
					return;
				} else {
					/*
					 * Some other node has beaten us to
					 * obtain ownership. We need to
					 * reschedule our ownership request
					 */
					mutex_exit(&un->un_owner_mx);
				}
			} else {
				mdmn_ksend_show_error(rval, kres,
				    "MD_MN_MSG_REQUIRE_OWNER");
				/*
				 * Message transport failure is handled by the
				 * comms layer. If the ownership change request
				 * does not succeed we need to flag the error to
				 * the initiator of the i/o. This is handled by
				 * the retry logic above. As the request failed
				 * we do not know _who_ the owner of the mirror
				 * currently is. We reset our idea of the owner
				 * to None so that any further write()s will
				 * attempt to become the owner again. This stops
				 * multiple nodes writing to the same mirror
				 * simultaneously.
				 */
				mutex_enter(&un->un_owner_mx);
				un->un_owner_state &=
				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
				mutex_exit(&un->un_owner_mx);
			}
			kmem_free(kres, sizeof (md_mn_kresult_t));
		} else
			mutex_exit(&un->un_owner_mx);

		/*
		 * Re-enqueue this request on the deferred i/o list. Delay the
		 * request for md_mirror_owner_to usecs to stop thrashing.
		 */
		(void) timeout(owner_timeout, dq,
		    drv_usectohz(md_mirror_owner_to));
	}
}

static void
mirror_write_strategy(buf_t *pb, int flag, void *private)
{
	md_mps_t	*ps;
	md_mcs_t	*cs;
	int		more;
	mm_unit_t	*un;
	mdi_unit_t	*ui;
	buf_t		*cb;		/* child buf pointer */
	set_t		setno;
	int		rs_on_overlap = 0;

	ui = MDI_UNIT(getminor(pb->b_edev));
	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));


	md_kstat_waitq_enter(ui);

	/*
	 * If a state change is in progress for this mirror in a MN set,
	 * suspend all non-resync writes until the state change is complete.
	 * The objective of this suspend is to ensure that it is not
	 * possible for one node to read data from a submirror that another node
	 * has not written to because of the state change. Therefore we
	 * suspend all writes until the state change has been made. As it is
	 * not possible to read from the target of a resync, there is no need
	 * to suspend resync writes.
	 * Note that we only block here if the caller can handle a busy-wait.
	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
	 */

	if (!(flag & MD_STR_WAR)) {
		if (flag & MD_STR_BLOCK_OK) {
			mutex_enter(&un->un_suspend_wr_mx);
			while (un->un_suspend_wr_flag) {
				cv_wait(&un->un_suspend_wr_cv,
				    &un->un_suspend_wr_mx);
			}
			mutex_exit(&un->un_suspend_wr_mx);
		}
		(void) md_unit_readerlock(ui);
	}

	if (!(flag & MD_STR_NOTTOP)) {
		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
			md_kstat_waitq_exit(ui);
			return;
		}
	}

	setno = MD_MIN2SET(getminor(pb->b_edev));

	/* If an ABR write has been requested, set MD_STR_ABR flag */
	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
		flag |= MD_STR_ABR;

	if (private == NULL) {
		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
		mirror_parent_init(ps);
	} else {
		ps = private;
		private = NULL;
	}
	if (flag & MD_STR_MAPPED)
		ps->ps_flags |= MD_MPS_MAPPED;

	if (flag & MD_STR_WOW)
		ps->ps_flags |= MD_MPS_WOW;

	if (flag & MD_STR_ABR)
		ps->ps_flags |= MD_MPS_ABR;

	if (flag & MD_STR_WMUPDATE)
		ps->ps_flags |= MD_MPS_WMUPDATE;

	/*
	 * Save essential information from the original buffhdr
	 * in the md_save structure.
	 */
	ps->ps_un = un;
	ps->ps_ui = ui;
	ps->ps_bp = pb;
	ps->ps_addr = pb->b_un.b_addr;
	ps->ps_firstblk = pb->b_lblkno;
	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
	ps->ps_changecnt = un->un_changecnt;

	/*
	 * Check for suspended writes here. This is where we can defer the
	 * write request to the daemon_io queue which will then call us with
	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
	 * the top of this routine.
	 */
	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
		mutex_enter(&un->un_suspend_wr_mx);
		if (un->un_suspend_wr_flag) {
			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
			mutex_exit(&un->un_suspend_wr_mx);
			md_unit_readerexit(ui);
			daemon_request(&md_mirror_daemon, daemon_io,
			    (daemon_queue_t *)ps, REQ_OLD);
			return;
		}
		mutex_exit(&un->un_suspend_wr_mx);
	}

	/*
	 * If not MN owner and this is an ABR write, make sure the current
	 * resync region is in the overlaps tree
	 */
	mutex_enter(&un->un_owner_mx);
	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
		md_mps_t	*ps1;
		/* Block the current resync region, if not already blocked */
		ps1 = un->un_rs_prev_overlap;

		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
		    (ps1->ps_lastblk != 0))) {
			/* Drop locks to avoid deadlock */
			mutex_exit(&un->un_owner_mx);
			md_unit_readerexit(ui);
			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
			rs_on_overlap = 1;
			(void) md_unit_readerlock(ui);
			mutex_enter(&un->un_owner_mx);
			/*
			 * Check to see if we have obtained ownership
			 * while waiting for overlaps. If we have, remove
			 * the resync_region entry from the overlap tree
			 */
			if (MD_MN_MIRROR_OWNER(un) &&
			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
				mirror_overlap_tree_remove(ps1);
				rs_on_overlap = 0;
			}
		}
	}
	mutex_exit(&un->un_owner_mx);


	/*
	 * following keep write after read from writing to the
	 * source in the case where it all came from one place
	 */
	if (flag & MD_STR_WAR) {
		int	abort_write = 0;
		/*
		 * We are perfoming a write-after-read. This is either as a
		 * result of a resync read or as a result of a read in a
		 * dirty resync region when the optimized resync is not
		 * complete. If in a MN set and a resync generated i/o,
		 * if the current block is not in the current
		 * resync region terminate the write as another node must have
		 * completed this resync region
		 */
		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
		    (!flag & MD_STR_DIRTY_RD)) {
			if (!IN_RESYNC_REGION(un, ps))
				abort_write = 1;
		}
		if ((select_write_after_read_units(un, ps) == 0) ||
		    (abort_write)) {
#ifdef DEBUG
			if (mirror_debug_flag)
				printf("Abort resync write on %x, block %lld\n",
				    MD_SID(un), ps->ps_firstblk);
#endif
			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
				mirror_overlap_tree_remove(ps);
			kmem_cache_free(mirror_parent_cache, ps);
			md_kstat_waitq_exit(ui);
			md_unit_readerexit(ui);
			md_biodone(pb);
			return;
		}
	} else {
		select_write_units(un, ps);

		/* Drop readerlock to avoid deadlock */
		md_unit_readerexit(ui);
		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
		un = md_unit_readerlock(ui);
		/*
		 * For a MN set with an ABR write, if we are now the
		 * owner and we have a resync region in the overlap
		 * tree, remove the entry from overlaps and retry the write.
		 */

		if (MD_MNSET_SETNO(setno) &&
		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
			mutex_enter(&un->un_owner_mx);
			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
				mirror_overlap_tree_remove(ps);
				md_kstat_waitq_exit(ui);
				mutex_exit(&un->un_owner_mx);
				md_unit_readerexit(ui);
				daemon_request(&md_mirror_daemon, daemon_io,
				    (daemon_queue_t *)ps, REQ_OLD);
				return;
			}
			mutex_exit(&un->un_owner_mx);
		}
	}

	/*
	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
	 * we need to become the mirror owner before continuing with the
	 * write(). For ABR mirrors we check that we 'own' the resync if
	 * we're in write-after-read mode. We do this _after_ ensuring that
	 * there are no overlaps to ensure that once we know that we are
	 * the owner, the readerlock will not be released until the write is
	 * complete. As a change of ownership in a MN set requires the
	 * writerlock, this ensures that ownership cannot be changed until
	 * the write is complete.
	 */
	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
		if (MD_MN_NO_MIRROR_OWNER(un))  {
			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
				mirror_overlap_tree_remove(ps);
			md_kstat_waitq_exit(ui);
			ASSERT(!(flag & MD_STR_WAR));
			md_unit_readerexit(ui);
			daemon_request(&md_mirror_daemon, become_owner,
			    (daemon_queue_t *)ps, REQ_OLD);
			return;
		}
	}

	/*
	 * Mark resync region if mirror has a Resync Region _and_ we are not
	 * a resync initiated write(). Don't mark region if we're flagged as
	 * an ABR write.
	 */
	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
	    !(flag & MD_STR_WAR)) {
		if (mirror_mark_resync_region(un, ps->ps_firstblk,
		    ps->ps_lastblk, md_mn_mynode_id)) {
			pb->b_flags |= B_ERROR;
			pb->b_resid = pb->b_bcount;
			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
				mirror_overlap_tree_remove(ps);
			kmem_cache_free(mirror_parent_cache, ps);
			md_kstat_waitq_exit(ui);
			md_unit_readerexit(ui);
			md_biodone(pb);
			return;
		}
	}

	ps->ps_childbflags = pb->b_flags | B_WRITE;
	ps->ps_childbflags &= ~B_READ;
	if (flag & MD_STR_MAPPED)
		ps->ps_childbflags &= ~B_PAGEIO;

	if (!(flag & MD_STR_NOTTOP) && panicstr)
		/* Disable WOW and don't free ps */
		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);

	md_kstat_waitq_to_runq(ui);

	/*
	 * Treat Raw and Direct I/O as Write-on-Write always
	 */

	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
	    (pb->b_flags & B_PHYS) &&
	    !(ps->ps_flags & MD_MPS_WOW)) {
		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
			mirror_overlap_tree_remove(ps);
		md_unit_readerexit(ui);
		daemon_request(&md_mstr_daemon, handle_wow,
		    (daemon_queue_t *)ps, REQ_OLD);
		return;
	}

	ps->ps_frags = 1;
	do {
		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
		mirror_child_init(cs);
		cb = &cs->cs_buf;
		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));

		/*
		 * This handles the case where we're requesting
		 * a write to block 0 on a label partition.  (more < 0)
		 * means that the request size was smaller than the
		 * size of the label.  If so this request is done.
		 */
		if (more < 0) {
			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
				mirror_overlap_tree_remove(ps);
			md_kstat_runq_exit(ui);
			kmem_cache_free(mirror_child_cache, cs);
			kmem_cache_free(mirror_parent_cache, ps);
			md_unit_readerexit(ui);
			md_biodone(pb);
			return;
		}
		if (more) {
			mutex_enter(&ps->ps_mx);
			ps->ps_frags++;
			mutex_exit(&ps->ps_mx);
		}
		md_call_strategy(cb, flag, private);
	} while (more);

	if (!(flag & MD_STR_NOTTOP) && panicstr) {
		while (!(ps->ps_flags & MD_MPS_DONE)) {
			md_daemon(1, &md_done_daemon);
			drv_usecwait(10);
		}
		kmem_cache_free(mirror_parent_cache, ps);
	}
}

static void
mirror_read_strategy(buf_t *pb, int flag, void *private)
{
	md_mps_t	*ps;
	md_mcs_t	*cs;
	size_t		more;
	mm_unit_t	*un;
	mdi_unit_t	*ui;
	size_t		current_count;
	diskaddr_t	current_blkno;
	off_t		current_offset;
	buf_t		*cb;		/* child buf pointer */
	set_t		setno;

	ui = MDI_UNIT(getminor(pb->b_edev));

	md_kstat_waitq_enter(ui);

	un = (mm_unit_t *)md_unit_readerlock(ui);

	if (!(flag & MD_STR_NOTTOP)) {
		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
			md_kstat_waitq_exit(ui);
			return;
		}
	}

	if (private == NULL) {
		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
		mirror_parent_init(ps);
	} else {
		ps = private;
		private = NULL;
	}

	if (flag & MD_STR_MAPPED)
		ps->ps_flags |= MD_MPS_MAPPED;
	if (flag & MD_NOBLOCK)
		ps->ps_flags |= MD_MPS_NOBLOCK;
	if (flag & MD_STR_WMUPDATE)
		ps->ps_flags |= MD_MPS_WMUPDATE;

	/*
	 * Check to see if this is a DMR driven read. If so we need to use the
	 * specified side (in un->un_dmr_last_read) for the source of the data.
	 */
	if (flag & MD_STR_DMR)
		ps->ps_flags |= MD_MPS_DMR;

	/*
	 * Save essential information from the original buffhdr
	 * in the md_save structure.
	 */
	ps->ps_un = un;
	ps->ps_ui = ui;
	ps->ps_bp = pb;
	ps->ps_addr = pb->b_un.b_addr;
	ps->ps_firstblk = pb->b_lblkno;
	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
	ps->ps_changecnt = un->un_changecnt;

	current_count = btodb(pb->b_bcount);
	current_blkno = pb->b_lblkno;
	current_offset = 0;

	/*
	 * If flag has MD_STR_WAR set this means that the read is issued by a
	 * resync thread which may or may not be an optimised resync.
	 *
	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
	 * code has not completed; either a resync has not started since snarf,
	 * or there is an optimized resync in progress.
	 *
	 * We need to generate a write after this read in the following two
	 * cases,
	 *
	 * 1. Any Resync-Generated read
	 *
	 * 2. Any read to a DIRTY REGION if there is an optimized resync
	 *    pending or in progress.
	 *
	 * The write after read is done in these cases to ensure that all sides
	 * of the mirror are in sync with the read data and that it is not
	 * possible for an application to read the same block multiple times
	 * and get different data.
	 *
	 * This would be possible if the block was in a dirty region.
	 *
	 * If we're performing a directed read we don't write the data out as
	 * the application is responsible for restoring the mirror to a known
	 * state.
	 */
	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
	    !(flag & MD_STR_DMR)) {
		size_t	start_rr, i, end_rr;
		int	region_dirty = 1;

		/*
		 * We enter here under three circumstances,
		 *
		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
		 * 0			1
		 * 1			0
		 * 1			1
		 *
		 * To be optimal we only care to explicitly check for dirty
		 * regions in the second case since if MD_STR_WAR is set we
		 * always do the write after read.
		 */
		if (!(flag & MD_STR_WAR)) {
			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
			BLK_TO_RR(start_rr, ps->ps_firstblk, un);

			for (i = start_rr; i <= end_rr; i++)
				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
					break;
		}

		if ((region_dirty) &&
		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
			ps->ps_call = write_after_read;
			/*
			 * Mark this as a RESYNC_READ in ps_flags.
			 * This is used if the read fails during a
			 * resync of a 3-way mirror to ensure that
			 * the retried read to the remaining
			 * good submirror has MD_STR_WAR set. This
			 * is needed to ensure that the resync write
			 * (write-after-read) takes place.
			 */
			ps->ps_flags |= MD_MPS_RESYNC_READ;

			/*
			 * If MD_STR_FLAG_ERR is set in the flags we
			 * set MD_MPS_FLAG_ERROR so that an error on the resync
			 * write (issued by write_after_read) will be flagged
			 * to the biowait'ing resync thread. This allows us to
			 * avoid issuing further resync requests to a device
			 * that has had a write failure.
			 */
			if (flag & MD_STR_FLAG_ERR)
				ps->ps_flags |= MD_MPS_FLAG_ERROR;

			setno = MD_UN2SET(un);
			/*
			 * Drop the readerlock to avoid
			 * deadlock
			 */
			md_unit_readerexit(ui);
			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
			un = md_unit_readerlock(ui);
			/*
			 * Ensure that we are owner
			 */
			if (MD_MNSET_SETNO(setno)) {
				/*
				 * For a non-resync read that requires a
				 * write-after-read to be done, set a flag
				 * in the parent structure, so that the
				 * write_strategy routine can omit the
				 * test that the write is still within the
				 * resync region
				 */
				if (!(flag & MD_STR_WAR))
					ps->ps_flags |= MD_MPS_DIRTY_RD;

				/*
				 * Before reading the buffer, see if
				 * there is an owner.
				 */
				if (MD_MN_NO_MIRROR_OWNER(un))  {
					ps->ps_call = NULL;
					mirror_overlap_tree_remove(ps);
					md_kstat_waitq_exit(ui);
					md_unit_readerexit(ui);
					daemon_request(
					    &md_mirror_daemon,
					    become_owner,
					    (daemon_queue_t *)ps,
					    REQ_OLD);
					return;
				}
				/*
				 * For a resync read, check to see if I/O is
				 * outside of the current resync region, or
				 * the resync has finished. If so
				 * just terminate the I/O
				 */
				if ((flag & MD_STR_WAR) &&
				    (!(un->c.un_status & MD_UN_WAR) ||
				    (!IN_RESYNC_REGION(un, ps)))) {
#ifdef DEBUG
					if (mirror_debug_flag)
						printf("Abort resync read "
						    "%x: %lld\n",
						    MD_SID(un),
						    ps->ps_firstblk);
#endif
					mirror_overlap_tree_remove(ps);
					kmem_cache_free(mirror_parent_cache,
					    ps);
					md_kstat_waitq_exit(ui);
					md_unit_readerexit(ui);
					md_biodone(pb);
					return;
				}
			}
		}
	}

	if (flag & MD_STR_DMR) {
		ps->ps_call = directed_read_done;
	}

	if (!(flag & MD_STR_NOTTOP) && panicstr)
		ps->ps_flags |= MD_MPS_DONTFREE;

	md_kstat_waitq_to_runq(ui);

	ps->ps_frags++;
	do {
		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
		mirror_child_init(cs);
		cb = &cs->cs_buf;
		cs->cs_ps = ps;

		cb = md_bioclone(pb, current_offset, current_count, NODEV,
		    current_blkno, mirror_done, cb, KM_NOSLEEP);

		more = mirror_map_read(ps, cs, current_blkno,
		    (u_longlong_t)current_count);
		if (more) {
			mutex_enter(&ps->ps_mx);
			ps->ps_frags++;
			mutex_exit(&ps->ps_mx);
		}

		/*
		 * Do these calculations now,
		 *  so that we pickup a valid b_bcount from the chld_bp.
		 */
		current_count -= more;
		current_offset += cb->b_bcount;
		current_blkno +=  more;
		md_call_strategy(cb, flag, private);
	} while (more);

	if (!(flag & MD_STR_NOTTOP) && panicstr) {
		while (!(ps->ps_flags & MD_MPS_DONE)) {
			md_daemon(1, &md_done_daemon);
			drv_usecwait(10);
		}
		kmem_cache_free(mirror_parent_cache, ps);
	}
}

void
md_mirror_strategy(buf_t *bp, int flag, void *private)
{
	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));

	/*
	 * When doing IO to a multi owner meta device, check if set is halted.
	 * We do this check without the needed lock held, for performance
	 * reasons.
	 * If an IO just slips through while the set is locked via an
	 * MD_MN_SUSPEND_SET, we don't care about it.
	 * Only check for suspension if we are a top-level i/o request
	 * (MD_STR_NOTTOP is cleared in 'flag').
	 */
	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
	    (MD_SET_HALTED | MD_SET_MNSET)) {
		if ((flag & MD_STR_NOTTOP) == 0) {
			mutex_enter(&md_mx);
			/* Here we loop until the set is no longer halted */
			while (md_set[setno].s_status & MD_SET_HALTED) {
				cv_wait(&md_cv, &md_mx);
			}
			mutex_exit(&md_mx);
		}
	}

	if ((flag & MD_IO_COUNTED) == 0) {
		if ((flag & MD_NOBLOCK) == 0) {
			if (md_inc_iocount(setno) != 0) {
				bp->b_flags |= B_ERROR;
				bp->b_error = ENXIO;
				bp->b_resid = bp->b_bcount;
				biodone(bp);
				return;
			}
		} else {
			md_inc_iocount_noblock(setno);
		}
	}

	if (bp->b_flags & B_READ)
		mirror_read_strategy(bp, flag, private);
	else
		mirror_write_strategy(bp, flag, private);
}

/*
 * mirror_directed_read:
 * --------------------
 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
 * so that the application can determine what (if any) resync needs to be
 * performed. The data is copied out to the user-supplied buffer.
 *
 * Parameters:
 *	mdev	- dev_t for the mirror device
 *	vdr	- directed read parameters specifying location and submirror
 *		  to perform the read from
 *	mode	- used to ddi_copyout() any resulting data from the read
 *
 * Returns:
 *	0	success
 *	!0	error code
 *		EINVAL - invalid request format
 */
int
mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
{
	buf_t		*bp;
	minor_t		mnum = getminor(mdev);
	mdi_unit_t	*ui = MDI_UNIT(mnum);
	mm_unit_t	*un;
	mm_submirror_t	*sm;
	char		*sm_nm;
	uint_t		next_side;
	void		*kbuffer;

	if (ui == NULL)
		return (ENXIO);

	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
		return (EINVAL);
	}

	/* Check for aligned block access. We disallow non-aligned requests. */
	if (vdr->vdr_offset % DEV_BSIZE) {
		return (EINVAL);
	}

	/*
	 * Allocate kernel buffer for target of read(). If we had a reliable
	 * (sorry functional) DDI this wouldn't be needed.
	 */
	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
	if (kbuffer == NULL) {
		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
		    " bytes\n", vdr->vdr_nbytes);
		return (ENOMEM);
	}

	bp = getrbuf(KM_SLEEP);

	bp->b_un.b_addr = kbuffer;
	bp->b_flags = B_READ;
	bp->b_bcount = vdr->vdr_nbytes;
	bp->b_lblkno = lbtodb(vdr->vdr_offset);
	bp->b_edev = mdev;

	un = md_unit_readerlock(ui);

	/*
	 * If DKV_SIDE_INIT is set we need to determine the first available
	 * side to start reading from. If it isn't set we increment to the
	 * next readable submirror.
	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
	 * Note: we check for a readable submirror on completion of the i/o so
	 * we should _always_ have one available. If this becomes unavailable
	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
	 * a metadetach is made between the completion of one DKIOCDMR ioctl
	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
	 * The chance of this is small, but not non-existent.
	 */
	if (vdr->vdr_side == DKV_SIDE_INIT) {
		next_side = 0;
	} else {
		next_side = vdr->vdr_side + 1;
	}
	while ((next_side < NMIRROR) &&
	    !SUBMIRROR_IS_READABLE(un, next_side))
		next_side++;
	if (next_side >= NMIRROR) {
		vdr->vdr_flags |= DKV_DMR_ERROR;
		freerbuf(bp);
		vdr->vdr_bytesread = 0;
		md_unit_readerexit(ui);
		return (0);
	}

	/* Set the side to read from */
	un->un_dmr_last_read = next_side;

	md_unit_readerexit(ui);

	/*
	 * Save timestamp for verification purposes. Can be read by debugger
	 * to verify that this ioctl has been executed and to find the number
	 * of DMR reads and the time of the last DMR read.
	 */
	uniqtime(&mirror_dmr_stats.dmr_timestamp);
	mirror_dmr_stats.dmr_count++;

	/* Issue READ request and wait for completion */
	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);

	mutex_enter(&un->un_dmr_mx);
	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
	mutex_exit(&un->un_dmr_mx);

	/*
	 * Check to see if we encountered an error during the read. If so we
	 * can make no guarantee about any possibly returned data.
	 */
	if ((bp->b_flags & B_ERROR) == 0) {
		vdr->vdr_flags &= ~DKV_DMR_ERROR;
		if (bp->b_resid) {
			vdr->vdr_flags |= DKV_DMR_SHORT;
			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
		} else {
			vdr->vdr_flags |= DKV_DMR_SUCCESS;
			vdr->vdr_bytesread = vdr->vdr_nbytes;
		}
		/* Copy the data read back out to the user supplied buffer */
		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
		    mode)) {
			kmem_free(kbuffer, vdr->vdr_nbytes);
			return (EFAULT);
		}

	} else {
		/* Error out with DKV_DMR_ERROR */
		vdr->vdr_flags |= DKV_DMR_ERROR;
		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
	}
	/*
	 * Update the DMR parameters with the side and name of submirror that
	 * we have just read from (un->un_dmr_last_read)
	 */
	un = md_unit_readerlock(ui);

	vdr->vdr_side = un->un_dmr_last_read;
	sm = &un->un_sm[un->un_dmr_last_read];
	sm_nm = md_shortname(md_getminor(sm->sm_dev));

	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));

	/*
	 * Determine if we've completed the read cycle. This is true iff the
	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
	 * use un_nsm as we need to handle a sparse array of submirrors (which
	 * can occur if a submirror is metadetached).
	 */
	next_side = un->un_dmr_last_read + 1;
	while ((next_side < NMIRROR) &&
	    !SUBMIRROR_IS_READABLE(un, next_side))
		next_side++;
	if (next_side >= NMIRROR) {
		/* We've finished */
		vdr->vdr_flags |= DKV_DMR_DONE;
	}

	md_unit_readerexit(ui);
	freerbuf(bp);
	kmem_free(kbuffer, vdr->vdr_nbytes);

	return (0);
}

/*
 * mirror_resync_message:
 * ---------------------
 * Handle the multi-node resync messages that keep all nodes within a given
 * disk-set in sync with their view of a mirror's resync status.
 *
 * The message types dealt with are:
 * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
 * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
 * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
 * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
 *
 * Returns:
 *	0	Success
 *	>0	Failure error number
 */
int
mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
{
	mdi_unit_t		*ui;
	mm_unit_t		*un;
	set_t			setno;
	int			is_ABR;
	int			smi;
	int			ci;
	sm_state_t		state;
	int			broke_out;
	mm_submirror_t		*sm;
	mm_submirror_ic_t	*smic;
	md_m_shared_t		*shared;
	md_error_t		mde = mdnullerror;
	md_mps_t		*ps;
	int			rs_active;
	int			rr, rr_start, rr_end;

	/* Check that the given device is part of a multi-node set */
	setno = MD_MIN2SET(p->mnum);
	if (setno >= md_nsets) {
		return (ENXIO);
	}
	if (!MD_MNSET_SETNO(setno)) {
		return (EINVAL);
	}

	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
		return (EINVAL);
	if ((ui = MDI_UNIT(p->mnum)) == NULL)
		return (EINVAL);
	is_ABR = (ui->ui_tstate & MD_ABR_CAP);

	/* Obtain the current resync status */
	(void) md_ioctl_readerlock(lockp, ui);
	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
	md_ioctl_readerexit(lockp);

	switch ((md_mn_msgtype_t)p->msg_type) {
	case MD_MN_MSG_RESYNC_STARTING:
		/* Start the resync thread for the mirror */
		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
		break;

	case MD_MN_MSG_RESYNC_NEXT:
		/*
		 * We have to release any previously marked overlap regions
		 * so that i/o can resume. Then we need to block the region
		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
		 * Update un_rs_resync_done and un_rs_resync_2_do.
		 */
		(void) md_ioctl_readerlock(lockp, ui);
		/*
		 * Ignore the message if there is no active resync thread or
		 * if it is for a resync type that we have already completed.
		 * un_resync_completed is set to the last resync completed
		 * when processing a PHASE_DONE message.
		 */
		if (!rs_active || (p->rs_type == un->un_resync_completed))
			break;
		/*
		 * If this message is for the same resync and is for an earlier
		 * resync region, just ignore it. This can only occur if this
		 * node has progressed on to the next resync region before
		 * we receive this message. This can occur if the class for
		 * this message is busy and the originator has to retry thus
		 * allowing this node to move onto the next resync_region.
		 */
		if ((p->rs_type == un->un_rs_type) &&
		    (p->rs_start < un->un_resync_startbl))
			break;
		ps = un->un_rs_prev_overlap;

		/* Allocate previous overlap reference if needed */
		if (ps == NULL) {
			ps = kmem_cache_alloc(mirror_parent_cache,
			    MD_ALLOCFLAGS);
			ps->ps_un = un;
			ps->ps_ui = ui;
			ps->ps_firstblk = 0;
			ps->ps_lastblk = 0;
			ps->ps_flags = 0;
			md_ioctl_readerexit(lockp);
			(void) md_ioctl_writerlock(lockp, ui);
			un->un_rs_prev_overlap = ps;
			md_ioctl_writerexit(lockp);
		} else
			md_ioctl_readerexit(lockp);

		if (p->rs_originator != md_mn_mynode_id) {
			/*
			 * Clear our un_resync_bm for the regions completed.
			 * The owner (originator) will take care of itself.
			 */
			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
			BLK_TO_RR(rr_start, p->rs_start, un);
			if (ps->ps_lastblk && rr_end < rr_start) {
				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
				mutex_enter(&un->un_resync_mx);
				/*
				 * Update our resync bitmap to reflect that
				 * another node has synchronized this range.
				 */
				for (rr = rr_start; rr <= rr_end; rr++) {
					CLR_KEEPDIRTY(rr, un);
				}
				mutex_exit(&un->un_resync_mx);
			}

			/*
			 * On all but the originating node, first update
			 * the resync state, then unblock the previous
			 * region and block the next one. No need
			 * to do this if the region is already blocked.
			 * Update the submirror state and flags from the
			 * originator. This keeps the cluster in sync with
			 * regards to the resync status.
			 */

			(void) md_ioctl_writerlock(lockp, ui);
			un->un_rs_resync_done = p->rs_done;
			un->un_rs_resync_2_do = p->rs_2_do;
			un->un_rs_type = p->rs_type;
			un->un_resync_startbl = p->rs_start;
			md_ioctl_writerexit(lockp);
			/*
			 * Use un_owner_mx to ensure that an ownership change
			 * cannot happen at the same time as this message
			 */
			mutex_enter(&un->un_owner_mx);
			if (MD_MN_MIRROR_OWNER(un)) {
				ps->ps_firstblk = p->rs_start;
				ps->ps_lastblk = ps->ps_firstblk +
				    p->rs_size - 1;
			} else {
				if ((ps->ps_firstblk != p->rs_start) ||
				    (ps->ps_lastblk != p->rs_start +
				    p->rs_size - 1)) {
					/* Remove previous overlap range */
					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
						mirror_overlap_tree_remove(ps);

					ps->ps_firstblk = p->rs_start;
					ps->ps_lastblk = ps->ps_firstblk +
					    p->rs_size - 1;

					mutex_exit(&un->un_owner_mx);
					/* Block this range from all i/o. */
					if (ps->ps_firstblk != 0 ||
					    ps->ps_lastblk != 0)
						wait_for_overlaps(ps,
						    MD_OVERLAP_ALLOW_REPEAT);
					mutex_enter(&un->un_owner_mx);
					/*
					 * Check to see if we have obtained
					 * ownership while waiting for
					 * overlaps. If we have, remove
					 * the resync_region entry from the
					 * overlap tree
					 */
					if (MD_MN_MIRROR_OWNER(un) &&
					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
						mirror_overlap_tree_remove(ps);
				}
			}
			mutex_exit(&un->un_owner_mx);

			/*
			 * If this is the first RESYNC_NEXT message (i.e.
			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
			 * issue RESYNC_START NOTIFY event
			 */
			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
				    SVM_TAG_METADEVICE, MD_UN2SET(un),
				    MD_SID(un));
			}

			/* Ensure that our local resync thread is running */
			if (un->un_rs_thread == NULL) {
				(void) mirror_resync_unit(p->mnum, NULL,
				    &p->mde, lockp);
			}
		}

		break;
	case MD_MN_MSG_RESYNC_FINISH:
		/*
		 * Complete the resync by stopping the resync thread.
		 * Also release the previous overlap region field.
		 * Update the resync_progress_thread by cv_signal'ing it so
		 * that we mark the end of the resync as soon as possible. This
		 * stops an unnecessary delay should be panic after resync
		 * completion.
		 */
#ifdef DEBUG
		if (!rs_active) {
			if (mirror_debug_flag)
				printf("RESYNC_FINISH (mnum = %x), "
				    "Resync *NOT* active",
				    p->mnum);
		}
#endif

		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
		    (p->rs_originator != md_mn_mynode_id)) {
			mutex_enter(&un->un_rs_thread_mx);
			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
			un->un_rs_thread_flags &=
			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
			cv_signal(&un->un_rs_thread_cv);
			mutex_exit(&un->un_rs_thread_mx);
		}
		if (is_ABR) {
			/* Resync finished, if ABR set owner to NULL */
			mutex_enter(&un->un_owner_mx);
			un->un_mirror_owner = 0;
			mutex_exit(&un->un_owner_mx);
		}
		(void) md_ioctl_writerlock(lockp, ui);
		ps = un->un_rs_prev_overlap;
		if (ps != NULL) {
			/* Remove previous overlap range */
			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
				mirror_overlap_tree_remove(ps);
			/*
			 * Release the overlap range reference
			 */
			un->un_rs_prev_overlap = NULL;
			kmem_cache_free(mirror_parent_cache,
			    ps);
		}
		md_ioctl_writerexit(lockp);

		/* Mark the resync as complete in the metadb */
		un->un_rs_resync_done = p->rs_done;
		un->un_rs_resync_2_do = p->rs_2_do;
		un->un_rs_type = p->rs_type;
		mutex_enter(&un->un_rs_progress_mx);
		cv_signal(&un->un_rs_progress_cv);
		mutex_exit(&un->un_rs_progress_mx);

		un = md_ioctl_writerlock(lockp, ui);
		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
		/* Deal with any pending grow_unit */
		if (un->c.un_status & MD_UN_GROW_PENDING) {
			if ((mirror_grow_unit(un, &mde) != 0) ||
			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
				un->c.un_status &= ~MD_UN_GROW_PENDING;
			}
		}
		md_ioctl_writerexit(lockp);
		break;

	case MD_MN_MSG_RESYNC_PHASE_DONE:
		/*
		 * A phase of the resync, optimized. component or
		 * submirror is complete. Update mirror status.
		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
		 * mirror owner is peforming a resync. If we have just snarfed
		 * this set, then we must clear any of the flags set at snarf
		 * time by unit_setup_resync().
		 * Note that unit_setup_resync() sets up these flags to
		 * indicate that an optimized resync is required. These flags
		 * need to be reset because if we get here,  the mirror owner
		 * will have handled the optimized resync.
		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
		 * MD_UN_WAR. In addition, for each submirror,
		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
		 * set to SMS_OFFLINE.
		 */
#ifdef DEBUG
		if (mirror_debug_flag)
			printf("phase done mess received from %d, mnum=%x,"
			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
			    p->rs_type, p->rs_flags);
#endif
		/*
		 * Ignore the message if there is no active resync thread.
		 */
		if (!rs_active)
			break;

		broke_out = p->rs_flags & MD_MN_RS_ERR;
		switch (RS_TYPE(p->rs_type)) {
		case MD_RS_OPTIMIZED:
			un = md_ioctl_writerlock(lockp, ui);
			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
				/* If we are originator, just clear rs_type */
				if (p->rs_originator == md_mn_mynode_id) {
					SET_RS_TYPE_NONE(un->un_rs_type);
					md_ioctl_writerexit(lockp);
					break;
				}
				/*
				 * If CLEAR_OPT_NOT_DONE is set, only clear the
				 * flags if OPT_NOT_DONE is set *and* rs_type
				 * is MD_RS_NONE.
				 */
				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
					/* No resync in progress */
					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
					un->c.un_status &= ~MD_UN_WAR;
				} else {
					/*
					 * We are in the middle of an
					 * optimized resync and this message
					 * should be ignored.
					 */
					md_ioctl_writerexit(lockp);
					break;
				}
			} else {
				/*
				 * This is the end of an optimized resync,
				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
				 */

				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
				if (!broke_out)
					un->c.un_status &= ~MD_UN_WAR;

				/*
				 * Clear our un_resync_bm for the regions
				 * completed.  The owner (originator) will
				 * take care of itself.
				 */
				if (p->rs_originator != md_mn_mynode_id &&
				    (ps = un->un_rs_prev_overlap) != NULL) {
					BLK_TO_RR(rr_start, ps->ps_firstblk,
					    un);
					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
					mutex_enter(&un->un_resync_mx);
					for (rr = rr_start; rr <= rr_end;
					    rr++) {
						CLR_KEEPDIRTY(rr, un);
					}
					mutex_exit(&un->un_resync_mx);
				}
			}

			/*
			 * Set resync_completed to last resync type and then
			 * clear resync_type to indicate no resync in progress
			 */
			un->un_resync_completed = un->un_rs_type;
			SET_RS_TYPE_NONE(un->un_rs_type);

			/*
			 * If resync is as a result of a submirror ONLINE,
			 * reset the submirror state to SMS_RUNNING if the
			 * resync was ok else set back to SMS_OFFLINE.
			 */
			for (smi = 0; smi < NMIRROR; smi++) {
				un->un_sm[smi].sm_flags &=
				    ~MD_SM_RESYNC_TARGET;
				if (SMS_BY_INDEX_IS(un, smi,
				    SMS_OFFLINE_RESYNC)) {
					if (p->rs_flags &
					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
						state = SMS_OFFLINE;
					} else {
						state = (broke_out ?
						    SMS_OFFLINE : SMS_RUNNING);
					}
					mirror_set_sm_state(
					    &un->un_sm[smi],
					    &un->un_smic[smi], state,
					    broke_out);
					mirror_commit(un, NO_SUBMIRRORS,
					    0);
				}
				/*
				 * If we still have an offline submirror, reset
				 * the OFFLINE_SM flag in the mirror status
				 */
				if (SMS_BY_INDEX_IS(un, smi,
				    SMS_OFFLINE))
					un->c.un_status |=
					    MD_UN_OFFLINE_SM;
			}
			md_ioctl_writerexit(lockp);
			break;
		case MD_RS_SUBMIRROR:
			un = md_ioctl_writerlock(lockp, ui);
			smi = RS_SMI(p->rs_type);
			sm = &un->un_sm[smi];
			smic = &un->un_smic[smi];
			/* Clear RESYNC target */
			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
			/*
			 * Set resync_completed to last resync type and then
			 * clear resync_type to indicate no resync in progress
			 */
			un->un_resync_completed = un->un_rs_type;
			SET_RS_TYPE_NONE(un->un_rs_type);
			/*
			 * If the resync completed ok reset the submirror
			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
			 */
			state = (broke_out ?
			    SMS_ATTACHED : SMS_RUNNING);
			mirror_set_sm_state(sm, smic, state, broke_out);
			un->c.un_status &= ~MD_UN_WAR;
			mirror_commit(un, SMI2BIT(smi), 0);
			md_ioctl_writerexit(lockp);
			break;
		case MD_RS_COMPONENT:
			un = md_ioctl_writerlock(lockp, ui);
			smi = RS_SMI(p->rs_type);
			ci = RS_CI(p->rs_type);
			sm = &un->un_sm[smi];
			smic = &un->un_smic[smi];
			shared = (md_m_shared_t *)
			    (*(smic->sm_shared_by_indx))
			    (sm->sm_dev, sm, ci);
			un->c.un_status &= ~MD_UN_WAR;
			/* Clear RESYNC target */
			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
			/*
			 * Set resync_completed to last resync type and then
			 * clear resync_type to indicate no resync in progress
			 */
			un->un_resync_completed = un->un_rs_type;
			SET_RS_TYPE_NONE(un->un_rs_type);

			/*
			 * If the resync completed ok, set the component state
			 * to CS_OKAY.
			 */
			if (broke_out)
				shared->ms_flags |= MDM_S_RS_TRIED;
			else {
				/*
				 * As we don't transmit the changes,
				 * no need to drop the lock.
				 */
				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
			}
			md_ioctl_writerexit(lockp);
		default:
			break;
		}
		/*
		 * If the purpose of this PHASE_DONE message is just to
		 * indicate to all other nodes that the optimized resync
		 * required (OPT_NOT_DONE) flag is to be cleared, there is
		 * no need to generate a notify event as there has not
		 * actually been a resync.
		 */
		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
			if (broke_out) {
				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
				    SVM_TAG_METADEVICE, MD_UN2SET(un),
				    MD_SID(un));
			} else {
				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
				    SVM_TAG_METADEVICE, MD_UN2SET(un),
				    MD_SID(un));
			}
		}
		break;

	default:
#ifdef DEBUG
		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
		    " %x\n", p->msg_type);
#endif
		return (EINVAL);
	}
	return (0);
}

/* Return a -1 if snarf of optimized record failed and set should be released */
static int
mirror_snarf(md_snarfcmd_t cmd, set_t setno)
{
	mddb_recid_t	recid;
	int		gotsomething;
	int		all_mirrors_gotten;
	mm_unit_t	*un;
	mddb_type_t	typ1;
	mddb_de_ic_t    *dep;
	mddb_rb32_t	*rbp;
	size_t		newreqsize;
	mm_unit_t	*big_un;
	mm_unit32_od_t	*small_un;
	int		retval;
	mdi_unit_t	*ui;

	if (cmd == MD_SNARF_CLEANUP) {
		if (md_get_setstatus(setno) & MD_SET_STALE)
			return (0);

		recid = mddb_makerecid(setno, 0);
		typ1 = (mddb_type_t)md_getshared_key(setno,
		    mirror_md_ops.md_driver.md_drivername);
		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
				un = (mm_unit_t *)mddb_getrecaddr(recid);
				mirror_cleanup(un);
				recid = mddb_makerecid(setno, 0);
			}
		}
		return (0);
	}

	all_mirrors_gotten = 1;
	gotsomething = 0;

	recid = mddb_makerecid(setno, 0);
	typ1 = (mddb_type_t)md_getshared_key(setno,
	    mirror_md_ops.md_driver.md_drivername);

	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
			continue;

		dep = mddb_getrecdep(recid);
		dep->de_flags = MDDB_F_MIRROR;
		rbp = dep->de_rb;

		switch (rbp->rb_revision) {
		case MDDB_REV_RB:
		case MDDB_REV_RBFN:
			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
				/*
				 * This means, we have an old and small
				 * record and this record hasn't already
				 * been converted.  Before we create an
				 * incore metadevice from this we have to
				 * convert it to a big record.
				 */
				small_un =
				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
				newreqsize = sizeof (mm_unit_t);
				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
				    KM_SLEEP);
				mirror_convert((caddr_t)small_un,
				    (caddr_t)big_un, SMALL_2_BIG);
				kmem_free(small_un, dep->de_reqsize);

				/*
				 * Update userdata and incore userdata
				 * incores are at the end of un
				 */
				dep->de_rb_userdata_ic = big_un;
				dep->de_rb_userdata = big_un;
				dep->de_icreqsize = newreqsize;
				un = big_un;
				rbp->rb_private |= MD_PRV_CONVD;
			} else {
				/*
				 * Unit already converted, just get the
				 * record address.
				 */
				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
				    sizeof (*un), 0);
			}
			un->c.un_revision &= ~MD_64BIT_META_DEV;
			break;
		case MDDB_REV_RB64:
		case MDDB_REV_RB64FN:
			/* Big device */
			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
			    sizeof (*un), 0);
			un->c.un_revision |= MD_64BIT_META_DEV;
			un->c.un_flag |= MD_EFILABEL;
			break;
		}
		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);

		/*
		 * Create minor device node for snarfed entry.
		 */
		(void) md_create_minor_node(setno, MD_SID(un));

		if (MD_UNIT(MD_SID(un)) != NULL) {
			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
			continue;
		}
		all_mirrors_gotten = 0;
		retval = mirror_build_incore(un, 1);
		if (retval == 0) {
			mddb_setrecprivate(recid, MD_PRV_GOTIT);
			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
			resync_start_timeout(setno);
			gotsomething = 1;
		} else {
			return (retval);
		}
		/*
		 * Set flag to indicate that the mirror has not yet
		 * been through a reconfig. This flag is used for MN sets
		 * when determining whether to update the mirror state from
		 * the Master node.
		 */
		if (MD_MNSET_SETNO(setno)) {
			ui = MDI_UNIT(MD_SID(un));
			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
		}
	}

	if (!all_mirrors_gotten)
		return (gotsomething);

	recid = mddb_makerecid(setno, 0);
	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
			mddb_setrecprivate(recid, MD_PRV_PENDDEL);

	return (0);
}

static int
mirror_halt(md_haltcmd_t cmd, set_t setno)
{
	unit_t		i;
	mdi_unit_t	*ui;
	minor_t		mnum;
	int		reset_mirror_flag = 0;

	if (cmd == MD_HALT_CLOSE)
		return (0);

	if (cmd == MD_HALT_OPEN)
		return (0);

	if (cmd == MD_HALT_UNLOAD)
		return (0);

	if (cmd == MD_HALT_CHECK) {
		for (i = 0; i < md_nunits; i++) {
			mnum = MD_MKMIN(setno, i);
			if ((ui = MDI_UNIT(mnum)) == NULL)
				continue;
			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
				continue;
			if (md_unit_isopen(ui))
				return (1);
		}
		return (0);
	}

	if (cmd != MD_HALT_DOIT)
		return (1);

	for (i = 0; i < md_nunits; i++) {
		mnum = MD_MKMIN(setno, i);
		if ((ui = MDI_UNIT(mnum)) == NULL)
			continue;
		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
			continue;
		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);

		/* Set a flag if there is at least one mirror metadevice. */
		reset_mirror_flag = 1;
	}

	/*
	 * Only wait for the global dr_timeout to finish
	 *  - if there are mirror metadevices in this diskset or
	 *  - if this is the local set since an unload of the md_mirror
	 *    driver could follow a successful mirror halt in the local set.
	 */
	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
		while ((mirror_md_ops.md_head == NULL) &&
		    (mirror_timeout.dr_timeout_id != 0))
			delay(md_hz);
	}

	return (0);
}

/*ARGSUSED3*/
static int
mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
{
	IOLOCK	lock;
	minor_t		mnum = getminor(*dev);
	set_t		setno;

	/*
	 * When doing an open of a multi owner metadevice, check to see if this
	 * node is a starting node and if a reconfig cycle is underway.
	 * If so, the system isn't sufficiently set up enough to handle the
	 * open (which involves I/O during sp_validate), so fail with ENXIO.
	 */
	setno = MD_MIN2SET(mnum);
	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
			return (ENXIO);
	}

	if (md_oflags & MD_OFLG_FROMIOCTL) {
		/*
		 * This indicates that the caller is an ioctl service routine.
		 * In this case we initialise our stack-based IOLOCK and pass
		 * this into the internal open routine. This allows multi-owner
		 * metadevices to avoid deadlocking if an error is encountered
		 * during the open() attempt. The failure case is:
		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
		 * this configuration would deadlock as the mirror code has to
		 * send a state-update to the other nodes when it detects the
		 * failure of the underlying submirror with an errored soft-part
		 * on it. As there is a class1 message in progress (metaclear)
		 * set_sm_comp_state() cannot send another class1 message;
		 * instead we do not send a state_update message as the
		 * metaclear is distributed and the failed submirror will be
		 * cleared from the configuration by the metaclear.
		 */
		IOLOCK_INIT(&lock);
		return (mirror_internal_open(getminor(*dev), flag, otyp,
		    md_oflags, &lock));
	} else {
		return (mirror_internal_open(getminor(*dev), flag, otyp,
		    md_oflags, (IOLOCK *)NULL));
	}
}


/*ARGSUSED1*/
static int
mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
{
	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
	    (IOLOCK *)NULL));
}


/*
 * This routine dumps memory to the disk.  It assumes that the memory has
 * already been mapped into mainbus space.  It is called at disk interrupt
 * priority when the system is in trouble.
 *
 */
static int
mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
	mm_unit_t	*un;
	dev_t		mapdev;
	int		result;
	int		smi;
	int		any_succeed = 0;
	int		save_result = 0;

	/*
	 * Don't need to grab the unit lock.
	 * Cause nothing else is suppose to be happenning.
	 * Also dump is not suppose to sleep.
	 */
	un = (mm_unit_t *)MD_UNIT(getminor(dev));

	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
		return (EINVAL);

	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
		return (EINVAL);

	for (smi = 0; smi < NMIRROR; smi++) {
		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
			continue;
		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
		result = bdev_dump(mapdev, addr, blkno, nblk);
		if (result)
			save_result = result;

		if (result == 0)
			any_succeed++;
	}

	if (any_succeed)
		return (0);

	return (save_result);
}

/*
 * NAME: mirror_probe_dev
 *
 * DESCRITPION: force opens every component of a mirror.
 *
 * On entry the unit writerlock is held
 */
static int
mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
{
	int		i;
	int		smi;
	int		ci;
	mm_unit_t	*un;
	int		md_devopen = 0;
	set_t		setno;
	int		sm_cnt;
	int		sm_unavail_cnt;

	if (md_unit_isopen(ui))
		md_devopen++;

	un = MD_UNIT(mnum);
	setno = MD_UN2SET(un);

	sm_cnt = 0;
	sm_unavail_cnt = 0;
	for (i = 0; i < NMIRROR; i++) {
		md_dev64_t tmpdev;
		mdi_unit_t	*sm_ui;

		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
			continue;
		}

		sm_cnt++;
		tmpdev = un->un_sm[i].sm_dev;
		(void) md_layered_open(mnum, &tmpdev,
		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
		un->un_sm[i].sm_dev = tmpdev;

		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));

		/*
		 * Logic similar to that in mirror_open_all_devs.  We set or
		 * clear the submirror Unavailable bit.
		 */
		(void) md_unit_writerlock(sm_ui);
		if (submirror_unavailable(un, i, 1)) {
			sm_ui->ui_tstate |= MD_INACCESSIBLE;
			sm_unavail_cnt++;
		} else {
			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
		}
		md_unit_writerexit(sm_ui);
	}

	/*
	 * If all of the submirrors are unavailable, the mirror is also
	 * unavailable.
	 */
	if (sm_cnt == sm_unavail_cnt) {
		ui->ui_tstate |= MD_INACCESSIBLE;
	} else {
		ui->ui_tstate &= ~MD_INACCESSIBLE;
	}

	/*
	 * Start checking from probe failures. If failures occur we
	 * set the appropriate erred state only if the metadevice is in
	 * use. This is specifically to prevent unnecessary resyncs.
	 * For instance if the disks were accidentally disconnected when
	 * the system booted up then until the metadevice is accessed
	 * (like file system mount) the user can shutdown, recable and
	 * reboot w/o incurring a potentially huge resync.
	 */

	smi = 0;
	ci = 0;
	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {

		if (mirror_other_sources(un, smi, ci, 0) == 1) {
			/*
			 * Note that for a MN set, there is no need to call
			 * SE_NOTIFY as that is done when processing the
			 * state change
			 */
			if (md_devopen) {
				/*
				 * Never called from ioctl context,
				 * so (IOLOCK *)NULL
				 */
				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
				if (!MD_MNSET_SETNO(setno)) {
					SE_NOTIFY(EC_SVM_STATE,
					    ESC_SVM_LASTERRED,
					    SVM_TAG_METADEVICE, setno,
					    MD_SID(un));
				}
				continue;
			} else {
				(void) mirror_close_all_devs(un,
				    MD_OFLG_PROBEDEV);
				if (!MD_MNSET_SETNO(setno)) {
					SE_NOTIFY(EC_SVM_STATE,
					    ESC_SVM_OPEN_FAIL,
					    SVM_TAG_METADEVICE, setno,
					    MD_SID(un));
				}
				mirror_openfail_console_info(un, smi, ci);
				return (ENXIO);
			}
		}

		/*
		 * Note that for a MN set, there is no need to call
		 * SE_NOTIFY as that is done when processing the
		 * state change
		 */
		if (md_devopen) {
			/* Never called from ioctl context, so (IOLOCK *)NULL */
			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
			    MD_STATE_XMIT, (IOLOCK *)NULL);
			if (!MD_MNSET_SETNO(setno)) {
				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
				    SVM_TAG_METADEVICE, setno,
				    MD_SID(un));
			}
		}
		mirror_openfail_console_info(un, smi, ci);
		ci++;
	}

	if (MD_MNSET_SETNO(setno)) {
		send_poke_hotspares(setno);
	} else {
		(void) poke_hotspares();
	}
	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);

	return (0);
}


static int
mirror_imp_set(
	set_t	setno
)
{

	mddb_recid_t	recid;
	int		gotsomething, i;
	mddb_type_t	typ1;
	mddb_de_ic_t	*dep;
	mddb_rb32_t	*rbp;
	mm_unit32_od_t	*un32;
	mm_unit_t	*un64;
	md_dev64_t	self_devt;
	minor_t		*self_id;	/* minor needs to be updated */
	md_parent_t	*parent_id;	/* parent needs to be updated */
	mddb_recid_t	*record_id;	/* record id needs to be updated */
	mddb_recid_t	*optrec_id;
	md_dev64_t	tmpdev;


	gotsomething = 0;

	typ1 = (mddb_type_t)md_getshared_key(setno,
	    mirror_md_ops.md_driver.md_drivername);
	recid = mddb_makerecid(setno, 0);

	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
			continue;

		dep = mddb_getrecdep(recid);
		rbp = dep->de_rb;

		switch (rbp->rb_revision) {
		case MDDB_REV_RB:
		case MDDB_REV_RBFN:
			/*
			 * Small device
			 */
			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
			self_id = &(un32->c.un_self_id);
			parent_id = &(un32->c.un_parent);
			record_id = &(un32->c.un_record_id);
			optrec_id = &(un32->un_rr_dirty_recid);

			for (i = 0; i < un32->un_nsm; i++) {
				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
				un32->un_sm[i].sm_dev = md_cmpldev
				    (md_makedevice(md_major, MD_MKMIN(setno,
				    MD_MIN2UNIT(md_getminor(tmpdev)))));

				if (!md_update_minor(setno, mddb_getsidenum
				    (setno), un32->un_sm[i].sm_key))
				goto out;
			}
			break;
		case MDDB_REV_RB64:
		case MDDB_REV_RB64FN:
			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
			self_id = &(un64->c.un_self_id);
			parent_id = &(un64->c.un_parent);
			record_id = &(un64->c.un_record_id);
			optrec_id = &(un64->un_rr_dirty_recid);

			for (i = 0; i < un64->un_nsm; i++) {
				tmpdev = un64->un_sm[i].sm_dev;
				un64->un_sm[i].sm_dev = md_makedevice
				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
				    (md_getminor(tmpdev))));

				if (!md_update_minor(setno, mddb_getsidenum
				    (setno), un64->un_sm[i].sm_key))
				goto out;
			}
			break;
		}

		/*
		 * If this is a top level and a friendly name metadevice,
		 * update its minor in the namespace.
		 */
		if ((*parent_id == MD_NO_PARENT) &&
		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
		    (rbp->rb_revision == MDDB_REV_RB64FN))) {

			self_devt = md_makedevice(md_major, *self_id);
			if (!md_update_top_device_minor(setno,
			    mddb_getsidenum(setno), self_devt))
				goto out;
		}

		/*
		 * Update unit with the imported setno
		 *
		 */
		mddb_setrecprivate(recid, MD_PRV_GOTIT);

		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
		if (*parent_id != MD_NO_PARENT)
			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
		*record_id = MAKERECID(setno, DBID(*record_id));
		*optrec_id = MAKERECID(setno, DBID(*optrec_id));

		gotsomething = 1;
	}

out:
	return (gotsomething);
}

/*
 * NAME: mirror_check_offline
 *
 * DESCRIPTION: return offline_status = 1 if any submirrors are offline
 *
 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
 * ioctl.
 */
int
mirror_check_offline(md_dev64_t dev, int *offline_status)
{
	mm_unit_t		*un;
	md_error_t		mde = mdnullerror;

	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
		return (EINVAL);
	*offline_status = 0;
	if (un->c.un_status & MD_UN_OFFLINE_SM)
		*offline_status = 1;
	return (0);
}

/*
 * NAME: mirror_inc_abr_count
 *
 * DESCRIPTION: increment the count of layered soft parts with ABR set
 *
 * Called from ioctl, so access to un_abr_count is protected by the global
 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
 */
int
mirror_inc_abr_count(md_dev64_t dev)
{
	mm_unit_t		*un;
	md_error_t		mde = mdnullerror;

	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
		return (EINVAL);
	un->un_abr_count++;
	return (0);
}

/*
 * NAME: mirror_dec_abr_count
 *
 * DESCRIPTION: decrement the count of layered soft parts with ABR set
 *
 * Called from ioctl, so access to un_abr_count is protected by the global
 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
 */
int
mirror_dec_abr_count(md_dev64_t dev)
{
	mm_unit_t		*un;
	md_error_t		mde = mdnullerror;

	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
		return (EINVAL);
	un->un_abr_count--;
	return (0);
}

static md_named_services_t mirror_named_services[] = {
	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
	{mirror_rename_check,				MDRNM_CHECK	    },
	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
	{(intptr_t (*)()) mirror_exchange_parent_update_to,
			MDRNM_PARENT_UPDATE_TO},
	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
			MDRNM_SELF_UPDATE_FROM_DOWN },
	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
	{ NULL,						0		    }
};

md_ops_t mirror_md_ops = {
	mirror_open,		/* open */
	mirror_close,		/* close */
	md_mirror_strategy,	/* strategy */
	NULL,			/* print */
	mirror_dump,		/* dump */
	NULL,			/* read */
	NULL,			/* write */
	md_mirror_ioctl,	/* mirror_ioctl, */
	mirror_snarf,		/* mirror_snarf */
	mirror_halt,		/* mirror_halt */
	NULL,			/* aread */
	NULL,			/* awrite */
	mirror_imp_set,		/* import set */
	mirror_named_services
};

/* module specific initilization */
static void
init_init()
{
	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);

	/* Initialize the parent and child save memory pools */
	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
	    sizeof (md_mps_t), 0, mirror_parent_constructor,
	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
	    0);

	mirror_child_cache = kmem_cache_create("md_mirror_child",
	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
	    mirror_child_constructor, mirror_child_destructor,
	    mirror_run_queue, NULL, NULL, 0);

	/*
	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
	 * then initialize wowbuf memory pool.
	 */
	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
	if (md_wowbuf_size <= 0)
		md_wowbuf_size = 2 * DEV_BSIZE;
	if (md_wowbuf_size > (32 * DEV_BSIZE))
		md_wowbuf_size = (32 * DEV_BSIZE);

	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);

	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);

	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
}

/* module specific uninitilization (undo init_init()) */
static void
fini_uninit()
{
	kmem_cache_destroy(mirror_parent_cache);
	kmem_cache_destroy(mirror_child_cache);
	kmem_cache_destroy(mirror_wowblk_cache);
	mirror_parent_cache = mirror_child_cache =
	    mirror_wowblk_cache = NULL;

	mutex_destroy(&mirror_timeout.dr_mx);
	mutex_destroy(&hotspare_request.dr_mx);
	mutex_destroy(&non_ff_drv_mutex);
}

/* define the module linkage */
MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())