mirror.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/conf.h>
30#include <sys/file.h>
31#include <sys/user.h>
32#include <sys/uio.h>
33#include <sys/t_lock.h>
34#include <sys/buf.h>
35#include <sys/dkio.h>
36#include <sys/vtoc.h>
37#include <sys/kmem.h>
38#include <vm/page.h>
39#include <sys/cmn_err.h>
40#include <sys/sysmacros.h>
41#include <sys/types.h>
42#include <sys/mkdev.h>
43#include <sys/stat.h>
44#include <sys/open.h>
45#include <sys/modctl.h>
46#include <sys/ddi.h>
47#include <sys/sunddi.h>
48#include <sys/debug.h>
49#include <sys/dklabel.h>
50#include <vm/hat.h>
51#include <sys/lvm/mdvar.h>
52#include <sys/lvm/md_mirror.h>
53#include <sys/lvm/md_convert.h>
54#include <sys/lvm/md_mddb.h>
55#include <sys/esunddi.h>
56
57#include <sys/sysevent/eventdefs.h>
58#include <sys/sysevent/svm.h>
59#include <sys/lvm/mdmn_commd.h>
60#include <sys/avl.h>
61
62md_ops_t		mirror_md_ops;
63#ifndef	lint
64char			_depends_on[] = "drv/md";
65md_ops_t		*md_interface_ops = &mirror_md_ops;
66#endif
67
68extern mdq_anchor_t	md_done_daemon;
69extern mdq_anchor_t	md_mstr_daemon;
70extern mdq_anchor_t	md_mirror_daemon;
71extern mdq_anchor_t	md_mirror_io_daemon;
72extern mdq_anchor_t	md_mirror_rs_daemon;
73extern mdq_anchor_t	md_mhs_daemon;
74
75extern unit_t		md_nunits;
76extern set_t		md_nsets;
77extern md_set_t		md_set[];
78
79extern int		md_status;
80extern clock_t		md_hz;
81
82extern md_krwlock_t	md_unit_array_rw;
83extern kmutex_t		md_mx;
84extern kcondvar_t	md_cv;
85extern int		md_mtioctl_cnt;
86
87daemon_request_t	mirror_timeout;
88static daemon_request_t	hotspare_request;
89static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
90
91int	md_mirror_mcs_buf_off;
92
93/* Flags for mdmn_ksend_message to allow debugging */
94int	md_mirror_msg_flags;
95
96#ifdef DEBUG
97/* Flag to switch on debug messages */
98int	mirror_debug_flag = 0;
99#endif
100
101/*
102 * Struct used to hold count of DMR reads and the timestamp of last DMR read
103 * It is used to verify, using a debugger, that the DMR read ioctl has been
104 * executed.
105 */
106dmr_stats_t	mirror_dmr_stats = {0, 0};
107
108/*
109 * Mutex protecting list of non-failfast drivers.
110 */
111static kmutex_t	non_ff_drv_mutex;
112extern char	**non_ff_drivers;
113
114extern major_t	md_major;
115
116/*
117 * Write-On-Write memory pool.
118 */
119static void		copy_write_cont(wowhdr_t *wowhdr);
120static kmem_cache_t	*mirror_wowblk_cache = NULL;
121static int		md_wowbuf_size = 16384;
122static size_t		md_wowblk_size;
123
124/*
125 * This is a flag that allows:
126 *	- disabling the write-on-write mechanism.
127 *	- logging occurrences of write-on-write
128 *	- switching wow handling procedure processing
129 * Counter for occurences of WOW.
130 */
131static uint_t	md_mirror_wow_flg = 0;
132static int	md_mirror_wow_cnt = 0;
133
134/*
135 * Tunable to enable/disable dirty region
136 * processing when closing down a mirror.
137 */
138static int	new_resync = 1;
139kmem_cache_t	*mirror_parent_cache = NULL;
140kmem_cache_t	*mirror_child_cache = NULL;
141
142extern int	md_ff_disable;		/* disable failfast */
143
144static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
145static void	mirror_read_strategy(buf_t *, int, void *);
146static void	mirror_write_strategy(buf_t *, int, void *);
147static void	become_owner(daemon_queue_t *);
148static int	mirror_done(struct buf *cb);
149static int	mirror_done_common(struct buf *cb);
150static void	clear_retry_error(struct buf *cb);
151
152/*
153 * patchables
154 */
155int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
156int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
157
158/*
159 * patchable to change delay before rescheduling mirror ownership request.
160 * Value is clock ticks, default 0.5 seconds
161 */
162clock_t	md_mirror_owner_to = 500000;
163
164/*ARGSUSED1*/
165static int
166mirror_parent_constructor(void *p, void *d1, int d2)
167{
168	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
169	return (0);
170}
171
172static void
173mirror_parent_init(md_mps_t *ps)
174{
175	bzero(ps, offsetof(md_mps_t, ps_mx));
176	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
177}
178
179/*ARGSUSED1*/
180static void
181mirror_parent_destructor(void *p, void *d)
182{
183	mutex_destroy(&((md_mps_t *)p)->ps_mx);
184}
185
186/*ARGSUSED1*/
187static int
188mirror_child_constructor(void *p, void *d1, int d2)
189{
190	bioinit(&((md_mcs_t *)p)->cs_buf);
191	return (0);
192}
193
194void
195mirror_child_init(md_mcs_t *cs)
196{
197	cs->cs_ps = NULL;
198	cs->cs_mdunit = 0;
199	md_bioreset(&cs->cs_buf);
200}
201
202/*ARGSUSED1*/
203static void
204mirror_child_destructor(void *p, void *d)
205{
206	biofini(&((md_mcs_t *)p)->cs_buf);
207}
208
209static void
210mirror_wowblk_init(wowhdr_t *p)
211{
212	bzero(p, md_wowblk_size);
213}
214
215static void
216send_poke_hotspares_msg(daemon_request_t *drq)
217{
218	int			rval;
219	md_mn_msg_pokehsp_t	pokehsp;
220	md_mn_kresult_t		*kresult;
221	set_t			setno = (set_t)drq->dq.qlen;
222
223	pokehsp.pokehsp_setno = setno;
224
225	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
226	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
227	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
228	    sizeof (pokehsp), kresult);
229
230	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
231		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
232		/* If we're shutting down already, pause things here. */
233		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
234			while (!md_mn_is_commd_present()) {
235				delay(md_hz);
236			}
237		}
238		cmn_err(CE_PANIC,
239		    "ksend_message failure: POKE_HOTSPARES");
240	}
241	kmem_free(kresult, sizeof (md_mn_kresult_t));
242
243	/* Allow further requests to use this set's queue structure */
244	mutex_enter(&drq->dr_mx);
245	drq->dr_pending = 0;
246	mutex_exit(&drq->dr_mx);
247}
248
249/*
250 * Send a poke_hotspares message to the master node. To avoid swamping the
251 * commd handler with requests we only send a message if there is not one
252 * already outstanding. We punt the request to a separate thread context as
253 * cannot afford to block waiting on the request to be serviced. This is
254 * essential when a reconfig cycle is in progress as any open() of a multinode
255 * metadevice may result in a livelock.
256 */
257static void
258send_poke_hotspares(set_t setno)
259{
260	daemon_request_t	*drq = &mn_hs_request[setno];
261
262	mutex_enter(&drq->dr_mx);
263	if (drq->dr_pending == 0) {
264		drq->dr_pending = 1;
265		drq->dq.qlen = (int)setno;
266		daemon_request(&md_mhs_daemon,
267		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
268	}
269	mutex_exit(&drq->dr_mx);
270}
271
272void
273mirror_set_sm_state(
274	mm_submirror_t		*sm,
275	mm_submirror_ic_t	*smic,
276	sm_state_t		newstate,
277	int			force)
278{
279	int			compcnt;
280	int			i;
281	int			errcnt;
282	sm_state_t		origstate;
283	md_m_shared_t		*shared;
284
285	if (force) {
286		sm->sm_state = newstate;
287		uniqtime32(&sm->sm_timestamp);
288		return;
289	}
290
291	origstate = newstate;
292
293	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
294	for (i = 0, errcnt = 0; i < compcnt; i++) {
295		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
296		    (sm->sm_dev, sm, i);
297		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
298			newstate |= SMS_COMP_ERRED;
299		if (shared->ms_state & (CS_RESYNC))
300			newstate |= SMS_COMP_RESYNC;
301		if (shared->ms_state & CS_ERRED)
302			errcnt++;
303	}
304
305	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
306		newstate &= ~origstate;
307
308	if (errcnt == compcnt)
309		newstate |= SMS_ALL_ERRED;
310	else
311		newstate &= ~SMS_ALL_ERRED;
312
313	sm->sm_state = newstate;
314	uniqtime32(&sm->sm_timestamp);
315}
316
317static int
318mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
319							int frm_probe)
320{
321	mm_submirror_t		*sm;
322	mm_submirror_ic_t	*smic;
323	md_m_shared_t		*shared;
324	int			ci;
325	int			i;
326	int			compcnt;
327	int			open_comp; /* flag for open component */
328
329	for (i = *smi; i < NMIRROR; i++) {
330		sm = &un->un_sm[i];
331		smic = &un->un_smic[i];
332
333		if (!SMS_IS(sm, SMS_INUSE))
334			continue;
335
336		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
337		for (ci = *cip; ci < compcnt; ci++) {
338			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
339			    (sm->sm_dev, sm, ci);
340			/*
341			 * if called from any routine but probe, we check for
342			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
343			 * it sets MDM_S_PROBEOPEN flag and we test for this
344			 * flag. They are both exclusive tests.
345			 */
346			open_comp = (frm_probe) ?
347			    (shared->ms_flags & MDM_S_PROBEOPEN):
348			    (shared->ms_flags & MDM_S_ISOPEN);
349			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
350			    ((shared->ms_state == CS_OKAY) ||
351			    (shared->ms_state == CS_RESYNC))) {
352				if (clr_error) {
353					shared->ms_flags &= ~MDM_S_IOERR;
354				}
355				*cip = ci;
356				*smi = i;
357				return (1);
358			}
359
360			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
361				shared->ms_flags &= ~MDM_S_IOERR;
362			}
363		}
364
365		*cip = 0;
366	}
367	return (0);
368}
369
370/*ARGSUSED*/
371static void
372mirror_run_queue(void *d)
373{
374	if (!(md_status & MD_GBL_DAEMONS_LIVE))
375		md_daemon(1, &md_done_daemon);
376}
377/*
378 * check_comp_4_hotspares
379 *
380 * This function attempts to allocate a hotspare for this component if the
381 * component is in error. In a MN set, the function can be called in 2 modes.
382 * It can be called either when a component error has been detected or when a
383 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
384 * in flags and the request is sent to all nodes.
385 * The handler on each of the nodes then calls this function with
386 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
387 *
388 * For non-MN sets the function simply attempts to allocate a hotspare.
389 *
390 * On entry, the following locks are held
391 *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
392 *	md_unit_writerlock
393 *
394 * Returns	0 if ok
395 *		1 if the unit containing the component has been cleared while
396 *		  the mdmn_ksend_message() was being executed
397 */
398extern int
399check_comp_4_hotspares(
400	mm_unit_t	*un,
401	int		smi,
402	int		ci,
403	uint_t		flags,
404	mddb_recid_t	hs_id,	/* Only used by MN disksets */
405	IOLOCK		*lockp	/* can be NULL */
406)
407{
408	mm_submirror_t		*sm;
409	mm_submirror_ic_t	*smic;
410	md_m_shared_t		*shared;
411	mddb_recid_t		recids[6];
412	minor_t			mnum;
413	intptr_t		(*hs_dev)();
414	void			(*hs_done)();
415	void			*hs_data;
416	md_error_t		mde = mdnullerror;
417	set_t			setno;
418	md_mn_msg_allochsp_t	allochspmsg;
419	md_mn_kresult_t		*kresult;
420	mm_unit_t		*new_un;
421	int			rval;
422
423	mnum = MD_SID(un);
424	setno = MD_UN2SET(un);
425	sm = &un->un_sm[smi];
426	smic = &un->un_smic[smi];
427	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
428	    (sm->sm_dev, sm, ci);
429
430	if (shared->ms_state != CS_ERRED)
431		return (0);
432
433	/* Don't start a new component resync if a resync is already running. */
434	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
435		return (0);
436
437	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
438		uint_t		msgflags;
439		md_mn_msgtype_t	msgtype;
440
441		/* Send allocate hotspare message to all nodes */
442
443		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
444		allochspmsg.msg_allochsp_sm = smi;
445		allochspmsg.msg_allochsp_comp = ci;
446		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
447
448		/*
449		 * Before calling mdmn_ksend_message(), release locks
450		 * Can never be in the context of an ioctl.
451		 */
452		md_unit_writerexit(MDI_UNIT(mnum));
453		if (flags & MD_HOTSPARE_LINKHELD)
454			rw_exit(&mirror_md_ops.md_link_rw.lock);
455#ifdef DEBUG
456		if (mirror_debug_flag)
457			printf("send alloc hotspare, flags="
458			    "0x%x %x, %x, %x, %x\n", flags,
459			    allochspmsg.msg_allochsp_mnum,
460			    allochspmsg.msg_allochsp_sm,
461			    allochspmsg.msg_allochsp_comp,
462			    allochspmsg.msg_allochsp_hs_id);
463#endif
464		if (flags & MD_HOTSPARE_WMUPDATE) {
465			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
466			/*
467			 * When coming from an update of watermarks, there
468			 * must already be a message logged that triggered
469			 * this action. So, no need to log this message, too.
470			 */
471			msgflags = MD_MSGF_NO_LOG;
472		} else {
473			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
474			msgflags = MD_MSGF_DEFAULT_FLAGS;
475		}
476
477		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
478		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
479		    (char *)&allochspmsg, sizeof (allochspmsg),
480		    kresult);
481
482		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
483#ifdef DEBUG
484			if (mirror_debug_flag)
485				mdmn_ksend_show_error(rval, kresult,
486				    "ALLOCATE HOTSPARE");
487#endif
488			/*
489			 * If message is sent ok but exitval indicates an error
490			 * it must be because the mirror has been cleared. In
491			 * this case re-obtain lock and return an error
492			 */
493			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
494				if (flags & MD_HOTSPARE_LINKHELD) {
495					rw_enter(&mirror_md_ops.md_link_rw.lock,
496					    RW_READER);
497				}
498				kmem_free(kresult, sizeof (md_mn_kresult_t));
499				return (1);
500			}
501			/* If we're shutting down already, pause things here. */
502			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
503				while (!md_mn_is_commd_present()) {
504					delay(md_hz);
505				}
506			}
507			cmn_err(CE_PANIC,
508			    "ksend_message failure: ALLOCATE_HOTSPARE");
509		}
510		kmem_free(kresult, sizeof (md_mn_kresult_t));
511
512		/*
513		 * re-obtain the locks
514		 */
515		if (flags & MD_HOTSPARE_LINKHELD)
516			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
517		new_un = md_unit_writerlock(MDI_UNIT(mnum));
518
519		/*
520		 * As we had to release the locks in order to send the
521		 * message to all nodes, we need to check to see if the
522		 * unit has changed. If it has we release the writerlock
523		 * and return fail.
524		 */
525		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
526			md_unit_writerexit(MDI_UNIT(mnum));
527			return (1);
528		}
529	} else {
530		if (MD_MNSET_SETNO(setno)) {
531			/*
532			 * If 2 or more nodes simultaneously see a
533			 * component failure, these nodes will each
534			 * send an ALLOCATE_HOTSPARE[2] message.
535			 * The first message will allocate the hotspare
536			 * and the subsequent messages should do nothing.
537			 *
538			 * If a slave node doesn't have a hotspare allocated
539			 * at the time the message is initiated, then the
540			 * passed in hs_id will be 0.  If the node
541			 * executing this routine has a component shared
542			 * ms_hs_id of non-zero, but the message shows a
543			 * hs_id of 0, then just return since a hotspare
544			 * has already been allocated for this failing
545			 * component.  When the slave node returns from
546			 * the ksend_message the hotspare will have
547			 * already been allocated.
548			 *
549			 * If the slave node does send an hs_id of non-zero,
550			 * and the slave node's hs_id matches this node's
551			 * ms_hs_id, then the hotspare has error'd and
552			 * should be replaced.
553			 *
554			 * If the slave node sends an hs_id of non-zero and
555			 * this node has a different shared ms_hs_id, then
556			 * just return since this hotspare has already
557			 * been hotspared.
558			 */
559			if (shared->ms_hs_id != 0) {
560				if (hs_id == 0) {
561#ifdef DEBUG
562					if (mirror_debug_flag) {
563						printf("check_comp_4_hotspares"
564						    "(NOXMIT), short circuit "
565						    "hs_id=0x%x, "
566						    "ms_hs_id=0x%x\n",
567						    hs_id, shared->ms_hs_id);
568					}
569#endif
570					return (0);
571				}
572				if (hs_id != shared->ms_hs_id) {
573#ifdef DEBUG
574					if (mirror_debug_flag) {
575						printf("check_comp_4_hotspares"
576						    "(NOXMIT), short circuit2 "
577						    "hs_id=0x%x, "
578						    "ms_hs_id=0x%x\n",
579						    hs_id, shared->ms_hs_id);
580					}
581#endif
582					return (0);
583				}
584			}
585		}
586
587		sm = &un->un_sm[smi];
588		hs_dev = md_get_named_service(sm->sm_dev, 0,
589		    "hotspare device", 0);
590		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
591		    &hs_data) != 0)
592			return (0);
593
594		/*
595		 * set_sm_comp_state() commits the modified records.
596		 * As we don't transmit the changes, no need to drop the lock.
597		 */
598		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
599		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
600
601		(*hs_done)(sm->sm_dev, hs_data);
602
603		mirror_check_failfast(mnum);
604
605		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
606		    setno, MD_SID(un));
607
608		/*
609		 * For a multi-node set we need to reset the un_rs_type,
610		 * un_rs_resync_done and un_rs_resync_2_do fields as the
611		 * hot-spare resync must copy all applicable data.
612		 */
613		if (MD_MNSET_SETNO(setno)) {
614			un->un_rs_type = MD_RS_NONE;
615			un->un_rs_resync_done = 0;
616			un->un_rs_resync_2_do = 0;
617		}
618
619		/*
620		 * Must drop writer lock since mirror_resync_unit will
621		 * open devices and must be able to grab readerlock.
622		 * Don't need to drop IOLOCK since any descendent routines
623		 * calling ksend_messages will drop the IOLOCK as needed.
624		 *
625		 */
626		if (lockp) {
627			md_ioctl_writerexit(lockp);
628		} else {
629			md_unit_writerexit(MDI_UNIT(mnum));
630		}
631
632		/* start resync */
633		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
634
635		if (lockp) {
636			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
637		} else {
638			new_un = md_unit_writerlock(MDI_UNIT(mnum));
639		}
640	}
641	return (0);
642}
643
644/*
645 * check_unit_4_hotspares
646 *
647 * For a given mirror, allocate hotspares, if available for any components
648 * that are in error
649 *
650 * Returns	0 if ok
651 *		1 if check_comp_4_hotspares returns non-zero. This will only
652 *		  happen for a MN unit where the unit has been cleared while
653 *		  the allocate hotspare message is sent to all nodes.
654 */
655static int
656check_unit_4_hotspares(mm_unit_t *un, int flags)
657{
658	mm_submirror_t		*sm;
659	mm_submirror_ic_t	*smic;
660	int			ci;
661	int			i;
662	int			compcnt;
663
664	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
665		return (0);
666
667	for (i = 0; i < NMIRROR; i++) {
668		sm = &un->un_sm[i];
669		smic = &un->un_smic[i];
670		if (!SMS_IS(sm, SMS_INUSE))
671			continue;
672		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
673		for (ci = 0; ci < compcnt; ci++) {
674			md_m_shared_t		*shared;
675
676			shared = (md_m_shared_t *)
677			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
678			/*
679			 * Never called from ioctl context, so pass in
680			 * (IOLOCK *)NULL.  Pass through flags from calling
681			 * routine, also setting XMIT flag.
682			 */
683			if (check_comp_4_hotspares(un, i, ci,
684			    (MD_HOTSPARE_XMIT | flags),
685			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
686				return (1);
687		}
688	}
689	return (0);
690}
691
692static void
693check_4_hotspares(daemon_request_t *drq)
694{
695	mdi_unit_t	*ui;
696	mm_unit_t	*un;
697	md_link_t	*next;
698	int		x;
699
700	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
701	drq->dr_pending = 0;		/* again in low level routine if */
702	mutex_exit(&drq->dr_mx);	/* something found to do	*/
703
704	/*
705	 * Used to have a problem here. The disksets weren't marked as being
706	 * MNHOLD. This opened a window where we could be searching for
707	 * hotspares and have the disk set unloaded (released) from under
708	 * us causing a panic in stripe_component_count().
709	 * The way to prevent that is to mark the set MNHOLD which prevents
710	 * any diskset from being released while we are scanning the mirrors,
711	 * submirrors and components.
712	 */
713
714	for (x = 0; x < md_nsets; x++)
715		md_holdset_enter(x);
716
717	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
718	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
719		ui = MDI_UNIT(next->ln_id);
720
721		un = (mm_unit_t *)md_unit_readerlock(ui);
722
723		/*
724		 * Only check the unit if we are the master for this set
725		 * For an MN set, poke_hotspares() is only effective on the
726		 * master
727		 */
728		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
729		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
730			md_unit_readerexit(ui);
731			continue;
732		}
733		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
734			md_unit_readerexit(ui);
735			continue;
736		}
737		md_unit_readerexit(ui);
738
739		un = (mm_unit_t *)md_unit_writerlock(ui);
740		/*
741		 * check_unit_4_hotspares will exit 1 if the unit has been
742		 * removed during the process of allocating the hotspare.
743		 * This can only happen for a MN metadevice. If unit no longer
744		 * exists, no need to release writerlock
745		 */
746		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
747			md_unit_writerexit(ui);
748		else {
749			/*
750			 * If check_unit_4_hotspares failed, queue another
751			 * request and break out of this one
752			 */
753			(void) poke_hotspares();
754			break;
755		}
756	}
757	rw_exit(&mirror_md_ops.md_link_rw.lock);
758
759	for (x = 0; x < md_nsets; x++)
760		md_holdset_exit(x);
761}
762
763/*
764 * poke_hotspares
765 *
766 * If there is not a pending poke_hotspares request pending, queue a requent
767 * to call check_4_hotspares(). This will scan all mirrors and attempt to
768 * allocate hotspares for all components in error.
769 */
770int
771poke_hotspares()
772{
773	mutex_enter(&hotspare_request.dr_mx);
774	if (hotspare_request.dr_pending == 0) {
775		hotspare_request.dr_pending = 1;
776		daemon_request(&md_mhs_daemon,
777		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
778		    REQ_OLD);
779	}
780	mutex_exit(&hotspare_request.dr_mx);
781	return (0);
782}
783
784static void
785free_all_ecomps(err_comp_t *ecomp)
786{
787	err_comp_t	*d;
788
789	while (ecomp != NULL) {
790		d = ecomp;
791		ecomp = ecomp->ec_next;
792		kmem_free(d, sizeof (err_comp_t));
793	}
794}
795
796/*
797 * NAME: mirror_openfail_console_info
798 *
799 * DESCRIPTION: Prints a informative message to the console when mirror
800 *		cannot be opened.
801 *
802 * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
803 *	       int		smi - submirror index
804 *	       int		ci - component index
805 */
806
807void
808mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
809{
810	void (*get_dev)();
811	ms_cd_info_t cd;
812	md_dev64_t tmpdev;
813
814	tmpdev = un->un_sm[smi].sm_dev;
815	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
816	if (get_dev != NULL) {
817		(void) (*get_dev)(tmpdev, smi, ci, &cd);
818		cmn_err(CE_WARN, "md %s: open error on %s",
819		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
820		    cd.cd_dev, NULL, 0));
821	} else {
822		cmn_err(CE_WARN, "md %s: open error",
823		    md_shortname(MD_SID(un)));
824	}
825}
826
827static int
828mirror_close_all_devs(mm_unit_t *un, int md_cflags)
829{
830	int i;
831	md_dev64_t dev;
832
833	for (i = 0; i < NMIRROR; i++) {
834		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
835			continue;
836		dev = un->un_sm[i].sm_dev;
837		md_layered_close(dev, md_cflags);
838	}
839	return (0);
840}
841
842/*
843 * Keep track of drivers that don't support failfast.  We use this so that
844 * we only log one diagnostic message for each of these drivers, no matter
845 * how many times we run the mirror_check_failfast function.
846 * Return 1 if this is a new driver that does not support failfast,
847 * return 0 if we have already seen this non-failfast driver.
848 */
849static int
850new_non_ff_driver(const char *s)
851{
852	mutex_enter(&non_ff_drv_mutex);
853	if (non_ff_drivers == NULL) {
854		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
855		    KM_NOSLEEP);
856		if (non_ff_drivers == NULL) {
857			mutex_exit(&non_ff_drv_mutex);
858			return (1);
859		}
860
861		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
862		    KM_NOSLEEP);
863		if (non_ff_drivers[0] == NULL) {
864			kmem_free(non_ff_drivers, 2 * sizeof (char *));
865			non_ff_drivers = NULL;
866			mutex_exit(&non_ff_drv_mutex);
867			return (1);
868		}
869
870		(void) strcpy(non_ff_drivers[0], s);
871		non_ff_drivers[1] = NULL;
872
873	} else {
874		int i;
875		char **tnames;
876		char **tmp;
877
878		for (i = 0; non_ff_drivers[i] != NULL; i++) {
879			if (strcmp(s, non_ff_drivers[i]) == 0) {
880				mutex_exit(&non_ff_drv_mutex);
881				return (0);
882			}
883		}
884
885		/* allow for new element and null */
886		i += 2;
887		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
888		if (tnames == NULL) {
889			mutex_exit(&non_ff_drv_mutex);
890			return (1);
891		}
892
893		for (i = 0; non_ff_drivers[i] != NULL; i++)
894			tnames[i] = non_ff_drivers[i];
895
896		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
897		if (tnames[i] == NULL) {
898			/* adjust i so that it is the right count to free */
899			kmem_free(tnames, (i + 2) * sizeof (char *));
900			mutex_exit(&non_ff_drv_mutex);
901			return (1);
902		}
903
904		(void) strcpy(tnames[i++], s);
905		tnames[i] = NULL;
906
907		tmp = non_ff_drivers;
908		non_ff_drivers = tnames;
909		/* i now represents the count we previously alloced */
910		kmem_free(tmp, i * sizeof (char *));
911	}
912	mutex_exit(&non_ff_drv_mutex);
913
914	return (1);
915}
916
917/*
918 * Check for the "ddi-failfast-supported" devtree property on each submirror
919 * component to indicate if we should do I/O to that submirror with the
920 * B_FAILFAST flag set or not.  This check is made at various state transitions
921 * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
922 * only need to check one drive (e.g. hotspare) but since the check is
923 * fast and infrequent and sometimes needs to be done on all components we
924 * just check all components on each call.
925 */
926void
927mirror_check_failfast(minor_t mnum)
928{
929	int		i;
930	mm_unit_t	*un;
931
932	if (md_ff_disable)
933		return;
934
935	un = MD_UNIT(mnum);
936
937	for (i = 0; i < NMIRROR; i++) {
938		int			ci;
939		int			cnt;
940		int			ff = 1;
941		mm_submirror_t		*sm;
942		mm_submirror_ic_t	*smic;
943		void			(*get_dev)();
944
945		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
946			continue;
947
948		sm = &un->un_sm[i];
949		smic = &un->un_smic[i];
950
951		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
952		    "get device", 0);
953
954		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
955		for (ci = 0; ci < cnt; ci++) {
956			int		found = 0;
957			dev_t		ci_dev;
958			major_t		major;
959			dev_info_t	*devi;
960			ms_cd_info_t	cd;
961
962			/*
963			 * this already returns the hs
964			 * dev if the device is spared
965			 */
966			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
967
968			ci_dev = md_dev64_to_dev(cd.cd_dev);
969			major = getmajor(ci_dev);
970
971			if (major == md_major) {
972				/*
973				 * this component must be a soft
974				 * partition; get the real dev
975				 */
976				minor_t	dev_mnum;
977				mdi_unit_t	*ui;
978				mp_unit_t	*un;
979				set_t	setno;
980				side_t	side;
981				md_dev64_t	tmpdev;
982
983				ui = MDI_UNIT(getminor(ci_dev));
984
985				/* grab necessary lock */
986				un = (mp_unit_t *)md_unit_readerlock(ui);
987
988				dev_mnum = MD_SID(un);
989				setno = MD_MIN2SET(dev_mnum);
990				side = mddb_getsidenum(setno);
991
992				tmpdev = un->un_dev;
993
994				/* Get dev by device id */
995				if (md_devid_found(setno, side,
996				    un->un_key) == 1) {
997					tmpdev = md_resolve_bydevid(dev_mnum,
998					    tmpdev, un->un_key);
999				}
1000
1001				md_unit_readerexit(ui);
1002
1003				ci_dev = md_dev64_to_dev(tmpdev);
1004				major = getmajor(ci_dev);
1005			}
1006
1007			if (ci_dev != NODEV32 &&
1008			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1009			    != NULL) {
1010				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
1011				int		propvalue = 0;
1012				int		proplength = sizeof (int);
1013				int		error;
1014				struct cb_ops	*cb;
1015
1016				if ((cb = devopsp[major]->devo_cb_ops) !=
1017				    NULL) {
1018					error = (*cb->cb_prop_op)
1019					    (DDI_DEV_T_ANY, devi, prop_op,
1020					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1021					    "ddi-failfast-supported",
1022					    (caddr_t)&propvalue, &proplength);
1023
1024					if (error == DDI_PROP_SUCCESS)
1025						found = 1;
1026				}
1027
1028				if (!found && new_non_ff_driver(
1029				    ddi_driver_name(devi))) {
1030					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1031					    "disabled on %s",
1032					    ddi_driver_name(devi));
1033				}
1034
1035				ddi_release_devi(devi);
1036			}
1037
1038			/*
1039			 * All components must support
1040			 * failfast in the submirror.
1041			 */
1042			if (!found) {
1043				ff = 0;
1044				break;
1045			}
1046		}
1047
1048		if (ff) {
1049			sm->sm_flags |= MD_SM_FAILFAST;
1050		} else {
1051			sm->sm_flags &= ~MD_SM_FAILFAST;
1052		}
1053	}
1054}
1055
1056/*
1057 * Return true if the submirror is unavailable.
1058 * If any of the submirror components are opened then the submirror cannot
1059 * be unavailable (MD_INACCESSIBLE).
1060 * If any of the components are already in the errored state, then the submirror
1061 * cannot be unavailable (MD_INACCESSIBLE).
1062 */
1063static bool_t
1064submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1065{
1066	mm_submirror_t		*sm;
1067	mm_submirror_ic_t	*smic;
1068	md_m_shared_t		*shared;
1069	int			ci;
1070	int			compcnt;
1071
1072	sm = &un->un_sm[smi];
1073	smic = &un->un_smic[smi];
1074
1075	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1076	for (ci = 0; ci < compcnt; ci++) {
1077		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1078		    (sm->sm_dev, sm, ci);
1079		if (from_probe) {
1080			if (shared->ms_flags & MDM_S_PROBEOPEN)
1081				return (B_FALSE);
1082		} else {
1083			if (shared->ms_flags & MDM_S_ISOPEN)
1084				return (B_FALSE);
1085		}
1086		if (shared->ms_state == CS_ERRED ||
1087		    shared->ms_state == CS_LAST_ERRED)
1088			return (B_FALSE);
1089	}
1090
1091	return (B_TRUE);
1092}
1093
1094static int
1095mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1096{
1097	int		i;
1098	mm_unit_t	*un;
1099	mdi_unit_t	*ui;
1100	int		err;
1101	int		smi;
1102	int		ci;
1103	err_comp_t	*c;
1104	err_comp_t	*ecomps = NULL;
1105	int		smmask = 0;
1106	set_t		setno;
1107	int		sm_cnt;
1108	int		sm_unavail_cnt;
1109
1110	mirror_check_failfast(mnum);
1111
1112	un = MD_UNIT(mnum);
1113	ui = MDI_UNIT(mnum);
1114	setno = MD_UN2SET(un);
1115
1116	for (i = 0; i < NMIRROR; i++) {
1117		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1118
1119		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1120			continue;
1121		if (md_layered_open(mnum, &tmpdev, md_oflags))
1122			smmask |= SMI2BIT(i);
1123		un->un_sm[i].sm_dev = tmpdev;
1124	}
1125
1126	/*
1127	 * If smmask is clear, all submirrors are accessible. Clear the
1128	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1129	 * mirror device.   If smmask is set, we have to determine which of the
1130	 * submirrors are in error. If no submirror is accessible we mark the
1131	 * whole mirror as MD_INACCESSIBLE.
1132	 */
1133	if (smmask == 0) {
1134		if (lockp) {
1135			md_ioctl_readerexit(lockp);
1136			(void) md_ioctl_writerlock(lockp, ui);
1137		} else {
1138			md_unit_readerexit(ui);
1139			(void) md_unit_writerlock(ui);
1140		}
1141		ui->ui_tstate &= ~MD_INACCESSIBLE;
1142		if (lockp) {
1143			md_ioctl_writerexit(lockp);
1144			(void) md_ioctl_readerlock(lockp, ui);
1145		} else {
1146			md_unit_writerexit(ui);
1147			(void) md_unit_readerlock(ui);
1148		}
1149
1150		for (i = 0; i < NMIRROR; i++) {
1151			md_dev64_t	tmpdev;
1152			mdi_unit_t	*sm_ui;
1153
1154			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1155				continue;
1156
1157			tmpdev = un->un_sm[i].sm_dev;
1158			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1159			(void) md_unit_writerlock(sm_ui);
1160			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1161			md_unit_writerexit(sm_ui);
1162		}
1163
1164		return (0);
1165	}
1166
1167	for (i = 0; i < NMIRROR; i++) {
1168		md_dev64_t tmpdev;
1169
1170		if (!(smmask & SMI2BIT(i)))
1171			continue;
1172
1173		tmpdev = un->un_sm[i].sm_dev;
1174		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1175		un->un_sm[i].sm_dev = tmpdev;
1176		ASSERT(err == 0);
1177	}
1178
1179	if (lockp) {
1180		md_ioctl_readerexit(lockp);
1181		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1182	} else {
1183		md_unit_readerexit(ui);
1184		un = (mm_unit_t *)md_unit_writerlock(ui);
1185	}
1186
1187	/*
1188	 * We want to make sure the unavailable flag is not masking a real
1189	 * error on the submirror.
1190	 * For each submirror,
1191	 *    if all of the submirror components couldn't be opened and there
1192	 *    are no errors on the submirror, then set the unavailable flag
1193	 *    otherwise, clear unavailable.
1194	 */
1195	sm_cnt = 0;
1196	sm_unavail_cnt = 0;
1197	for (i = 0; i < NMIRROR; i++) {
1198		md_dev64_t	tmpdev;
1199		mdi_unit_t	*sm_ui;
1200
1201		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1202			continue;
1203
1204		sm_cnt++;
1205		tmpdev = un->un_sm[i].sm_dev;
1206		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1207
1208		(void) md_unit_writerlock(sm_ui);
1209		if (submirror_unavailable(un, i, 0)) {
1210			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1211			sm_unavail_cnt++;
1212		} else {
1213			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1214		}
1215		md_unit_writerexit(sm_ui);
1216	}
1217
1218	/*
1219	 * If all of the submirrors are unavailable, the mirror is also
1220	 * unavailable.
1221	 */
1222	if (sm_cnt == sm_unavail_cnt) {
1223		ui->ui_tstate |= MD_INACCESSIBLE;
1224	} else {
1225		ui->ui_tstate &= ~MD_INACCESSIBLE;
1226	}
1227
1228	smi = 0;
1229	ci = 0;
1230	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1231		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1232
1233			free_all_ecomps(ecomps);
1234			(void) mirror_close_all_devs(un, md_oflags);
1235			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1236			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1237			mirror_openfail_console_info(un, smi, ci);
1238			if (lockp) {
1239				md_ioctl_writerexit(lockp);
1240				(void) md_ioctl_readerlock(lockp, ui);
1241			} else {
1242				md_unit_writerexit(ui);
1243				(void) md_unit_readerlock(ui);
1244			}
1245			return (ENXIO);
1246		}
1247
1248		/* track all component states that need changing */
1249		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1250		c->ec_next = ecomps;
1251		c->ec_smi = smi;
1252		c->ec_ci = ci;
1253		ecomps = c;
1254		ci++;
1255	}
1256
1257	/* Make all state changes and commit them */
1258	for (c = ecomps; c != NULL; c = c->ec_next) {
1259		/*
1260		 * If lockp is set, then entering kernel through ioctl.
1261		 * For a MN set, the only ioctl path is via a commd message
1262		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1263		 * being sent to each node.
1264		 * In this case, set NO_XMIT so that set_sm_comp_state
1265		 * won't attempt to send a message on a message.
1266		 *
1267		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1268		 * which flag is passed.
1269		 */
1270		if (lockp) {
1271			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1272			    MD_STATE_NO_XMIT, lockp);
1273		} else {
1274			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1275			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1276		}
1277		/*
1278		 * For a MN set, the NOTIFY is done when the state change is
1279		 * processed on each node
1280		 */
1281		if (!MD_MNSET_SETNO(setno)) {
1282			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1283			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1284		}
1285	}
1286
1287	if (lockp) {
1288		md_ioctl_writerexit(lockp);
1289		(void) md_ioctl_readerlock(lockp, ui);
1290	} else {
1291		md_unit_writerexit(ui);
1292		(void) md_unit_readerlock(ui);
1293	}
1294
1295	free_all_ecomps(ecomps);
1296
1297	/* allocate hotspares for all errored components */
1298	if (MD_MNSET_SETNO(setno)) {
1299		/*
1300		 * If we're called from an ioctl (lockp set) then we cannot
1301		 * directly call send_poke_hotspares as this will block until
1302		 * the message gets despatched to all nodes. If the cluster is
1303		 * going through a reconfig cycle then the message will block
1304		 * until the cycle is complete, and as we originate from a
1305		 * service call from commd we will livelock.
1306		 */
1307		if (lockp == NULL) {
1308			md_unit_readerexit(ui);
1309			send_poke_hotspares(setno);
1310			(void) md_unit_readerlock(ui);
1311		}
1312	} else {
1313		(void) poke_hotspares();
1314	}
1315	return (0);
1316}
1317
1318void
1319mirror_overlap_tree_remove(md_mps_t *ps)
1320{
1321	mm_unit_t	*un;
1322
1323	if (panicstr)
1324		return;
1325
1326	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1327	un = ps->ps_un;
1328
1329	mutex_enter(&un->un_overlap_tree_mx);
1330	avl_remove(&un->un_overlap_root, ps);
1331	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1332	if (un->un_overlap_tree_flag != 0) {
1333		un->un_overlap_tree_flag = 0;
1334		cv_broadcast(&un->un_overlap_tree_cv);
1335	}
1336	mutex_exit(&un->un_overlap_tree_mx);
1337}
1338
1339
1340/*
1341 * wait_for_overlaps:
1342 * -----------------
1343 * Check that given i/o request does not cause an overlap with already pending
1344 * i/o. If it does, block until the overlapped i/o completes.
1345 *
1346 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1347 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1348 * it must not already be in the tree.
1349 */
1350static void
1351wait_for_overlaps(md_mps_t *ps, int flags)
1352{
1353	mm_unit_t	*un;
1354	avl_index_t	where;
1355	md_mps_t	*ps1;
1356
1357	if (panicstr)
1358		return;
1359
1360	un = ps->ps_un;
1361	mutex_enter(&un->un_overlap_tree_mx);
1362	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1363	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1364		mutex_exit(&un->un_overlap_tree_mx);
1365		return;
1366	}
1367
1368	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1369
1370	do {
1371		ps1 = avl_find(&un->un_overlap_root, ps, &where);
1372		if (ps1 == NULL) {
1373			/*
1374			 * The candidate range does not overlap with any
1375			 * range in the tree.  Insert it and be done.
1376			 */
1377			avl_insert(&un->un_overlap_root, ps, where);
1378			ps->ps_flags |= MD_MPS_ON_OVERLAP;
1379		} else {
1380			/*
1381			 * The candidate range would overlap.  Set the flag
1382			 * indicating we need to be woken up, and sleep
1383			 * until another thread removes a range.  If upon
1384			 * waking up we find this mps was put on the tree
1385			 * by another thread, the loop terminates.
1386			 */
1387			un->un_overlap_tree_flag = 1;
1388			cv_wait(&un->un_overlap_tree_cv,
1389			    &un->un_overlap_tree_mx);
1390		}
1391	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1392	mutex_exit(&un->un_overlap_tree_mx);
1393}
1394
1395/*
1396 * This function is called from mirror_done to check whether any pages have
1397 * been modified while a mirrored write was in progress.  Returns 0 if
1398 * all pages associated with bp are clean, 1 otherwise.
1399 */
1400static int
1401any_pages_dirty(struct buf *bp)
1402{
1403	int	rval;
1404
1405	rval = biomodified(bp);
1406	if (rval == -1)
1407		rval = 0;
1408
1409	return (rval);
1410}
1411
1412#define	MAX_EXTRAS 10
1413
1414void
1415mirror_commit(
1416	mm_unit_t	*un,
1417	int		smmask,
1418	mddb_recid_t	*extras
1419)
1420{
1421	mm_submirror_t		*sm;
1422	md_unit_t		*su;
1423	int			i;
1424
1425	/* 2=mirror,null id */
1426	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1427
1428	int			ri = 0;
1429
1430	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1431		return;
1432
1433	/* Add two, this includes the mirror unit and the null recid */
1434	if (extras != NULL) {
1435		int	nrecids = 0;
1436		while (extras[nrecids] != 0) {
1437			nrecids++;
1438		}
1439		ASSERT(nrecids <= MAX_EXTRAS);
1440	}
1441
1442	if (un != NULL)
1443		recids[ri++] = un->c.un_record_id;
1444	for (i = 0;  i < NMIRROR; i++) {
1445		if (!(smmask & SMI2BIT(i)))
1446			continue;
1447		sm = &un->un_sm[i];
1448		if (!SMS_IS(sm, SMS_INUSE))
1449			continue;
1450		if (md_getmajor(sm->sm_dev) != md_major)
1451			continue;
1452		su =  MD_UNIT(md_getminor(sm->sm_dev));
1453		recids[ri++] = su->c.un_record_id;
1454	}
1455
1456	if (extras != NULL)
1457		while (*extras != 0) {
1458			recids[ri++] = *extras;
1459			extras++;
1460		}
1461
1462	if (ri == 0)
1463		return;
1464	recids[ri] = 0;
1465
1466	/*
1467	 * Ok to hold ioctl lock across record commit to mddb as
1468	 * long as the record(s) being committed aren't resync records.
1469	 */
1470	mddb_commitrecs_wrapper(recids);
1471}
1472
1473
1474/*
1475 * This routine is used to set a bit in the writable_bm bitmap
1476 * which represents each submirror in a metamirror which
1477 * is writable. The first writable submirror index is assigned
1478 * to the sm_index.  The number of writable submirrors are returned in nunits.
1479 *
1480 * This routine returns the submirror's unit number.
1481 */
1482
1483static void
1484select_write_units(struct mm_unit *un, md_mps_t *ps)
1485{
1486
1487	int		i;
1488	unsigned	writable_bm = 0;
1489	unsigned	nunits = 0;
1490
1491	for (i = 0; i < NMIRROR; i++) {
1492		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1493			/* set bit of all writable units */
1494			writable_bm |= SMI2BIT(i);
1495			nunits++;
1496		}
1497	}
1498	ps->ps_writable_sm = writable_bm;
1499	ps->ps_active_cnt = nunits;
1500	ps->ps_current_sm = 0;
1501}
1502
1503static
1504unsigned
1505select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1506{
1507
1508	int		i;
1509	unsigned	writable_bm = 0;
1510	unsigned	nunits = 0;
1511
1512	for (i = 0; i < NMIRROR; i++) {
1513		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1514		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1515			writable_bm |= SMI2BIT(i);
1516			nunits++;
1517		}
1518	}
1519	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1520		writable_bm &= ~ps->ps_allfrom_sm;
1521		nunits--;
1522	}
1523	ps->ps_writable_sm = writable_bm;
1524	ps->ps_active_cnt = nunits;
1525	ps->ps_current_sm = 0;
1526	return (nunits);
1527}
1528
1529static md_dev64_t
1530select_read_unit(
1531	mm_unit_t	*un,
1532	diskaddr_t	blkno,
1533	u_longlong_t	reqcount,
1534	u_longlong_t	*cando,
1535	int		must_be_opened,
1536	md_m_shared_t	**shared,
1537	md_mcs_t	*cs)
1538{
1539	int			i;
1540	md_m_shared_t		*s;
1541	uint_t			lasterrcnt = 0;
1542	md_dev64_t		dev = 0;
1543	u_longlong_t		cnt;
1544	u_longlong_t		mincnt;
1545	mm_submirror_t		*sm;
1546	mm_submirror_ic_t	*smic;
1547	mdi_unit_t		*ui;
1548
1549	mincnt = reqcount;
1550	for (i = 0; i < NMIRROR; i++) {
1551		if (!SUBMIRROR_IS_READABLE(un, i))
1552			continue;
1553		sm = &un->un_sm[i];
1554		smic = &un->un_smic[i];
1555		cnt = reqcount;
1556
1557		/*
1558		 * If the current submirror is marked as inaccessible, do not
1559		 * try to access it.
1560		 */
1561		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1562		(void) md_unit_readerlock(ui);
1563		if (ui->ui_tstate & MD_INACCESSIBLE) {
1564			md_unit_readerexit(ui);
1565			continue;
1566		}
1567		md_unit_readerexit(ui);
1568
1569		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1570		    (sm->sm_dev, sm, blkno, &cnt);
1571
1572		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1573			continue;
1574		if (s->ms_state == CS_OKAY) {
1575			*cando = cnt;
1576			if (shared != NULL)
1577				*shared = s;
1578
1579			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1580			    cs != NULL) {
1581				cs->cs_buf.b_flags |= B_FAILFAST;
1582			}
1583
1584			return (un->un_sm[i].sm_dev);
1585		}
1586		if (s->ms_state != CS_LAST_ERRED)
1587			continue;
1588
1589		/* don't use B_FAILFAST since we're Last Erred */
1590
1591		if (mincnt > cnt)
1592			mincnt = cnt;
1593		if (s->ms_lasterrcnt > lasterrcnt) {
1594			lasterrcnt = s->ms_lasterrcnt;
1595			if (shared != NULL)
1596				*shared = s;
1597			dev = un->un_sm[i].sm_dev;
1598		}
1599	}
1600	*cando = mincnt;
1601	return (dev);
1602}
1603
1604/*
1605 * Given a 32-bit bitmap, this routine will return the bit number
1606 * of the nth bit set.	The nth bit set is passed via the index integer.
1607 *
1608 * This routine is used to run through the writable submirror bitmap
1609 * and starting all of the writes.  See the value returned is the
1610 * index to appropriate submirror structure, in the md_sm
1611 * array for metamirrors.
1612 */
1613static int
1614md_find_nth_unit(uint_t mask, int index)
1615{
1616	int	bit, nfound;
1617
1618	for (bit = -1, nfound = -1; nfound != index; bit++) {
1619		ASSERT(mask != 0);
1620		nfound += (mask & 1);
1621		mask >>= 1;
1622	}
1623	return (bit);
1624}
1625
1626static int
1627fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1628{
1629	mm_unit_t	*un;
1630	buf_t		*bp;
1631	int		i;
1632	unsigned	nunits = 0;
1633	int		iunit;
1634	uint_t		running_bm = 0;
1635	uint_t		sm_index;
1636
1637	bp = &cs->cs_buf;
1638	un = ps->ps_un;
1639
1640	for (i = 0; i < NMIRROR; i++) {
1641		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1642			continue;
1643		running_bm |= SMI2BIT(i);
1644		nunits++;
1645	}
1646	if (nunits == 0)
1647		return (1);
1648
1649	/*
1650	 * For directed mirror read (DMR) we only use the specified side and
1651	 * do not compute the source of the read.
1652	 * If we're running with MD_MPS_DIRTY_RD set we always return the
1653	 * first mirror side (this prevents unnecessary ownership switching).
1654	 * Otherwise we return the submirror according to the mirror read option
1655	 */
1656	if (ps->ps_flags & MD_MPS_DMR) {
1657		sm_index = un->un_dmr_last_read;
1658	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1659		sm_index = md_find_nth_unit(running_bm, 0);
1660	} else {
1661		/* Normal (non-DMR) operation */
1662		switch (un->un_read_option) {
1663		case RD_GEOMETRY:
1664			iunit = (int)(bp->b_lblkno /
1665			    howmany(un->c.un_total_blocks, nunits));
1666			sm_index = md_find_nth_unit(running_bm, iunit);
1667			break;
1668		case RD_FIRST:
1669			sm_index = md_find_nth_unit(running_bm, 0);
1670			break;
1671		case RD_LOAD_BAL:
1672			/* this is intentional to fall into the default */
1673		default:
1674			un->un_last_read = (un->un_last_read + 1) % nunits;
1675			sm_index = md_find_nth_unit(running_bm,
1676			    un->un_last_read);
1677			break;
1678		}
1679	}
1680	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1681	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1682
1683	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1684		bp->b_flags |= B_FAILFAST;
1685	}
1686
1687	return (0);
1688}
1689
1690static
1691int
1692mirror_are_submirrors_available(mm_unit_t *un)
1693{
1694	int i;
1695	for (i = 0; i < NMIRROR; i++) {
1696		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1697
1698		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1699		    md_getmajor(tmpdev) != md_major)
1700			continue;
1701
1702		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1703		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1704			return (0);
1705
1706		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1707			return (0);
1708	}
1709	return (1);
1710}
1711
1712void
1713build_submirror(mm_unit_t *un, int i, int snarfing)
1714{
1715	struct mm_submirror	*sm;
1716	struct mm_submirror_ic	*smic;
1717	md_unit_t		*su;
1718	set_t			setno;
1719
1720	sm = &un->un_sm[i];
1721	smic = &un->un_smic[i];
1722
1723	sm->sm_flags = 0; /* sometime we may need to do more here */
1724
1725	setno = MD_UN2SET(un);
1726
1727	if (!SMS_IS(sm, SMS_INUSE))
1728		return;
1729	if (snarfing) {
1730		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1731		    sm->sm_key, MD_NOTRUST_DEVT);
1732	} else {
1733		if (md_getmajor(sm->sm_dev) == md_major) {
1734			su = MD_UNIT(md_getminor(sm->sm_dev));
1735			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1736			/* submirror can no longer be soft partitioned */
1737			MD_CAPAB(su) &= (~MD_CAN_SP);
1738		}
1739	}
1740	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1741	    0, "shared by blk", 0);
1742	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1743	    0, "shared by indx", 0);
1744	smic->sm_get_component_count = (int (*)())md_get_named_service(
1745	    sm->sm_dev, 0, "get component count", 0);
1746	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1747	    "get block count skip size", 0);
1748	sm->sm_state &= ~SMS_IGNORE;
1749	if (SMS_IS(sm, SMS_OFFLINE))
1750		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1751	md_set_parent(sm->sm_dev, MD_SID(un));
1752}
1753
1754static void
1755mirror_cleanup(mm_unit_t *un)
1756{
1757	mddb_recid_t	recid;
1758	int		smi;
1759	sv_dev_t	sv[NMIRROR];
1760	int		nsv = 0;
1761
1762	/*
1763	 * If a MN diskset and this node is not the master, do
1764	 * not delete any records on snarf of the mirror records.
1765	 */
1766	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1767	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1768		return;
1769	}
1770
1771	for (smi = 0; smi < NMIRROR; smi++) {
1772		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1773			continue;
1774		sv[nsv].setno = MD_UN2SET(un);
1775		sv[nsv++].key = un->un_sm[smi].sm_key;
1776	}
1777
1778	recid = un->un_rr_dirty_recid;
1779	mddb_deleterec_wrapper(un->c.un_record_id);
1780	if (recid > 0)
1781		mddb_deleterec_wrapper(recid);
1782
1783	md_rem_names(sv, nsv);
1784}
1785
1786/*
1787 * Comparison function for the avl tree which tracks
1788 * outstanding writes on submirrors.
1789 *
1790 * Returns:
1791 *	-1: ps1 < ps2
1792 *	 0: ps1 and ps2 overlap
1793 *	 1: ps1 > ps2
1794 */
1795static int
1796mirror_overlap_compare(const void *p1, const void *p2)
1797{
1798	const md_mps_t *ps1 = (md_mps_t *)p1;
1799	const md_mps_t *ps2 = (md_mps_t *)p2;
1800
1801	if (ps1->ps_firstblk < ps2->ps_firstblk) {
1802		if (ps1->ps_lastblk >= ps2->ps_firstblk)
1803			return (0);
1804		return (-1);
1805	}
1806
1807	if (ps1->ps_firstblk > ps2->ps_firstblk) {
1808		if (ps1->ps_firstblk <= ps2->ps_lastblk)
1809			return (0);
1810		return (1);
1811	}
1812
1813	return (0);
1814}
1815
1816/* Return a -1 if optimized record unavailable and set should be released */
1817int
1818mirror_build_incore(mm_unit_t *un, int snarfing)
1819{
1820	int		i;
1821
1822	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1823		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1824		return (1);
1825	}
1826
1827	if (mirror_are_submirrors_available(un) == 0)
1828		return (1);
1829
1830	if (MD_UNIT(MD_SID(un)) != NULL)
1831		return (0);
1832
1833	MD_STATUS(un) = 0;
1834
1835	/* pre-4.1 didn't define CAN_META_CHILD capability */
1836	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1837
1838	un->un_overlap_tree_flag = 0;
1839	avl_create(&un->un_overlap_root, mirror_overlap_compare,
1840	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1841
1842	for (i = 0; i < NMIRROR; i++)
1843		build_submirror(un, i, snarfing);
1844
1845	if (unit_setup_resync(un, snarfing) != 0) {
1846		if (snarfing) {
1847			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1848			/*
1849			 * If a MN set and set is not stale, then return -1
1850			 * which will force the caller to unload the set.
1851			 * The MN diskset nodes will return failure if
1852			 * unit_setup_resync fails so that nodes won't
1853			 * get out of sync.
1854			 *
1855			 * If set is STALE, the master node can't allocate
1856			 * a resync record (if needed), but node needs to
1857			 * join the set so that user can delete broken mddbs.
1858			 * So, if set is STALE, just continue on.
1859			 */
1860			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1861			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1862				return (-1);
1863			}
1864		} else
1865			return (1);
1866	}
1867
1868	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1869	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1870
1871	un->un_suspend_wr_flag = 0;
1872	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1873	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1874
1875	/*
1876	 * Allocate mutexes for mirror-owner and resync-owner changes.
1877	 * All references to the owner message state field must be guarded
1878	 * by this mutex.
1879	 */
1880	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1881
1882	/*
1883	 * Allocate mutex and condvar for resync thread manipulation. These
1884	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1885	 */
1886	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1887	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1888
1889	/*
1890	 * Allocate mutex and condvar for resync progress thread manipulation.
1891	 * This allows resyncs to be continued across an intervening reboot.
1892	 */
1893	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1894	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1895
1896	/*
1897	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1898	 * provides synchronization between a user-ioctl and the resulting
1899	 * strategy() call that performs the read().
1900	 */
1901	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1902	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1903
1904	/*
1905	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
1906	 */
1907	for (i = 0; i < MD_MNMAXSIDES; i++) {
1908		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
1909	}
1910
1911	/* place various information in the in-core data structures */
1912	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
1913	MD_UNIT(MD_SID(un)) = un;
1914
1915	return (0);
1916}
1917
1918
1919void
1920reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
1921{
1922	mddb_recid_t	recid, vtoc_id;
1923	size_t		bitcnt;
1924	size_t		shortcnt;
1925	int		smi;
1926	sv_dev_t	sv[NMIRROR];
1927	int		nsv = 0;
1928	uint_t		bits = 0;
1929	minor_t		selfid;
1930	md_unit_t	*su;
1931	int		i;
1932
1933	md_destroy_unit_incore(mnum, &mirror_md_ops);
1934
1935	shortcnt = un->un_rrd_num * sizeof (short);
1936	bitcnt = howmany(un->un_rrd_num, NBBY);
1937
1938	if (un->un_outstanding_writes)
1939		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
1940	if (un->un_goingclean_bm)
1941		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
1942	if (un->un_goingdirty_bm)
1943		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
1944	if (un->un_resync_bm)
1945		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
1946	if (un->un_pernode_dirty_sum)
1947		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
1948
1949	/*
1950	 * Destroy the taskq for deferred processing of DRL clean requests.
1951	 * This taskq will only be present for Multi Owner mirrors.
1952	 */
1953	if (un->un_drl_task != NULL)
1954		ddi_taskq_destroy(un->un_drl_task);
1955
1956	md_nblocks_set(mnum, -1ULL);
1957	MD_UNIT(mnum) = NULL;
1958
1959	/*
1960	 * Attempt release of its minor node
1961	 */
1962	md_remove_minor_node(mnum);
1963
1964	if (!removing)
1965		return;
1966
1967	for (smi = 0; smi < NMIRROR; smi++) {
1968		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1969			continue;
1970		/* reallow soft partitioning of submirror and reset parent */
1971		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
1972		MD_CAPAB(su) |= MD_CAN_SP;
1973		md_reset_parent(un->un_sm[smi].sm_dev);
1974		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
1975
1976		sv[nsv].setno = MD_MIN2SET(mnum);
1977		sv[nsv++].key = un->un_sm[smi].sm_key;
1978		bits |= SMI2BIT(smi);
1979	}
1980
1981	MD_STATUS(un) |= MD_UN_BEING_RESET;
1982	recid = un->un_rr_dirty_recid;
1983	vtoc_id = un->c.un_vtoc_id;
1984	selfid = MD_SID(un);
1985
1986	mirror_commit(un, bits, 0);
1987
1988	avl_destroy(&un->un_overlap_root);
1989
1990	/* Destroy all mutexes and condvars before returning. */
1991	mutex_destroy(&un->un_suspend_wr_mx);
1992	cv_destroy(&un->un_suspend_wr_cv);
1993	mutex_destroy(&un->un_overlap_tree_mx);
1994	cv_destroy(&un->un_overlap_tree_cv);
1995	mutex_destroy(&un->un_owner_mx);
1996	mutex_destroy(&un->un_rs_thread_mx);
1997	cv_destroy(&un->un_rs_thread_cv);
1998	mutex_destroy(&un->un_rs_progress_mx);
1999	cv_destroy(&un->un_rs_progress_cv);
2000	mutex_destroy(&un->un_dmr_mx);
2001	cv_destroy(&un->un_dmr_cv);
2002
2003	for (i = 0; i < MD_MNMAXSIDES; i++) {
2004		rw_destroy(&un->un_pernode_dirty_mx[i]);
2005		if (un->un_pernode_dirty_bm[i])
2006			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2007	}
2008
2009	/*
2010	 * Remove self from the namespace
2011	 */
2012	if (un->c.un_revision & MD_FN_META_DEV) {
2013		(void) md_rem_selfname(un->c.un_self_id);
2014	}
2015
2016	/* This frees the unit structure. */
2017	mddb_deleterec_wrapper(un->c.un_record_id);
2018
2019	if (recid != 0)
2020		mddb_deleterec_wrapper(recid);
2021
2022	/* Remove the vtoc, if present */
2023	if (vtoc_id)
2024		mddb_deleterec_wrapper(vtoc_id);
2025
2026	md_rem_names(sv, nsv);
2027
2028	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2029	    MD_MIN2SET(selfid), selfid);
2030}
2031
2032int
2033mirror_internal_open(
2034	minor_t		mnum,
2035	int		flag,
2036	int		otyp,
2037	int		md_oflags,
2038	IOLOCK		*lockp		/* can be NULL */
2039)
2040{
2041	mdi_unit_t	*ui = MDI_UNIT(mnum);
2042	int		err = 0;
2043
2044tryagain:
2045	/* single thread */
2046	if (lockp) {
2047		/*
2048		 * If ioctl lock is held, use openclose_enter
2049		 * routine that will set the ioctl flag when
2050		 * grabbing the readerlock.
2051		 */
2052		(void) md_ioctl_openclose_enter(lockp, ui);
2053	} else {
2054		(void) md_unit_openclose_enter(ui);
2055	}
2056
2057	/*
2058	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2059	 * message in a MN diskset and this requires that the openclose
2060	 * lock is dropped in order to send this message.  So, another
2061	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2062	 * attempting an open while this thread has an open in progress.
2063	 * Call the *_lh version of the lock exit routines since the ui_mx
2064	 * mutex must be held from checking for OPENINPROGRESS until
2065	 * after the cv_wait call.
2066	 */
2067	mutex_enter(&ui->ui_mx);
2068	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2069		if (lockp) {
2070			(void) md_ioctl_openclose_exit_lh(lockp);
2071		} else {
2072			md_unit_openclose_exit_lh(ui);
2073		}
2074		cv_wait(&ui->ui_cv, &ui->ui_mx);
2075		mutex_exit(&ui->ui_mx);
2076		goto tryagain;
2077	}
2078
2079	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2080	mutex_exit(&ui->ui_mx);
2081
2082	/* open devices, if necessary */
2083	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2084		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2085			goto out;
2086	}
2087
2088	/* count open */
2089	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2090		goto out;
2091
2092	/* unlock, return success */
2093out:
2094	mutex_enter(&ui->ui_mx);
2095	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2096	mutex_exit(&ui->ui_mx);
2097
2098	if (lockp) {
2099		/*
2100		 * If ioctl lock is held, use openclose_exit
2101		 * routine that will clear the lockp reader flag.
2102		 */
2103		(void) md_ioctl_openclose_exit(lockp);
2104	} else {
2105		md_unit_openclose_exit(ui);
2106	}
2107	return (err);
2108}
2109
2110int
2111mirror_internal_close(
2112	minor_t		mnum,
2113	int		otyp,
2114	int		md_cflags,
2115	IOLOCK		*lockp		/* can be NULL */
2116)
2117{
2118	mdi_unit_t	*ui = MDI_UNIT(mnum);
2119	mm_unit_t	*un;
2120	int		err = 0;
2121
2122	/* single thread */
2123	if (lockp) {
2124		/*
2125		 * If ioctl lock is held, use openclose_enter
2126		 * routine that will set the ioctl flag when
2127		 * grabbing the readerlock.
2128		 */
2129		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2130	} else {
2131		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2132	}
2133
2134	/* count closed */
2135	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2136		goto out;
2137
2138	/* close devices, if necessary */
2139	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2140		/*
2141		 * Clean up dirty bitmap for this unit. Do this
2142		 * before closing the underlying devices to avoid
2143		 * race conditions with reset_mirror() as a
2144		 * result of a 'metaset -r' command running in
2145		 * parallel. This might cause deallocation of
2146		 * dirty region bitmaps; with underlying metadevices
2147		 * in place this can't happen.
2148		 * Don't do this if a MN set and ABR not set
2149		 */
2150		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2151			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2152			    !(ui->ui_tstate & MD_ABR_CAP))
2153				mirror_process_unit_resync(un);
2154		}
2155		(void) mirror_close_all_devs(un, md_cflags);
2156
2157		/*
2158		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2159		 * clear these capabilities on the last open in the cluster.
2160		 * To do this we send a message to all nodes to see of the
2161		 * device is open.
2162		 */
2163		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2164		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2165			if (lockp) {
2166				(void) md_ioctl_openclose_exit(lockp);
2167			} else {
2168				md_unit_openclose_exit(ui);
2169			}
2170
2171			/*
2172			 * if we are in the context of an ioctl, drop the
2173			 * ioctl lock.
2174			 * Otherwise, no other locks should be held.
2175			 */
2176			if (lockp) {
2177				IOLOCK_RETURN_RELEASE(0, lockp);
2178			}
2179
2180			mdmn_clear_all_capabilities(mnum);
2181
2182			/* if dropped the lock previously, regain it */
2183			if (lockp) {
2184				IOLOCK_RETURN_REACQUIRE(lockp);
2185			}
2186			return (0);
2187		}
2188		/* unlock and return success */
2189	}
2190out:
2191	/* Call whether lockp is NULL or not. */
2192	if (lockp) {
2193		md_ioctl_openclose_exit(lockp);
2194	} else {
2195		md_unit_openclose_exit(ui);
2196	}
2197	return (err);
2198}
2199
2200/*
2201 * When a component has completed resyncing and is now ok, check if the
2202 * corresponding component in the other submirrors is in the Last Erred
2203 * state.  If it is, we want to change that to the Erred state so we stop
2204 * using that component and start using this good component instead.
2205 *
2206 * This is called from set_sm_comp_state and recursively calls
2207 * set_sm_comp_state if it needs to change the Last Erred state.
2208 */
2209static void
2210reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2211	IOLOCK *lockp)
2212{
2213	mm_submirror_t		*sm;
2214	mm_submirror_ic_t	*smic;
2215	int			ci;
2216	int			i;
2217	int			compcnt;
2218	int			changed = 0;
2219
2220	for (i = 0; i < NMIRROR; i++) {
2221		sm = &un->un_sm[i];
2222		smic = &un->un_smic[i];
2223
2224		if (!SMS_IS(sm, SMS_INUSE))
2225			continue;
2226
2227		/* ignore the submirror that we just made ok */
2228		if (i == smi)
2229			continue;
2230
2231		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2232		for (ci = 0; ci < compcnt; ci++) {
2233			md_m_shared_t	*shared;
2234
2235			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2236			    (sm->sm_dev, sm, ci);
2237
2238			if ((shared->ms_state & CS_LAST_ERRED) &&
2239			    !mirror_other_sources(un, i, ci, 1)) {
2240
2241				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2242				    flags, lockp);
2243				changed = 1;
2244			}
2245		}
2246	}
2247
2248	/* maybe there is a hotspare for this newly erred component */
2249	if (changed) {
2250		set_t	setno;
2251
2252		setno = MD_UN2SET(un);
2253		if (MD_MNSET_SETNO(setno)) {
2254			send_poke_hotspares(setno);
2255		} else {
2256			(void) poke_hotspares();
2257		}
2258	}
2259}
2260
2261/*
2262 * set_sm_comp_state
2263 *
2264 * Set the state of a submirror component to the specified new state.
2265 * If the mirror is in a multi-node set, send messages to all nodes to
2266 * block all writes to the mirror and then update the state and release the
2267 * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2268 * MD_STATE_XMIT will be unset in 2 cases:
2269 * 1. When the state is changed to CS_RESYNC as this state change
2270 * will already have been updated on each node by the processing of the
2271 * distributed metasync command, hence no need to xmit.
2272 * 2. When the state is change to CS_OKAY after a resync has completed. Again
2273 * the resync completion will already have been processed on each node by
2274 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2275 * resync, hence no need to xmit.
2276 *
2277 * In case we are called from the updates of a watermark,
2278 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2279 * a metainit or similar. In this case the message that we sent to propagate
2280 * the state change must not be a class1 message as that would deadlock with
2281 * the metainit command that is still being processed.
2282 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2283 * instead. This also makes the submessage generator to create a class2
2284 * submessage rather than a class1 (which would also block)
2285 *
2286 * On entry, unit_writerlock is held
2287 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2288 * also held.
2289 */
2290void
2291set_sm_comp_state(
2292	mm_unit_t	*un,
2293	int		smi,
2294	int		ci,
2295	int		newstate,
2296	mddb_recid_t	*extras,
2297	uint_t		flags,
2298	IOLOCK		*lockp
2299)
2300{
2301	mm_submirror_t		*sm;
2302	mm_submirror_ic_t	*smic;
2303	md_m_shared_t		*shared;
2304	int			origstate;
2305	void			(*get_dev)();
2306	ms_cd_info_t		cd;
2307	char			devname[MD_MAX_CTDLEN];
2308	int			err;
2309	set_t			setno = MD_UN2SET(un);
2310	md_mn_msg_stch_t	stchmsg;
2311	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2312	md_mn_kresult_t		*kresult;
2313	int			rval;
2314	uint_t			msgflags;
2315	md_mn_msgtype_t		msgtype;
2316	int			save_lock = 0;
2317	mdi_unit_t		*ui_sm;
2318
2319	sm = &un->un_sm[smi];
2320	smic = &un->un_smic[smi];
2321
2322	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2323	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2324	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2325	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2326		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2327	}
2328
2329	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2330	    (sm->sm_dev, sm, ci);
2331	origstate = shared->ms_state;
2332
2333	/*
2334	 * If the new state is an error and the old one wasn't, generate
2335	 * a console message. We do this before we send the state to other
2336	 * nodes in a MN set because the state change may change the component
2337	 * name  if a hotspare is allocated.
2338	 */
2339	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2340	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2341
2342		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2343		    "get device", 0);
2344		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2345
2346		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2347		    cd.cd_dev, devname, sizeof (devname));
2348
2349		if (err == ENOENT) {
2350			(void) md_devname(setno, cd.cd_dev, devname,
2351			    sizeof (devname));
2352		}
2353
2354		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2355		    md_shortname(md_getminor(sm->sm_dev)), devname);
2356
2357		if (newstate & CS_LAST_ERRED) {
2358			cmn_err(CE_WARN, "md: %s: %s last erred",
2359			    md_shortname(md_getminor(sm->sm_dev)),
2360			    devname);
2361
2362		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2363			/*
2364			 * Close the broken device and clear the open flag on
2365			 * it.  Closing the device means the RCM framework will
2366			 * be able to unconfigure the device if required.
2367			 *
2368			 * We have to check that the device is open, otherwise
2369			 * the first open on it has resulted in the error that
2370			 * is being processed and the actual cd.cd_dev will be
2371			 * NODEV64.
2372			 *
2373			 * If this is a multi-node mirror, then the multinode
2374			 * state checks following this code will cause the
2375			 * slave nodes to close the mirror in the function
2376			 * mirror_set_state().
2377			 */
2378			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2379			shared->ms_flags &= ~MDM_S_ISOPEN;
2380		}
2381
2382	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2383	    (shared->ms_flags & MDM_S_ISOPEN)) {
2384		/*
2385		 * Similar to logic above except no log messages since we
2386		 * are just transitioning from Last Erred to Erred.
2387		 */
2388		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2389		    "get device", 0);
2390		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2391
2392		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2393		shared->ms_flags &= ~MDM_S_ISOPEN;
2394	}
2395
2396	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2397	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2398		/*
2399		 * For a multi-node mirror, send the state change to the
2400		 * master, which broadcasts to all nodes, including this
2401		 * one. Once the message is received, the state is set
2402		 * in-core and the master commits the change to disk.
2403		 * There is a case, comp_replace,  where this function
2404		 * can be called from within an ioctl and therefore in this
2405		 * case, as the ioctl will already be called on each node,
2406		 * there is no need to xmit the state change to the master for
2407		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2408		 * to indicate whether a xmit is required. The mirror's
2409		 * transient state is set to MD_ERR_PENDING to avoid sending
2410		 * multiple messages.
2411		 */
2412		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2413			ui->ui_tstate |= MD_ERR_PENDING;
2414
2415		/*
2416		 * Send a state update message to all nodes. This message
2417		 * will generate 2 submessages, the first one to suspend
2418		 * all writes to the mirror and the second to update the
2419		 * state and resume writes.
2420		 */
2421		stchmsg.msg_stch_mnum = un->c.un_self_id;
2422		stchmsg.msg_stch_sm = smi;
2423		stchmsg.msg_stch_comp = ci;
2424		stchmsg.msg_stch_new_state = newstate;
2425		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2426#ifdef DEBUG
2427		if (mirror_debug_flag)
2428			printf("send set state, %x, %x, %x, %x, %x\n",
2429			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2430			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2431			    stchmsg.msg_stch_hs_id);
2432#endif
2433		if (flags & MD_STATE_WMUPDATE) {
2434			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2435			/*
2436			 * When coming from an update of watermarks, there
2437			 * must already be a message logged that triggered
2438			 * this action. So, no need to log this message, too.
2439			 */
2440			msgflags = MD_MSGF_NO_LOG;
2441		} else {
2442			msgtype  = MD_MN_MSG_STATE_UPDATE;
2443			msgflags = MD_MSGF_DEFAULT_FLAGS;
2444		}
2445
2446		/*
2447		 * If we are in the context of an ioctl, drop the ioctl lock.
2448		 * lockp holds the list of locks held.
2449		 *
2450		 * Otherwise, increment the appropriate reacquire counters.
2451		 * If openclose lock is *held, then must reacquire reader
2452		 * lock before releasing the openclose lock.
2453		 * Do not drop the ARRAY_WRITER lock as we may not be able
2454		 * to reacquire it.
2455		 */
2456		if (lockp) {
2457			if (lockp->l_flags & MD_ARRAY_WRITER) {
2458				save_lock = MD_ARRAY_WRITER;
2459				lockp->l_flags &= ~MD_ARRAY_WRITER;
2460			} else if (lockp->l_flags & MD_ARRAY_READER) {
2461				save_lock = MD_ARRAY_READER;
2462				lockp->l_flags &= ~MD_ARRAY_READER;
2463			}
2464			IOLOCK_RETURN_RELEASE(0, lockp);
2465		} else {
2466			if (flags & MD_STATE_OCHELD) {
2467				md_unit_writerexit(ui);
2468				(void) md_unit_readerlock(ui);
2469				md_unit_openclose_exit(ui);
2470			} else {
2471				md_unit_writerexit(ui);
2472			}
2473		}
2474
2475		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2476		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2477		    (char *)&stchmsg, sizeof (stchmsg), kresult);
2478
2479		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2480			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2481			/* If we're shutting down already, pause things here. */
2482			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2483				while (!md_mn_is_commd_present()) {
2484					delay(md_hz);
2485				}
2486			}
2487			cmn_err(CE_PANIC,
2488			    "ksend_message failure: STATE_UPDATE");
2489		}
2490		kmem_free(kresult, sizeof (md_mn_kresult_t));
2491
2492		/* if dropped the lock previously, regain it */
2493		if (lockp) {
2494			IOLOCK_RETURN_REACQUIRE(lockp);
2495			lockp->l_flags |= save_lock;
2496		} else {
2497			/*
2498			 * Reacquire dropped locks and update acquirecnts
2499			 * appropriately.
2500			 */
2501			if (flags & MD_STATE_OCHELD) {
2502				/*
2503				 * openclose also grabs readerlock.
2504				 */
2505				(void) md_unit_openclose_enter(ui);
2506				md_unit_readerexit(ui);
2507				(void) md_unit_writerlock(ui);
2508			} else {
2509				(void) md_unit_writerlock(ui);
2510			}
2511		}
2512
2513		ui->ui_tstate &= ~MD_ERR_PENDING;
2514	} else {
2515		shared->ms_state = newstate;
2516		uniqtime32(&shared->ms_timestamp);
2517
2518		if (newstate == CS_ERRED)
2519			shared->ms_flags |= MDM_S_NOWRITE;
2520		else
2521			shared->ms_flags &= ~MDM_S_NOWRITE;
2522
2523		shared->ms_flags &= ~MDM_S_IOERR;
2524		un->un_changecnt++;
2525		shared->ms_lasterrcnt = un->un_changecnt;
2526
2527		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2528		mirror_commit(un, SMI2BIT(smi), extras);
2529	}
2530
2531	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2532		/*
2533		 * Resetting the Last Erred state will recursively call back
2534		 * into this function (set_sm_comp_state) to update the state.
2535		 */
2536		reset_lasterred(un, smi, extras, flags, lockp);
2537	}
2538}
2539
2540static int
2541find_another_logical(
2542	mm_unit_t		*un,
2543	mm_submirror_t		*esm,
2544	diskaddr_t		blk,
2545	u_longlong_t		cnt,
2546	int			must_be_open,
2547	int			state,
2548	int			err_cnt)
2549{
2550	u_longlong_t	cando;
2551	md_dev64_t	dev;
2552	md_m_shared_t	*s;
2553
2554	esm->sm_state |= SMS_IGNORE;
2555	while (cnt != 0) {
2556		u_longlong_t	 mcnt;
2557
2558		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2559
2560		dev = select_read_unit(un, blk, mcnt, &cando,
2561		    must_be_open, &s, NULL);
2562		if (dev == (md_dev64_t)0)
2563			break;
2564
2565		if ((state == CS_LAST_ERRED) &&
2566		    (s->ms_state == CS_LAST_ERRED) &&
2567		    (err_cnt > s->ms_lasterrcnt))
2568			break;
2569
2570		cnt -= cando;
2571		blk += cando;
2572	}
2573	esm->sm_state &= ~SMS_IGNORE;
2574	return (cnt != 0);
2575}
2576
2577int
2578mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2579{
2580	mm_submirror_t		*sm;
2581	mm_submirror_ic_t	*smic;
2582	size_t			count;
2583	diskaddr_t		block;
2584	u_longlong_t		skip;
2585	u_longlong_t		size;
2586	md_dev64_t		dev;
2587	int			cnt;
2588	md_m_shared_t		*s;
2589	int			not_found;
2590
2591	sm = &un->un_sm[smi];
2592	smic = &un->un_smic[smi];
2593	dev = sm->sm_dev;
2594
2595	/*
2596	 * Make sure every component of the submirror
2597	 * has other sources.
2598	 */
2599	if (ci < 0) {
2600		/* Find the highest lasterrcnt */
2601		cnt = (*(smic->sm_get_component_count))(dev, sm);
2602		for (ci = 0; ci < cnt; ci++) {
2603			not_found = mirror_other_sources(un, smi, ci,
2604			    must_be_open);
2605			if (not_found)
2606				return (1);
2607		}
2608		return (0);
2609	}
2610
2611	/*
2612	 * Make sure this component has other sources
2613	 */
2614	(void) (*(smic->sm_get_bcss))
2615	    (dev, sm, ci, &block, &count, &skip, &size);
2616
2617	if (count == 0)
2618		return (1);
2619
2620	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2621
2622	while (count--) {
2623		if (block >= un->c.un_total_blocks)
2624			return (0);
2625
2626		if ((block + size) > un->c.un_total_blocks)
2627			size = un->c.un_total_blocks - block;
2628
2629		not_found = find_another_logical(un, sm, block, size,
2630		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2631		if (not_found)
2632			return (1);
2633
2634		block += size + skip;
2635	}
2636	return (0);
2637}
2638
2639static void
2640finish_error(md_mps_t *ps)
2641{
2642	struct buf	*pb;
2643	mm_unit_t	*un;
2644	mdi_unit_t	*ui;
2645	uint_t		new_str_flags;
2646
2647	pb = ps->ps_bp;
2648	un = ps->ps_un;
2649	ui = ps->ps_ui;
2650
2651	/*
2652	 * Must flag any error to the resync originator if we're performing
2653	 * a Write-after-Read. This corresponds to an i/o error on a resync
2654	 * target device and in this case we ought to abort the resync as there
2655	 * is nothing that can be done to recover from this without operator
2656	 * intervention. If we don't set the B_ERROR flag we will continue
2657	 * reading from the mirror but won't write to the target (as it will
2658	 * have been placed into an errored state).
2659	 * To handle the case of multiple components within a submirror we only
2660	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2661	 * The originator of the resync read will cause this bit to be set if
2662	 * the underlying component count is one for a submirror resync. All
2663	 * other resync types will have the flag set as there is no underlying
2664	 * resync which can be performed on a contained metadevice for these
2665	 * resync types (optimized or component).
2666	 */
2667
2668	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2669		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2670			pb->b_flags |= B_ERROR;
2671		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2672		MPS_FREE(mirror_parent_cache, ps);
2673		md_unit_readerexit(ui);
2674		md_biodone(pb);
2675		return;
2676	}
2677	/*
2678	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2679	 * operation therefore this I/O request has already been counted,
2680	 * the I/O count variable will be decremented by mirror_done()'s
2681	 * call to md_biodone().
2682	 */
2683	if (ps->ps_changecnt != un->un_changecnt) {
2684		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2685		if (ps->ps_flags & MD_MPS_WOW)
2686			new_str_flags |= MD_STR_WOW;
2687		if (ps->ps_flags & MD_MPS_MAPPED)
2688			new_str_flags |= MD_STR_MAPPED;
2689		/*
2690		 * If this I/O request was a read that was part of a resync,
2691		 * set MD_STR_WAR for the retried read to ensure that the
2692		 * resync write (i.e. write-after-read) will be performed
2693		 */
2694		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2695			new_str_flags |= MD_STR_WAR;
2696		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2697		MPS_FREE(mirror_parent_cache, ps);
2698		md_unit_readerexit(ui);
2699		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2700		return;
2701	}
2702
2703	pb->b_flags |= B_ERROR;
2704	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2705	MPS_FREE(mirror_parent_cache, ps);
2706	md_unit_readerexit(ui);
2707	md_biodone(pb);
2708}
2709
2710static void
2711error_update_unit(md_mps_t *ps)
2712{
2713	mm_unit_t		*un;
2714	mdi_unit_t		*ui;
2715	int			smi;	/* sub mirror index */
2716	int			ci;	/* errored component */
2717	set_t			setno;
2718	uint_t			flags;	/* for set_sm_comp_state() */
2719	uint_t			hspflags; /* for check_comp_4_hotspares() */
2720
2721	ui = ps->ps_ui;
2722	un = (mm_unit_t *)md_unit_writerlock(ui);
2723	setno = MD_UN2SET(un);
2724
2725	/* All of these updates have to propagated in case of MN set */
2726	flags = MD_STATE_XMIT;
2727	hspflags = MD_HOTSPARE_XMIT;
2728
2729	/* special treatment if we are called during updating watermarks */
2730	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2731		flags |= MD_STATE_WMUPDATE;
2732		hspflags |= MD_HOTSPARE_WMUPDATE;
2733	}
2734	smi = 0;
2735	ci = 0;
2736	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2737		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2738
2739			/* Never called from ioctl context, so (IOLOCK *)NULL */
2740			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2741			    (IOLOCK *)NULL);
2742			/*
2743			 * For a MN set, the NOTIFY is done when the state
2744			 * change is processed on each node
2745			 */
2746			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2747				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2748				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2749			}
2750			continue;
2751		}
2752		/* Never called from ioctl context, so (IOLOCK *)NULL */
2753		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2754		    (IOLOCK *)NULL);
2755		/*
2756		 * For a MN set, the NOTIFY is done when the state
2757		 * change is processed on each node
2758		 */
2759		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2760			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2761			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2762		}
2763		smi = 0;
2764		ci = 0;
2765	}
2766
2767	md_unit_writerexit(ui);
2768	if (MD_MNSET_SETNO(setno)) {
2769		send_poke_hotspares(setno);
2770	} else {
2771		(void) poke_hotspares();
2772	}
2773	(void) md_unit_readerlock(ui);
2774
2775	finish_error(ps);
2776}
2777
2778/*
2779 * When we have a B_FAILFAST IO error on a Last Erred component we need to
2780 * retry the IO without B_FAILFAST set so that we try to ensure that the
2781 * component "sees" each IO.
2782 */
2783static void
2784last_err_retry(md_mcs_t *cs)
2785{
2786	struct buf	*cb;
2787	md_mps_t	*ps;
2788	uint_t		flags;
2789
2790	cb = &cs->cs_buf;
2791	cb->b_flags &= ~B_FAILFAST;
2792
2793	/* if we're panicing just let this I/O error out */
2794	if (panicstr) {
2795		(void) mirror_done(cb);
2796		return;
2797	}
2798
2799	/* reissue the I/O */
2800
2801	ps = cs->cs_ps;
2802
2803	bioerror(cb, 0);
2804
2805	mutex_enter(&ps->ps_mx);
2806
2807	flags = MD_STR_NOTTOP;
2808	if (ps->ps_flags & MD_MPS_MAPPED)
2809		flags |= MD_STR_MAPPED;
2810	if (ps->ps_flags & MD_MPS_NOBLOCK)
2811		flags |= MD_NOBLOCK;
2812
2813	mutex_exit(&ps->ps_mx);
2814
2815	clear_retry_error(cb);
2816
2817	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2818	    md_shortname(getminor(cb->b_edev)));
2819
2820	md_call_strategy(cb, flags, NULL);
2821}
2822
2823static void
2824mirror_error(md_mps_t *ps)
2825{
2826	int		smi;	/* sub mirror index */
2827	int		ci;	/* errored component */
2828
2829	if (panicstr) {
2830		finish_error(ps);
2831		return;
2832	}
2833
2834	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2835		mirror_overlap_tree_remove(ps);
2836
2837	smi = 0;
2838	ci = 0;
2839	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2840		md_unit_readerexit(ps->ps_ui);
2841		daemon_request(&md_mstr_daemon, error_update_unit,
2842		    (daemon_queue_t *)ps, REQ_OLD);
2843		return;
2844	}
2845
2846	finish_error(ps);
2847}
2848
2849static int
2850copy_write_done(struct buf *cb)
2851{
2852	md_mps_t	*ps;
2853	buf_t		*pb;
2854	char		*wowbuf;
2855	wowhdr_t	*wowhdr;
2856	ssize_t		wow_resid;
2857
2858	/* get wowbuf ans save structure */
2859	wowbuf = cb->b_un.b_addr;
2860	wowhdr = WOWBUF_HDR(wowbuf);
2861	ps = wowhdr->wow_ps;
2862	pb = ps->ps_bp;
2863
2864	/* Save error information, then free cb */
2865	if (cb->b_flags & B_ERROR)
2866		pb->b_flags |= B_ERROR;
2867
2868	if (cb->b_flags & B_REMAPPED)
2869		bp_mapout(cb);
2870
2871	freerbuf(cb);
2872
2873	/* update residual and continue if needed */
2874	if ((pb->b_flags & B_ERROR) == 0) {
2875		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2876		pb->b_resid = wow_resid;
2877		if (wow_resid > 0)  {
2878			daemon_request(&md_mstr_daemon, copy_write_cont,
2879			    (daemon_queue_t *)wowhdr, REQ_OLD);
2880			return (1);
2881		}
2882	}
2883
2884	/* Write is complete, release resources. */
2885	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2886	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2887	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2888	MPS_FREE(mirror_parent_cache, ps);
2889	md_biodone(pb);
2890	return (0);
2891}
2892
2893static void
2894copy_write_cont(wowhdr_t *wowhdr)
2895{
2896	buf_t		*pb;
2897	buf_t		*cb;
2898	char		*wowbuf;
2899	int		wow_offset;
2900	size_t		wow_resid;
2901	diskaddr_t	wow_blkno;
2902
2903	wowbuf = WOWHDR_BUF(wowhdr);
2904	pb = wowhdr->wow_ps->ps_bp;
2905
2906	/* get data on current location */
2907	wow_offset = wowhdr->wow_offset;
2908	wow_resid = pb->b_bcount - wow_offset;
2909	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
2910
2911	/* setup child buffer */
2912	cb = getrbuf(KM_SLEEP);
2913	cb->b_flags = B_WRITE;
2914	cb->b_edev = pb->b_edev;
2915	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
2916	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
2917	cb->b_iodone = copy_write_done;
2918	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
2919	cb->b_lblkno = wow_blkno;
2920
2921	/* move offset to next section */
2922	wowhdr->wow_offset += cb->b_bcount;
2923
2924	/* copy and setup write for current section */
2925	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
2926
2927	/* do it */
2928	/*
2929	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
2930	 * that handles the WOW condition. The resultant increment on the
2931	 * I/O count variable is cleared by copy_write_done()'s call to
2932	 * md_biodone().
2933	 */
2934	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
2935	    | MD_STR_MAPPED, NULL);
2936}
2937
2938static void
2939md_mirror_copy_write(md_mps_t *ps)
2940{
2941	wowhdr_t	*wowhdr;
2942
2943	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
2944	mirror_wowblk_init(wowhdr);
2945	wowhdr->wow_ps = ps;
2946	wowhdr->wow_offset = 0;
2947	copy_write_cont(wowhdr);
2948}
2949
2950static void
2951handle_wow(md_mps_t *ps)
2952{
2953	buf_t		*pb;
2954
2955	pb = ps->ps_bp;
2956
2957	bp_mapin(pb);
2958
2959	md_mirror_wow_cnt++;
2960	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
2961		cmn_err(CE_NOTE,
2962		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
2963		    md_shortname(getminor(pb->b_edev)),
2964		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
2965	}
2966
2967	/*
2968	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2969	 * operation therefore this I/O request has already been counted,
2970	 * the I/O count variable will be decremented by mirror_done()'s
2971	 * call to md_biodone().
2972	 */
2973	if (md_mirror_wow_flg & WOW_NOCOPY)
2974		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
2975		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
2976	else
2977		md_mirror_copy_write(ps);
2978}
2979
2980/*
2981 * Return true if the specified submirror is either in the Last Erred
2982 * state or is transitioning into the Last Erred state.
2983 */
2984static bool_t
2985submirror_is_lasterred(mm_unit_t *un, int smi)
2986{
2987	mm_submirror_t		*sm;
2988	mm_submirror_ic_t	*smic;
2989	md_m_shared_t		*shared;
2990	int			ci;
2991	int			compcnt;
2992
2993	sm = &un->un_sm[smi];
2994	smic = &un->un_smic[smi];
2995
2996	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2997	for (ci = 0; ci < compcnt; ci++) {
2998		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2999		    (sm->sm_dev, sm, ci);
3000
3001		if (shared->ms_state == CS_LAST_ERRED)
3002			return (B_TRUE);
3003
3004		/*
3005		 * It is not currently Last Erred, check if entering Last Erred.
3006		 */
3007		if ((shared->ms_flags & MDM_S_IOERR) &&
3008		    ((shared->ms_state == CS_OKAY) ||
3009		    (shared->ms_state == CS_RESYNC))) {
3010			if (mirror_other_sources(un, smi, ci, 0) == 1)
3011				return (B_TRUE);
3012		}
3013	}
3014
3015	return (B_FALSE);
3016}
3017
3018
3019static int
3020mirror_done(struct buf *cb)
3021{
3022	md_mps_t	*ps;
3023	md_mcs_t	*cs;
3024
3025	/*LINTED*/
3026	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3027	ps = cs->cs_ps;
3028
3029	mutex_enter(&ps->ps_mx);
3030
3031	/* check if we need to retry an errored failfast I/O */
3032	if (cb->b_flags & B_ERROR) {
3033		struct buf *pb = ps->ps_bp;
3034
3035		if (cb->b_flags & B_FAILFAST) {
3036			int		i;
3037			mm_unit_t	*un = ps->ps_un;
3038
3039			for (i = 0; i < NMIRROR; i++) {
3040				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3041					continue;
3042
3043				if (cb->b_edev ==
3044				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3045
3046					/*
3047					 * This is the submirror that had the
3048					 * error.  Check if it is Last Erred.
3049					 */
3050					if (submirror_is_lasterred(un, i)) {
3051						daemon_queue_t *dqp;
3052
3053						mutex_exit(&ps->ps_mx);
3054						dqp = (daemon_queue_t *)cs;
3055						dqp->dq_prev = NULL;
3056						dqp->dq_next = NULL;
3057						daemon_request(&md_done_daemon,
3058						    last_err_retry, dqp,
3059						    REQ_OLD);
3060						return (1);
3061					}
3062					break;
3063				}
3064			}
3065		}
3066
3067		/* continue to process the buf without doing a retry */
3068		ps->ps_flags |= MD_MPS_ERROR;
3069		pb->b_error = cb->b_error;
3070	}
3071
3072	return (mirror_done_common(cb));
3073}
3074
3075/*
3076 * Split from the original mirror_done function so we can handle bufs after a
3077 * retry.
3078 * ps->ps_mx is already held in the caller of this function and the cb error
3079 * has already been checked and handled in the caller.
3080 */
3081static int
3082mirror_done_common(struct buf *cb)
3083{
3084	struct buf	*pb;
3085	mm_unit_t	*un;
3086	mdi_unit_t	*ui;
3087	md_mps_t	*ps;
3088	md_mcs_t	*cs;
3089	size_t		end_rr, start_rr, current_rr;
3090
3091	/*LINTED*/
3092	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3093	ps = cs->cs_ps;
3094	pb = ps->ps_bp;
3095
3096	if (cb->b_flags & B_REMAPPED)
3097		bp_mapout(cb);
3098
3099	ps->ps_frags--;
3100	if (ps->ps_frags != 0) {
3101		mutex_exit(&ps->ps_mx);
3102		kmem_cache_free(mirror_child_cache, cs);
3103		return (1);
3104	}
3105	un = ps->ps_un;
3106	ui = ps->ps_ui;
3107
3108	/*
3109	 * Do not update outstanding_writes if we're running with ABR
3110	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3111	 * Also a resync initiated write() has no outstanding_writes update
3112	 * either.
3113	 */
3114	if (((cb->b_flags & B_READ) == 0) &&
3115	    (un->un_nsm >= 2) &&
3116	    (ps->ps_call == NULL) &&
3117	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3118	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3119		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3120		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3121		mutex_enter(&un->un_resync_mx);
3122		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3123			un->un_outstanding_writes[current_rr]--;
3124		mutex_exit(&un->un_resync_mx);
3125	}
3126	kmem_cache_free(mirror_child_cache, cs);
3127	mutex_exit(&ps->ps_mx);
3128
3129	if (ps->ps_call != NULL) {
3130		daemon_request(&md_done_daemon, ps->ps_call,
3131		    (daemon_queue_t *)ps, REQ_OLD);
3132		return (1);
3133	}
3134
3135	if ((ps->ps_flags & MD_MPS_ERROR)) {
3136		daemon_request(&md_done_daemon, mirror_error,
3137		    (daemon_queue_t *)ps, REQ_OLD);
3138		return (1);
3139	}
3140
3141	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3142		mirror_overlap_tree_remove(ps);
3143
3144	/*
3145	 * Handle Write-on-Write problem.
3146	 * Skip In case of Raw and Direct I/O as they are
3147	 * handled earlier.
3148	 *
3149	 */
3150	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3151	    !(pb->b_flags & B_READ) &&
3152	    !(ps->ps_flags & MD_MPS_WOW) &&
3153	    !(pb->b_flags & B_PHYS) &&
3154	    any_pages_dirty(pb)) {
3155		md_unit_readerexit(ps->ps_ui);
3156		daemon_request(&md_mstr_daemon, handle_wow,
3157		    (daemon_queue_t *)ps, REQ_OLD);
3158		return (1);
3159	}
3160
3161	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3162	MPS_FREE(mirror_parent_cache, ps);
3163	md_unit_readerexit(ui);
3164	md_biodone(pb);
3165	return (0);
3166}
3167
3168/*
3169 * Clear error state in submirror component if the retry worked after
3170 * a failfast error.
3171 */
3172static void
3173clear_retry_error(struct buf *cb)
3174{
3175	int			smi;
3176	md_mcs_t		*cs;
3177	mm_unit_t		*un;
3178	mdi_unit_t		*ui_sm;
3179	mm_submirror_t		*sm;
3180	mm_submirror_ic_t	*smic;
3181	u_longlong_t		cnt;
3182	md_m_shared_t		*shared;
3183
3184	/*LINTED*/
3185	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3186	un = cs->cs_ps->ps_un;
3187
3188	for (smi = 0; smi < NMIRROR; smi++) {
3189		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3190			continue;
3191
3192		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3193			break;
3194	}
3195
3196	if (smi >= NMIRROR)
3197		return;
3198
3199	sm = &un->un_sm[smi];
3200	smic = &un->un_smic[smi];
3201	cnt = cb->b_bcount;
3202
3203	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3204	(void) md_unit_writerlock(ui_sm);
3205
3206	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3207	    cb->b_blkno, &cnt);
3208
3209	if (shared->ms_flags & MDM_S_IOERR) {
3210		shared->ms_flags &= ~MDM_S_IOERR;
3211
3212	} else {
3213		/* the buf spans components and the first one is not erred */
3214		int	cnt;
3215		int	i;
3216
3217		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3218		for (i = 0; i < cnt; i++) {
3219			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3220			    (sm->sm_dev, sm, i);
3221
3222			if (shared->ms_flags & MDM_S_IOERR &&
3223			    shared->ms_state == CS_OKAY) {
3224
3225				shared->ms_flags &= ~MDM_S_IOERR;
3226				break;
3227			}
3228		}
3229	}
3230
3231	md_unit_writerexit(ui_sm);
3232}
3233
3234static size_t
3235mirror_map_read(
3236	md_mps_t *ps,
3237	md_mcs_t *cs,
3238	diskaddr_t blkno,
3239	u_longlong_t	count
3240)
3241{
3242	mm_unit_t	*un;
3243	buf_t		*bp;
3244	u_longlong_t	cando;
3245
3246	bp = &cs->cs_buf;
3247	un = ps->ps_un;
3248
3249	bp->b_lblkno = blkno;
3250	if (fast_select_read_unit(ps, cs) == 0) {
3251		bp->b_bcount = ldbtob(count);
3252		return (0);
3253	}
3254	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3255	    count, &cando, 0, NULL, cs));
3256	bp->b_bcount = ldbtob(cando);
3257	if (count != cando)
3258		return (cando);
3259	return (0);
3260}
3261
3262static void
3263write_after_read(md_mps_t *ps)
3264{
3265	struct buf	*pb;
3266	int		flags;
3267
3268	if (ps->ps_flags & MD_MPS_ERROR) {
3269		mirror_error(ps);
3270		return;
3271	}
3272
3273	pb = ps->ps_bp;
3274	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3275	ps->ps_call = NULL;
3276	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3277	flags = MD_STR_NOTTOP | MD_STR_WAR;
3278	if (ps->ps_flags & MD_MPS_MAPPED)
3279		flags |= MD_STR_MAPPED;
3280	if (ps->ps_flags & MD_MPS_NOBLOCK)
3281		flags |= MD_NOBLOCK;
3282	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3283		flags |= MD_STR_DIRTY_RD;
3284	(void) mirror_write_strategy(pb, flags, ps);
3285}
3286
3287static void
3288continue_serial(md_mps_t *ps)
3289{
3290	md_mcs_t	*cs;
3291	buf_t		*cb;
3292	mm_unit_t	*un;
3293	int		flags;
3294
3295	un = ps->ps_un;
3296	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3297	mirror_child_init(cs);
3298	cb = &cs->cs_buf;
3299	ps->ps_call = NULL;
3300	ps->ps_frags = 1;
3301	(void) mirror_map_write(un, cs, ps, 0);
3302	flags = MD_STR_NOTTOP;
3303	if (ps->ps_flags & MD_MPS_MAPPED)
3304		flags |= MD_STR_MAPPED;
3305	md_call_strategy(cb, flags, NULL);
3306}
3307
3308static int
3309mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3310{
3311	int i;
3312	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3313	buf_t		*cb;
3314	buf_t		*pb;
3315	diskaddr_t	blkno;
3316	size_t		bcount;
3317	off_t		offset;
3318
3319	pb = ps->ps_bp;
3320	cb = &cs->cs_buf;
3321	cs->cs_ps = ps;
3322
3323	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3324
3325	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3326
3327	blkno = pb->b_lblkno;
3328	bcount = pb->b_bcount;
3329	offset = 0;
3330	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3331		blkno = DK_LABEL_LOC + 1;
3332		/*
3333		 * This handles the case where we're requesting
3334		 * a write to block 0 on a label partition
3335		 * and the request size was smaller than the
3336		 * size of the label.  If this is the case
3337		 * then we'll return -1.  Failure to do so will
3338		 * either cause the calling thread to hang due to
3339		 * an ssd bug, or worse if the bcount were allowed
3340		 * to go negative (ie large).
3341		 */
3342		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3343			return (-1);
3344		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3345		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3346	}
3347
3348	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3349	    cb, KM_NOSLEEP);
3350	if (war)
3351		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3352
3353	/*
3354	 * If the submirror is in the erred stated, check if any component is
3355	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3356	 * flag on the IO.
3357	 *
3358	 * Provide a fast path for the non-erred case (which should be the
3359	 * normal case).
3360	 */
3361	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3362		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3363			mm_submirror_t		*sm;
3364			mm_submirror_ic_t	*smic;
3365			int			ci;
3366			int			compcnt;
3367
3368			sm = &un->un_sm[i];
3369			smic = &un->un_smic[i];
3370
3371			compcnt = (*(smic->sm_get_component_count))
3372			    (sm->sm_dev, un);
3373			for (ci = 0; ci < compcnt; ci++) {
3374				md_m_shared_t	*shared;
3375
3376				shared = (md_m_shared_t *)
3377				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3378				    sm, ci);
3379
3380				if (shared->ms_state == CS_LAST_ERRED)
3381					break;
3382			}
3383			if (ci >= compcnt)
3384				cb->b_flags |= B_FAILFAST;
3385
3386		} else {
3387			cb->b_flags |= B_FAILFAST;
3388		}
3389	}
3390
3391	ps->ps_current_sm++;
3392	if (ps->ps_current_sm != ps->ps_active_cnt) {
3393		if (un->un_write_option == WR_SERIAL) {
3394			ps->ps_call = continue_serial;
3395			return (0);
3396		}
3397		return (1);
3398	}
3399	return (0);
3400}
3401
3402/*
3403 * directed_read_done:
3404 * ------------------
3405 * Completion routine called when a DMR request has been returned from the
3406 * underlying driver. Wake-up the original ioctl() and return the data to
3407 * the user.
3408 */
3409static void
3410directed_read_done(md_mps_t *ps)
3411{
3412	mm_unit_t	*un;
3413	mdi_unit_t	*ui;
3414
3415	un = ps->ps_un;
3416	ui = ps->ps_ui;
3417
3418	md_unit_readerexit(ui);
3419	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3420	ps->ps_call = NULL;
3421
3422	mutex_enter(&un->un_dmr_mx);
3423	cv_signal(&un->un_dmr_cv);
3424	mutex_exit(&un->un_dmr_mx);
3425
3426	/* release the parent structure */
3427	kmem_cache_free(mirror_parent_cache, ps);
3428}
3429
3430/*
3431 * daemon_io:
3432 * ------------
3433 * Called to issue a mirror_write_strategy() or mirror_read_strategy
3434 * call from a blockable context. NOTE: no mutex can be held on entry to this
3435 * routine
3436 */
3437static void
3438daemon_io(daemon_queue_t *dq)
3439{
3440	md_mps_t	*ps = (md_mps_t *)dq;
3441	int		flag = MD_STR_NOTTOP;
3442	buf_t		*pb = ps->ps_bp;
3443
3444	if (ps->ps_flags & MD_MPS_MAPPED)
3445		flag |= MD_STR_MAPPED;
3446	if (ps->ps_flags & MD_MPS_WOW)
3447		flag |= MD_STR_WOW;
3448	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3449		flag |= MD_STR_WAR;
3450	if (ps->ps_flags & MD_MPS_ABR)
3451		flag |= MD_STR_ABR;
3452	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3453		flag |= MD_STR_BLOCK_OK;
3454
3455	/*
3456	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3457	 * MD_STR_WAR before calling mirror_read_strategy
3458	 */
3459	if (pb->b_flags & B_READ) {
3460		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3461			flag |= MD_STR_WAR;
3462		mirror_read_strategy(pb, flag, ps);
3463	} else
3464		mirror_write_strategy(pb, flag, ps);
3465}
3466
3467/*
3468 * update_resync:
3469 * -------------
3470 * Called to update the in-core version of the resync record with the latest
3471 * version that was committed to disk when the previous mirror owner
3472 * relinquished ownership. This call is likely to block as we must hold-off
3473 * any current resync processing that may be occurring.
3474 * On completion of the resync record update we issue the mirror_write_strategy
3475 * call to complete the i/o that first started this sequence. To remove a race
3476 * condition between a new write() request which is submitted and the resync
3477 * record update we acquire the writerlock. This will hold off all i/o to the
3478 * mirror until the resync update has completed.
3479 * NOTE: no mutex can be held on entry to this routine
3480 */
3481static void
3482update_resync(daemon_queue_t *dq)
3483{
3484	md_mps_t	*ps = (md_mps_t *)dq;
3485	buf_t		*pb = ps->ps_bp;
3486	mdi_unit_t	*ui = ps->ps_ui;
3487	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
3488	set_t		setno;
3489	int		restart_resync;
3490
3491	mutex_enter(&un->un_rrp_inflight_mx);
3492	(void) md_unit_writerlock(ui);
3493	ps->ps_un = un;
3494	setno = MD_MIN2SET(getminor(pb->b_edev));
3495	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3496		/*
3497		 * Synchronize our in-core view of what regions need to be
3498		 * resync'd with the on-disk version.
3499		 */
3500		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3501		    un->un_dirty_bm);
3502
3503		/* Region dirty map is now up to date */
3504	}
3505	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3506	md_unit_writerexit(ui);
3507	mutex_exit(&un->un_rrp_inflight_mx);
3508
3509	/* Restart the resync thread if it was previously blocked */
3510	if (restart_resync) {
3511		mutex_enter(&un->un_rs_thread_mx);
3512		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3513		cv_signal(&un->un_rs_thread_cv);
3514		mutex_exit(&un->un_rs_thread_mx);
3515	}
3516	/* Continue with original deferred i/o */
3517	daemon_io(dq);
3518}
3519
3520/*
3521 * owner_timeout:
3522 * -------------
3523 * Called if the original mdmn_ksend_message() failed and the request is to be
3524 * retried. Reattempt the original ownership change.
3525 *
3526 * NOTE: called at interrupt context (see timeout(9f)).
3527 */
3528static void
3529owner_timeout(void *arg)
3530{
3531	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3532
3533	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3534}
3535
3536/*
3537 * become_owner:
3538 * ------------
3539 * Called to issue RPC request to become the owner of the mirror
3540 * associated with this i/o request. We assume that the ownership request
3541 * is synchronous, so if it succeeds we will issue the request via
3542 * mirror_write_strategy().
3543 * If multiple i/o's are outstanding we will be called from the mirror_daemon
3544 * service thread.
3545 * NOTE: no mutex should be held on entry to this routine.
3546 */
3547static void
3548become_owner(daemon_queue_t *dq)
3549{
3550	md_mps_t	*ps = (md_mps_t *)dq;
3551	mm_unit_t	*un = ps->ps_un;
3552	buf_t		*pb = ps->ps_bp;
3553	set_t		setno;
3554	md_mn_kresult_t	*kres;
3555	int		msg_flags = md_mirror_msg_flags;
3556	md_mps_t	*ps1;
3557
3558	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3559
3560	/*
3561	 * If we're already the mirror owner we do not need to send a message
3562	 * but can simply process the i/o request immediately.
3563	 * If we've already sent the request to become owner we requeue the
3564	 * request as we're waiting for the synchronous ownership message to
3565	 * be processed.
3566	 */
3567	if (MD_MN_MIRROR_OWNER(un)) {
3568		/*
3569		 * As the strategy() call will potentially block we need to
3570		 * punt this to a separate thread and complete this request
3571		 * as quickly as possible. Note: if we're a read request
3572		 * this must be a resync, we cannot afford to be queued
3573		 * behind any intervening i/o requests. In this case we put the
3574		 * request on the md_mirror_rs_daemon queue.
3575		 */
3576		if (pb->b_flags & B_READ) {
3577			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3578			    REQ_OLD);
3579		} else {
3580			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3581			    REQ_OLD);
3582		}
3583	} else {
3584		mutex_enter(&un->un_owner_mx);
3585		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3586			md_mn_req_owner_t	*msg;
3587			int			rval = 0;
3588
3589			/*
3590			 * Check to see that we haven't exceeded the maximum
3591			 * retry count. If we have we fail the i/o as the
3592			 * comms mechanism has become wedged beyond recovery.
3593			 */
3594			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3595				mutex_exit(&un->un_owner_mx);
3596				cmn_err(CE_WARN,
3597				    "md_mirror: Request exhausted ownership "
3598				    "retry limit of %d attempts", dq->qlen);
3599				pb->b_error = EIO;
3600				pb->b_flags |= B_ERROR;
3601				pb->b_resid = pb->b_bcount;
3602				kmem_cache_free(mirror_parent_cache, ps);
3603				md_biodone(pb);
3604				return;
3605			}
3606
3607			/*
3608			 * Issue request to change ownership. The call is
3609			 * synchronous so when it returns we can complete the
3610			 * i/o (if successful), or enqueue it again so that
3611			 * the operation will be retried.
3612			 */
3613			un->un_owner_state |= MM_MN_OWNER_SENT;
3614			mutex_exit(&un->un_owner_mx);
3615
3616			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3617			setno = MD_MIN2SET(getminor(pb->b_edev));
3618			msg->mnum = MD_SID(un);
3619			msg->owner = md_mn_mynode_id;
3620			msg_flags |= MD_MSGF_NO_LOG;
3621			/*
3622			 * If this IO is triggered by updating a watermark,
3623			 * it might be issued by the creation of a softpartition
3624			 * while the commd subsystem is suspended.
3625			 * We don't want this message to block.
3626			 */
3627			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3628				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3629			}
3630
3631			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3632			rval = mdmn_ksend_message(setno,
3633			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3634			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
3635
3636			kmem_free(msg, sizeof (md_mn_req_owner_t));
3637
3638			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3639				dq->qlen = 0;
3640				/*
3641				 * Successfully changed owner, reread the
3642				 * resync record so that we have a valid idea of
3643				 * any previously committed incomplete write()s.
3644				 * NOTE: As we need to acquire the resync mutex
3645				 * this may block, so we defer it to a separate
3646				 * thread handler. This makes us (effectively)
3647				 * non-blocking once the ownership message
3648				 * handling has completed.
3649				 */
3650				mutex_enter(&un->un_owner_mx);
3651				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3652					un->un_mirror_owner = md_mn_mynode_id;
3653					/* Sets owner of un_rr_dirty record */
3654					if (un->un_rr_dirty_recid)
3655						(void) mddb_setowner(
3656						    un->un_rr_dirty_recid,
3657						    md_mn_mynode_id);
3658					un->un_owner_state &=
3659					    ~MM_MN_BECOME_OWNER;
3660					/*
3661					 * Release the block on the current
3662					 * resync region if it is blocked
3663					 */
3664					ps1 = un->un_rs_prev_overlap;
3665					if ((ps1 != NULL) &&
3666					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3667						mirror_overlap_tree_remove(ps1);
3668					mutex_exit(&un->un_owner_mx);
3669
3670					/*
3671					 * If we're a read, this must be a
3672					 * resync request, issue
3673					 * the i/o request on the
3674					 * md_mirror_rs_daemon queue. This is
3675					 * to avoid a deadlock between the
3676					 * resync_unit thread and
3677					 * subsequent i/o requests that may
3678					 * block on the resync region.
3679					 */
3680					if (pb->b_flags & B_READ) {
3681						daemon_request(
3682						    &md_mirror_rs_daemon,
3683						    update_resync, dq, REQ_OLD);
3684					} else {
3685						daemon_request(
3686						    &md_mirror_io_daemon,
3687						    update_resync, dq, REQ_OLD);
3688					}
3689					kmem_free(kres,
3690					    sizeof (md_mn_kresult_t));
3691					return;
3692				} else {
3693					/*
3694					 * Some other node has beaten us to
3695					 * obtain ownership. We need to
3696					 * reschedule our ownership request
3697					 */
3698					mutex_exit(&un->un_owner_mx);
3699				}
3700			} else {
3701				mdmn_ksend_show_error(rval, kres,
3702				    "MD_MN_MSG_REQUIRE_OWNER");
3703				/*
3704				 * Message transport failure is handled by the
3705				 * comms layer. If the ownership change request
3706				 * does not succeed we need to flag the error to
3707				 * the initiator of the i/o. This is handled by
3708				 * the retry logic above. As the request failed
3709				 * we do not know _who_ the owner of the mirror
3710				 * currently is. We reset our idea of the owner
3711				 * to None so that any further write()s will
3712				 * attempt to become the owner again. This stops
3713				 * multiple nodes writing to the same mirror
3714				 * simultaneously.
3715				 */
3716				mutex_enter(&un->un_owner_mx);
3717				un->un_owner_state &=
3718				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3719				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3720				mutex_exit(&un->un_owner_mx);
3721			}
3722			kmem_free(kres, sizeof (md_mn_kresult_t));
3723		} else
3724			mutex_exit(&un->un_owner_mx);
3725
3726		/*
3727		 * Re-enqueue this request on the deferred i/o list. Delay the
3728		 * request for md_mirror_owner_to usecs to stop thrashing.
3729		 */
3730		(void) timeout(owner_timeout, dq,
3731		    drv_usectohz(md_mirror_owner_to));
3732	}
3733}
3734
3735static void
3736mirror_write_strategy(buf_t *pb, int flag, void *private)
3737{
3738	md_mps_t	*ps;
3739	md_mcs_t	*cs;
3740	int		more;
3741	mm_unit_t	*un;
3742	mdi_unit_t	*ui;
3743	buf_t		*cb;		/* child buf pointer */
3744	set_t		setno;
3745	int		rs_on_overlap = 0;
3746
3747	ui = MDI_UNIT(getminor(pb->b_edev));
3748	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3749
3750
3751	md_kstat_waitq_enter(ui);
3752
3753	/*
3754	 * If a state change is in progress for this mirror in a MN set,
3755	 * suspend all non-resync writes until the state change is complete.
3756	 * The objective of this suspend is to ensure that it is not
3757	 * possible for one node to read data from a submirror that another node
3758	 * has not written to because of the state change. Therefore we
3759	 * suspend all writes until the state change has been made. As it is
3760	 * not possible to read from the target of a resync, there is no need
3761	 * to suspend resync writes.
3762	 * Note that we only block here if the caller can handle a busy-wait.
3763	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3764	 */
3765
3766	if (!(flag & MD_STR_WAR)) {
3767		if (flag & MD_STR_BLOCK_OK) {
3768			mutex_enter(&un->un_suspend_wr_mx);
3769			while (un->un_suspend_wr_flag) {
3770				cv_wait(&un->un_suspend_wr_cv,
3771				    &un->un_suspend_wr_mx);
3772			}
3773			mutex_exit(&un->un_suspend_wr_mx);
3774		}
3775		(void) md_unit_readerlock(ui);
3776	}
3777
3778	if (!(flag & MD_STR_NOTTOP)) {
3779		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3780			md_kstat_waitq_exit(ui);
3781			return;
3782		}
3783	}
3784
3785	setno = MD_MIN2SET(getminor(pb->b_edev));
3786
3787	/* If an ABR write has been requested, set MD_STR_ABR flag */
3788	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3789		flag |= MD_STR_ABR;
3790
3791	if (private == NULL) {
3792		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3793		mirror_parent_init(ps);
3794	} else {
3795		ps = private;
3796		private = NULL;
3797	}
3798	if (flag & MD_STR_MAPPED)
3799		ps->ps_flags |= MD_MPS_MAPPED;
3800
3801	if (flag & MD_STR_WOW)
3802		ps->ps_flags |= MD_MPS_WOW;
3803
3804	if (flag & MD_STR_ABR)
3805		ps->ps_flags |= MD_MPS_ABR;
3806
3807	if (flag & MD_STR_WMUPDATE)
3808		ps->ps_flags |= MD_MPS_WMUPDATE;
3809
3810	/*
3811	 * Save essential information from the original buffhdr
3812	 * in the md_save structure.
3813	 */
3814	ps->ps_un = un;
3815	ps->ps_ui = ui;
3816	ps->ps_bp = pb;
3817	ps->ps_addr = pb->b_un.b_addr;
3818	ps->ps_firstblk = pb->b_lblkno;
3819	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3820	ps->ps_changecnt = un->un_changecnt;
3821
3822	/*
3823	 * Check for suspended writes here. This is where we can defer the
3824	 * write request to the daemon_io queue which will then call us with
3825	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3826	 * the top of this routine.
3827	 */
3828	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3829		mutex_enter(&un->un_suspend_wr_mx);
3830		if (un->un_suspend_wr_flag) {
3831			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3832			mutex_exit(&un->un_suspend_wr_mx);
3833			md_unit_readerexit(ui);
3834			daemon_request(&md_mirror_daemon, daemon_io,
3835			    (daemon_queue_t *)ps, REQ_OLD);
3836			return;
3837		}
3838		mutex_exit(&un->un_suspend_wr_mx);
3839	}
3840
3841	/*
3842	 * If not MN owner and this is an ABR write, make sure the current
3843	 * resync region is in the overlaps tree
3844	 */
3845	mutex_enter(&un->un_owner_mx);
3846	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3847	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3848		md_mps_t	*ps1;
3849		/* Block the current resync region, if not already blocked */
3850		ps1 = un->un_rs_prev_overlap;
3851
3852		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3853		    (ps1->ps_lastblk != 0))) {
3854			/* Drop locks to avoid deadlock */
3855			mutex_exit(&un->un_owner_mx);
3856			md_unit_readerexit(ui);
3857			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3858			rs_on_overlap = 1;
3859			(void) md_unit_readerlock(ui);
3860			mutex_enter(&un->un_owner_mx);
3861			/*
3862			 * Check to see if we have obtained ownership
3863			 * while waiting for overlaps. If we have, remove
3864			 * the resync_region entry from the overlap tree
3865			 */
3866			if (MD_MN_MIRROR_OWNER(un) &&
3867			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3868				mirror_overlap_tree_remove(ps1);
3869				rs_on_overlap = 0;
3870			}
3871		}
3872	}
3873	mutex_exit(&un->un_owner_mx);
3874
3875
3876	/*
3877	 * following keep write after read from writing to the
3878	 * source in the case where it all came from one place
3879	 */
3880	if (flag & MD_STR_WAR) {
3881		int	abort_write = 0;
3882		/*
3883		 * We are perfoming a write-after-read. This is either as a
3884		 * result of a resync read or as a result of a read in a
3885		 * dirty resync region when the optimized resync is not
3886		 * complete. If in a MN set and a resync generated i/o,
3887		 * if the current block is not in the current
3888		 * resync region terminate the write as another node must have
3889		 * completed this resync region
3890		 */
3891		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3892		    (!flag & MD_STR_DIRTY_RD)) {
3893			if (!IN_RESYNC_REGION(un, ps))
3894				abort_write = 1;
3895		}
3896		if ((select_write_after_read_units(un, ps) == 0) ||
3897		    (abort_write)) {
3898#ifdef DEBUG
3899			if (mirror_debug_flag)
3900				printf("Abort resync write on %x, block %lld\n",
3901				    MD_SID(un), ps->ps_firstblk);
3902#endif
3903			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3904				mirror_overlap_tree_remove(ps);
3905			kmem_cache_free(mirror_parent_cache, ps);
3906			md_kstat_waitq_exit(ui);
3907			md_unit_readerexit(ui);
3908			md_biodone(pb);
3909			return;
3910		}
3911	} else {
3912		select_write_units(un, ps);
3913
3914		/* Drop readerlock to avoid deadlock */
3915		md_unit_readerexit(ui);
3916		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
3917		un = md_unit_readerlock(ui);
3918		/*
3919		 * For a MN set with an ABR write, if we are now the
3920		 * owner and we have a resync region in the overlap
3921		 * tree, remove the entry from overlaps and retry the write.
3922		 */
3923
3924		if (MD_MNSET_SETNO(setno) &&
3925		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3926			mutex_enter(&un->un_owner_mx);
3927			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
3928				mirror_overlap_tree_remove(ps);
3929				md_kstat_waitq_exit(ui);
3930				mutex_exit(&un->un_owner_mx);
3931				md_unit_readerexit(ui);
3932				daemon_request(&md_mirror_daemon, daemon_io,
3933				    (daemon_queue_t *)ps, REQ_OLD);
3934				return;
3935			}
3936			mutex_exit(&un->un_owner_mx);
3937		}
3938	}
3939
3940	/*
3941	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
3942	 * we need to become the mirror owner before continuing with the
3943	 * write(). For ABR mirrors we check that we 'own' the resync if
3944	 * we're in write-after-read mode. We do this _after_ ensuring that
3945	 * there are no overlaps to ensure that once we know that we are
3946	 * the owner, the readerlock will not be released until the write is
3947	 * complete. As a change of ownership in a MN set requires the
3948	 * writerlock, this ensures that ownership cannot be changed until
3949	 * the write is complete.
3950	 */
3951	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
3952	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
3953		if (MD_MN_NO_MIRROR_OWNER(un))  {
3954			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3955				mirror_overlap_tree_remove(ps);
3956			md_kstat_waitq_exit(ui);
3957			ASSERT(!(flag & MD_STR_WAR));
3958			md_unit_readerexit(ui);
3959			daemon_request(&md_mirror_daemon, become_owner,
3960			    (daemon_queue_t *)ps, REQ_OLD);
3961			return;
3962		}
3963	}
3964
3965	/*
3966	 * Mark resync region if mirror has a Resync Region _and_ we are not
3967	 * a resync initiated write(). Don't mark region if we're flagged as
3968	 * an ABR write.
3969	 */
3970	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
3971	    !(flag & MD_STR_WAR)) {
3972		if (mirror_mark_resync_region(un, ps->ps_firstblk,
3973		    ps->ps_lastblk, md_mn_mynode_id)) {
3974			pb->b_flags |= B_ERROR;
3975			pb->b_resid = pb->b_bcount;
3976			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3977				mirror_overlap_tree_remove(ps);
3978			kmem_cache_free(mirror_parent_cache, ps);
3979			md_kstat_waitq_exit(ui);
3980			md_unit_readerexit(ui);
3981			md_biodone(pb);
3982			return;
3983		}
3984	}
3985
3986	ps->ps_childbflags = pb->b_flags | B_WRITE;
3987	ps->ps_childbflags &= ~B_READ;
3988	if (flag & MD_STR_MAPPED)
3989		ps->ps_childbflags &= ~B_PAGEIO;
3990
3991	if (!(flag & MD_STR_NOTTOP) && panicstr)
3992		/* Disable WOW and don't free ps */
3993		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
3994
3995	md_kstat_waitq_to_runq(ui);
3996
3997	/*
3998	 * Treat Raw and Direct I/O as Write-on-Write always
3999	 */
4000
4001	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4002	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4003	    (pb->b_flags & B_PHYS) &&
4004	    !(ps->ps_flags & MD_MPS_WOW)) {
4005		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4006			mirror_overlap_tree_remove(ps);
4007		md_unit_readerexit(ui);
4008		daemon_request(&md_mstr_daemon, handle_wow,
4009		    (daemon_queue_t *)ps, REQ_OLD);
4010		return;
4011	}
4012
4013	ps->ps_frags = 1;
4014	do {
4015		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4016		mirror_child_init(cs);
4017		cb = &cs->cs_buf;
4018		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4019
4020		/*
4021		 * This handles the case where we're requesting
4022		 * a write to block 0 on a label partition.  (more < 0)
4023		 * means that the request size was smaller than the
4024		 * size of the label.  If so this request is done.
4025		 */
4026		if (more < 0) {
4027			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4028				mirror_overlap_tree_remove(ps);
4029			md_kstat_runq_exit(ui);
4030			kmem_cache_free(mirror_child_cache, cs);
4031			kmem_cache_free(mirror_parent_cache, ps);
4032			md_unit_readerexit(ui);
4033			md_biodone(pb);
4034			return;
4035		}
4036		if (more) {
4037			mutex_enter(&ps->ps_mx);
4038			ps->ps_frags++;
4039			mutex_exit(&ps->ps_mx);
4040		}
4041		md_call_strategy(cb, flag, private);
4042	} while (more);
4043
4044	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4045		while (!(ps->ps_flags & MD_MPS_DONE)) {
4046			md_daemon(1, &md_done_daemon);
4047			drv_usecwait(10);
4048		}
4049		kmem_cache_free(mirror_parent_cache, ps);
4050	}
4051}
4052
4053static void
4054mirror_read_strategy(buf_t *pb, int flag, void *private)
4055{
4056	md_mps_t	*ps;
4057	md_mcs_t	*cs;
4058	size_t		more;
4059	mm_unit_t	*un;
4060	mdi_unit_t	*ui;
4061	size_t		current_count;
4062	diskaddr_t	current_blkno;
4063	off_t		current_offset;
4064	buf_t		*cb;		/* child buf pointer */
4065	set_t		setno;
4066
4067	ui = MDI_UNIT(getminor(pb->b_edev));
4068
4069	md_kstat_waitq_enter(ui);
4070
4071	un = (mm_unit_t *)md_unit_readerlock(ui);
4072
4073	if (!(flag & MD_STR_NOTTOP)) {
4074		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4075			md_kstat_waitq_exit(ui);
4076			return;
4077		}
4078	}
4079
4080	if (private == NULL) {
4081		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4082		mirror_parent_init(ps);
4083	} else {
4084		ps = private;
4085		private = NULL;
4086	}
4087
4088	if (flag & MD_STR_MAPPED)
4089		ps->ps_flags |= MD_MPS_MAPPED;
4090	if (flag & MD_NOBLOCK)
4091		ps->ps_flags |= MD_MPS_NOBLOCK;
4092	if (flag & MD_STR_WMUPDATE)
4093		ps->ps_flags |= MD_MPS_WMUPDATE;
4094
4095	/*
4096	 * Check to see if this is a DMR driven read. If so we need to use the
4097	 * specified side (in un->un_dmr_last_read) for the source of the data.
4098	 */
4099	if (flag & MD_STR_DMR)
4100		ps->ps_flags |= MD_MPS_DMR;
4101
4102	/*
4103	 * Save essential information from the original buffhdr
4104	 * in the md_save structure.
4105	 */
4106	ps->ps_un = un;
4107	ps->ps_ui = ui;
4108	ps->ps_bp = pb;
4109	ps->ps_addr = pb->b_un.b_addr;
4110	ps->ps_firstblk = pb->b_lblkno;
4111	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4112	ps->ps_changecnt = un->un_changecnt;
4113
4114	current_count = btodb(pb->b_bcount);
4115	current_blkno = pb->b_lblkno;
4116	current_offset = 0;
4117
4118	/*
4119	 * If flag has MD_STR_WAR set this means that the read is issued by a
4120	 * resync thread which may or may not be an optimised resync.
4121	 *
4122	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4123	 * code has not completed; either a resync has not started since snarf,
4124	 * or there is an optimized resync in progress.
4125	 *
4126	 * We need to generate a write after this read in the following two
4127	 * cases,
4128	 *
4129	 * 1. Any Resync-Generated read
4130	 *
4131	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4132	 *    pending or in progress.
4133	 *
4134	 * The write after read is done in these cases to ensure that all sides
4135	 * of the mirror are in sync with the read data and that it is not
4136	 * possible for an application to read the same block multiple times
4137	 * and get different data.
4138	 *
4139	 * This would be possible if the block was in a dirty region.
4140	 *
4141	 * If we're performing a directed read we don't write the data out as
4142	 * the application is responsible for restoring the mirror to a known
4143	 * state.
4144	 */
4145	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4146	    !(flag & MD_STR_DMR)) {
4147		size_t	start_rr, i, end_rr;
4148		int	region_dirty = 1;
4149
4150		/*
4151		 * We enter here under three circumstances,
4152		 *
4153		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4154		 * 0			1
4155		 * 1			0
4156		 * 1			1
4157		 *
4158		 * To be optimal we only care to explicitly check for dirty
4159		 * regions in the second case since if MD_STR_WAR is set we
4160		 * always do the write after read.
4161		 */
4162		if (!(flag & MD_STR_WAR)) {
4163			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4164			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4165
4166			for (i = start_rr; i <= end_rr; i++)
4167				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4168					break;
4169		}
4170
4171		if ((region_dirty) &&
4172		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4173			ps->ps_call = write_after_read;
4174			/*
4175			 * Mark this as a RESYNC_READ in ps_flags.
4176			 * This is used if the read fails during a
4177			 * resync of a 3-way mirror to ensure that
4178			 * the retried read to the remaining
4179			 * good submirror has MD_STR_WAR set. This
4180			 * is needed to ensure that the resync write
4181			 * (write-after-read) takes place.
4182			 */
4183			ps->ps_flags |= MD_MPS_RESYNC_READ;
4184
4185			/*
4186			 * If MD_STR_FLAG_ERR is set in the flags we
4187			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4188			 * write (issued by write_after_read) will be flagged
4189			 * to the biowait'ing resync thread. This allows us to
4190			 * avoid issuing further resync requests to a device
4191			 * that has had a write failure.
4192			 */
4193			if (flag & MD_STR_FLAG_ERR)
4194				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4195
4196			setno = MD_UN2SET(un);
4197			/*
4198			 * Drop the readerlock to avoid
4199			 * deadlock
4200			 */
4201			md_unit_readerexit(ui);
4202			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4203			un = md_unit_readerlock(ui);
4204			/*
4205			 * Ensure that we are owner
4206			 */
4207			if (MD_MNSET_SETNO(setno)) {
4208				/*
4209				 * For a non-resync read that requires a
4210				 * write-after-read to be done, set a flag
4211				 * in the parent structure, so that the
4212				 * write_strategy routine can omit the
4213				 * test that the write is still within the
4214				 * resync region
4215				 */
4216				if (!(flag & MD_STR_WAR))
4217					ps->ps_flags |= MD_MPS_DIRTY_RD;
4218
4219				/*
4220				 * Before reading the buffer, see if
4221				 * there is an owner.
4222				 */
4223				if (MD_MN_NO_MIRROR_OWNER(un))  {
4224					ps->ps_call = NULL;
4225					mirror_overlap_tree_remove(ps);
4226					md_kstat_waitq_exit(ui);
4227					md_unit_readerexit(ui);
4228					daemon_request(
4229					    &md_mirror_daemon,
4230					    become_owner,
4231					    (daemon_queue_t *)ps,
4232					    REQ_OLD);
4233					return;
4234				}
4235				/*
4236				 * For a resync read, check to see if I/O is
4237				 * outside of the current resync region, or
4238				 * the resync has finished. If so
4239				 * just terminate the I/O
4240				 */
4241				if ((flag & MD_STR_WAR) &&
4242				    (!(un->c.un_status & MD_UN_WAR) ||
4243				    (!IN_RESYNC_REGION(un, ps)))) {
4244#ifdef DEBUG
4245					if (mirror_debug_flag)
4246						printf("Abort resync read "
4247						    "%x: %lld\n",
4248						    MD_SID(un),
4249						    ps->ps_firstblk);
4250#endif
4251					mirror_overlap_tree_remove(ps);
4252					kmem_cache_free(mirror_parent_cache,
4253					    ps);
4254					md_kstat_waitq_exit(ui);
4255					md_unit_readerexit(ui);
4256					md_biodone(pb);
4257					return;
4258				}
4259			}
4260		}
4261	}
4262
4263	if (flag & MD_STR_DMR) {
4264		ps->ps_call = directed_read_done;
4265	}
4266
4267	if (!(flag & MD_STR_NOTTOP) && panicstr)
4268		ps->ps_flags |= MD_MPS_DONTFREE;
4269
4270	md_kstat_waitq_to_runq(ui);
4271
4272	ps->ps_frags++;
4273	do {
4274		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4275		mirror_child_init(cs);
4276		cb = &cs->cs_buf;
4277		cs->cs_ps = ps;
4278
4279		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4280		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4281
4282		more = mirror_map_read(ps, cs, current_blkno,
4283		    (u_longlong_t)current_count);
4284		if (more) {
4285			mutex_enter(&ps->ps_mx);
4286			ps->ps_frags++;
4287			mutex_exit(&ps->ps_mx);
4288		}
4289
4290		/*
4291		 * Do these calculations now,
4292		 *  so that we pickup a valid b_bcount from the chld_bp.
4293		 */
4294		current_count -= more;
4295		current_offset += cb->b_bcount;
4296		current_blkno +=  more;
4297		md_call_strategy(cb, flag, private);
4298	} while (more);
4299
4300	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4301		while (!(ps->ps_flags & MD_MPS_DONE)) {
4302			md_daemon(1, &md_done_daemon);
4303			drv_usecwait(10);
4304		}
4305		kmem_cache_free(mirror_parent_cache, ps);
4306	}
4307}
4308
4309void
4310md_mirror_strategy(buf_t *bp, int flag, void *private)
4311{
4312	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4313
4314	/*
4315	 * When doing IO to a multi owner meta device, check if set is halted.
4316	 * We do this check without the needed lock held, for performance
4317	 * reasons.
4318	 * If an IO just slips through while the set is locked via an
4319	 * MD_MN_SUSPEND_SET, we don't care about it.
4320	 * Only check for suspension if we are a top-level i/o request
4321	 * (MD_STR_NOTTOP is cleared in 'flag').
4322	 */
4323	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4324	    (MD_SET_HALTED | MD_SET_MNSET)) {
4325		if ((flag & MD_STR_NOTTOP) == 0) {
4326			mutex_enter(&md_mx);
4327			/* Here we loop until the set is no longer halted */
4328			while (md_set[setno].s_status & MD_SET_HALTED) {
4329				cv_wait(&md_cv, &md_mx);
4330			}
4331			mutex_exit(&md_mx);
4332		}
4333	}
4334
4335	if ((flag & MD_IO_COUNTED) == 0) {
4336		if ((flag & MD_NOBLOCK) == 0) {
4337			if (md_inc_iocount(setno) != 0) {
4338				bp->b_flags |= B_ERROR;
4339				bp->b_error = ENXIO;
4340				bp->b_resid = bp->b_bcount;
4341				biodone(bp);
4342				return;
4343			}
4344		} else {
4345			md_inc_iocount_noblock(setno);
4346		}
4347	}
4348
4349	if (bp->b_flags & B_READ)
4350		mirror_read_strategy(bp, flag, private);
4351	else
4352		mirror_write_strategy(bp, flag, private);
4353}
4354
4355/*
4356 * mirror_directed_read:
4357 * --------------------
4358 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4359 * so that the application can determine what (if any) resync needs to be
4360 * performed. The data is copied out to the user-supplied buffer.
4361 *
4362 * Parameters:
4363 *	mdev	- dev_t for the mirror device
4364 *	vdr	- directed read parameters specifying location and submirror
4365 *		  to perform the read from
4366 *	mode	- used to ddi_copyout() any resulting data from the read
4367 *
4368 * Returns:
4369 *	0	success
4370 *	!0	error code
4371 *		EINVAL - invalid request format
4372 */
4373int
4374mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4375{
4376	buf_t		*bp;
4377	minor_t		mnum = getminor(mdev);
4378	mdi_unit_t	*ui = MDI_UNIT(mnum);
4379	mm_unit_t	*un;
4380	mm_submirror_t	*sm;
4381	char		*sm_nm;
4382	uint_t		next_side;
4383	void		*kbuffer;
4384
4385	if (ui == NULL)
4386		return (ENXIO);
4387
4388	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4389		return (EINVAL);
4390	}
4391
4392	/* Check for aligned block access. We disallow non-aligned requests. */
4393	if (vdr->vdr_offset % DEV_BSIZE) {
4394		return (EINVAL);
4395	}
4396
4397	/*
4398	 * Allocate kernel buffer for target of read(). If we had a reliable
4399	 * (sorry functional) DDI this wouldn't be needed.
4400	 */
4401	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4402	if (kbuffer == NULL) {
4403		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4404		    " bytes\n", vdr->vdr_nbytes);
4405		return (ENOMEM);
4406	}
4407
4408	bp = getrbuf(KM_SLEEP);
4409
4410	bp->b_un.b_addr = kbuffer;
4411	bp->b_flags = B_READ;
4412	bp->b_bcount = vdr->vdr_nbytes;
4413	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4414	bp->b_edev = mdev;
4415
4416	un = md_unit_readerlock(ui);
4417
4418	/*
4419	 * If DKV_SIDE_INIT is set we need to determine the first available
4420	 * side to start reading from. If it isn't set we increment to the
4421	 * next readable submirror.
4422	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4423	 * Note: we check for a readable submirror on completion of the i/o so
4424	 * we should _always_ have one available. If this becomes unavailable
4425	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4426	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4427	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4428	 * The chance of this is small, but not non-existent.
4429	 */
4430	if (vdr->vdr_side == DKV_SIDE_INIT) {
4431		next_side = 0;
4432	} else {
4433		next_side = vdr->vdr_side + 1;
4434	}
4435	while ((next_side < NMIRROR) &&
4436	    !SUBMIRROR_IS_READABLE(un, next_side))
4437		next_side++;
4438	if (next_side >= NMIRROR) {
4439		vdr->vdr_flags |= DKV_DMR_ERROR;
4440		freerbuf(bp);
4441		vdr->vdr_bytesread = 0;
4442		md_unit_readerexit(ui);
4443		return (0);
4444	}
4445
4446	/* Set the side to read from */
4447	un->un_dmr_last_read = next_side;
4448
4449	md_unit_readerexit(ui);
4450
4451	/*
4452	 * Save timestamp for verification purposes. Can be read by debugger
4453	 * to verify that this ioctl has been executed and to find the number
4454	 * of DMR reads and the time of the last DMR read.
4455	 */
4456	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4457	mirror_dmr_stats.dmr_count++;
4458
4459	/* Issue READ request and wait for completion */
4460	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4461
4462	mutex_enter(&un->un_dmr_mx);
4463	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4464	mutex_exit(&un->un_dmr_mx);
4465
4466	/*
4467	 * Check to see if we encountered an error during the read. If so we
4468	 * can make no guarantee about any possibly returned data.
4469	 */
4470	if ((bp->b_flags & B_ERROR) == 0) {
4471		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4472		if (bp->b_resid) {
4473			vdr->vdr_flags |= DKV_DMR_SHORT;
4474			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4475		} else {
4476			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4477			vdr->vdr_bytesread = vdr->vdr_nbytes;
4478		}
4479		/* Copy the data read back out to the user supplied buffer */
4480		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4481		    mode)) {
4482			kmem_free(kbuffer, vdr->vdr_nbytes);
4483			return (EFAULT);
4484		}
4485
4486	} else {
4487		/* Error out with DKV_DMR_ERROR */
4488		vdr->vdr_flags |= DKV_DMR_ERROR;
4489		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4490	}
4491	/*
4492	 * Update the DMR parameters with the side and name of submirror that
4493	 * we have just read from (un->un_dmr_last_read)
4494	 */
4495	un = md_unit_readerlock(ui);
4496
4497	vdr->vdr_side = un->un_dmr_last_read;
4498	sm = &un->un_sm[un->un_dmr_last_read];
4499	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4500
4501	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4502
4503	/*
4504	 * Determine if we've completed the read cycle. This is true iff the
4505	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4506	 * use un_nsm as we need to handle a sparse array of submirrors (which
4507	 * can occur if a submirror is metadetached).
4508	 */
4509	next_side = un->un_dmr_last_read + 1;
4510	while ((next_side < NMIRROR) &&
4511	    !SUBMIRROR_IS_READABLE(un, next_side))
4512		next_side++;
4513	if (next_side >= NMIRROR) {
4514		/* We've finished */
4515		vdr->vdr_flags |= DKV_DMR_DONE;
4516	}
4517
4518	md_unit_readerexit(ui);
4519	freerbuf(bp);
4520	kmem_free(kbuffer, vdr->vdr_nbytes);
4521
4522	return (0);
4523}
4524
4525/*
4526 * mirror_resync_message:
4527 * ---------------------
4528 * Handle the multi-node resync messages that keep all nodes within a given
4529 * disk-set in sync with their view of a mirror's resync status.
4530 *
4531 * The message types dealt with are:
4532 * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4533 * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4534 * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4535 * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4536 *
4537 * Returns:
4538 *	0	Success
4539 *	>0	Failure error number
4540 */
4541int
4542mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4543{
4544	mdi_unit_t		*ui;
4545	mm_unit_t		*un;
4546	set_t			setno;
4547	int			is_ABR;
4548	int			smi;
4549	int			ci;
4550	sm_state_t		state;
4551	int			broke_out;
4552	mm_submirror_t		*sm;
4553	mm_submirror_ic_t	*smic;
4554	md_m_shared_t		*shared;
4555	md_error_t		mde = mdnullerror;
4556	md_mps_t		*ps;
4557	int			rs_active;
4558	int			rr, rr_start, rr_end;
4559
4560	/* Check that the given device is part of a multi-node set */
4561	setno = MD_MIN2SET(p->mnum);
4562	if (setno >= md_nsets) {
4563		return (ENXIO);
4564	}
4565	if (!MD_MNSET_SETNO(setno)) {
4566		return (EINVAL);
4567	}
4568
4569	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4570		return (EINVAL);
4571	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4572		return (EINVAL);
4573	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4574
4575	/* Obtain the current resync status */
4576	(void) md_ioctl_readerlock(lockp, ui);
4577	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4578	md_ioctl_readerexit(lockp);
4579
4580	switch ((md_mn_msgtype_t)p->msg_type) {
4581	case MD_MN_MSG_RESYNC_STARTING:
4582		/* Start the resync thread for the mirror */
4583		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4584		break;
4585
4586	case MD_MN_MSG_RESYNC_NEXT:
4587		/*
4588		 * We have to release any previously marked overlap regions
4589		 * so that i/o can resume. Then we need to block the region
4590		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4591		 * Update un_rs_resync_done and un_rs_resync_2_do.
4592		 */
4593		(void) md_ioctl_readerlock(lockp, ui);
4594		/*
4595		 * Ignore the message if there is no active resync thread or
4596		 * if it is for a resync type that we have already completed.
4597		 * un_resync_completed is set to the last resync completed
4598		 * when processing a PHASE_DONE message.
4599		 */
4600		if (!rs_active || (p->rs_type == un->un_resync_completed))
4601			break;
4602		/*
4603		 * If this message is for the same resync and is for an earlier
4604		 * resync region, just ignore it. This can only occur if this
4605		 * node has progressed on to the next resync region before
4606		 * we receive this message. This can occur if the class for
4607		 * this message is busy and the originator has to retry thus
4608		 * allowing this node to move onto the next resync_region.
4609		 */
4610		if ((p->rs_type == un->un_rs_type) &&
4611		    (p->rs_start < un->un_resync_startbl))
4612			break;
4613		ps = un->un_rs_prev_overlap;
4614
4615		/* Allocate previous overlap reference if needed */
4616		if (ps == NULL) {
4617			ps = kmem_cache_alloc(mirror_parent_cache,
4618			    MD_ALLOCFLAGS);
4619			ps->ps_un = un;
4620			ps->ps_ui = ui;
4621			ps->ps_firstblk = 0;
4622			ps->ps_lastblk = 0;
4623			ps->ps_flags = 0;
4624			md_ioctl_readerexit(lockp);
4625			(void) md_ioctl_writerlock(lockp, ui);
4626			un->un_rs_prev_overlap = ps;
4627			md_ioctl_writerexit(lockp);
4628		} else
4629			md_ioctl_readerexit(lockp);
4630
4631		if (p->rs_originator != md_mn_mynode_id) {
4632			/*
4633			 * Clear our un_resync_bm for the regions completed.
4634			 * The owner (originator) will take care of itself.
4635			 */
4636			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4637			BLK_TO_RR(rr_start, p->rs_start, un);
4638			if (ps->ps_lastblk && rr_end < rr_start) {
4639				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4640				mutex_enter(&un->un_resync_mx);
4641				/*
4642				 * Update our resync bitmap to reflect that
4643				 * another node has synchronized this range.
4644				 */
4645				for (rr = rr_start; rr <= rr_end; rr++) {
4646					CLR_KEEPDIRTY(rr, un);
4647				}
4648				mutex_exit(&un->un_resync_mx);
4649			}
4650
4651			/*
4652			 * On all but the originating node, first update
4653			 * the resync state, then unblock the previous
4654			 * region and block the next one. No need
4655			 * to do this if the region is already blocked.
4656			 * Update the submirror state and flags from the
4657			 * originator. This keeps the cluster in sync with
4658			 * regards to the resync status.
4659			 */
4660
4661			(void) md_ioctl_writerlock(lockp, ui);
4662			un->un_rs_resync_done = p->rs_done;
4663			un->un_rs_resync_2_do = p->rs_2_do;
4664			un->un_rs_type = p->rs_type;
4665			un->un_resync_startbl = p->rs_start;
4666			md_ioctl_writerexit(lockp);
4667			/*
4668			 * Use un_owner_mx to ensure that an ownership change
4669			 * cannot happen at the same time as this message
4670			 */
4671			mutex_enter(&un->un_owner_mx);
4672			if (MD_MN_MIRROR_OWNER(un)) {
4673				ps->ps_firstblk = p->rs_start;
4674				ps->ps_lastblk = ps->ps_firstblk +
4675				    p->rs_size - 1;
4676			} else {
4677				if ((ps->ps_firstblk != p->rs_start) ||
4678				    (ps->ps_lastblk != p->rs_start +
4679				    p->rs_size - 1)) {
4680					/* Remove previous overlap range */
4681					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4682						mirror_overlap_tree_remove(ps);
4683
4684					ps->ps_firstblk = p->rs_start;
4685					ps->ps_lastblk = ps->ps_firstblk +
4686					    p->rs_size - 1;
4687
4688					mutex_exit(&un->un_owner_mx);
4689					/* Block this range from all i/o. */
4690					if (ps->ps_firstblk != 0 ||
4691					    ps->ps_lastblk != 0)
4692						wait_for_overlaps(ps,
4693						    MD_OVERLAP_ALLOW_REPEAT);
4694					mutex_enter(&un->un_owner_mx);
4695					/*
4696					 * Check to see if we have obtained
4697					 * ownership while waiting for
4698					 * overlaps. If we have, remove
4699					 * the resync_region entry from the
4700					 * overlap tree
4701					 */
4702					if (MD_MN_MIRROR_OWNER(un) &&
4703					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4704						mirror_overlap_tree_remove(ps);
4705				}
4706			}
4707			mutex_exit(&un->un_owner_mx);
4708
4709			/*
4710			 * If this is the first RESYNC_NEXT message (i.e.
4711			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4712			 * issue RESYNC_START NOTIFY event
4713			 */
4714			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4715				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4716				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4717				    MD_SID(un));
4718			}
4719
4720			/* Ensure that our local resync thread is running */
4721			if (un->un_rs_thread == NULL) {
4722				(void) mirror_resync_unit(p->mnum, NULL,
4723				    &p->mde, lockp);
4724			}
4725		}
4726
4727		break;
4728	case MD_MN_MSG_RESYNC_FINISH:
4729		/*
4730		 * Complete the resync by stopping the resync thread.
4731		 * Also release the previous overlap region field.
4732		 * Update the resync_progress_thread by cv_signal'ing it so
4733		 * that we mark the end of the resync as soon as possible. This
4734		 * stops an unnecessary delay should be panic after resync
4735		 * completion.
4736		 */
4737#ifdef DEBUG
4738		if (!rs_active) {
4739			if (mirror_debug_flag)
4740				printf("RESYNC_FINISH (mnum = %x), "
4741				    "Resync *NOT* active",
4742				    p->mnum);
4743		}
4744#endif
4745
4746		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4747		    (p->rs_originator != md_mn_mynode_id)) {
4748			mutex_enter(&un->un_rs_thread_mx);
4749			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4750			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4751			un->un_rs_thread_flags &=
4752			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4753			cv_signal(&un->un_rs_thread_cv);
4754			mutex_exit(&un->un_rs_thread_mx);
4755		}
4756		if (is_ABR) {
4757			/* Resync finished, if ABR set owner to NULL */
4758			mutex_enter(&un->un_owner_mx);
4759			un->un_mirror_owner = 0;
4760			mutex_exit(&un->un_owner_mx);
4761		}
4762		(void) md_ioctl_writerlock(lockp, ui);
4763		ps = un->un_rs_prev_overlap;
4764		if (ps != NULL) {
4765			/* Remove previous overlap range */
4766			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4767				mirror_overlap_tree_remove(ps);
4768			/*
4769			 * Release the overlap range reference
4770			 */
4771			un->un_rs_prev_overlap = NULL;
4772			kmem_cache_free(mirror_parent_cache,
4773			    ps);
4774		}
4775		md_ioctl_writerexit(lockp);
4776
4777		/* Mark the resync as complete in the metadb */
4778		un->un_rs_resync_done = p->rs_done;
4779		un->un_rs_resync_2_do = p->rs_2_do;
4780		un->un_rs_type = p->rs_type;
4781		mutex_enter(&un->un_rs_progress_mx);
4782		cv_signal(&un->un_rs_progress_cv);
4783		mutex_exit(&un->un_rs_progress_mx);
4784
4785		un = md_ioctl_writerlock(lockp, ui);
4786		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4787		/* Deal with any pending grow_unit */
4788		if (un->c.un_status & MD_UN_GROW_PENDING) {
4789			if ((mirror_grow_unit(un, &mde) != 0) ||
4790			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4791				un->c.un_status &= ~MD_UN_GROW_PENDING;
4792			}
4793		}
4794		md_ioctl_writerexit(lockp);
4795		break;
4796
4797	case MD_MN_MSG_RESYNC_PHASE_DONE:
4798		/*
4799		 * A phase of the resync, optimized. component or
4800		 * submirror is complete. Update mirror status.
4801		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4802		 * mirror owner is peforming a resync. If we have just snarfed
4803		 * this set, then we must clear any of the flags set at snarf
4804		 * time by unit_setup_resync().
4805		 * Note that unit_setup_resync() sets up these flags to
4806		 * indicate that an optimized resync is required. These flags
4807		 * need to be reset because if we get here,  the mirror owner
4808		 * will have handled the optimized resync.
4809		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4810		 * MD_UN_WAR. In addition, for each submirror,
4811		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4812		 * set to SMS_OFFLINE.
4813		 */
4814#ifdef DEBUG
4815		if (mirror_debug_flag)
4816			printf("phase done mess received from %d, mnum=%x,"
4817			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4818			    p->rs_type, p->rs_flags);
4819#endif
4820		/*
4821		 * Ignore the message if there is no active resync thread.
4822		 */
4823		if (!rs_active)
4824			break;
4825
4826		broke_out = p->rs_flags & MD_MN_RS_ERR;
4827		switch (RS_TYPE(p->rs_type)) {
4828		case MD_RS_OPTIMIZED:
4829			un = md_ioctl_writerlock(lockp, ui);
4830			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4831				/* If we are originator, just clear rs_type */
4832				if (p->rs_originator == md_mn_mynode_id) {
4833					SET_RS_TYPE_NONE(un->un_rs_type);
4834					md_ioctl_writerexit(lockp);
4835					break;
4836				}
4837				/*
4838				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4839				 * flags if OPT_NOT_DONE is set *and* rs_type
4840				 * is MD_RS_NONE.
4841				 */
4842				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4843				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4844					/* No resync in progress */
4845					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4846					un->c.un_status &= ~MD_UN_WAR;
4847				} else {
4848					/*
4849					 * We are in the middle of an
4850					 * optimized resync and this message
4851					 * should be ignored.
4852					 */
4853					md_ioctl_writerexit(lockp);
4854					break;
4855				}
4856			} else {
4857				/*
4858				 * This is the end of an optimized resync,
4859				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4860				 */
4861
4862				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4863				if (!broke_out)
4864					un->c.un_status &= ~MD_UN_WAR;
4865
4866				/*
4867				 * Clear our un_resync_bm for the regions
4868				 * completed.  The owner (originator) will
4869				 * take care of itself.
4870				 */
4871				if (p->rs_originator != md_mn_mynode_id &&
4872				    (ps = un->un_rs_prev_overlap) != NULL) {
4873					BLK_TO_RR(rr_start, ps->ps_firstblk,
4874					    un);
4875					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4876					mutex_enter(&un->un_resync_mx);
4877					for (rr = rr_start; rr <= rr_end;
4878					    rr++) {
4879						CLR_KEEPDIRTY(rr, un);
4880					}
4881					mutex_exit(&un->un_resync_mx);
4882				}
4883			}
4884
4885			/*
4886			 * Set resync_completed to last resync type and then
4887			 * clear resync_type to indicate no resync in progress
4888			 */
4889			un->un_resync_completed = un->un_rs_type;
4890			SET_RS_TYPE_NONE(un->un_rs_type);
4891
4892			/*
4893			 * If resync is as a result of a submirror ONLINE,
4894			 * reset the submirror state to SMS_RUNNING if the
4895			 * resync was ok else set back to SMS_OFFLINE.
4896			 */
4897			for (smi = 0; smi < NMIRROR; smi++) {
4898				un->un_sm[smi].sm_flags &=
4899				    ~MD_SM_RESYNC_TARGET;
4900				if (SMS_BY_INDEX_IS(un, smi,
4901				    SMS_OFFLINE_RESYNC)) {
4902					if (p->rs_flags &
4903					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4904						state = SMS_OFFLINE;
4905					} else {
4906						state = (broke_out ?
4907						    SMS_OFFLINE : SMS_RUNNING);
4908					}
4909					mirror_set_sm_state(
4910					    &un->un_sm[smi],
4911					    &un->un_smic[smi], state,
4912					    broke_out);
4913					mirror_commit(un, NO_SUBMIRRORS,
4914					    0);
4915				}
4916				/*
4917				 * If we still have an offline submirror, reset
4918				 * the OFFLINE_SM flag in the mirror status
4919				 */
4920				if (SMS_BY_INDEX_IS(un, smi,
4921				    SMS_OFFLINE))
4922					un->c.un_status |=
4923					    MD_UN_OFFLINE_SM;
4924			}
4925			md_ioctl_writerexit(lockp);
4926			break;
4927		case MD_RS_SUBMIRROR:
4928			un = md_ioctl_writerlock(lockp, ui);
4929			smi = RS_SMI(p->rs_type);
4930			sm = &un->un_sm[smi];
4931			smic = &un->un_smic[smi];
4932			/* Clear RESYNC target */
4933			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4934			/*
4935			 * Set resync_completed to last resync type and then
4936			 * clear resync_type to indicate no resync in progress
4937			 */
4938			un->un_resync_completed = un->un_rs_type;
4939			SET_RS_TYPE_NONE(un->un_rs_type);
4940			/*
4941			 * If the resync completed ok reset the submirror
4942			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
4943			 */
4944			state = (broke_out ?
4945			    SMS_ATTACHED : SMS_RUNNING);
4946			mirror_set_sm_state(sm, smic, state, broke_out);
4947			un->c.un_status &= ~MD_UN_WAR;
4948			mirror_commit(un, SMI2BIT(smi), 0);
4949			md_ioctl_writerexit(lockp);
4950			break;
4951		case MD_RS_COMPONENT:
4952			un = md_ioctl_writerlock(lockp, ui);
4953			smi = RS_SMI(p->rs_type);
4954			ci = RS_CI(p->rs_type);
4955			sm = &un->un_sm[smi];
4956			smic = &un->un_smic[smi];
4957			shared = (md_m_shared_t *)
4958			    (*(smic->sm_shared_by_indx))
4959			    (sm->sm_dev, sm, ci);
4960			un->c.un_status &= ~MD_UN_WAR;
4961			/* Clear RESYNC target */
4962			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4963			/*
4964			 * Set resync_completed to last resync type and then
4965			 * clear resync_type to indicate no resync in progress
4966			 */
4967			un->un_resync_completed = un->un_rs_type;
4968			SET_RS_TYPE_NONE(un->un_rs_type);
4969
4970			/*
4971			 * If the resync completed ok, set the component state
4972			 * to CS_OKAY.
4973			 */
4974			if (broke_out)
4975				shared->ms_flags |= MDM_S_RS_TRIED;
4976			else {
4977				/*
4978				 * As we don't transmit the changes,
4979				 * no need to drop the lock.
4980				 */
4981				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
4982				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
4983			}
4984			md_ioctl_writerexit(lockp);
4985		default:
4986			break;
4987		}
4988		/*
4989		 * If the purpose of this PHASE_DONE message is just to
4990		 * indicate to all other nodes that the optimized resync
4991		 * required (OPT_NOT_DONE) flag is to be cleared, there is
4992		 * no need to generate a notify event as there has not
4993		 * actually been a resync.
4994		 */
4995		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
4996			if (broke_out) {
4997				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
4998				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4999				    MD_SID(un));
5000			} else {
5001				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5002				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5003				    MD_SID(un));
5004			}
5005		}
5006		break;
5007
5008	default:
5009#ifdef DEBUG
5010		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5011		    " %x\n", p->msg_type);
5012#endif
5013		return (EINVAL);
5014	}
5015	return (0);
5016}
5017
5018/* Return a -1 if snarf of optimized record failed and set should be released */
5019static int
5020mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5021{
5022	mddb_recid_t	recid;
5023	int		gotsomething;
5024	int		all_mirrors_gotten;
5025	mm_unit_t	*un;
5026	mddb_type_t	typ1;
5027	mddb_de_ic_t    *dep;
5028	mddb_rb32_t	*rbp;
5029	size_t		newreqsize;
5030	mm_unit_t	*big_un;
5031	mm_unit32_od_t	*small_un;
5032	int		retval;
5033	mdi_unit_t	*ui;
5034
5035	if (cmd == MD_SNARF_CLEANUP) {
5036		if (md_get_setstatus(setno) & MD_SET_STALE)
5037			return (0);
5038
5039		recid = mddb_makerecid(setno, 0);
5040		typ1 = (mddb_type_t)md_getshared_key(setno,
5041		    mirror_md_ops.md_driver.md_drivername);
5042		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5043			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5044				un = (mm_unit_t *)mddb_getrecaddr(recid);
5045				mirror_cleanup(un);
5046				recid = mddb_makerecid(setno, 0);
5047			}
5048		}
5049		return (0);
5050	}
5051
5052	all_mirrors_gotten = 1;
5053	gotsomething = 0;
5054
5055	recid = mddb_makerecid(setno, 0);
5056	typ1 = (mddb_type_t)md_getshared_key(setno,
5057	    mirror_md_ops.md_driver.md_drivername);
5058
5059	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5060		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5061			continue;
5062
5063		dep = mddb_getrecdep(recid);
5064		dep->de_flags = MDDB_F_MIRROR;
5065		rbp = dep->de_rb;
5066
5067		switch (rbp->rb_revision) {
5068		case MDDB_REV_RB:
5069		case MDDB_REV_RBFN:
5070			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5071				/*
5072				 * This means, we have an old and small
5073				 * record and this record hasn't already
5074				 * been converted.  Before we create an
5075				 * incore metadevice from this we have to
5076				 * convert it to a big record.
5077				 */
5078				small_un =
5079				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
5080				newreqsize = sizeof (mm_unit_t);
5081				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5082				    KM_SLEEP);
5083				mirror_convert((caddr_t)small_un,
5084				    (caddr_t)big_un, SMALL_2_BIG);
5085				kmem_free(small_un, dep->de_reqsize);
5086
5087				/*
5088				 * Update userdata and incore userdata
5089				 * incores are at the end of un
5090				 */
5091				dep->de_rb_userdata_ic = big_un;
5092				dep->de_rb_userdata = big_un;
5093				dep->de_icreqsize = newreqsize;
5094				un = big_un;
5095				rbp->rb_private |= MD_PRV_CONVD;
5096			} else {
5097				/*
5098				 * Unit already converted, just get the
5099				 * record address.
5100				 */
5101				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5102				    sizeof (*un), 0);
5103			}
5104			un->c.un_revision &= ~MD_64BIT_META_DEV;
5105			break;
5106		case MDDB_REV_RB64:
5107		case MDDB_REV_RB64FN:
5108			/* Big device */
5109			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5110			    sizeof (*un), 0);
5111			un->c.un_revision |= MD_64BIT_META_DEV;
5112			un->c.un_flag |= MD_EFILABEL;
5113			break;
5114		}
5115		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5116
5117		/*
5118		 * Create minor device node for snarfed entry.
5119		 */
5120		(void) md_create_minor_node(setno, MD_SID(un));
5121
5122		if (MD_UNIT(MD_SID(un)) != NULL) {
5123			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5124			continue;
5125		}
5126		all_mirrors_gotten = 0;
5127		retval = mirror_build_incore(un, 1);
5128		if (retval == 0) {
5129			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5130			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5131			resync_start_timeout(setno);
5132			gotsomething = 1;
5133		} else {
5134			return (retval);
5135		}
5136		/*
5137		 * Set flag to indicate that the mirror has not yet
5138		 * been through a reconfig. This flag is used for MN sets
5139		 * when determining whether to update the mirror state from
5140		 * the Master node.
5141		 */
5142		if (MD_MNSET_SETNO(setno)) {
5143			ui = MDI_UNIT(MD_SID(un));
5144			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5145		}
5146	}
5147
5148	if (!all_mirrors_gotten)
5149		return (gotsomething);
5150
5151	recid = mddb_makerecid(setno, 0);
5152	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5153		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5154			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5155
5156	return (0);
5157}
5158
5159static int
5160mirror_halt(md_haltcmd_t cmd, set_t setno)
5161{
5162	unit_t		i;
5163	mdi_unit_t	*ui;
5164	minor_t		mnum;
5165	int		reset_mirror_flag = 0;
5166
5167	if (cmd == MD_HALT_CLOSE)
5168		return (0);
5169
5170	if (cmd == MD_HALT_OPEN)
5171		return (0);
5172
5173	if (cmd == MD_HALT_UNLOAD)
5174		return (0);
5175
5176	if (cmd == MD_HALT_CHECK) {
5177		for (i = 0; i < md_nunits; i++) {
5178			mnum = MD_MKMIN(setno, i);
5179			if ((ui = MDI_UNIT(mnum)) == NULL)
5180				continue;
5181			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5182				continue;
5183			if (md_unit_isopen(ui))
5184				return (1);
5185		}
5186		return (0);
5187	}
5188
5189	if (cmd != MD_HALT_DOIT)
5190		return (1);
5191
5192	for (i = 0; i < md_nunits; i++) {
5193		mnum = MD_MKMIN(setno, i);
5194		if ((ui = MDI_UNIT(mnum)) == NULL)
5195			continue;
5196		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5197			continue;
5198		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5199
5200		/* Set a flag if there is at least one mirror metadevice. */
5201		reset_mirror_flag = 1;
5202	}
5203
5204	/*
5205	 * Only wait for the global dr_timeout to finish
5206	 *  - if there are mirror metadevices in this diskset or
5207	 *  - if this is the local set since an unload of the md_mirror
5208	 *    driver could follow a successful mirror halt in the local set.
5209	 */
5210	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5211		while ((mirror_md_ops.md_head == NULL) &&
5212		    (mirror_timeout.dr_timeout_id != 0))
5213			delay(md_hz);
5214	}
5215
5216	return (0);
5217}
5218
5219/*ARGSUSED3*/
5220static int
5221mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5222{
5223	IOLOCK	lock;
5224	minor_t		mnum = getminor(*dev);
5225	set_t		setno;
5226
5227	/*
5228	 * When doing an open of a multi owner metadevice, check to see if this
5229	 * node is a starting node and if a reconfig cycle is underway.
5230	 * If so, the system isn't sufficiently set up enough to handle the
5231	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5232	 */
5233	setno = MD_MIN2SET(mnum);
5234	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5235	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5236			return (ENXIO);
5237	}
5238
5239	if (md_oflags & MD_OFLG_FROMIOCTL) {
5240		/*
5241		 * This indicates that the caller is an ioctl service routine.
5242		 * In this case we initialise our stack-based IOLOCK and pass
5243		 * this into the internal open routine. This allows multi-owner
5244		 * metadevices to avoid deadlocking if an error is encountered
5245		 * during the open() attempt. The failure case is:
5246		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5247		 * this configuration would deadlock as the mirror code has to
5248		 * send a state-update to the other nodes when it detects the
5249		 * failure of the underlying submirror with an errored soft-part
5250		 * on it. As there is a class1 message in progress (metaclear)
5251		 * set_sm_comp_state() cannot send another class1 message;
5252		 * instead we do not send a state_update message as the
5253		 * metaclear is distributed and the failed submirror will be
5254		 * cleared from the configuration by the metaclear.
5255		 */
5256		IOLOCK_INIT(&lock);
5257		return (mirror_internal_open(getminor(*dev), flag, otyp,
5258		    md_oflags, &lock));
5259	} else {
5260		return (mirror_internal_open(getminor(*dev), flag, otyp,
5261		    md_oflags, (IOLOCK *)NULL));
5262	}
5263}
5264
5265
5266/*ARGSUSED1*/
5267static int
5268mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5269{
5270	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5271	    (IOLOCK *)NULL));
5272}
5273
5274
5275/*
5276 * This routine dumps memory to the disk.  It assumes that the memory has
5277 * already been mapped into mainbus space.  It is called at disk interrupt
5278 * priority when the system is in trouble.
5279 *
5280 */
5281static int
5282mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5283{
5284	mm_unit_t	*un;
5285	dev_t		mapdev;
5286	int		result;
5287	int		smi;
5288	int		any_succeed = 0;
5289	int		save_result = 0;
5290
5291	/*
5292	 * Don't need to grab the unit lock.
5293	 * Cause nothing else is suppose to be happenning.
5294	 * Also dump is not suppose to sleep.
5295	 */
5296	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5297
5298	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5299		return (EINVAL);
5300
5301	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5302		return (EINVAL);
5303
5304	for (smi = 0; smi < NMIRROR; smi++) {
5305		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5306			continue;
5307		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5308		result = bdev_dump(mapdev, addr, blkno, nblk);
5309		if (result)
5310			save_result = result;
5311
5312		if (result == 0)
5313			any_succeed++;
5314	}
5315
5316	if (any_succeed)
5317		return (0);
5318
5319	return (save_result);
5320}
5321
5322/*
5323 * NAME: mirror_probe_dev
5324 *
5325 * DESCRITPION: force opens every component of a mirror.
5326 *
5327 * On entry the unit writerlock is held
5328 */
5329static int
5330mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5331{
5332	int		i;
5333	int		smi;
5334	int		ci;
5335	mm_unit_t	*un;
5336	int		md_devopen = 0;
5337	set_t		setno;
5338	int		sm_cnt;
5339	int		sm_unavail_cnt;
5340
5341	if (md_unit_isopen(ui))
5342		md_devopen++;
5343
5344	un = MD_UNIT(mnum);
5345	setno = MD_UN2SET(un);
5346
5347	sm_cnt = 0;
5348	sm_unavail_cnt = 0;
5349	for (i = 0; i < NMIRROR; i++) {
5350		md_dev64_t tmpdev;
5351		mdi_unit_t	*sm_ui;
5352
5353		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5354			continue;
5355		}
5356
5357		sm_cnt++;
5358		tmpdev = un->un_sm[i].sm_dev;
5359		(void) md_layered_open(mnum, &tmpdev,
5360		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5361		un->un_sm[i].sm_dev = tmpdev;
5362
5363		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5364
5365		/*
5366		 * Logic similar to that in mirror_open_all_devs.  We set or
5367		 * clear the submirror Unavailable bit.
5368		 */
5369		(void) md_unit_writerlock(sm_ui);
5370		if (submirror_unavailable(un, i, 1)) {
5371			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5372			sm_unavail_cnt++;
5373		} else {
5374			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5375		}
5376		md_unit_writerexit(sm_ui);
5377	}
5378
5379	/*
5380	 * If all of the submirrors are unavailable, the mirror is also
5381	 * unavailable.
5382	 */
5383	if (sm_cnt == sm_unavail_cnt) {
5384		ui->ui_tstate |= MD_INACCESSIBLE;
5385	} else {
5386		ui->ui_tstate &= ~MD_INACCESSIBLE;
5387	}
5388
5389	/*
5390	 * Start checking from probe failures. If failures occur we
5391	 * set the appropriate erred state only if the metadevice is in
5392	 * use. This is specifically to prevent unnecessary resyncs.
5393	 * For instance if the disks were accidentally disconnected when
5394	 * the system booted up then until the metadevice is accessed
5395	 * (like file system mount) the user can shutdown, recable and
5396	 * reboot w/o incurring a potentially huge resync.
5397	 */
5398
5399	smi = 0;
5400	ci = 0;
5401	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5402
5403		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5404			/*
5405			 * Note that for a MN set, there is no need to call
5406			 * SE_NOTIFY as that is done when processing the
5407			 * state change
5408			 */
5409			if (md_devopen) {
5410				/*
5411				 * Never called from ioctl context,
5412				 * so (IOLOCK *)NULL
5413				 */
5414				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5415				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5416				if (!MD_MNSET_SETNO(setno)) {
5417					SE_NOTIFY(EC_SVM_STATE,
5418					    ESC_SVM_LASTERRED,
5419					    SVM_TAG_METADEVICE, setno,
5420					    MD_SID(un));
5421				}
5422				continue;
5423			} else {
5424				(void) mirror_close_all_devs(un,
5425				    MD_OFLG_PROBEDEV);
5426				if (!MD_MNSET_SETNO(setno)) {
5427					SE_NOTIFY(EC_SVM_STATE,
5428					    ESC_SVM_OPEN_FAIL,
5429					    SVM_TAG_METADEVICE, setno,
5430					    MD_SID(un));
5431				}
5432				mirror_openfail_console_info(un, smi, ci);
5433				return (ENXIO);
5434			}
5435		}
5436
5437		/*
5438		 * Note that for a MN set, there is no need to call
5439		 * SE_NOTIFY as that is done when processing the
5440		 * state change
5441		 */
5442		if (md_devopen) {
5443			/* Never called from ioctl context, so (IOLOCK *)NULL */
5444			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5445			    MD_STATE_XMIT, (IOLOCK *)NULL);
5446			if (!MD_MNSET_SETNO(setno)) {
5447				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5448				    SVM_TAG_METADEVICE, setno,
5449				    MD_SID(un));
5450			}
5451		}
5452		mirror_openfail_console_info(un, smi, ci);
5453		ci++;
5454	}
5455
5456	if (MD_MNSET_SETNO(setno)) {
5457		send_poke_hotspares(setno);
5458	} else {
5459		(void) poke_hotspares();
5460	}
5461	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5462
5463	return (0);
5464}
5465
5466
5467static int
5468mirror_imp_set(
5469	set_t	setno
5470)
5471{
5472
5473	mddb_recid_t	recid;
5474	int		gotsomething, i;
5475	mddb_type_t	typ1;
5476	mddb_de_ic_t	*dep;
5477	mddb_rb32_t	*rbp;
5478	mm_unit32_od_t	*un32;
5479	mm_unit_t	*un64;
5480	md_dev64_t	self_devt;
5481	minor_t		*self_id;	/* minor needs to be updated */
5482	md_parent_t	*parent_id;	/* parent needs to be updated */
5483	mddb_recid_t	*record_id;	/* record id needs to be updated */
5484	mddb_recid_t	*optrec_id;
5485	md_dev64_t	tmpdev;
5486
5487
5488	gotsomething = 0;
5489
5490	typ1 = (mddb_type_t)md_getshared_key(setno,
5491	    mirror_md_ops.md_driver.md_drivername);
5492	recid = mddb_makerecid(setno, 0);
5493
5494	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5495		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5496			continue;
5497
5498		dep = mddb_getrecdep(recid);
5499		rbp = dep->de_rb;
5500
5501		switch (rbp->rb_revision) {
5502		case MDDB_REV_RB:
5503		case MDDB_REV_RBFN:
5504			/*
5505			 * Small device
5506			 */
5507			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5508			self_id = &(un32->c.un_self_id);
5509			parent_id = &(un32->c.un_parent);
5510			record_id = &(un32->c.un_record_id);
5511			optrec_id = &(un32->un_rr_dirty_recid);
5512
5513			for (i = 0; i < un32->un_nsm; i++) {
5514				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5515				un32->un_sm[i].sm_dev = md_cmpldev
5516				    (md_makedevice(md_major, MD_MKMIN(setno,
5517				    MD_MIN2UNIT(md_getminor(tmpdev)))));
5518
5519				if (!md_update_minor(setno, mddb_getsidenum
5520				    (setno), un32->un_sm[i].sm_key))
5521				goto out;
5522			}
5523			break;
5524		case MDDB_REV_RB64:
5525		case MDDB_REV_RB64FN:
5526			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5527			self_id = &(un64->c.un_self_id);
5528			parent_id = &(un64->c.un_parent);
5529			record_id = &(un64->c.un_record_id);
5530			optrec_id = &(un64->un_rr_dirty_recid);
5531
5532			for (i = 0; i < un64->un_nsm; i++) {
5533				tmpdev = un64->un_sm[i].sm_dev;
5534				un64->un_sm[i].sm_dev = md_makedevice
5535				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5536				    (md_getminor(tmpdev))));
5537
5538				if (!md_update_minor(setno, mddb_getsidenum
5539				    (setno), un64->un_sm[i].sm_key))
5540				goto out;
5541			}
5542			break;
5543		}
5544
5545		/*
5546		 * If this is a top level and a friendly name metadevice,
5547		 * update its minor in the namespace.
5548		 */
5549		if ((*parent_id == MD_NO_PARENT) &&
5550		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5551		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5552
5553			self_devt = md_makedevice(md_major, *self_id);
5554			if (!md_update_top_device_minor(setno,
5555			    mddb_getsidenum(setno), self_devt))
5556				goto out;
5557		}
5558
5559		/*
5560		 * Update unit with the imported setno
5561		 *
5562		 */
5563		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5564
5565		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5566		if (*parent_id != MD_NO_PARENT)
5567			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5568		*record_id = MAKERECID(setno, DBID(*record_id));
5569		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5570
5571		gotsomething = 1;
5572	}
5573
5574out:
5575	return (gotsomething);
5576}
5577
5578/*
5579 * NAME: mirror_check_offline
5580 *
5581 * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5582 *
5583 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5584 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5585 * ioctl.
5586 */
5587int
5588mirror_check_offline(md_dev64_t dev, int *offline_status)
5589{
5590	mm_unit_t		*un;
5591	md_error_t		mde = mdnullerror;
5592
5593	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5594		return (EINVAL);
5595	*offline_status = 0;
5596	if (un->c.un_status & MD_UN_OFFLINE_SM)
5597		*offline_status = 1;
5598	return (0);
5599}
5600
5601/*
5602 * NAME: mirror_inc_abr_count
5603 *
5604 * DESCRIPTION: increment the count of layered soft parts with ABR set
5605 *
5606 * Called from ioctl, so access to un_abr_count is protected by the global
5607 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5608 */
5609int
5610mirror_inc_abr_count(md_dev64_t dev)
5611{
5612	mm_unit_t		*un;
5613	md_error_t		mde = mdnullerror;
5614
5615	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5616		return (EINVAL);
5617	un->un_abr_count++;
5618	return (0);
5619}
5620
5621/*
5622 * NAME: mirror_dec_abr_count
5623 *
5624 * DESCRIPTION: decrement the count of layered soft parts with ABR set
5625 *
5626 * Called from ioctl, so access to un_abr_count is protected by the global
5627 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5628 */
5629int
5630mirror_dec_abr_count(md_dev64_t dev)
5631{
5632	mm_unit_t		*un;
5633	md_error_t		mde = mdnullerror;
5634
5635	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5636		return (EINVAL);
5637	un->un_abr_count--;
5638	return (0);
5639}
5640
5641static md_named_services_t mirror_named_services[] = {
5642	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5643	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5644	{mirror_rename_check,				MDRNM_CHECK	    },
5645	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5646	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5647			MDRNM_PARENT_UPDATE_TO},
5648	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5649			MDRNM_SELF_UPDATE_FROM_DOWN },
5650	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5651	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5652	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5653	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5654	{ NULL,						0		    }
5655};
5656
5657md_ops_t mirror_md_ops = {
5658	mirror_open,		/* open */
5659	mirror_close,		/* close */
5660	md_mirror_strategy,	/* strategy */
5661	NULL,			/* print */
5662	mirror_dump,		/* dump */
5663	NULL,			/* read */
5664	NULL,			/* write */
5665	md_mirror_ioctl,	/* mirror_ioctl, */
5666	mirror_snarf,		/* mirror_snarf */
5667	mirror_halt,		/* mirror_halt */
5668	NULL,			/* aread */
5669	NULL,			/* awrite */
5670	mirror_imp_set,		/* import set */
5671	mirror_named_services
5672};
5673
5674/* module specific initilization */
5675static void
5676init_init()
5677{
5678	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5679
5680	/* Initialize the parent and child save memory pools */
5681	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5682	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5683	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5684	    0);
5685
5686	mirror_child_cache = kmem_cache_create("md_mirror_child",
5687	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5688	    mirror_child_constructor, mirror_child_destructor,
5689	    mirror_run_queue, NULL, NULL, 0);
5690
5691	/*
5692	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5693	 * then initialize wowbuf memory pool.
5694	 */
5695	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5696	if (md_wowbuf_size <= 0)
5697		md_wowbuf_size = 2 * DEV_BSIZE;
5698	if (md_wowbuf_size > (32 * DEV_BSIZE))
5699		md_wowbuf_size = (32 * DEV_BSIZE);
5700
5701	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5702	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5703	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5704
5705	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5706	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5707
5708	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5709}
5710
5711/* module specific uninitilization (undo init_init()) */
5712static void
5713fini_uninit()
5714{
5715	kmem_cache_destroy(mirror_parent_cache);
5716	kmem_cache_destroy(mirror_child_cache);
5717	kmem_cache_destroy(mirror_wowblk_cache);
5718	mirror_parent_cache = mirror_child_cache =
5719	    mirror_wowblk_cache = NULL;
5720
5721	mutex_destroy(&mirror_timeout.dr_mx);
5722	mutex_destroy(&hotspare_request.dr_mx);
5723	mutex_destroy(&non_ff_drv_mutex);
5724}
5725
5726/* define the module linkage */
5727MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5728