mirror_ioctl.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/conf.h>
30#include <sys/file.h>
31#include <sys/user.h>
32#include <sys/uio.h>
33#include <sys/t_lock.h>
34#include <sys/buf.h>
35#include <sys/dkio.h>
36#include <sys/vtoc.h>
37#include <sys/kmem.h>
38#include <vm/page.h>
39#include <sys/sysmacros.h>
40#include <sys/types.h>
41#include <sys/mkdev.h>
42#include <sys/stat.h>
43#include <sys/open.h>
44#include <sys/modctl.h>
45#include <sys/ddi.h>
46#include <sys/sunddi.h>
47
48#include <sys/lvm/mdvar.h>
49#include <sys/lvm/md_names.h>
50#include <sys/lvm/md_mddb.h>
51#include <sys/lvm/md_stripe.h>
52#include <sys/lvm/md_mirror.h>
53
54#include <sys/model.h>
55
56#include <sys/sysevent/eventdefs.h>
57#include <sys/sysevent/svm.h>
58#include <sys/lvm/mdmn_commd.h>
59
60extern int		md_status;
61extern kmutex_t		md_mx;
62extern kcondvar_t	md_cv;
63
64extern unit_t		md_nunits;
65extern set_t		md_nsets;
66extern md_set_t		md_set[];
67
68extern md_ops_t		mirror_md_ops;
69extern int		md_ioctl_cnt;
70extern md_krwlock_t	md_unit_array_rw;
71extern major_t		md_major;
72extern mdq_anchor_t	md_ff_daemonq;
73extern void		md_probe_one();
74extern void		mirror_openfail_console_info();
75
76#ifdef DEBUG
77extern int		mirror_debug_flag;
78#endif
79
80static void
81mirror_resume_writes(mm_unit_t *un)
82{
83	/*
84	 * Release the block on writes to the mirror and resume any blocked
85	 * resync thread.
86	 * This is only required for MN sets
87	 */
88	if (MD_MNSET_SETNO(MD_UN2SET(un))) {
89#ifdef DEBUG
90		if (mirror_debug_flag)
91			printf("mirror_resume_writes: mnum %x\n", MD_SID(un));
92#endif
93		mutex_enter(&un->un_suspend_wr_mx);
94		un->un_suspend_wr_flag = 0;
95		cv_broadcast(&un->un_suspend_wr_cv);
96		mutex_exit(&un->un_suspend_wr_mx);
97		mutex_enter(&un->un_rs_thread_mx);
98		un->un_rs_thread_flags &= ~MD_RI_BLOCK;
99		cv_signal(&un->un_rs_thread_cv);
100		mutex_exit(&un->un_rs_thread_mx);
101	}
102}
103
104mm_unit_t *
105mirror_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
106{
107	mm_unit_t	*un;
108	mdi_unit_t	*ui;
109	set_t		setno = MD_MIN2SET(mnum);
110
111	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
112		(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
113		return (NULL);
114	}
115
116	if (!(flags & STALE_OK)) {
117		if (md_get_setstatus(setno) & MD_SET_STALE) {
118			(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
119			return (NULL);
120		}
121	}
122
123	ui = MDI_UNIT(mnum);
124	if (flags & NO_OLD) {
125		if (ui != NULL) {
126			(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
127			return (NULL);
128		}
129		return ((mm_unit_t *)1);
130	}
131
132	if (ui == NULL) {
133		(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
134		return (NULL);
135	}
136
137	if (flags & ARRAY_WRITER)
138		md_array_writer(lock);
139	else if (flags & ARRAY_READER)
140		md_array_reader(lock);
141
142	if (!(flags & NO_LOCK)) {
143		if (flags & WR_LOCK)
144			(void) md_ioctl_writerlock(lock, ui);
145		else /* RD_LOCK */
146			(void) md_ioctl_readerlock(lock, ui);
147	}
148	un = (mm_unit_t *)MD_UNIT(mnum);
149
150	if (un->c.un_type != MD_METAMIRROR) {
151		(void) mdmderror(mde, MDE_NOT_MM, mnum);
152		return (NULL);
153	}
154
155	return (un);
156}
157
158static int
159mirror_set(
160	void		*d,
161	int		mode
162)
163{
164	minor_t		mnum;
165	mm_unit_t	*un;
166	mddb_recid_t	recid;
167	mddb_type_t	typ1;
168	int		err;
169	int		i;
170	set_t		setno;
171	md_set_params_t	*msp = d;
172
173
174	mnum = msp->mnum;
175
176	mdclrerror(&msp->mde);
177
178	if (mirror_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
179		return (0);
180
181	setno = MD_MIN2SET(mnum);
182
183	typ1 = (mddb_type_t)md_getshared_key(setno,
184	    mirror_md_ops.md_driver.md_drivername);
185
186	/*
187	 * Create the db record for this mdstruct
188	 * We don't store incore elements ondisk
189	 */
190
191	if (msp->options & MD_CRO_64BIT) {
192#if defined(_ILP32)
193		return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
194#else
195		recid = mddb_createrec((size_t)msp->size, typ1, MIRROR_REC,
196		    MD_CRO_64BIT | MD_CRO_MIRROR | MD_CRO_FN, setno);
197#endif
198	} else {
199		/*
200		 * It's important to use the correct size here
201		 */
202		msp->size = sizeof (mm_unit32_od_t);
203		recid = mddb_createrec((size_t)msp->size, typ1, MIRROR_REC,
204		    MD_CRO_32BIT | MD_CRO_MIRROR | MD_CRO_FN, setno);
205	}
206	if (recid < 0)
207		return (mddbstatus2error(&msp->mde, (int)recid,
208		    mnum, setno));
209
210	/* Resize to include incore fields */
211	un = (mm_unit_t *)mddb_getrecaddr_resize(recid, sizeof (*un), 0);
212	/*
213	 * It is okay that we muck with the mdstruct here,
214	 * since no one else will know about the mdstruct
215	 * until we commit it. If we crash, the record will
216	 * be automatically purged, since we haven't
217	 * committed it yet.
218	 */
219
220	/* copy in the user's mdstruct */
221	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
222	    (uint_t)msp->size, mode)) {
223		mddb_deleterec_wrapper(recid);
224		return (EFAULT);
225	}
226	/* All 64 bit metadevices only support EFI labels. */
227	if (msp->options & MD_CRO_64BIT) {
228		un->c.un_flag |= MD_EFILABEL;
229	}
230
231	un->c.un_revision |= MD_FN_META_DEV;
232	MD_RECID(un)	= recid;
233	MD_CAPAB(un)	= MD_CAN_PARENT | MD_CAN_META_CHILD | MD_CAN_SP;
234	MD_PARENT(un)	= MD_NO_PARENT;
235
236	for (i = 0; i < NMIRROR; i++) {
237		struct mm_submirror	*sm;
238
239		sm = &un->un_sm[i];
240		if (!SMS_IS(sm, SMS_INUSE))
241			continue;
242
243		/* ensure that the submirror is a metadevice */
244		if (md_getmajor(sm->sm_dev) != md_major)
245			return (mdmderror(&msp->mde, MDE_INVAL_UNIT,
246			    md_getminor(sm->sm_dev)));
247
248		if (md_get_parent(sm->sm_dev) == MD_NO_PARENT)
249			continue;
250
251		/* mirror creation should fail here */
252		md_nblocks_set(mnum, -1ULL);
253		MD_UNIT(mnum) = NULL;
254
255		mddb_deleterec_wrapper(recid);
256		return (mdmderror(&msp->mde, MDE_IN_USE,
257		    md_getminor(sm->sm_dev)));
258	}
259
260	if (err = mirror_build_incore(un, 0)) {
261		md_nblocks_set(mnum, -1ULL);
262		MD_UNIT(mnum) = NULL;
263
264		mddb_deleterec_wrapper(recid);
265		return (err);
266	}
267
268	/*
269	 * Update unit availability
270	 */
271	md_set[setno].s_un_avail--;
272
273	mirror_commit(un, ALL_SUBMIRRORS, 0);
274	md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
275	mirror_check_failfast(mnum);
276	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
277	    MD_SID(un));
278
279	resync_start_timeout(setno);
280	return (0);
281}
282
283static int
284mirror_get(
285	void		*migp,
286	int		mode,
287	IOLOCK		*lock
288)
289{
290	mm_unit_t	*un;
291	md_i_get_t	*migph = migp;
292
293	mdclrerror(&migph->mde);
294
295	if ((un = mirror_getun(migph->id, &migph->mde, RD_LOCK, lock)) == NULL)
296		return (0);
297
298	if (migph->size == 0) {
299		migph->size = un->c.un_size;
300		return (0);
301	}
302
303	if (migph->size < un->c.un_size) {
304		return (EFAULT);
305	}
306	if (ddi_copyout(un, (caddr_t)(uintptr_t)migph->mdp,
307	    un->c.un_size, mode))
308		return (EFAULT);
309	return (0);
310}
311
312static int
313mirror_getdevs(
314	void			*mgdp,
315	int			mode,
316	IOLOCK			*lock
317)
318{
319	mm_unit_t		*un;
320	md_dev64_t		*udevs;
321	int			cnt;
322	int			i;
323	md_dev64_t		unit_dev;
324	md_getdevs_params_t	*mgdph = mgdp;
325
326
327	mdclrerror(&mgdph->mde);
328
329	if ((un = mirror_getun(mgdph->mnum,
330	    &mgdph->mde, RD_LOCK, lock)) == NULL)
331		return (0);
332
333	udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
334
335	for (cnt = 0, i = 0; i < NMIRROR; i++) {
336		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
337			continue;
338		if (cnt < mgdph->cnt) {
339			unit_dev = un->un_sm[i].sm_dev;
340			if (md_getmajor(unit_dev) != md_major) {
341				unit_dev = md_xlate_mini_2_targ(unit_dev);
342				if (unit_dev == NODEV64)
343					return (ENODEV);
344			}
345
346			if (ddi_copyout((caddr_t)&unit_dev, (caddr_t)udevs,
347			    sizeof (*udevs), mode) != 0)
348				return (EFAULT);
349			++udevs;
350		}
351		++cnt;
352	}
353
354	mgdph->cnt = cnt;
355	return (0);
356}
357
358static int
359mirror_reset(
360	md_i_reset_t	*mirp
361)
362{
363	minor_t		mnum = mirp->mnum;
364	mm_unit_t	*un;
365	mdi_unit_t	*ui;
366	set_t		setno = MD_MIN2SET(mnum);
367
368	mdclrerror(&mirp->mde);
369
370	if ((un = mirror_getun(mnum, &mirp->mde, NO_LOCK, NULL)) == NULL)
371		return (0);
372
373	if (MD_HAS_PARENT(un->c.un_parent)) {
374		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
375	}
376
377	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
378
379	/* single thread */
380	ui = MDI_UNIT(mnum);
381	(void) md_unit_openclose_enter(ui);
382
383	if (md_unit_isopen(ui)) {
384		md_unit_openclose_exit(ui);
385		rw_exit(&md_unit_array_rw.lock);
386		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
387	}
388
389	md_unit_openclose_exit(ui);
390
391	if (!mirp->force) {
392		int	smi;
393		for (smi = 0; smi < NMIRROR; smi++) {
394			if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
395				continue;
396
397			if (!SMS_BY_INDEX_IS(un, smi, SMS_RUNNING)) {
398				rw_exit(&md_unit_array_rw.lock);
399				return (mdmderror(&mirp->mde,
400				    MDE_C_WITH_INVAL_SM, mnum));
401			}
402		}
403	}
404
405	reset_mirror(un, mnum, 1);
406
407	/*
408	 * Update unit availability
409	 */
410	md_set[setno].s_un_avail++;
411
412	/*
413	 * If MN set, reset s_un_next so all nodes can have
414	 * the same view of the next available slot when
415	 * nodes are -w and -j
416	 */
417	if (MD_MNSET_SETNO(setno)) {
418		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
419	}
420
421	rw_exit(&md_unit_array_rw.lock);
422	return (0);
423}
424
425static int
426mirror_get_geom(
427	mm_unit_t	*un,
428	struct dk_geom	*geomp
429)
430{
431	md_get_geom((md_unit_t *)un, geomp);
432
433	return (0);
434}
435
436static int
437mirror_get_vtoc(
438	mm_unit_t	*un,
439	struct vtoc	*vtocp
440)
441{
442	md_get_vtoc((md_unit_t *)un, vtocp);
443
444	return (0);
445}
446
447static int
448mirror_set_vtoc(
449	mm_unit_t	*un,
450	struct vtoc	*vtocp
451)
452{
453	return (md_set_vtoc((md_unit_t *)un, vtocp));
454}
455
456static int
457mirror_get_extvtoc(
458	mm_unit_t	*un,
459	struct extvtoc	*vtocp
460)
461{
462	md_get_extvtoc((md_unit_t *)un, vtocp);
463
464	return (0);
465}
466
467static int
468mirror_set_extvtoc(
469	mm_unit_t	*un,
470	struct extvtoc	*vtocp
471)
472{
473	return (md_set_extvtoc((md_unit_t *)un, vtocp));
474}
475
476static int
477mirror_get_cgapart(
478	mm_unit_t	*un,
479	struct dk_map	*dkmapp
480)
481{
482	md_get_cgapart((md_unit_t *)un, dkmapp);
483	return (0);
484}
485
486static int
487mirror_getcomp_by_dev(mm_unit_t *un, replace_params_t *params,
488    int *smi, int *cip)
489{
490	mm_submirror_t		*sm;
491	mm_submirror_ic_t	*smic;
492	ms_comp_t		*comp;
493	ms_unit_t		*mous;
494	int			ci;
495	int			i;
496	int			compcnt;
497	ms_cd_info_t		cd;
498	void			(*get_dev)();
499	md_dev64_t		dev = md_expldev(params->old_dev);
500	md_error_t		*ep = &params->mde;
501	minor_t			mnum = params->mnum;
502	mdkey_t			devkey;
503	int			nkeys;
504	set_t			setno;
505	side_t			side;
506
507	setno = MD_MIN2SET(MD_SID(un));
508	side = mddb_getsidenum(setno);
509
510	if (md_getkeyfromdev(setno, side, dev, &devkey, &nkeys) != 0)
511		return (mddeverror(ep, MDE_NAME_SPACE, dev));
512
513	for (i = 0; i < NMIRROR; i++) {
514		sm = &un->un_sm[i];
515		smic = &un->un_smic[i];
516
517		if (!SMS_IS(sm, SMS_INUSE))
518			continue;
519
520		get_dev =
521		    (void (*)())md_get_named_service(sm->sm_dev, 0,
522		    "get device", 0);
523		compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
524
525		/*
526		 * For each of the underlying stripe components get
527		 * the info.
528		 */
529		for (ci = 0; ci < compcnt; ci++) {
530			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
531			if ((cd.cd_dev == dev) || (cd.cd_orig_dev == dev)) {
532				*cip = ci;
533				*smi = i;
534				return (1);
535			}
536		}
537
538		/*
539		 * now we rescan looking only for NODEV. If we find
540		 * NODEV then we will check the keys to see if its a match.
541		 *
542		 * If no key was found to match dev, then there is
543		 * no way to compare keys - so continue.
544		 */
545		if (nkeys == 0) {
546			continue;
547		}
548		mous = MD_UNIT(md_getminor(sm->sm_dev));
549
550		for (ci = 0; ci < compcnt; ci++) {
551
552			comp = (struct ms_comp *)
553			    ((void *)&((char *)mous)[mous->un_ocomp]);
554
555			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
556
557			if (cd.cd_dev == NODEV64 || cd.cd_orig_dev == NODEV64) {
558				comp += ci;
559				if (comp->un_key == devkey) {
560					if (nkeys > 1) {
561						return (mddeverror(
562						    ep, MDE_MULTNM, dev));
563					}
564					*cip = ci;
565					*smi = i;
566					return (1);
567				}
568			}
569		}
570	}
571	return (mdcomperror(ep, MDE_CANT_FIND_COMP, mnum, dev));
572}
573
574/*
575 * comp_replace:
576 * ----------------
577 * Called to implement the component replace function
578 *
579 * Owner is returned in the parameter block passed in by the caller.
580 *
581 * Returns:
582 *	0	success
583 *	error code if the functions fails
584 *
585 * For a MN set, on entry all writes to the mirror are suspended, on exit
586 * from this function, writes must be resumed when not a dryrun.
587 */
588static int
589comp_replace(
590	replace_params_t	*params,
591	IOLOCK			*lock
592)
593{
594	minor_t			mnum = params->mnum;
595	set_t			setno;
596	side_t			side;
597	mm_unit_t		*un;
598	mdi_unit_t		*ui;
599	ms_unit_t		*ms_un;
600	mdi_unit_t		*ms_ui;
601	ms_comp_t		*comp;
602	mm_submirror_t		*sm;
603	md_dev64_t		smdev;
604	mddb_recid_t		recids[6]; /* recids for stripe on SP */
605	int			smi, ci;
606	ms_new_dev_t		nd;
607	int			(*repl_dev)();
608	void			(*repl_done)();
609	void			*repl_data;
610	int			err = 0;
611	ms_cd_info_t		cd;
612	void			(*get_dev)();
613
614	mdclrerror(&params->mde);
615
616	if ((un = mirror_getun(mnum, &params->mde, WRITERS, lock)) == NULL) {
617		return (0);
618	}
619
620	ui = MDI_UNIT(mnum);
621	if (ui->ui_tstate & MD_INACCESSIBLE) {
622		(void) mdmderror(&params->mde, MDE_IN_UNAVAIL_STATE, mnum);
623		goto errexit;
624	}
625
626	/*
627	 * replace cannot be done while a resync is active or we are
628	 * still waiting for an optimized resync to be started
629	 */
630	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
631		(void) mdmderror(&params->mde, MDE_RESYNC_ACTIVE, mnum);
632		goto errexit;
633	}
634
635	if (mirror_getcomp_by_dev(un, params, &smi, &ci) == 0) {
636		goto errexit;
637	}
638
639	if (un->un_nsm == 1) {
640		(void) mdmderror(&params->mde, MDE_LAST_SM_RE, mnum);
641		goto errexit;
642	}
643
644	if (mirror_other_sources(un, smi, ci, 0) != 0) {
645		(void) mdcomperror(&params->mde, MDE_REPL_INVAL_STATE,
646		    mnum, md_expldev(params->old_dev));
647		goto errexit;
648	}
649
650	sm = &un->un_sm[smi];
651	if (sm->sm_state & (SMS_OFFLINE | SMS_OFFLINE_RESYNC)) {
652		(void) mdmderror(&params->mde, MDE_ILLEGAL_SM_STATE, mnum);
653		goto errexit;
654	}
655
656	get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
657	    "get device", 0);
658	(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
659
660	repl_dev = (int (*)())md_get_named_service(sm->sm_dev, 0,
661	    "replace device", 0);
662
663	smdev = sm->sm_dev;
664	ms_un = MD_UNIT(md_getminor(smdev));
665
666	if (params->cmd == ENABLE_COMP) {
667		md_dev64_t	this_dev;
668		int		numkeys;
669		mdkey_t		this_key;
670
671		this_dev = ((cd.cd_orig_dev == 0) ? cd.cd_dev :
672		    cd.cd_orig_dev);
673		setno = MD_MIN2SET(md_getminor(smdev));
674		side = mddb_getsidenum(setno);
675		comp = (struct ms_comp *)
676		    ((void *)&((char *)ms_un)[ms_un->un_ocomp]);
677		comp += ci;
678		/*
679		 * We trust the dev_t because we cannot determine the
680		 * dev_t from the device id since a new disk is in the
681		 * same location. Since this is a call from metareplace -e dx
682		 * AND it is SCSI a new dev_t is not generated.  So the
683		 * dev_t from the mddb is used. Before enabling the device
684		 * we check to make sure that multiple entries for the same
685		 * device does not exist in the namespace. If they do we
686		 * fail the ioctl.
687		 * One of the many ways multiple entries in the name space
688		 * can occur is if one removed the failed component in the
689		 * stripe of a mirror and put another disk that was part of
690		 * another metadevice. After reboot metadevadm would correctly
691		 * update the device name for the metadevice whose component
692		 * has moved. However now in the metadb there are two entries
693		 * for the same name (ctds) that belong to different
694		 * metadevices. One is valid, the other is a ghost or "last
695		 * know as" ctds.
696		 */
697		this_dev =  md_getdevnum(setno, side,
698		    comp->un_key, MD_TRUST_DEVT);
699
700		/*
701		 * Verify that multiple keys for the same
702		 * dev_t don't exist
703		 */
704
705		if (md_getkeyfromdev(setno, side, this_dev,
706		    &this_key, &numkeys) != 0) {
707			(void) mddeverror(&params->mde, MDE_NAME_SPACE,
708			    md_expldev(params->old_dev));
709			goto errexit;
710		}
711		/*
712		 * Namespace has multiple entries
713		 * for the same devt
714		 */
715		if (numkeys > 1) {
716			(void) mddeverror(&params->mde, MDE_MULTNM,
717			    md_expldev(params->old_dev));
718			goto errexit;
719		}
720		if ((numkeys == 0) || (comp->un_key != this_key)) {
721			(void) mdcomperror(&params->mde, MDE_CANT_FIND_COMP,
722			    mnum, this_dev);
723			goto errexit;
724		}
725
726		if ((md_getmajor(this_dev) != md_major) &&
727		    (md_devid_found(setno, side, this_key) == 1)) {
728			if (md_update_namespace_did(setno, side,
729			    this_key, &params->mde) != 0) {
730				(void) mddeverror(&params->mde, MDE_NAME_SPACE,
731				    this_dev);
732				goto errexit;
733			}
734		}
735
736		if (md_expldev(params->new_dev) != this_dev) {
737			(void) mddeverror(&params->mde, MDE_FIX_INVAL_STATE,
738			    md_expldev(params->new_dev));
739			goto errexit;
740		}
741
742		/* in case of dryrun, don't actually do anything */
743		if ((params->options & MDIOCTL_DRYRUN) == 0) {
744			err = (*repl_dev)(sm->sm_dev, 0, ci, NULL, recids, 6,
745			    &repl_done, &repl_data);
746		}
747	} else if ((params->options & MDIOCTL_DRYRUN) == 0) {
748		nd.nd_dev = md_expldev(params->new_dev);
749		nd.nd_key = params->new_key;
750		nd.nd_start_blk = params->start_blk;
751		nd.nd_nblks = params->number_blks;
752		nd.nd_labeled = params->has_label;
753		nd.nd_hs_id = 0;
754
755		err = (*repl_dev)(sm->sm_dev, 0, ci, &nd, recids, 6,
756		    &repl_done, &repl_data);
757
758	}
759
760	if (err != 0) {
761		(void) mdcomperror(&params->mde, err, mnum,
762		    md_expldev(params->new_dev));
763		goto errexit;
764	}
765	/* In case of a dryun we're done. */
766	if (params->options & MDIOCTL_DRYRUN) {
767		mdclrerror(&params->mde);
768		return (0);
769	}
770
771	/* set_sm_comp_state() commits the modified records */
772	set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, MD_STATE_NO_XMIT,
773	    lock);
774
775	(*repl_done)(sm->sm_dev, repl_data);
776
777	/*
778	 * If the mirror is open then need to make sure that the submirror,
779	 * on which the replace ran, is also open and if not then open it.
780	 * This is only a concern for a single component sub-mirror stripe
781	 * as it may not be open due to the failure of the single component.
782	 *
783	 * This check has to be done after the call to (*repl_done)
784	 * as that function releases the writer lock on the submirror.
785	 */
786	if (md_unit_isopen(ui)) {
787		minor_t ms_mnum = md_getminor(sm->sm_dev);
788
789		ms_ui = MDI_UNIT(ms_mnum);
790
791		if (!md_unit_isopen(ms_ui)) {
792			/*
793			 * Underlying submirror is not open so open it.
794			 */
795			if (md_layered_open(ms_mnum, &smdev, MD_OFLG_NULL)) {
796				mirror_openfail_console_info(un, smi, ci);
797				goto errexit;
798			}
799		}
800	}
801
802	mirror_check_failfast(mnum);
803
804	if (params->cmd == ENABLE_COMP) {
805		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
806		    MD_UN2SET(un), MD_SID(un));
807	} else {
808		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
809		    MD_UN2SET(un), MD_SID(un));
810	}
811
812	md_ioctl_writerexit(lock);
813	/*
814	 * Reset any saved resync location flags as we've now replaced the
815	 * component. This means we have to resync the _whole_ component.
816	 */
817	un->un_rs_resync_done = un->un_rs_resync_2_do = 0;
818	un->un_rs_type = MD_RS_NONE;
819	mirror_resume_writes(un);
820	if (!MD_MNSET_SETNO(MD_UN2SET(un)))
821		(void) mirror_resync_unit(mnum, NULL, &params->mde, lock);
822	mdclrerror(&params->mde);
823	return (0);
824errexit:
825	/* We need to resume writes unless this is a dryrun */
826	if (!(params->options & MDIOCTL_DRYRUN))
827		mirror_resume_writes(un);
828	return (0);
829}
830
831/*
832 * mirror_attach:
833 * ----------------
834 * Called to implement the submirror attach function
835 *
836 * Owner is returned in the parameter block passed in by the caller.
837 *
838 * Returns:
839 *	0	success
840 *	error code if the functions fails
841 *
842 * For a MN set, on entry all writes to the mirror are suspended, on exit
843 * from this function, writes must be resumed when not a dryrun.
844 */
845static int
846mirror_attach(
847	md_att_struct_t	*att,
848	IOLOCK		*lock
849)
850{
851	minor_t			mnum = att->mnum;
852	mm_unit_t		*un;
853	md_unit_t		*su;
854	mm_submirror_t		*sm;
855	mm_submirror_ic_t	*smic;
856	int			smi;
857	md_dev64_t		sm_dev;
858	minor_t			sm_mnum;
859	mdkey_t			indx;
860	set_t			setno;
861	uint_t			options;
862
863	/*
864	 * This routine should not be called during upgrade.
865	 */
866	if (MD_UPGRADE)  {
867		return (0);
868	}
869
870	mdclrerror(&att->mde);
871	options = att->options;
872
873	if ((un = mirror_getun(mnum, &att->mde, WRITERS, lock)) == NULL) {
874		return (0);
875	}
876
877	setno = MD_UN2SET(un);
878
879	for (smi = 0; smi < NMIRROR; smi++)
880		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
881			break;
882
883	if (smi == NMIRROR) {
884		(void) mdmderror(&att->mde, MDE_MIRROR_FULL, mnum);
885		goto errexit;
886	}
887
888	sm = &un->un_sm[smi];
889	smic = &un->un_smic[smi];
890	sm_dev = att->submirror;
891	sm_mnum = md_getminor(sm_dev);
892
893	if (md_get_parent(sm_dev) != MD_NO_PARENT) {
894		(void) mdmderror(&att->mde, MDE_IN_USE, sm_mnum);
895		goto errexit;
896	}
897
898	if (md_unit_isopen(MDI_UNIT(sm_mnum))) {
899		(void) mdmderror(&att->mde, MDE_IS_OPEN, sm_mnum);
900		goto errexit;
901	}
902
903	/* Check the size */
904	su = (md_unit_t *)MD_UNIT(sm_mnum);
905	if (un->c.un_total_blocks > su->c.un_total_blocks) {
906		(void) mdmderror(&att->mde, MDE_SM_TOO_SMALL, sm_mnum);
907		goto errexit;
908	}
909
910	/* Don't attach labeled sm to unlabeled mirrors */
911	if ((su->c.un_flag & MD_LABELED) && !(un->c.un_flag & MD_LABELED)) {
912		(void) mdmderror(&att->mde, MDE_NO_LABELED_SM, sm_mnum);
913		goto errexit;
914	}
915
916	indx = md_setshared_name(setno,
917	    ddi_major_to_name(md_getmajor(sm_dev)), 0L);
918
919	/* Open the sm, only if the mirror is open */
920	if (md_unit_isopen(MDI_UNIT(mnum))) {
921		if (md_layered_open(mnum, &sm_dev, MD_OFLG_NULL)) {
922			(void) md_remshared_name(setno, indx);
923			(void) mdmderror(&att->mde, MDE_SM_OPEN_ERR,
924			    md_getminor(att->submirror));
925			goto errexit;
926		}
927		/* in dryrun mode, don't leave the device open */
928		if (options & MDIOCTL_DRYRUN) {
929			md_layered_close(sm_dev, MD_OFLG_NULL);
930		}
931	}
932
933	/*
934	 * After this point the checks are done and action is taken.
935	 * So, clean up and return in case of dryrun.
936	 */
937
938	if (options & MDIOCTL_DRYRUN) {
939		md_ioctl_writerexit(lock);
940		mdclrerror(&att->mde);
941		return (0);
942	}
943
944	sm->sm_key = att->key;
945	sm->sm_dev = sm_dev;
946	md_set_parent(sm_dev, MD_SID(un));
947	mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
948	build_submirror(un, smi, 0);
949	un->un_nsm++;
950	mirror_commit(un, SMI2BIT(smi), 0);
951	mirror_check_failfast(mnum);
952	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ATTACH, SVM_TAG_METADEVICE,
953	    MD_UN2SET(un), MD_SID(un));
954
955	mirror_resume_writes(un);
956	md_ioctl_writerexit(lock);
957	if (!MD_MNSET_SETNO(setno))
958		(void) mirror_resync_unit(mnum, NULL, &att->mde, lock);
959	mdclrerror(&att->mde);
960	return (0);
961errexit:
962	/* We need to resume writes unless this is a dryrun */
963	if (!(options & MDIOCTL_DRYRUN))
964		mirror_resume_writes(un);
965	return (0);
966}
967
968
969void
970reset_comp_states(mm_submirror_t *sm, mm_submirror_ic_t *smic)
971{
972	int		compcnt;
973	int		i;
974	md_m_shared_t	*shared;
975
976	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
977	for (i = 0; i < compcnt; i++) {
978		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
979		    (sm->sm_dev, sm, i);
980
981		shared->ms_state = CS_OKAY;
982		shared->ms_flags &= ~MDM_S_NOWRITE;
983		shared->ms_lasterrcnt = 0;
984	}
985}
986
987
988/*
989 * mirror_detach:
990 * ----------------
991 * Called to implement the submirror detach function
992 *
993 * Owner is returned in the parameter block passed in by the caller.
994 *
995 * Returns:
996 *	0	success
997 *	error code if the functions fails
998 *
999 * For a MN set, on entry all writes to the mirror are suspended, on exit
1000 * from this function, writes must be resumed.
1001 */
1002static int
1003mirror_detach(
1004	md_detach_params_t	*det,
1005	IOLOCK			*lock
1006)
1007{
1008	minor_t			mnum = det->mnum;
1009	mm_unit_t		*un;
1010	mdi_unit_t		*ui;
1011	mm_submirror_t		*sm;
1012	mm_submirror_t		*old_sm;
1013	mm_submirror_t		*new_sm;
1014	mm_submirror_ic_t	*smic;
1015	int			smi;
1016	md_dev64_t		sm_dev;
1017	md_unit_t		*su;
1018	sv_dev_t		sv;
1019	mddb_recid_t		recids[2];
1020	int			nsv = 0;
1021	int			smi_remove;
1022
1023	mdclrerror(&det->mde);
1024
1025	if ((un = mirror_getun(mnum, &det->mde, WRITERS, lock)) == NULL) {
1026		return (0);
1027	}
1028
1029	ui = MDI_UNIT(mnum);
1030	if (ui->ui_tstate & MD_INACCESSIBLE) {
1031		mirror_resume_writes(un);
1032		return (mdmderror(&det->mde, MDE_IN_UNAVAIL_STATE, mnum));
1033	}
1034	/*
1035	 * detach cannot be done while a resync is active or we are
1036	 * still waiting for an optimized resync to be started
1037	 */
1038	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
1039		mirror_resume_writes(un);
1040		return (mdmderror(&det->mde, MDE_RESYNC_ACTIVE, mnum));
1041	}
1042
1043	for (smi = 0; smi < NMIRROR; smi++) {
1044		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1045			continue;
1046		}
1047		if (un->un_sm[smi].sm_dev == det->submirror) {
1048			smi_remove = smi;
1049			break;
1050		}
1051	}
1052
1053	if (smi == NMIRROR) {
1054		mirror_resume_writes(un);
1055		return (mdmderror(&det->mde, MDE_CANT_FIND_SM, mnum));
1056	}
1057
1058	if (un->un_nsm == 1) {
1059		mirror_resume_writes(un);
1060		return (mdmderror(&det->mde, MDE_LAST_SM, mnum));
1061	}
1062
1063	if (mirror_other_sources(un, smi, WHOLE_SM, 0) != 0) {
1064		mirror_resume_writes(un);
1065		return (mdmderror(&det->mde, MDE_NO_READABLE_SM, mnum));
1066	}
1067
1068	sm = &un->un_sm[smi];
1069	smic = &un->un_smic[smi];
1070	sm_dev = sm->sm_dev;
1071	su = (md_unit_t *)MD_UNIT(md_getminor(sm_dev));
1072
1073	/*
1074	 * Need to pass in the extra record id,
1075	 * cause mirror_commit() will not commit
1076	 * a sm (from the smmask) if the slot is unused.
1077	 * Which it is, since we are detaching.
1078	 */
1079	recids[0] = ((md_unit_t *)MD_UNIT(md_getminor(sm_dev)))->c.un_record_id;
1080	recids[1] = 0;
1081
1082	mirror_set_sm_state(sm, smic, SMS_UNUSED, det->force_detach);
1083	/*
1084	 * If there are any erred components
1085	 * then make the detach fail and do not unparent the
1086	 * submirror.
1087	 */
1088	if (sm->sm_state == SMS_UNUSED) {
1089		/* reallow soft partitioning of submirror */
1090		MD_CAPAB(su) |= MD_CAN_SP;
1091		md_reset_parent(sm_dev);
1092		reset_comp_states(sm, smic);
1093		un->un_nsm--;
1094		/* Close the sm, only if the mirror is open */
1095		if (md_unit_isopen(MDI_UNIT(mnum)))
1096			md_layered_close(sm_dev, MD_OFLG_NULL);
1097		sv.setno = MD_UN2SET(un);
1098		sv.key = sm->sm_key;
1099		nsv = 1;
1100	} else
1101		(void) mdmderror(&det->mde, MDE_SM_FAILED_COMPS, mnum);
1102
1103	/*
1104	 * Perhaps the mirror changed it's size due to this detach.
1105	 * (void) mirror_grow_unit(un, &mde);
1106	 */
1107
1108	/*
1109	 * NOTE: We are passing the detached sm recid
1110	 * and not the smmask field. This is correct.
1111	 */
1112	mirror_commit(un, 0, recids);
1113	md_rem_names(&sv, nsv);
1114	if (sm->sm_state == SMS_UNUSED) {
1115		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DETACH, SVM_TAG_METADEVICE,
1116		    MD_UN2SET(un), MD_SID(un));
1117	}
1118
1119	/*
1120	 * Reshuffle the submirror devices in the array as we potentially
1121	 * have a dead record in the middle of it.
1122	 */
1123	for (smi = 0; nsv && (smi < NMIRROR); smi++) {
1124		if (smi < smi_remove) {
1125			continue;
1126		}
1127		if (smi > smi_remove) {
1128			old_sm = &un->un_sm[smi];
1129			new_sm = &un->un_sm[smi - 1];
1130			new_sm->sm_key = old_sm->sm_key;
1131			new_sm->sm_dev = old_sm->sm_dev;
1132			new_sm->sm_state = old_sm->sm_state;
1133			new_sm->sm_flags = old_sm->sm_flags;
1134			new_sm->sm_shared = old_sm->sm_shared;
1135			new_sm->sm_hsp_id = old_sm->sm_hsp_id;
1136			new_sm->sm_timestamp = old_sm->sm_timestamp;
1137			bzero(old_sm, sizeof (mm_submirror_t));
1138		}
1139	}
1140	mirror_commit(un, 0, NULL);
1141	mirror_resume_writes(un);
1142	return (0);
1143}
1144
1145/*
1146 * mirror_offline:
1147 * ----------------
1148 * Called to implement the submirror offline function
1149 *
1150 * Owner is returned in the parameter block passed in by the caller.
1151 *
1152 * Returns:
1153 *	0	success
1154 *	error code if the functions fails
1155 *
1156 * For a MN set, on entry all writes to the mirror are suspended, on exit
1157 * from this function, writes must be resumed.
1158 */
1159static int
1160mirror_offline(
1161	md_i_off_on_t	*miop,
1162	IOLOCK		*lock
1163)
1164{
1165	minor_t			mnum = miop->mnum;
1166	mm_unit_t		*un;
1167	mm_submirror_t		*sm;
1168	mm_submirror_ic_t	*smic;
1169	int			smi;
1170	mdi_unit_t		*ui = MDI_UNIT(mnum);
1171
1172	mdclrerror(&miop->mde);
1173
1174	if ((un = mirror_getun(mnum, &miop->mde, WR_LOCK, lock)) == NULL) {
1175		return (0);
1176	}
1177
1178	/*
1179	 * offline cannot be done while a resync is active or we are
1180	 * still waiting for an optimized resync to be started
1181	 */
1182	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
1183		mirror_resume_writes(un);
1184		return (mdmderror(&miop->mde, MDE_RESYNC_ACTIVE, mnum));
1185	}
1186
1187	/*
1188	 * Reject mirror_offline if ABR is set
1189	 */
1190	if ((ui->ui_tstate & MD_ABR_CAP) || un->un_abr_count) {
1191		mirror_resume_writes(un);
1192		return (mderror(&miop->mde, MDE_ABR_SET));
1193	}
1194
1195	for (smi = 0; smi < NMIRROR; smi++) {
1196		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1197			continue;
1198		if (un->un_sm[smi].sm_dev == miop->submirror)
1199			break;
1200	}
1201
1202	if (smi == NMIRROR) {
1203		mirror_resume_writes(un);
1204		return (mdmderror(&miop->mde, MDE_CANT_FIND_SM, mnum));
1205	}
1206
1207	sm = &un->un_sm[smi];
1208	smic = &un->un_smic[smi];
1209	if (!SMS_IS(sm, SMS_RUNNING) && !miop->force_offline) {
1210		mirror_resume_writes(un);
1211		return (mdmderror(&miop->mde, MDE_ILLEGAL_SM_STATE, mnum));
1212	}
1213
1214	if (mirror_other_sources(un, smi, WHOLE_SM, 0) != 0) {
1215		mirror_resume_writes(un);
1216		return (mdmderror(&miop->mde, MDE_NO_READABLE_SM, mnum));
1217	}
1218	mirror_set_sm_state(sm, smic, SMS_OFFLINE, 1);
1219	mirror_resume_writes(un);
1220
1221	MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1222	mirror_commit(un, NO_SUBMIRRORS, 0);
1223	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OFFLINE, SVM_TAG_METADEVICE,
1224	    MD_UN2SET(un), MD_SID(un));
1225	return (0);
1226}
1227
1228/*
1229 * mirror_online:
1230 * ----------------
1231 * Called to implement the submirror online function
1232 *
1233 * Owner is returned in the parameter block passed in by the caller.
1234 *
1235 * Returns:
1236 *	0	success
1237 *	error code if the functions fails
1238 *
1239 * For a MN set, on entry all writes to the mirror are suspended, on exit
1240 * from this function, writes must be resumed.
1241 */
1242static int
1243mirror_online(
1244	md_i_off_on_t	*miop,
1245	IOLOCK		*lock
1246)
1247{
1248	minor_t			mnum = miop->mnum;
1249	mm_unit_t		*un;
1250	mm_submirror_t		*sm;
1251	mm_submirror_ic_t	*smic;
1252	int			smi;
1253	set_t			setno = MD_MIN2SET(mnum);
1254
1255	mdclrerror(&miop->mde);
1256
1257	if ((un = mirror_getun(mnum, &miop->mde, WR_LOCK, lock)) == NULL) {
1258		return (0);
1259	}
1260
1261	for (smi = 0; smi < NMIRROR; smi++) {
1262		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1263			continue;
1264		if (un->un_sm[smi].sm_dev == miop->submirror)
1265			break;
1266	}
1267	if (smi == NMIRROR) {
1268		mirror_resume_writes(un);
1269		return (mdmderror(&miop->mde, MDE_CANT_FIND_SM, mnum));
1270	}
1271
1272	sm = &un->un_sm[smi];
1273	smic = &un->un_smic[smi];
1274	if (!SMS_IS(sm, SMS_OFFLINE)) {
1275		mirror_resume_writes(un);
1276		return (mdmderror(&miop->mde, MDE_ILLEGAL_SM_STATE, mnum));
1277	}
1278
1279	/*
1280	 * online cannot be done while a resync is active or we are
1281	 * still waiting for an optimized resync to be started
1282	 */
1283	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
1284		mirror_resume_writes(un);
1285		return (mdmderror(&miop->mde, MDE_RESYNC_ACTIVE, mnum));
1286	}
1287
1288	mirror_set_sm_state(sm, smic, SMS_OFFLINE_RESYNC, 1);
1289	mirror_commit(un, NO_SUBMIRRORS, 0);
1290	mirror_check_failfast(mnum);
1291	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ONLINE, SVM_TAG_METADEVICE,
1292	    MD_UN2SET(un), MD_SID(un));
1293
1294
1295	/* for MN sets, re-read the resync record from disk */
1296	if (MD_MNSET_SETNO(MD_UN2SET(un)))
1297		(void) mddb_reread_rr(setno, un->un_rr_dirty_recid);
1298
1299	bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
1300	    howmany(un->un_rrd_num, NBBY));
1301	MD_STATUS(un) |= MD_UN_OPT_NOT_DONE;
1302	sm->sm_flags |= MD_SM_RESYNC_TARGET;
1303	mirror_resume_writes(un);
1304	md_ioctl_writerexit(lock);
1305	if (!MD_MNSET_SETNO(setno))
1306		return (mirror_resync_unit(mnum, NULL, &miop->mde, lock));
1307	else return (0);
1308}
1309
1310int
1311mirror_grow_unit(
1312	mm_unit_t		*un,
1313	md_error_t		*ep
1314)
1315{
1316	md_unit_t		*su;
1317	mm_submirror_t		*sm;
1318	int			smi;
1319	diskaddr_t		total_blocks;
1320	diskaddr_t		current_tb;
1321	int			spc;		/* sectors per head */
1322	minor_t			mnum = MD_SID(un);
1323
1324	/*
1325	 * grow_unit cannot be done while a resync is active or we are
1326	 * still waiting for an optimized resync to be started. Set
1327	 * flag to indicate GROW_PENDING and once the resync is complete
1328	 * the grow_unit function will be executed.
1329	 */
1330	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) {
1331		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1332		mirror_commit(un, NO_SUBMIRRORS, 0);
1333		return (mdmderror(ep, MDE_GROW_DELAYED, MD_SID(un)));
1334	}
1335
1336	/*
1337	 * Find the smallest submirror
1338	 */
1339	total_blocks = 0;
1340	for (smi = 0; smi < NMIRROR; smi++) {
1341		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1342			continue;
1343		sm = &un->un_sm[smi];
1344		/*
1345		 * Growth is not possible if there is one or more
1346		 * submirrors made up of non-Metadevices.
1347		 */
1348		if (md_getmajor(sm->sm_dev) != md_major)
1349			return (0);
1350
1351		su = MD_UNIT(md_getminor(sm->sm_dev));
1352		if ((total_blocks == 0) ||
1353		    (su->c.un_total_blocks < total_blocks))
1354			total_blocks = su->c.un_total_blocks;
1355	}
1356
1357	/*
1358	 * If the smallest submirror is not larger
1359	 * than the mirror, we are all done.
1360	 */
1361	if (total_blocks <= un->c.un_total_blocks)
1362		return (0);
1363
1364	/*
1365	 * Growing the mirror now.
1366	 * First: Round down the actual_tb to be a multiple
1367	 * 	of nheads * nsects.
1368	 */
1369	spc = un->c.un_nhead * un->c.un_nsect;
1370	current_tb = (total_blocks/spc) * spc;
1371
1372	un->c.un_total_blocks = current_tb;
1373	md_nblocks_set(mnum, un->c.un_total_blocks);
1374	un->c.un_actual_tb = total_blocks;
1375
1376	/* Is the mirror growing from 32 bit device to 64 bit device? */
1377	if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1378	    (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)) {
1379#if defined(_ILP32)
1380		return (mdmderror(ep, MDE_UNIT_TOO_LARGE, mnum));
1381#else
1382		mddb_type_t	typ1;
1383		mddb_recid_t	recid;
1384		set_t		setno;
1385		mddb_recid_t	old_recid = un->c.un_record_id;
1386		mddb_recid_t	old_vtoc;
1387		mddb_de_ic_t    *dep, *old_dep;
1388		md_create_rec_option_t	options;
1389
1390		/* yup, new device size. So we need to replace the record */
1391		typ1 = (mddb_type_t)md_getshared_key(MD_UN2SET(un),
1392		    mirror_md_ops.md_driver.md_drivername);
1393		setno = MD_MIN2SET(mnum);
1394
1395		/* Preserve the friendly name properties of growing unit */
1396		options = MD_CRO_64BIT | MD_CRO_MIRROR;
1397		if (un->c.un_revision & MD_FN_META_DEV)
1398			options |= MD_CRO_FN;
1399		recid = mddb_createrec(offsetof(mm_unit_t, un_smic), typ1,
1400		    MIRROR_REC, options, setno);
1401		/* Resize to include incore fields */
1402		un->c.un_revision |= MD_64BIT_META_DEV;
1403		/* All 64 bit metadevices only support EFI labels. */
1404		un->c.un_flag |= MD_EFILABEL;
1405		/*
1406		 * If the device had a vtoc record attached to it, we remove
1407		 * the vtoc record, because the layout has changed completely.
1408		 */
1409		old_vtoc = un->c.un_vtoc_id;
1410		if (old_vtoc != 0) {
1411			un->c.un_vtoc_id =
1412			    md_vtoc_to_efi_record(old_vtoc, setno);
1413		}
1414		MD_RECID(un) = recid;
1415		dep = mddb_getrecdep(recid);
1416		old_dep = mddb_getrecdep(old_recid);
1417		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
1418		dep->de_rb_userdata = old_dep->de_rb_userdata;
1419		dep->de_reqsize = old_dep->de_reqsize;
1420		dep->de_rb_userdata_ic = old_dep->de_rb_userdata_ic;
1421		dep->de_icreqsize = old_dep->de_icreqsize;
1422		mirror_commit(un, NO_SUBMIRRORS, 0);
1423		old_dep->de_rb_userdata = NULL;
1424		old_dep->de_rb_userdata_ic = NULL;
1425		mddb_deleterec_wrapper(old_recid);
1426		/*
1427		 * If there was a vtoc record, it is no longer needed, because
1428		 * a new efi record has been created for this un.
1429		 */
1430		if (old_vtoc != 0) {
1431			mddb_deleterec_wrapper(old_vtoc);
1432		}
1433#endif
1434	}
1435
1436	if ((current_tb/un->un_rrd_blksize) > MD_MAX_NUM_RR) {
1437		if (mirror_resize_resync_regions(un, current_tb)) {
1438			return (mdmderror(ep, MDE_RR_ALLOC_ERROR, MD_SID(un)));
1439		}
1440		mirror_check_failfast(mnum);
1441		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1442		    MD_UN2SET(un), MD_SID(un));
1443		return (0);
1444	}
1445
1446	if (mirror_add_resync_regions(un, current_tb)) {
1447		return (mdmderror(ep, MDE_RR_ALLOC_ERROR, MD_SID(un)));
1448	}
1449
1450	mirror_check_failfast(mnum);
1451	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1452	    MD_UN2SET(un), MD_SID(un));
1453
1454	return (0);
1455}
1456
1457static int
1458mirror_grow(
1459	void			*mgp,
1460	IOLOCK			*lock
1461)
1462{
1463	mm_unit_t		*un;
1464	md_grow_params_t	*mgph = mgp;
1465
1466	mdclrerror(&mgph->mde);
1467
1468	if ((un = mirror_getun(mgph->mnum,
1469	    &mgph->mde, WR_LOCK, lock)) == NULL)
1470		return (0);
1471
1472	if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1473		return (0);
1474
1475	return (mirror_grow_unit(un, &mgph->mde));
1476}
1477
1478static int
1479mirror_change(
1480	md_mirror_params_t	*mmp,
1481	IOLOCK			*lock
1482)
1483{
1484	mm_params_t		*pp = &mmp->params;
1485	mm_unit_t		*un;
1486
1487	mdclrerror(&mmp->mde);
1488
1489	if ((un = mirror_getun(mmp->mnum, &mmp->mde, WR_LOCK, lock)) == NULL)
1490		return (0);
1491
1492	if (pp->change_read_option)
1493		un->un_read_option = pp->read_option;
1494
1495	if (pp->change_write_option)
1496		un->un_write_option = pp->write_option;
1497
1498	if (pp->change_pass_num)
1499		un->un_pass_num = pp->pass_num;
1500
1501	mirror_commit(un, NO_SUBMIRRORS, 0);
1502
1503	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
1504	    MD_UN2SET(un), MD_SID(un));
1505	return (0);
1506}
1507
1508static int
1509mirror_get_resync(
1510	md_resync_ioctl_t	*ri
1511)
1512{
1513	minor_t			mnum = ri->ri_mnum;
1514	mm_unit_t		*un;
1515	u_longlong_t		percent;
1516	uint_t			cnt;
1517	uint_t			rr;
1518	diskaddr_t		d;
1519
1520	mdclrerror(&ri->mde);
1521
1522	if ((un = mirror_getun(mnum, &ri->mde, STALE_OK|NO_LOCK, NULL)) == NULL)
1523		return (0);
1524
1525	ri->ri_flags = 0;
1526	if (md_get_setstatus(MD_MIN2SET(mnum)) & MD_SET_STALE) {
1527		ri->ri_percent_done = 0;
1528		ri->ri_percent_dirty = 0;
1529		return (0);
1530	}
1531
1532	if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE|MD_UN_RESYNC_CANCEL)) {
1533		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1534			ri->ri_flags |= MD_RI_INPROGRESS;
1535		/* Return state of resync thread */
1536		ri->ri_flags |= (un->un_rs_thread_flags & MD_RI_BLOCK);
1537		d = un->un_rs_resync_2_do;
1538		if (d) {
1539			percent = un->un_rs_resync_done;
1540			if (un->c.un_total_blocks >
1541			    MD_MAX_BLKS_FOR_SMALL_DEVS) {
1542				percent *= 1000;
1543				percent /= d;
1544				if (percent > 1000)
1545					percent = 1000;
1546			} else {
1547				percent *= 100;
1548				percent /= d;
1549			}
1550			ri->ri_percent_done = (int)percent;
1551		} else {
1552			ri->ri_percent_done = 0;
1553		}
1554	}
1555	if (un->un_nsm < 2) {
1556		ri->ri_percent_dirty = 0;
1557		return (0);
1558	}
1559	cnt = 0;
1560	for (rr = 0; rr < un->un_rrd_num; rr++)
1561		if (IS_REGION_DIRTY(rr, un))
1562			cnt++;
1563	d = un->un_rrd_num;
1564	if (d) {
1565		percent = cnt;
1566		percent *= 100;
1567		percent += d - 1;		/* round up */
1568		percent /= d;
1569	} else
1570		percent = 0;
1571	ri->ri_percent_dirty = (int)percent;
1572	return (0);
1573}
1574
1575/*
1576 * mirror_get_owner:
1577 * ----------------
1578 * Called to obtain the current owner of a mirror.
1579 *
1580 * Owner is returned in the parameter block passed in by the caller.
1581 *
1582 * Returns:
1583 *	0	success
1584 *	EINVAL	metadevice does not exist or is not a member of a multi-owned
1585 *		set.
1586 */
1587static int
1588mirror_get_owner(md_set_mmown_params_t *p, IOLOCK *lock)
1589{
1590	mm_unit_t	*un;
1591	set_t		setno;
1592
1593	if ((un = mirror_getun(p->d.mnum, &p->mde, RD_LOCK, lock)) == NULL)
1594		return (EINVAL);
1595
1596	setno = MD_UN2SET(un);
1597	if (!MD_MNSET_SETNO(setno)) {
1598		return (EINVAL);
1599	}
1600	p->d.owner = un->un_mirror_owner;
1601	return (0);
1602}
1603
1604/*
1605 * mirror_choose_owner_thread:
1606 * --------------------------
1607 * Called to send a CHOOSE_OWNER message to the commd running on the master
1608 * node. This needs to run in a separate context so that mutex livelock is
1609 * avoided. This can occur because the original request is issued from a call
1610 * to metaioctl() which acquires the global ioctl lock, calls down into the
1611 * mirror_ioctl code and then attempts to mdmn_ksend_message() to the master
1612 * node. As the handler for the choose_owner message needs to send another
1613 * ioctl through the metaioctl() entry point, any other use (by rpc.metad or
1614 * mdcommd checking on set ownership) will deadlock the system leading to
1615 * cluster reconfiguration timeouts and eventually a node or (at worst) a
1616 * cluster-wide panic
1617 */
1618static void
1619mirror_choose_owner_thread(md_mn_msg_chooseid_t	*msg)
1620{
1621	int		rval;
1622	md_mn_kresult_t	*kres;
1623	set_t		setno = MD_MIN2SET(msg->msg_chooseid_mnum);
1624
1625	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1626	rval = mdmn_ksend_message(setno, MD_MN_MSG_CHOOSE_OWNER,
1627	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)msg,
1628	    sizeof (md_mn_msg_chooseid_t), kres);
1629	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
1630		mdmn_ksend_show_error(rval, kres, "CHOOSE OWNER");
1631		cmn_err(CE_WARN, "ksend_message failure: CHOOSE_OWNER");
1632	}
1633
1634	kmem_free(kres, sizeof (md_mn_kresult_t));
1635	kmem_free(msg, sizeof (md_mn_msg_chooseid_t));
1636	thread_exit();
1637}
1638
1639/*
1640 * mirror_owner_thread:
1641 * -------------------
1642 * Called to request an ownership change from a thread context. This issues
1643 * a mdmn_ksend_message() and then completes the appropriate ownership change
1644 * on successful completion of the message transport.
1645 * The originating application must poll for completion on the 'flags' member
1646 * of the MD_MN_MM_OWNER_STATUS ioctl() parameter block.
1647 * Success is marked by a return value of MD_MN_MM_RES_OK, Failure by
1648 * MD_MN_MM_RES_FAIL
1649 */
1650static void
1651mirror_owner_thread(md_mn_req_owner_t *ownp)
1652{
1653	int		rval;
1654	set_t		setno = MD_MIN2SET(ownp->mnum);
1655	mm_unit_t	*un = MD_UNIT(ownp->mnum);
1656	md_mn_kresult_t	*kresult;
1657	md_mps_t	*ps1;
1658
1659	un->un_mirror_owner_status = 0;
1660
1661	mutex_enter(&un->un_owner_mx);
1662	un->un_owner_state |= MM_MN_OWNER_SENT;
1663	mutex_exit(&un->un_owner_mx);
1664
1665	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1666	rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER,
1667	    MD_MSGF_NO_LOG, 0, (char *)ownp, sizeof (md_mn_req_owner_t),
1668	    kresult);
1669
1670	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
1671		/*
1672		 * Message transport layer failed. Return the failure code to
1673		 * the application.
1674		 */
1675		mdmn_ksend_show_error(rval, kresult, "CHANGE OWNER");
1676		mutex_enter(&un->un_owner_mx);
1677		un->un_owner_state &= ~(MM_MN_BECOME_OWNER|MM_MN_OWNER_SENT);
1678		mutex_exit(&un->un_owner_mx);
1679		un->un_mirror_owner_status =
1680		    MD_MN_MM_RESULT | MD_MN_MM_RES_FAIL;
1681	} else {
1682		/*
1683		 * Ownership change succeeded. Update in-core version of
1684		 * mirror owner.
1685		 */
1686		mutex_enter(&un->un_owner_mx);
1687		if (un->un_owner_state & MM_MN_BECOME_OWNER) {
1688			un->un_mirror_owner = md_mn_mynode_id;
1689			/* Sets node owner of un_rr_dirty record */
1690			if (un->un_rr_dirty_recid)
1691				(void) mddb_setowner(un->un_rr_dirty_recid,
1692				    md_mn_mynode_id);
1693			/*
1694			 * Release the block on the current resync region if it
1695			 * is blocked
1696			 */
1697			ps1 = un->un_rs_prev_overlap;
1698			if ((ps1 != NULL) &&
1699			    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
1700				mirror_overlap_tree_remove(ps1);
1701		}
1702
1703		un->un_owner_state &= ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
1704		mutex_exit(&un->un_owner_mx);
1705		un->un_mirror_owner_status =
1706		    MD_MN_MM_RESULT | MD_MN_MM_RES_OK;
1707
1708		/* Restart the resync thread if it was previously blocked */
1709		if (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) {
1710			mutex_enter(&un->un_rs_thread_mx);
1711			un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
1712			cv_signal(&un->un_rs_thread_cv);
1713			mutex_exit(&un->un_rs_thread_mx);
1714		}
1715	}
1716	kmem_free(kresult, sizeof (md_mn_kresult_t));
1717	kmem_free(ownp, sizeof (md_mn_req_owner_t));
1718	thread_exit();
1719}
1720
1721/*
1722 * mirror_set_owner:
1723 * ----------------
1724 * Called to change the owner of a mirror to the specified node. If we
1725 * are not the owner of the mirror, we do nothing apart from update the in-core
1726 * ownership. It can also be used to choose a new owner for the resync of a
1727 * mirror, this case is specified by the flag MD_MN_MM_CHOOSE_OWNER, see below.
1728 *
1729 * The p->d.flags bitfield controls how subsequent ownership changes will be
1730 * handled:
1731 *	MD_MN_MM_SPAWN_THREAD
1732 *		a separate thread is created which emulates the behaviour of
1733 *		become_owner() [mirror.c]. This is needed when changing the
1734 *		ownership from user context as there needs to be a controlling
1735 *		kernel thread which updates the owner info on the originating
1736 *		node. Successful completion of the mdmn_ksend_message() means
1737 *		that the owner field can be changed.
1738 *
1739 *	MD_MN_MM_PREVENT_CHANGE
1740 *		Disallow any change of ownership once this ownership change has
1741 *		been processed. The only way of changing the owner away from
1742 *		the p->d.owner node specified in the call is to issue a request
1743 *		with MD_MN_MM_ALLOW_CHANGE set in the flags. Any request to
1744 *		become owner from a different node while the PREVENT_CHANGE
1745 *		is in operation will result in an EAGAIN return value.
1746 *		un->un_owner_state has MM_MN_PREVENT_CHANGE set.
1747 *
1748 *	MD_MN_MM_ALLOW_CHANGE
1749 *		Allow the owner to be changed by a subsequent request.
1750 *		un->un_owner_state has MM_MN_PREVENT_CHANGE cleared.
1751 *
1752 *	MD_MN_MM_CHOOSE_OWNER
1753 *		Choose a new owner for a mirror resync. In this case, the new
1754 *		owner argument is not used. The selection of a new owner
1755 *		is a round robin allocation using a resync owner count. This
1756 *		ioctl passes this value in a message to the master node
1757 *		which uses it to select a node from the node list and then
1758 *		sends it a message to become the owner.
1759 *
1760 * If we are the current owner, we must stop further i/o from being scheduled
1761 * and wait for any pending i/o to drain. We wait for any in-progress resync
1762 * bitmap updates to complete and we can then set the owner. If an update to
1763 * the resync bitmap is attempted after this we simply don't write this out to
1764 * disk until the ownership is restored.
1765 *
1766 * If we are the node that wants to become the owner we update the in-core
1767 * owner and return. The i/o that initiated the ownership change will complete
1768 * on successful return from this ioctl.
1769 *
1770 * Return Value:
1771 *	0		Success
1772 * 	EINVAL		Invalid unit referenced
1773 *	EAGAIN		Ownership couldn't be transferred away or change of
1774 *			ownership is prevented. Caller should retry later on.
1775 */
1776static int
1777mirror_set_owner(md_set_mmown_params_t *p, IOLOCK *lock)
1778{
1779	mdi_unit_t	*ui;
1780	mm_unit_t	*un;
1781	set_t		setno;
1782
1783	if ((un = mirror_getun(p->d.mnum, &p->mde, RD_LOCK, lock)) == NULL)
1784		return (EINVAL);
1785	ui = MDI_UNIT(p->d.mnum);
1786	setno = MD_MIN2SET(p->d.mnum);
1787	if (!MD_MNSET_SETNO(setno)) {
1788		return (EINVAL);
1789	}
1790
1791	/*
1792	 * If we are choosing a new resync owner, send a message to the master
1793	 * to make the choice.
1794	 */
1795	if (p->d.flags & MD_MN_MM_CHOOSE_OWNER) {
1796		/* Release ioctl lock before we call ksend_message() */
1797		md_ioctl_readerexit(lock);
1798		/* If we're resetting the owner pass the node id in */
1799		if (p->d.owner != MD_MN_MIRROR_UNOWNED) {
1800			return (mirror_choose_owner(un, &p->d));
1801		} else {
1802			return (mirror_choose_owner(un, NULL));
1803		}
1804	}
1805
1806	/*
1807	 * Check for whether we have to spawn a thread to issue this request.
1808	 * If set we issue a mdmn_ksend_message() to cause the appropriate
1809	 * ownership change. On completion of this request the calling
1810	 * application _must_ poll the structure 'flags' field to determine the
1811	 * result of the request. All this is necessary until we have true
1812	 * multi-entrant ioctl support.
1813	 * If we are just clearing the owner, then MD_MN_MM_SPAWN_THREAD can
1814	 * be ignored.
1815	 */
1816	if ((p->d.flags & MD_MN_MM_SPAWN_THREAD) && (p->d.owner != 0)) {
1817		md_mn_req_owner_t	*ownp;
1818		ownp = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
1819		p->d.flags &= ~MD_MN_MM_SPAWN_THREAD;
1820		bcopy(&p->d, ownp, sizeof (md_mn_req_owner_t));
1821		if (thread_create(NULL, 0, mirror_owner_thread, (caddr_t)ownp,
1822		    0, &p0, TS_RUN, 60) == NULL) {
1823			kmem_free(ownp, sizeof (md_mn_req_owner_t));
1824			return (EFAULT);
1825		} else {
1826			return (0);
1827		}
1828	}
1829
1830	/*
1831	 * If setting owner to NULL, this is being done because the owner has
1832	 * died and therefore we set OPT_NOT_DONE to ensure that the
1833	 * mirror is marked as "Needs Maintenance" and that an optimized
1834	 * resync will be done when we resync the mirror, Also clear the
1835	 * PREVENT_CHANGE flag and remove the last resync region from the
1836	 * overlap tree.
1837	 */
1838	if (p->d.owner == 0) {
1839		md_mps_t	*ps;
1840		int		i;
1841
1842		md_ioctl_readerexit(lock);
1843		un = md_ioctl_writerlock(lock, ui);
1844		/*
1845		 * If the ABR capability is not set and the pass_num is non-zero
1846		 * there is need to perform an optimized resync
1847		 * Therefore set OPT_NOT_DONE, setup the resync_bm and set
1848		 * the submirrors as resync targets.
1849		 */
1850		if (!(ui->ui_tstate & MD_ABR_CAP) && un->un_pass_num) {
1851			MD_STATUS(un) |= MD_UN_OPT_NOT_DONE;
1852
1853			(void) mddb_reread_rr(setno, un->un_rr_dirty_recid);
1854			bcopy((caddr_t)un->un_dirty_bm,
1855			    (caddr_t)un->un_resync_bm,
1856			    howmany(un->un_rrd_num, NBBY));
1857			for (i = 0; i < NMIRROR; i++) {
1858				if ((SUBMIRROR_IS_READABLE(un, i)) ||
1859				    SMS_BY_INDEX_IS(un, i,
1860				    SMS_OFFLINE_RESYNC))
1861					un->un_sm[i].sm_flags |=
1862					    MD_SM_RESYNC_TARGET;
1863			}
1864		}
1865		mutex_enter(&un->un_owner_mx);
1866		un->un_owner_state &= ~MD_MN_MM_PREVENT_CHANGE;
1867		mutex_exit(&un->un_owner_mx);
1868		ps = un->un_rs_prev_overlap;
1869		if ((ps != NULL) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1870			mirror_overlap_tree_remove(ps);
1871			ps->ps_firstblk = 0;
1872			ps->ps_lastblk = 0;
1873		}
1874		md_ioctl_writerexit(lock);
1875		un = md_ioctl_readerlock(lock, ui);
1876	}
1877
1878	mutex_enter(&un->un_owner_mx);
1879	if (!(un->un_owner_state & MM_MN_BECOME_OWNER)) {
1880		/*
1881		 * If we are not trying to become owner ourselves check
1882		 * to see if we have to change the owner
1883		 */
1884		if (un->un_mirror_owner == p->d.owner) {
1885			/*
1886			 * No need to change owner,
1887			 * Clear/set PREVENT_CHANGE bit
1888			 */
1889			if (p->d.flags & MD_MN_MM_PREVENT_CHANGE) {
1890				un->un_owner_state |= MM_MN_PREVENT_CHANGE;
1891			} else if (p->d.flags & MD_MN_MM_ALLOW_CHANGE) {
1892				un->un_owner_state &= ~MM_MN_PREVENT_CHANGE;
1893			}
1894			mutex_exit(&un->un_owner_mx);
1895			return (0);
1896		}
1897	}
1898
1899	/*
1900	 * Disallow ownership change if previously requested to. This can only
1901	 * be reset by issuing a request with MD_MN_MM_ALLOW_CHANGE set in the
1902	 * flags field.
1903	 */
1904	if ((un->un_owner_state & MM_MN_PREVENT_CHANGE) &&
1905	    !(p->d.flags & MD_MN_MM_ALLOW_CHANGE)) {
1906		mutex_exit(&un->un_owner_mx);
1907#ifdef DEBUG
1908		cmn_err(CE_WARN, "mirror_ioctl: Node %x attempted to become "
1909		    "owner while node %x has exclusive access to %s",
1910		    p->d.owner, un->un_mirror_owner, md_shortname(MD_SID(un)));
1911#endif
1912		return (EAGAIN);
1913	}
1914	if (p->d.owner == md_mn_mynode_id) {
1915		/*
1916		 * I'm becoming the mirror owner. Flag this so that the
1917		 * message sender can change the in-core owner when all
1918		 * nodes have processed this message
1919		 */
1920		un->un_owner_state &= ~MM_MN_OWNER_SENT;
1921		un->un_owner_state |= MM_MN_BECOME_OWNER;
1922		un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ?
1923		    MM_MN_PREVENT_CHANGE : 0;
1924		un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ?
1925		    ~MM_MN_PREVENT_CHANGE : ~0;
1926
1927		mutex_exit(&un->un_owner_mx);
1928	} else if ((un->un_mirror_owner == md_mn_mynode_id) ||
1929	    un->un_owner_state & MM_MN_BECOME_OWNER) {
1930		mutex_exit(&un->un_owner_mx);
1931
1932		/*
1933		 * I'm releasing ownership. Block and drain i/o. This also
1934		 * blocks until any in-progress resync record update completes.
1935		 */
1936		md_ioctl_readerexit(lock);
1937		un = md_ioctl_writerlock(lock, ui);
1938		/* Block the resync thread */
1939		mutex_enter(&un->un_rs_thread_mx);
1940		un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
1941		mutex_exit(&un->un_rs_thread_mx);
1942		mutex_enter(&un->un_owner_mx);
1943		un->un_mirror_owner = p->d.owner;
1944
1945		/* Sets node owner of un_rr_dirty record */
1946		if (un->un_rr_dirty_recid)
1947			(void) mddb_setowner(un->un_rr_dirty_recid, p->d.owner);
1948		un->un_owner_state &= ~MM_MN_BECOME_OWNER;
1949		un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ?
1950		    MM_MN_PREVENT_CHANGE : 0;
1951		un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ?
1952		    ~MM_MN_PREVENT_CHANGE : ~0;
1953		mutex_exit(&un->un_owner_mx);
1954		/*
1955		 * Allow further i/o to occur. Any write() from another node
1956		 * will now cause another ownership change to occur.
1957		 */
1958		md_ioctl_writerexit(lock);
1959	} else {
1960		/* Update the in-core mirror owner */
1961		un->un_mirror_owner = p->d.owner;
1962		/* Sets node owner of un_rr_dirty record */
1963		if (un->un_rr_dirty_recid)
1964			(void) mddb_setowner(un->un_rr_dirty_recid, p->d.owner);
1965		un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ?
1966		    MM_MN_PREVENT_CHANGE : 0;
1967		un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ?
1968		    ~MM_MN_PREVENT_CHANGE : ~0;
1969		mutex_exit(&un->un_owner_mx);
1970	}
1971	return (0);
1972}
1973/*
1974 * mirror_allocate_hotspare:
1975 * ------------------------
1976 * Called to allocate a hotspare for a failed component. This function is
1977 * called by the MD_MN_ALLOCATE_HOTSPARE ioctl.
1978 */
1979static int
1980mirror_allocate_hotspare(md_alloc_hotsp_params_t *p, IOLOCK *lockp)
1981{
1982	set_t		setno;
1983	mm_unit_t	*un;
1984
1985#ifdef DEBUG
1986	if (mirror_debug_flag)
1987		printf("mirror_allocate_hotspare: mnum,sm,comp = %x, %x, %x\n",
1988		    p->mnum, p->sm, p->comp);
1989#endif
1990
1991	if ((un = mirror_getun(p->mnum, &p->mde, WR_LOCK, lockp)) == NULL)
1992		return (EINVAL);
1993
1994	/* This function is only valid for a multi-node set */
1995	setno = MD_MIN2SET(p->mnum);
1996	if (!MD_MNSET_SETNO(setno)) {
1997		return (EINVAL);
1998	}
1999	(void) check_comp_4_hotspares(un, p->sm, p->comp, MD_HOTSPARE_NO_XMIT,
2000	    p->hs_id, lockp);
2001	md_ioctl_writerexit(lockp);
2002	return (0);
2003}
2004
2005/*
2006 * mirror_get_owner_status:
2007 * -----------------------
2008 * Return the status of a previously issued ioctl to change ownership. This is
2009 * required for soft-partition support as the request to change mirror owner
2010 * needs to be run from a separate daemon thread.
2011 *
2012 * Returns:
2013 *	0	Success (contents of un_mirror_owner_status placed in 'flags')
2014 *	EINVAL	Invalid unit
2015 */
2016static int
2017mirror_get_owner_status(md_mn_own_status_t *p, IOLOCK *lock)
2018{
2019	mm_unit_t	*un;
2020	set_t		setno;
2021
2022	if ((un = mirror_getun(p->mnum, &p->mde, RD_LOCK, lock)) == NULL)
2023		return (EINVAL);
2024
2025	setno = MD_MIN2SET(p->mnum);
2026	if (!MD_MNSET_SETNO(setno)) {
2027		return (EINVAL);
2028	}
2029
2030	p->flags = un->un_mirror_owner_status;
2031	return (0);
2032}
2033
2034/*
2035 * mirror_set_state:
2036 * ---------------
2037 * Called to set the state of the component of a submirror to the specified
2038 * value. This function is called by the MD_MN_SET_STATE ioctl.
2039 */
2040static int
2041mirror_set_state(md_set_state_params_t *p, IOLOCK *lockp)
2042{
2043	mm_unit_t		*un;
2044	mm_submirror_t		*sm;
2045	mm_submirror_ic_t	*smic;
2046	md_m_shared_t		*shared;
2047	set_t			setno;
2048
2049#ifdef DEBUG
2050	if (mirror_debug_flag)
2051		printf("mirror_set_state: mnum,sm,comp,state, hs_id = %x, "
2052		    "%x, %x, %x %x\n", p->mnum, p->sm, p->comp,
2053		    p->state, p->hs_id);
2054#endif
2055	if ((un = mirror_getun(p->mnum, &p->mde, WR_LOCK, lockp)) == NULL)
2056		return (EINVAL);
2057
2058	/* This function is only valid for a multi-node set */
2059	setno = MD_MIN2SET(p->mnum);
2060	if (!MD_MNSET_SETNO(setno)) {
2061		return (EINVAL);
2062	}
2063	sm = &un->un_sm[p->sm];
2064	smic = &un->un_smic[p->sm];
2065
2066	/* Set state in component and update ms_flags */
2067	shared = (md_m_shared_t *)
2068	    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, p->comp);
2069	/*
2070	 * If a CS_ERRED state is being sent, verify that the sender
2071	 * has the same view of the component that this node currently has.
2072	 *
2073	 * There is a case where the sender was sending a CS_ERRED when a
2074	 * component was in error, but before the sender returns from
2075	 * ksend_message the component has been hotspared and resync'd.
2076	 *
2077	 * In this case, the hs_id will be different from the shared ms_hs_id,
2078	 * so the component has already been hotspared.  Just return in this
2079	 * case.
2080	 */
2081	if (p->state == CS_ERRED) {
2082		if (shared->ms_hs_id != p->hs_id) {
2083#ifdef DEBUG
2084			if (mirror_debug_flag) {
2085				printf("mirror_set_state: short circuit "
2086				    "hs_id=0x%x, ms_hs_id=0x%x\n",
2087				    p->hs_id, shared->ms_hs_id);
2088			}
2089#endif
2090			/* release the block on writes to the mirror */
2091			mirror_resume_writes(un);
2092			md_ioctl_writerexit(lockp);
2093			return (0);
2094		}
2095	}
2096
2097	/*
2098	 * If the device is newly errored then make sure that it is
2099	 * closed. Closing the device allows for the RCM framework
2100	 * to unconfigure the device if required.
2101	 */
2102	if (!(shared->ms_state & CS_ERRED) && (p->state & CS_ERRED) &&
2103	    (shared->ms_flags & MDM_S_ISOPEN)) {
2104		void		(*get_dev)();
2105		ms_cd_info_t	cd;
2106
2107		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2108		    "get device", 0);
2109		(void) (*get_dev)(sm->sm_dev, sm, p->comp, &cd);
2110
2111		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2112		shared->ms_flags &= ~MDM_S_ISOPEN;
2113	}
2114
2115	shared->ms_state = p->state;
2116	uniqtime32(&shared->ms_timestamp);
2117
2118	if (p->state == CS_ERRED) {
2119		shared->ms_flags |= MDM_S_NOWRITE;
2120	} else
2121		shared->ms_flags &= ~MDM_S_NOWRITE;
2122
2123	shared->ms_flags &= ~MDM_S_IOERR;
2124	un->un_changecnt++;
2125	shared->ms_lasterrcnt = un->un_changecnt;
2126
2127	/* Update state in submirror */
2128	mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2129	/*
2130	 * Commit the state change to the metadb, only the master will write
2131	 * to disk
2132	 */
2133	mirror_commit(un, SMI2BIT(p->sm), 0);
2134
2135	/* release the block on writes to the mirror */
2136	mirror_resume_writes(un);
2137
2138	/* generate NOTIFY events for error state changes */
2139	if (p->state == CS_ERRED) {
2140		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
2141		    MD_UN2SET(un), MD_SID(un));
2142	} else if (p->state == CS_LAST_ERRED) {
2143		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
2144		    MD_UN2SET(un), MD_SID(un));
2145	}
2146	md_ioctl_writerexit(lockp);
2147	return (0);
2148}
2149
2150/*
2151 * mirror_suspend_writes:
2152 * ---------------------
2153 * Called to suspend writes to a mirror region. The flag un_suspend_wr_flag is
2154 * tested in mirror_write_strategy, and if set all writes are blocked.
2155 * This function is called by the MD_MN_SUSPEND_WRITES ioctl.
2156 */
2157static int
2158mirror_suspend_writes(md_suspend_wr_params_t *p)
2159{
2160	set_t		setno;
2161	mm_unit_t	*un;
2162
2163#ifdef DEBUG
2164	if (mirror_debug_flag)
2165		printf("mirror_suspend_writes: mnum = %x\n", p->mnum);
2166#endif
2167	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
2168		return (EINVAL); /* No unit */
2169
2170	/* This function is only valid for a multi-node set */
2171	setno = MD_MIN2SET(p->mnum);
2172	if (!MD_MNSET_SETNO(setno)) {
2173		return (EINVAL);
2174	}
2175
2176	/*
2177	 * Mark the resync as blocked. This will stop any currently running
2178	 * thread and will prevent a new resync from attempting to perform
2179	 * i/o
2180	 */
2181	mutex_enter(&un->un_rs_thread_mx);
2182	un->un_rs_thread_flags |= MD_RI_BLOCK;
2183	mutex_exit(&un->un_rs_thread_mx);
2184
2185	mutex_enter(&un->un_suspend_wr_mx);
2186	un->un_suspend_wr_flag = 1;
2187	mutex_exit(&un->un_suspend_wr_mx);
2188
2189	return (0);
2190}
2191
2192/*
2193 * mirror_set_capability:
2194 * ------------------------
2195 * Called to set or clear a capability for a mirror
2196 * called by the MD_MN_SET_CAP ioctl.
2197 */
2198static int
2199mirror_set_capability(md_mn_setcap_params_t *p, IOLOCK *lockp)
2200{
2201	set_t		setno;
2202	mm_unit_t	*un;
2203	mdi_unit_t	*ui;
2204
2205#ifdef DEBUG
2206	if (mirror_debug_flag)
2207		printf("mirror_set_capability: mnum = %x\n", p->mnum);
2208#endif
2209	if ((un = mirror_getun(p->mnum, &p->mde, RD_LOCK, lockp)) == NULL)
2210		return (EINVAL);
2211
2212	/* This function is only valid for a multi-node set */
2213	setno = MD_MIN2SET(p->mnum);
2214	if (!MD_MNSET_SETNO(setno)) {
2215		return (EINVAL);
2216	}
2217	ui = MDI_UNIT(p->mnum);
2218
2219	if (p->sc_set & DKV_ABR_CAP) {
2220		ui->ui_tstate |= MD_ABR_CAP; /* Set ABR capability */
2221		/* Clear DRL and set owner to 0 if no resync active */
2222		mirror_process_unit_resync(un);
2223		if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
2224			mutex_enter(&un->un_owner_mx);
2225			un->un_mirror_owner = 0;
2226			mutex_exit(&un->un_owner_mx);
2227		}
2228	} else {
2229		ui->ui_tstate &= ~MD_ABR_CAP; /* Clear ABR capability */
2230	}
2231	if (p->sc_set & DKV_DMR_CAP) {
2232		ui->ui_tstate |= MD_DMR_CAP; /* Set DMR capability */
2233	} else {
2234		ui->ui_tstate &= ~MD_DMR_CAP; /* Clear DMR capability */
2235	}
2236	return (0);
2237}
2238
2239/*
2240 * mirror_choose_owner:
2241 * ------------------------
2242 * Called to choose an owner for a mirror resync. Can be called when starting
2243 * resync or by the MD_MN_SET_MM_OWNER ioctl with the MD_MN_MM_CHOOSE_OWNER flag
2244 * set. The ioctl is called with this flag set when we are in the cluster
2245 * reconfig and we wish to set a new owner for a resync whose owner has left
2246 * the cluster. We use a resync owner count to implement a round robin
2247 * allocation of resync owners. We send a message to the master including
2248 * this count and the message handler uses it to select an owner from the
2249 * nodelist and then sends a SET_MM_OWNER message to the chosen node to
2250 * become the owner.
2251 *
2252 * Input:
2253 *	un	- unit reference
2254 *	ownp	- owner information (if non-NULL)
2255 */
2256int
2257mirror_choose_owner(mm_unit_t *un, md_mn_req_owner_t *ownp)
2258{
2259	set_t		setno;
2260	md_mn_msg_chooseid_t	*msg;
2261
2262	/* This function is only valid for a multi-node set */
2263	setno = MD_UN2SET(un);
2264	if (!MD_MNSET_SETNO(setno)) {
2265		return (EINVAL);
2266	}
2267
2268
2269#ifdef DEBUG
2270	if (mirror_debug_flag)
2271		printf("send choose owner message, mnum = %x,"
2272		    "rcnt = %d\n", MD_SID(un), md_set[setno].s_rcnt);
2273#endif
2274
2275	/*
2276	 * setup message with current resync count
2277	 * and then increment the count. If we're called with a non-NULL
2278	 * owner then we are reestablishing the owner of the mirror. In this
2279	 * case we have to flag this to the message handler and set rcnt to
2280	 * the new owner node.
2281	 */
2282	msg = kmem_zalloc(sizeof (md_mn_msg_chooseid_t), KM_SLEEP);
2283	msg->msg_chooseid_mnum = MD_SID(un);
2284	if (ownp == NULL) {
2285		mutex_enter(&md_mx);
2286		msg->msg_chooseid_rcnt = md_set[setno].s_rcnt;
2287		md_set[setno].s_rcnt++;
2288		mutex_exit(&md_mx);
2289		msg->msg_chooseid_set_node = B_FALSE;
2290	} else {
2291		msg->msg_chooseid_rcnt = ownp->owner;
2292		msg->msg_chooseid_set_node = B_TRUE;
2293	}
2294
2295	/*
2296	 * Spawn a thread to issue the ksend_message() call so that we can
2297	 * drop the ioctl lock hierarchy that is blocking further rpc.metad and
2298	 * commd set ownership checking.
2299	 */
2300	if (thread_create(NULL, 0, mirror_choose_owner_thread, (caddr_t)msg,
2301	    0, &p0, TS_RUN, 60) == NULL) {
2302		kmem_free(msg, sizeof (md_mn_msg_chooseid_t));
2303		return (EFAULT);
2304	} else {
2305		return (0);
2306	}
2307}
2308
2309/*
2310 * mirror_get_status:
2311 * ----------------------------------
2312 * Called by nodes which are not the master node of the cluster. Obtains the
2313 * master abr state and the submirror status for each valid submirror of the
2314 * unit so that the status returned by metastat is consistent across the
2315 * cluster.
2316 * We update tstate for the mirror and both the sm_flag and the sm_state for
2317 * each submirror.
2318 *
2319 * Input:
2320 *	un	mirror to obtain status from
2321 *
2322 * Calling Convention:
2323 *	writerlock (either ioctl or unit) must be held
2324 */
2325void
2326mirror_get_status(mm_unit_t *un, IOLOCK *lockp)
2327{
2328	mm_submirror_t		*sm;
2329	int			smi;
2330	int			rval;
2331	md_mn_kresult_t		*kres;
2332	md_mn_msg_mir_state_t	msg;
2333	md_mn_msg_mir_state_res_t	*res;
2334	set_t			setno = MD_UN2SET(un);
2335	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2336
2337
2338	ASSERT(ui->ui_lock & MD_UL_WRITER);
2339
2340	/*
2341	 * Get all of the information for the mirror.
2342	 */
2343	bzero(&msg, sizeof (msg));
2344	msg.mir_state_mnum = MD_SID(un);
2345
2346	/*
2347	 * Must drop the writerlock over ksend_message since another
2348	 * thread on this node could be running a higher class message
2349	 * and be trying grab the readerlock.
2350	 *
2351	 * If we are in the context of an ioctl, drop the ioctl lock.
2352	 * lockp holds the list of locks held.
2353	 */
2354	if (lockp) {
2355		IOLOCK_RETURN_RELEASE(0, lockp);
2356	} else {
2357		md_unit_writerexit(ui);
2358	}
2359
2360	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2361	rval = mdmn_ksend_message(setno, MD_MN_MSG_GET_MIRROR_STATE,
2362	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)&msg,
2363	    sizeof (msg), kres);
2364
2365	/* if the node hasn't yet joined, it's Ok. */
2366	if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
2367	    (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
2368		mdmn_ksend_show_error(rval, kres, "GET_MIRROR_STATE");
2369		cmn_err(CE_WARN, "ksend_message failure: GET_MIRROR_STATE");
2370	}
2371
2372	/* if dropped the lock previously, regain it */
2373	if (lockp) {
2374		IOLOCK_RETURN_REACQUIRE(lockp);
2375	} else {
2376		/*
2377		 * Reacquire dropped locks and update acquirecnts
2378		 * appropriately.
2379		 */
2380		(void) md_unit_writerlock(ui);
2381	}
2382
2383	/*
2384	 * Check to see if we've got a believable amount of returned data.
2385	 * If not, we simply return as there is no usable information.
2386	 */
2387	if (kres->kmmr_res_size < sizeof (*res)) {
2388		cmn_err(CE_WARN, "GET_MIRROR_STATE: returned %d bytes, expected"
2389		    " %d\n", kres->kmmr_res_size, (int)sizeof (*res));
2390		kmem_free(kres, sizeof (md_mn_kresult_t));
2391		return;
2392	}
2393
2394	/*
2395	 * Copy the results from the call back into our sm_state/sm_flags
2396	 */
2397	res = (md_mn_msg_mir_state_res_t *)kres->kmmr_res_data;
2398#ifdef DEBUG
2399	if (mirror_debug_flag)
2400		printf("mirror_get_status: %s\n", md_shortname(MD_SID(un)));
2401#endif
2402	for (smi = 0; smi < NMIRROR; smi++) {
2403		sm = &un->un_sm[smi];
2404#ifdef DEBUG
2405		if (mirror_debug_flag) {
2406			printf("curr state %4x, new state %4x\n", sm->sm_state,
2407			    res->sm_state[smi]);
2408			printf("curr_flags %4x, new flags %4x\n", sm->sm_flags,
2409			    res->sm_flags[smi]);
2410		}
2411#endif
2412		sm->sm_state = res->sm_state[smi];
2413		sm->sm_flags = res->sm_flags[smi];
2414	}
2415
2416	/* Set ABR if set on the Master node */
2417	ui->ui_tstate |= (res->mir_tstate & MD_ABR_CAP);
2418
2419	kmem_free(kres, sizeof (md_mn_kresult_t));
2420}
2421
2422/*
2423 * mirror_get_mir_state:
2424 * -------------------
2425 * Obtain the ABR state of a mirror and the state of all submirrors from the
2426 * master node for the unit specified in sm_state->mnum.
2427 * Called by MD_MN_GET_MIRROR_STATE ioctl.
2428 */
2429static int
2430mirror_get_mir_state(md_mn_get_mir_state_t *p, IOLOCK *lockp)
2431{
2432	mm_unit_t	*un;
2433	set_t		setno;
2434	md_error_t	mde;
2435
2436	mdclrerror(&mde);
2437
2438	if ((un = mirror_getun(p->mnum, &mde, WR_LOCK, lockp)) == NULL) {
2439		return (EINVAL);
2440	}
2441	setno = MD_MIN2SET(p->mnum);
2442	if (!MD_MNSET_SETNO(setno)) {
2443		return (EINVAL);
2444	}
2445
2446	/*
2447	 * We've now got a writerlock on the unit structure (so no-one can
2448	 * modify the incore values) and we'll now send the message to the
2449	 * master node. Since we're only called as part of a reconfig cycle
2450	 * we don't need to release the unit locks across the ksend_message as
2451	 * only the master node will process it, and we never send this to
2452	 * ourselves if we're the master.
2453	 */
2454
2455	mirror_get_status(un, lockp);
2456
2457	return (0);
2458}
2459
2460static int
2461mirror_admin_ioctl(int cmd, void *data, int mode, IOLOCK *lockp)
2462{
2463	size_t	sz = 0;
2464	void	*d = NULL;
2465	int	err = 0;
2466
2467	/* We can only handle 32-bit clients for internal commands */
2468	if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2469		return (EINVAL);
2470	}
2471	/* dispatch ioctl */
2472	switch (cmd) {
2473
2474	case MD_IOCSET:
2475	{
2476		if (! (mode & FWRITE))
2477			return (EACCES);
2478
2479		sz = sizeof (md_set_params_t);
2480
2481		d = kmem_alloc(sz, KM_SLEEP);
2482
2483		if (ddi_copyin(data, d, sz, mode)) {
2484			err = EFAULT;
2485			break;
2486		}
2487
2488		err = mirror_set(d, mode);
2489		break;
2490	}
2491
2492	case MD_IOCGET:
2493	{
2494		if (! (mode & FREAD))
2495			return (EACCES);
2496
2497		sz = sizeof (md_i_get_t);
2498
2499		d = kmem_alloc(sz, KM_SLEEP);
2500
2501		if (ddi_copyin(data, d, sz, mode)) {
2502			err = EFAULT;
2503			break;
2504		}
2505
2506		err = mirror_get(d, mode, lockp);
2507		break;
2508	}
2509
2510	case MD_IOCRESET:
2511	{
2512		if (! (mode & FWRITE))
2513			return (EACCES);
2514
2515		sz = sizeof (md_i_reset_t);
2516		d = kmem_alloc(sz, KM_SLEEP);
2517
2518		if (ddi_copyin(data, d, sz, mode)) {
2519			err = EFAULT;
2520			break;
2521		}
2522
2523		err = mirror_reset((md_i_reset_t *)d);
2524		break;
2525	}
2526
2527	case MD_IOCSETSYNC:
2528	case MD_MN_SETSYNC:
2529	{
2530		if (! (mode & FWRITE))
2531			return (EACCES);
2532
2533		sz = sizeof (md_resync_ioctl_t);
2534		d = kmem_alloc(sz, KM_SLEEP);
2535
2536		if (ddi_copyin(data, d, sz, mode)) {
2537			err = EFAULT;
2538			break;
2539		}
2540
2541		err = mirror_ioctl_resync((md_resync_ioctl_t *)d, lockp);
2542		break;
2543	}
2544
2545	case MD_IOCGETSYNC:
2546	{
2547		if (! (mode & FREAD))
2548			return (EACCES);
2549
2550		sz = sizeof (md_resync_ioctl_t);
2551		d = kmem_alloc(sz, KM_SLEEP);
2552
2553		if (ddi_copyin(data, d, sz, mode)) {
2554			err = EFAULT;
2555			break;
2556		}
2557
2558		err = mirror_get_resync((md_resync_ioctl_t *)d);
2559		break;
2560	}
2561
2562	case MD_IOCREPLACE:
2563	{
2564		if (! (mode & FWRITE))
2565			return (EACCES);
2566
2567		sz = sizeof (replace_params_t);
2568		d = kmem_alloc(sz, KM_SLEEP);
2569
2570		if (ddi_copyin(data, d, sz, mode)) {
2571			err = EFAULT;
2572			break;
2573		}
2574
2575		err = comp_replace((replace_params_t *)d, lockp);
2576		break;
2577	}
2578
2579	case MD_IOCOFFLINE:
2580	{
2581		if (! (mode & FWRITE))
2582			return (EACCES);
2583
2584		sz = sizeof (md_i_off_on_t);
2585		d = kmem_alloc(sz, KM_SLEEP);
2586
2587		if (ddi_copyin(data, d, sz, mode)) {
2588			err = EFAULT;
2589			break;
2590		}
2591
2592		err = mirror_offline((md_i_off_on_t *)d, lockp);
2593		break;
2594	}
2595
2596	case MD_IOCONLINE:
2597	{
2598		if (! (mode & FWRITE))
2599			return (EACCES);
2600
2601		sz = sizeof (md_i_off_on_t);
2602		d = kmem_alloc(sz, KM_SLEEP);
2603
2604		if (ddi_copyin(data, d, sz, mode)) {
2605			err = EFAULT;
2606			break;
2607		}
2608
2609		err = mirror_online((md_i_off_on_t *)d, lockp);
2610		break;
2611	}
2612
2613	case MD_IOCDETACH:
2614	{
2615		if (! (mode & FWRITE))
2616			return (EACCES);
2617
2618		sz = sizeof (md_detach_params_t);
2619		d = kmem_alloc(sz, KM_SLEEP);
2620
2621		if (ddi_copyin(data, d, sz, mode)) {
2622			err = EFAULT;
2623			break;
2624		}
2625
2626		err = mirror_detach((md_detach_params_t *)d, lockp);
2627		break;
2628	}
2629
2630	case MD_IOCATTACH:
2631	{
2632
2633		if (! (mode & FWRITE))
2634			return (EACCES);
2635
2636		sz = sizeof (md_att_struct_t);
2637		d = kmem_alloc(sz, KM_SLEEP);
2638
2639		if (ddi_copyin(data, d, sz, mode)) {
2640			err = EFAULT;
2641			break;
2642		}
2643
2644		err = mirror_attach((md_att_struct_t *)d, lockp);
2645		break;
2646	}
2647
2648	case MD_IOCGET_DEVS:
2649	{
2650		if (! (mode & FREAD))
2651			return (EACCES);
2652
2653		sz = sizeof (md_getdevs_params_t);
2654
2655		d = kmem_alloc(sz, KM_SLEEP);
2656
2657		if (ddi_copyin(data, d, sz, mode)) {
2658			err = EFAULT;
2659			break;
2660		}
2661
2662		err = mirror_getdevs(d, mode, lockp);
2663		break;
2664	}
2665
2666	case MD_IOCGROW:
2667	{
2668		if (! (mode & FWRITE))
2669			return (EACCES);
2670
2671		sz = sizeof (md_grow_params_t);
2672
2673		d = kmem_alloc(sz, KM_SLEEP);
2674
2675		if (ddi_copyin(data, d, sz, mode)) {
2676			err = EFAULT;
2677			break;
2678		}
2679
2680		err = mirror_grow(d, lockp);
2681		break;
2682	}
2683
2684	case MD_IOCCHANGE:
2685	{
2686		if (! (mode & FWRITE))
2687			return (EACCES);
2688
2689		sz = sizeof (md_mirror_params_t);
2690		d = kmem_alloc(sz, KM_SLEEP);
2691
2692		if (ddi_copyin(data, d, sz, mode)) {
2693			err = EFAULT;
2694			break;
2695		}
2696
2697		err = mirror_change((md_mirror_params_t *)d, lockp);
2698		break;
2699	}
2700
2701	case MD_IOCPROBE_DEV:
2702	{
2703		md_probedev_impl_t	*p = NULL;
2704		md_probedev_t		*ph = NULL;
2705		daemon_queue_t		*hdr = NULL;
2706		int			i;
2707		size_t			sz2 = 0;
2708
2709		if (! (mode & FREAD))
2710			return (EACCES);
2711
2712
2713		sz = sizeof (md_probedev_t);
2714		d = kmem_alloc(sz, KM_SLEEP);
2715
2716		/* now copy in the data */
2717		if (ddi_copyin(data, d, sz, mode)) {
2718			err = EFAULT;
2719			goto free_mem;
2720		}
2721
2722		/*
2723		 * Sanity test the args. Test name should have the keyword
2724		 * probe.
2725		 */
2726
2727		p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2728
2729		p->probe_sema = NULL;
2730		p->probe_mx = NULL;
2731		p->probe.mnum_list = (uint64_t)NULL;
2732
2733		ph = (struct md_probedev *)d;
2734
2735		p->probe.nmdevs = ph->nmdevs;
2736		(void) strcpy(p->probe.test_name, ph->test_name);
2737		bcopy(&ph->md_driver, &(p->probe.md_driver),
2738		    sizeof (md_driver_t));
2739
2740		if ((p->probe.nmdevs < 1) ||
2741		    (strstr(p->probe.test_name, "probe") == NULL)) {
2742			err = EINVAL;
2743			goto free_mem;
2744		}
2745
2746
2747		sz2 = sizeof (minor_t) * p->probe.nmdevs;
2748		p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz2,
2749		    KM_SLEEP);
2750
2751		if (ddi_copyin((void *)(uintptr_t)ph->mnum_list,
2752		    (void *)(uintptr_t)p->probe.mnum_list, sz2, mode)) {
2753			err = EFAULT;
2754			goto free_mem;
2755		}
2756
2757		if (err = md_init_probereq(p, &hdr))
2758			goto free_mem;
2759
2760		/*
2761		 * put the request on the queue and wait.
2762		 */
2763
2764		daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2765
2766		(void) IOLOCK_RETURN(0, lockp);
2767		/* wait for the events to occur */
2768		for (i = 0; i < p->probe.nmdevs; i++) {
2769			sema_p(PROBE_SEMA(p));
2770		}
2771		while (md_ioctl_lock_enter() == EINTR)
2772		;
2773
2774		/*
2775		 * clean up. The hdr list is freed in the probe routines
2776		 * since the list is NULL by the time we get here.
2777		 */
2778free_mem:
2779		if (p) {
2780			if (p->probe_sema != NULL) {
2781				sema_destroy(PROBE_SEMA(p));
2782				kmem_free(p->probe_sema, sizeof (ksema_t));
2783			}
2784			if (p->probe_mx != NULL) {
2785				mutex_destroy(PROBE_MX(p));
2786				kmem_free(p->probe_mx, sizeof (kmutex_t));
2787			}
2788			if ((uintptr_t)p->probe.mnum_list)
2789				kmem_free((void *)(uintptr_t)
2790				    p->probe.mnum_list, sz2);
2791
2792			kmem_free(p, sizeof (md_probedev_impl_t));
2793		}
2794		break;
2795	}
2796
2797	case MD_MN_SET_MM_OWNER:
2798	{
2799		if (! (mode & FWRITE))
2800			return (EACCES);
2801
2802		sz = sizeof (md_set_mmown_params_t);
2803		d = kmem_alloc(sz, KM_SLEEP);
2804
2805		if (ddi_copyin(data, d, sz, mode) != 0) {
2806			err = EFAULT;
2807			break;
2808		}
2809
2810		err = mirror_set_owner((md_set_mmown_params_t *)d, lockp);
2811		break;
2812	}
2813
2814	case MD_MN_GET_MM_OWNER:
2815	{
2816		if (! (mode & FREAD))
2817			return (EACCES);
2818
2819		sz = sizeof (md_set_mmown_params_t);
2820		d = kmem_alloc(sz, KM_SLEEP);
2821
2822		if (ddi_copyin(data, d, sz, mode) != 0) {
2823			err = EFAULT;
2824			break;
2825		}
2826
2827		err = mirror_get_owner((md_set_mmown_params_t *)d, lockp);
2828		break;
2829	}
2830
2831	case MD_MN_MM_OWNER_STATUS:
2832	{
2833		if (! (mode & FREAD))
2834			return (EACCES);
2835
2836		sz = sizeof (md_mn_own_status_t);
2837		d = kmem_alloc(sz, KM_SLEEP);
2838
2839		if (ddi_copyin(data, d, sz, mode) != 0) {
2840			err = EFAULT;
2841			break;
2842		}
2843
2844		err = mirror_get_owner_status((md_mn_own_status_t *)d, lockp);
2845		break;
2846	}
2847
2848	case MD_MN_SET_STATE:
2849	{
2850		if (! (mode & FWRITE))
2851			return (EACCES);
2852
2853		sz = sizeof (md_set_state_params_t);
2854		d = kmem_alloc(sz, KM_SLEEP);
2855
2856		if (ddi_copyin(data, d, sz, mode)) {
2857			err = EFAULT;
2858			break;
2859		}
2860
2861		err  = mirror_set_state((md_set_state_params_t *)d, lockp);
2862		break;
2863	}
2864
2865	case MD_MN_SUSPEND_WRITES:
2866	{
2867		if (! (mode & FREAD))
2868			return (EACCES);
2869
2870		sz = sizeof (md_suspend_wr_params_t);
2871		d = kmem_alloc(sz, KM_SLEEP);
2872
2873		if (ddi_copyin(data, d, sz, mode) != 0) {
2874			err = EFAULT;
2875			break;
2876		}
2877
2878		err = mirror_suspend_writes((md_suspend_wr_params_t *)d);
2879		break;
2880	}
2881
2882	case MD_MN_RESYNC:
2883	{
2884		sz = sizeof (md_mn_rs_params_t);
2885		d = kmem_alloc(sz, KM_SLEEP);
2886
2887		if (ddi_copyin(data, d, sz, mode) != 0) {
2888			err = EFAULT;
2889			break;
2890		}
2891
2892		err = mirror_resync_message((md_mn_rs_params_t *)d, lockp);
2893		break;
2894	}
2895
2896	case MD_MN_ALLOCATE_HOTSPARE:
2897	{
2898		if (! (mode & FWRITE))
2899			return (EACCES);
2900
2901		sz = sizeof (md_alloc_hotsp_params_t);
2902		d = kmem_alloc(sz, KM_SLEEP);
2903
2904		if (ddi_copyin(data, d, sz, mode)) {
2905			err = EFAULT;
2906			break;
2907		}
2908
2909		err  = mirror_allocate_hotspare((md_alloc_hotsp_params_t *)d,
2910		    lockp);
2911		break;
2912	}
2913
2914	case MD_MN_POKE_HOTSPARES:
2915	{
2916		(void) poke_hotspares();
2917		break;
2918	}
2919
2920	case MD_MN_SET_CAP:
2921	{
2922		if (! (mode & FWRITE))
2923			return (EACCES);
2924
2925		sz = sizeof (md_mn_setcap_params_t);
2926		d = kmem_alloc(sz, KM_SLEEP);
2927
2928		if (ddi_copyin(data, d, sz, mode)) {
2929			err = EFAULT;
2930			break;
2931		}
2932
2933		err  = mirror_set_capability((md_mn_setcap_params_t *)d,
2934		    lockp);
2935		break;
2936	}
2937
2938	case MD_MN_GET_MIRROR_STATE:
2939	{
2940		sz = sizeof (md_mn_get_mir_state_t);
2941		d = kmem_zalloc(sz, KM_SLEEP);
2942
2943		if (ddi_copyin(data, d, sz, mode)) {
2944			err = EFAULT;
2945			break;
2946		}
2947
2948		err = mirror_get_mir_state((md_mn_get_mir_state_t *)d,
2949		    lockp);
2950		break;
2951	}
2952
2953	case MD_MN_RR_DIRTY:
2954	{
2955		sz = sizeof (md_mn_rr_dirty_params_t);
2956		d = kmem_zalloc(sz, KM_SLEEP);
2957
2958		if (ddi_copyin(data, d, sz, mode)) {
2959			err = EFAULT;
2960			break;
2961		}
2962
2963		err = mirror_set_dirty_rr((md_mn_rr_dirty_params_t *)d);
2964		break;
2965	}
2966
2967	case MD_MN_RR_CLEAN:
2968	{
2969		md_mn_rr_clean_params_t tmp;
2970
2971		/* get the first part of the structure to find the size */
2972		if (ddi_copyin(data, &tmp, sizeof (tmp), mode)) {
2973			err = EFAULT;
2974			break;
2975		}
2976
2977		sz = MDMN_RR_CLEAN_PARAMS_SIZE(&tmp);
2978		d = kmem_zalloc(sz, KM_SLEEP);
2979
2980		if (ddi_copyin(data, d, sz, mode)) {
2981			err = EFAULT;
2982			break;
2983		}
2984
2985		err = mirror_set_clean_rr((md_mn_rr_clean_params_t *)d);
2986		break;
2987	}
2988
2989	default:
2990		return (ENOTTY);
2991	}
2992
2993	/*
2994	 * copyout and free any args
2995	 */
2996	if (sz != 0) {
2997		if (err == 0) {
2998			if (ddi_copyout(d, data, sz, mode) != 0) {
2999				err = EFAULT;
3000			}
3001		}
3002		kmem_free(d, sz);
3003	}
3004	return (err);
3005}
3006
3007int
3008md_mirror_ioctl(
3009	dev_t		ddi_dev,
3010	int		cmd,
3011	void		*data,
3012	int		mode,
3013	IOLOCK		*lockp
3014)
3015{
3016	minor_t		mnum = getminor(ddi_dev);
3017	mm_unit_t	*un;
3018	int		err = 0;
3019
3020	/* handle admin ioctls */
3021	if (mnum == MD_ADM_MINOR)
3022		return (mirror_admin_ioctl(cmd, data, mode, lockp));
3023
3024	/* check unit */
3025	if ((MD_MIN2SET(mnum) >= md_nsets) ||
3026	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
3027	    ((un = MD_UNIT(mnum)) == NULL))
3028		return (ENXIO);
3029	/* is this a supported ioctl? */
3030	err = md_check_ioctl_against_unit(cmd, un->c);
3031	if (err != 0) {
3032		return (err);
3033	}
3034
3035	/* dispatch ioctl */
3036	switch (cmd) {
3037
3038	case DKIOCINFO:
3039	{
3040		struct dk_cinfo	*p;
3041
3042		if (! (mode & FREAD))
3043			return (EACCES);
3044
3045		p = kmem_alloc(sizeof (*p), KM_SLEEP);
3046
3047		get_info(p, mnum);
3048		if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
3049			err = EFAULT;
3050
3051		kmem_free(p, sizeof (*p));
3052		return (err);
3053	}
3054
3055	case DKIOCGMEDIAINFO:
3056	{
3057		struct dk_minfo	p;
3058
3059		if (! (mode & FREAD))
3060			return (EACCES);
3061
3062		get_minfo(&p, mnum);
3063		if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
3064			err = EFAULT;
3065
3066		return (err);
3067	}
3068
3069	case DKIOCGGEOM:
3070	{
3071		struct dk_geom	*p;
3072
3073		if (! (mode & FREAD))
3074			return (EACCES);
3075
3076		p = kmem_alloc(sizeof (*p), KM_SLEEP);
3077
3078		if ((err = mirror_get_geom(un, p)) == 0) {
3079			if (ddi_copyout((caddr_t)p, data, sizeof (*p),
3080			    mode) != 0)
3081				err = EFAULT;
3082		}
3083
3084		kmem_free(p, sizeof (*p));
3085		return (err);
3086	}
3087
3088	case DKIOCGVTOC:
3089	{
3090		struct vtoc	vtoc;
3091
3092		if (! (mode & FREAD))
3093			return (EACCES);
3094
3095		if ((err = mirror_get_vtoc(un, &vtoc)) != 0) {
3096			return (err);
3097		}
3098
3099		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
3100			if (ddi_copyout(&vtoc, data, sizeof (vtoc), mode))
3101				err = EFAULT;
3102		}
3103#ifdef _SYSCALL32
3104		else {
3105			struct vtoc32 vtoc32;
3106			vtoctovtoc32(vtoc, vtoc32);
3107			if (ddi_copyout(&vtoc32, data, sizeof (vtoc32), mode))
3108				err = EFAULT;
3109		}
3110#endif /* _SYSCALL32 */
3111
3112		return (err);
3113	}
3114
3115	case DKIOCSVTOC:
3116	{
3117		struct vtoc	vtoc;
3118
3119		if (! (mode & FWRITE))
3120			return (EACCES);
3121
3122		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
3123			if (ddi_copyin(data, &vtoc, sizeof (vtoc), mode)) {
3124				err = EFAULT;
3125			}
3126		}
3127#ifdef _SYSCALL32
3128		else {
3129			struct vtoc32 vtoc32;
3130			if (ddi_copyin(data, &vtoc32, sizeof (vtoc32), mode)) {
3131				err = EFAULT;
3132			} else {
3133				vtoc32tovtoc(vtoc32, vtoc);
3134			}
3135		}
3136#endif /* _SYSCALL32 */
3137
3138		if (err == 0)
3139			err = mirror_set_vtoc(un, &vtoc);
3140
3141		return (err);
3142	}
3143
3144	case DKIOCGEXTVTOC:
3145	{
3146		struct extvtoc	extvtoc;
3147
3148		if (! (mode & FREAD))
3149			return (EACCES);
3150
3151		if ((err = mirror_get_extvtoc(un, &extvtoc)) != 0) {
3152			return (err);
3153		}
3154
3155		if (ddi_copyout(&extvtoc, data, sizeof (extvtoc), mode))
3156			err = EFAULT;
3157
3158		return (err);
3159	}
3160
3161	case DKIOCSEXTVTOC:
3162	{
3163		struct extvtoc	extvtoc;
3164
3165		if (! (mode & FWRITE))
3166			return (EACCES);
3167
3168		if (ddi_copyin(data, &extvtoc, sizeof (extvtoc), mode)) {
3169			err = EFAULT;
3170		}
3171
3172		if (err == 0)
3173			err = mirror_set_extvtoc(un, &extvtoc);
3174
3175		return (err);
3176	}
3177
3178	case DKIOCGAPART:
3179	{
3180		struct dk_map	dmp;
3181
3182		if ((err = mirror_get_cgapart(un, &dmp)) != 0) {
3183			return (err);
3184		}
3185
3186		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
3187			if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
3188			    mode) != 0)
3189				err = EFAULT;
3190		}
3191#ifdef _SYSCALL32
3192		else {
3193			struct dk_map32 dmp32;
3194
3195			dmp32.dkl_cylno = dmp.dkl_cylno;
3196			dmp32.dkl_nblk = dmp.dkl_nblk;
3197
3198			if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
3199			    mode) != 0)
3200				err = EFAULT;
3201		}
3202#endif /* _SYSCALL32 */
3203
3204		return (err);
3205	}
3206	case DKIOCGETEFI:
3207	{
3208		/*
3209		 * This one can be done centralized,
3210		 * no need to put in the same code for all types of metadevices
3211		 */
3212		return (md_dkiocgetefi(mnum, data, mode));
3213	}
3214	case DKIOCSETEFI:
3215	{
3216		/*
3217		 * This one can be done centralized,
3218		 * no need to put in the same code for all types of metadevices
3219		 */
3220		return (md_dkiocsetefi(mnum, data, mode));
3221	}
3222	case DKIOCPARTITION:
3223	{
3224		return (md_dkiocpartition(mnum, data, mode));
3225	}
3226
3227	case DKIOCGETVOLCAP:
3228	{
3229		volcap_t	vc;
3230		mdi_unit_t	*ui;
3231
3232		/* Only valid for MN sets */
3233		if (!MD_MNSET_SETNO(MD_MIN2SET(mnum)))
3234			return (EINVAL);
3235
3236		ui = MDI_UNIT(mnum);
3237		if (! (mode & FREAD))
3238			return (EACCES);
3239
3240		vc.vc_info = DKV_ABR_CAP | DKV_DMR_CAP;
3241		vc.vc_set = 0;
3242		if (ui->ui_tstate & MD_ABR_CAP) {
3243			vc.vc_set |= DKV_ABR_CAP;
3244		}
3245		if (ddi_copyout(&vc, data, sizeof (volcap_t), mode))
3246			err = EFAULT;
3247		return (err);
3248	}
3249
3250	case DKIOCSETVOLCAP:
3251	{
3252		volcap_t	vc;
3253		volcapset_t	volcap = 0;
3254		mdi_unit_t	*ui;
3255
3256		/* Only valid for MN sets */
3257		if (!MD_MNSET_SETNO(MD_MIN2SET(mnum)))
3258			return (EINVAL);
3259
3260		ui = MDI_UNIT(mnum);
3261		if (! (mode & FWRITE))
3262			return (EACCES);
3263
3264		if (ddi_copyin(data, &vc, sizeof (volcap_t), mode))
3265			return (EFAULT);
3266
3267		/* Not valid if a submirror is offline */
3268		if (un->c.un_status & MD_UN_OFFLINE_SM) {
3269			return (EINVAL);
3270		}
3271		if (ui->ui_tstate & MD_ABR_CAP)
3272			volcap |= DKV_ABR_CAP;
3273		/* Only send capability message if there is a change */
3274		if ((vc.vc_set & (DKV_ABR_CAP)) != volcap)
3275			err = mdmn_send_capability_message(mnum, vc, lockp);
3276		return (err);
3277	}
3278
3279	case DKIOCDMR:
3280	{
3281		vol_directed_rd_t	*vdr;
3282
3283#ifdef _MULTI_DATAMODEL
3284		vol_directed_rd32_t	*vdr32;
3285#endif	/* _MULTI_DATAMODEL */
3286
3287		/* Only valid for MN sets */
3288		if (!MD_MNSET_SETNO(MD_MIN2SET(mnum)))
3289			return (EINVAL);
3290
3291		vdr = kmem_zalloc(sizeof (vol_directed_rd_t), KM_NOSLEEP);
3292		if (vdr == NULL)
3293			return (ENOMEM);
3294
3295#ifdef _MULTI_DATAMODEL
3296		vdr32 = kmem_zalloc(sizeof (vol_directed_rd32_t), KM_NOSLEEP);
3297		if (vdr32 == NULL) {
3298			kmem_free(vdr, sizeof (vol_directed_rd_t));
3299			return (ENOMEM);
3300		}
3301
3302		switch (ddi_model_convert_from(mode & FMODELS)) {
3303		case DDI_MODEL_ILP32:
3304			/*
3305			 * If we're called from a higher-level driver we don't
3306			 * need to manipulate the data. Its already been done by
3307			 * the caller.
3308			 */
3309			if (!(mode & FKIOCTL)) {
3310				if (ddi_copyin(data, vdr32, sizeof (*vdr32),
3311				    mode)) {
3312					kmem_free(vdr, sizeof (*vdr));
3313					return (EFAULT);
3314				}
3315				vdr->vdr_flags = vdr32->vdr_flags;
3316				vdr->vdr_offset = vdr32->vdr_offset;
3317				vdr->vdr_nbytes = vdr32->vdr_nbytes;
3318				vdr->vdr_data =
3319				    (void *)(uintptr_t)vdr32->vdr_data;
3320				vdr->vdr_side = vdr32->vdr_side;
3321				break;
3322			}
3323			/* FALLTHROUGH */
3324
3325		case DDI_MODEL_NONE:
3326			if (ddi_copyin(data, vdr, sizeof (*vdr), mode)) {
3327				kmem_free(vdr32, sizeof (*vdr32));
3328				kmem_free(vdr, sizeof (*vdr));
3329				return (EFAULT);
3330			}
3331			break;
3332
3333		default:
3334			kmem_free(vdr32, sizeof (*vdr32));
3335			kmem_free(vdr, sizeof (*vdr));
3336			return (EFAULT);
3337		}
3338#else	/* ! _MULTI_DATAMODEL */
3339		if (ddi_copyin(data, vdr, sizeof (*vdr), mode)) {
3340			kmem_free(vdr, sizeof (*vdr));
3341			return (EFAULT);
3342		}
3343#endif	/* _MULTI_DATAMODEL */
3344
3345		err = mirror_directed_read(ddi_dev, vdr, mode);
3346
3347		if (err == 0) {
3348#ifdef _MULTI_DATAMODEL
3349			switch (ddi_model_convert_from(mode & FMODELS)) {
3350			case DDI_MODEL_ILP32:
3351				if (!(mode & FKIOCTL)) {
3352					vdr32->vdr_flags = vdr->vdr_flags;
3353					vdr32->vdr_offset = vdr->vdr_offset;
3354					vdr32->vdr_side = vdr->vdr_side;
3355					vdr32->vdr_bytesread =
3356					    vdr->vdr_bytesread;
3357					bcopy(vdr->vdr_side_name,
3358					    vdr32->vdr_side_name,
3359					    sizeof (vdr32->vdr_side_name));
3360
3361					if (ddi_copyout(vdr32, data,
3362					    sizeof (*vdr32), mode)) {
3363						err = EFAULT;
3364					}
3365					break;
3366				}
3367				/* FALLTHROUGH */
3368
3369			case DDI_MODEL_NONE:
3370				if (ddi_copyout(vdr, data, sizeof (*vdr), mode))
3371					err = EFAULT;
3372				break;
3373			}
3374#else	/* ! _MULTI_DATAMODEL */
3375			if (ddi_copyout(vdr, data, sizeof (*vdr), mode))
3376				err = EFAULT;
3377#endif	/* _MULTI_DATAMODEL */
3378			if (vdr->vdr_flags &  DKV_DMR_ERROR)
3379				err = EIO;
3380		}
3381
3382#ifdef _MULTI_DATAMODEL
3383		kmem_free(vdr32, sizeof (*vdr32));
3384#endif	/* _MULTI_DATAMODEL */
3385
3386		kmem_free(vdr, sizeof (*vdr));
3387
3388		return (err);
3389	}
3390
3391	default:
3392		return (ENOTTY);
3393	}
3394}
3395
3396/*
3397 * rename named service entry points and support functions
3398 */
3399
3400/*
3401 * rename/exchange role swap functions
3402 *
3403 * most of these are handled by generic role swap functions
3404 */
3405
3406/*
3407 * MDRNM_UPDATE_KIDS
3408 * rename/exchange of our child or grandchild
3409 */
3410void
3411mirror_renexch_update_kids(md_rendelta_t *delta, md_rentxn_t *rtxnp)
3412{
3413	mm_submirror_t		*sm;
3414	int			smi;
3415
3416	ASSERT(rtxnp);
3417	ASSERT((MDRNOP_RENAME == rtxnp->op) || (rtxnp->op == MDRNOP_EXCHANGE));
3418	ASSERT(rtxnp->recids);
3419	ASSERT(delta);
3420	ASSERT(delta->unp);
3421	ASSERT(delta->old_role == MDRR_PARENT);
3422	ASSERT(delta->new_role == MDRR_PARENT);
3423
3424	/*
3425	 * since our role isn't changing (parent->parent)
3426	 * one of our children must be changing
3427	 * find the child being modified, and update
3428	 * our notion of it
3429	 */
3430	for (smi = 0; smi < NMIRROR; smi++) {
3431		mm_unit_t *un = (mm_unit_t *)delta->unp;
3432
3433		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
3434			continue;
3435		}
3436		sm = &un->un_sm[smi];
3437
3438		if (md_getminor(sm->sm_dev) == rtxnp->from.mnum) {
3439			sm->sm_dev = md_makedevice(md_major, rtxnp->to.mnum);
3440			sm->sm_key = rtxnp->to.key;
3441			break;
3442		}
3443	}
3444
3445	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
3446}
3447
3448/*
3449 * exchange down (self->child)
3450 */
3451void
3452mirror_exchange_self_update_from_down(
3453	md_rendelta_t	*delta,
3454	md_rentxn_t	*rtxnp
3455)
3456{
3457	int			smi;
3458	mm_submirror_t		*found;
3459	minor_t			from_min, to_min;
3460	sv_dev_t		sv;
3461
3462	ASSERT(rtxnp);
3463	ASSERT(MDRNOP_EXCHANGE == rtxnp->op);
3464	ASSERT(rtxnp->recids);
3465	ASSERT(rtxnp->rec_idx >= 0);
3466	ASSERT(delta);
3467	ASSERT(delta->unp);
3468	ASSERT(delta->uip);
3469	ASSERT(delta->old_role == MDRR_SELF);
3470	ASSERT(delta->new_role == MDRR_CHILD);
3471	ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum);
3472
3473	from_min = rtxnp->from.mnum;
3474	to_min = rtxnp->to.mnum;
3475
3476	/*
3477	 * self id changes in our own unit struct
3478	 */
3479
3480	MD_SID(delta->unp) = to_min;
3481
3482	/*
3483	 * parent identifier need not change
3484	 */
3485
3486	/*
3487	 * point the set array pointers at the "new" unit and unit in-cores
3488	 * Note: the other half of this transfer is done in the "update_to"
3489	 * exchange named service.
3490	 */
3491
3492	MDI_VOIDUNIT(to_min) = delta->uip;
3493	MD_VOIDUNIT(to_min) = delta->unp;
3494
3495	/*
3496	 * transfer kstats
3497	 */
3498
3499	delta->uip->ui_kstat = rtxnp->to.kstatp;
3500
3501	/*
3502	 * the unit in-core reference to the get next link's id changes
3503	 */
3504
3505	delta->uip->ui_link.ln_id = to_min;
3506
3507	/*
3508	 * find the child whose identity we're assuming
3509	 */
3510
3511	for (found = NULL, smi = 0; !found && smi < NMIRROR; smi++) {
3512		mm_submirror_t		*sm;
3513		mm_unit_t		*un = (mm_unit_t *)delta->unp;
3514
3515		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
3516			continue;
3517		}
3518		sm = &un->un_sm[smi];
3519
3520		if (md_getminor(sm->sm_dev) == to_min) {
3521			found = sm;
3522		}
3523	}
3524	ASSERT(found);
3525
3526	/*
3527	 * Update the sub-mirror's identity
3528	 */
3529	found->sm_dev = md_makedevice(md_major, rtxnp->from.mnum);
3530	sv.key = found->sm_key;
3531
3532	ASSERT(rtxnp->from.key != MD_KEYWILD);
3533	ASSERT(rtxnp->from.key != MD_KEYBAD);
3534
3535	found->sm_key = rtxnp->from.key;
3536
3537	/*
3538	 * delete the key for the old sub-mirror from the name space
3539	 */
3540
3541	sv.setno = MD_MIN2SET(from_min);
3542	md_rem_names(&sv, 1);
3543
3544	/*
3545	 * and store the record id (from the unit struct) into recids
3546	 */
3547
3548	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
3549}
3550
3551/*
3552 * exchange down (parent->self)
3553 */
3554void
3555mirror_exchange_parent_update_to(
3556		md_rendelta_t	*delta,
3557		md_rentxn_t	*rtxnp
3558)
3559{
3560	int			smi;
3561	mm_submirror_t		*found;
3562	minor_t			from_min, to_min;
3563	sv_dev_t		sv;
3564
3565	ASSERT(rtxnp);
3566	ASSERT(MDRNOP_EXCHANGE == rtxnp->op);
3567	ASSERT(rtxnp->recids);
3568	ASSERT(rtxnp->rec_idx >= 0);
3569	ASSERT(delta);
3570	ASSERT(delta->unp);
3571	ASSERT(delta->uip);
3572	ASSERT(delta->old_role == MDRR_PARENT);
3573	ASSERT(delta->new_role == MDRR_SELF);
3574	ASSERT(md_getminor(delta->dev) == rtxnp->to.mnum);
3575
3576	from_min = rtxnp->from.mnum;
3577	to_min = rtxnp->to.mnum;
3578
3579	/*
3580	 * self id changes in our own unit struct
3581	 */
3582
3583	MD_SID(delta->unp) = from_min;
3584
3585	/*
3586	 * parent identifier need not change
3587	 */
3588
3589	/*
3590	 * point the set array pointers at the "new" unit and unit in-cores
3591	 * Note: the other half of this transfer is done in the "update_to"
3592	 * exchange named service.
3593	 */
3594
3595	MDI_VOIDUNIT(from_min) = delta->uip;
3596	MD_VOIDUNIT(from_min) = delta->unp;
3597
3598	/*
3599	 * transfer kstats
3600	 */
3601
3602	delta->uip->ui_kstat = rtxnp->from.kstatp;
3603
3604	/*
3605	 * the unit in-core reference to the get next link's id changes
3606	 */
3607
3608	delta->uip->ui_link.ln_id = from_min;
3609
3610	/*
3611	 * find the child whose identity we're assuming
3612	 */
3613
3614	for (found = NULL, smi = 0; !found && smi < NMIRROR; smi++) {
3615		mm_submirror_t		*sm;
3616		mm_unit_t		*un = (mm_unit_t *)delta->unp;
3617
3618		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
3619			continue;
3620		}
3621		sm = &un->un_sm[smi];
3622
3623		if (md_getminor(sm->sm_dev) == from_min) {
3624			found = sm;
3625		}
3626	}
3627	ASSERT(found);
3628
3629	/*
3630	 * Update the sub-mirror's identity
3631	 */
3632	found->sm_dev = md_makedevice(md_major, rtxnp->to.mnum);
3633	sv.key = found->sm_key;
3634
3635	ASSERT(rtxnp->to.key != MD_KEYWILD);
3636	ASSERT(rtxnp->to.key != MD_KEYBAD);
3637
3638	found->sm_key = rtxnp->to.key;
3639
3640	/*
3641	 * delete the key for the old sub-mirror from the name space
3642	 */
3643
3644	sv.setno = MD_MIN2SET(to_min);
3645	md_rem_names(&sv, 1);
3646
3647	/*
3648	 * and store the record id (from the unit struct) into recids
3649	 */
3650
3651	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
3652}
3653
3654/*
3655 * MDRNM_LIST_URKIDS: named svc entry point
3656 * all all delta entries appropriate for our children onto the
3657 * deltalist pointd to by dlpp
3658 */
3659int
3660mirror_rename_listkids(md_rendelta_t **dlpp, md_rentxn_t *rtxnp)
3661{
3662	minor_t			from_min, to_min;
3663	mm_unit_t		*from_un;
3664	md_rendelta_t		*new, *p;
3665	int			smi;
3666	int			n_children;
3667	mm_submirror_t		*sm;
3668
3669	ASSERT(rtxnp);
3670	ASSERT(dlpp);
3671	ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME));
3672
3673	from_min = rtxnp->from.mnum;
3674	to_min = rtxnp->to.mnum;
3675	n_children = 0;
3676
3677	if (!MDI_UNIT(from_min) || !(from_un = MD_UNIT(from_min))) {
3678		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
3679		return (-1);
3680	}
3681
3682	for (p = *dlpp; p && p->next != NULL; p = p->next) {
3683		/* NULL */
3684	}
3685
3686	for (smi = 0; smi < NMIRROR; smi++) {
3687		minor_t	child_min;
3688
3689		if (!SMS_BY_INDEX_IS(from_un, smi, SMS_INUSE)) {
3690			continue;
3691		}
3692
3693		sm = &from_un->un_sm[smi];
3694		child_min = md_getminor(sm->sm_dev);
3695
3696		p = new = md_build_rendelta(MDRR_CHILD,
3697		    to_min == child_min? MDRR_SELF: MDRR_CHILD,
3698		    sm->sm_dev, p,
3699		    MD_UNIT(child_min), MDI_UNIT(child_min),
3700		    &rtxnp->mde);
3701
3702		if (!new) {
3703			if (mdisok(&rtxnp->mde)) {
3704				(void) mdsyserror(&rtxnp->mde, ENOMEM);
3705			}
3706			return (-1);
3707		}
3708		++n_children;
3709	}
3710
3711	return (n_children);
3712}
3713
3714/*
3715 * support routine for MDRNM_CHECK
3716 */
3717static int
3718mirror_may_renexch_self(
3719	mm_unit_t	*un,
3720	mdi_unit_t	*ui,
3721	md_rentxn_t	*rtxnp)
3722{
3723	minor_t			 from_min;
3724	minor_t			 to_min;
3725	bool_t			 toplevel;
3726	bool_t			 related;
3727	int			 smi;
3728	mm_submirror_t		*sm;
3729
3730	from_min = rtxnp->from.mnum;
3731	to_min = rtxnp->to.mnum;
3732
3733	if (!un || !ui) {
3734		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
3735		    from_min);
3736		return (EINVAL);
3737	}
3738
3739	ASSERT(MD_CAPAB(un) & MD_CAN_META_CHILD);
3740	if (!(MD_CAPAB(un) & MD_CAN_META_CHILD)) {
3741		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
3742		return (EINVAL);
3743	}
3744
3745	if (MD_PARENT(un) == MD_MULTI_PARENT) {
3746		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
3747		return (EINVAL);
3748	}
3749
3750	toplevel = !MD_HAS_PARENT(MD_PARENT(un));
3751
3752	/* we're related if trying to swap with our parent */
3753	related = (!toplevel) && (MD_PARENT(un) == to_min);
3754
3755	switch (rtxnp->op) {
3756	case MDRNOP_EXCHANGE:
3757		/*
3758		 * check for a swap with our child
3759		 */
3760		for (smi = 0; smi < NMIRROR; smi++) {
3761
3762			if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
3763				continue;
3764			}
3765
3766			sm = &un->un_sm[smi];
3767			if (md_getminor(sm->sm_dev) == to_min) {
3768				related |= TRUE;
3769			}
3770		}
3771		if (!related) {
3772			(void) mdmderror(&rtxnp->mde,
3773			    MDE_RENAME_TARGET_UNRELATED, to_min);
3774			return (EINVAL);
3775		}
3776
3777		break;
3778
3779	case MDRNOP_RENAME:
3780		/*
3781		 * if from is top-level and is open, then the kernel is using
3782		 * the md_dev64_t.
3783		 */
3784
3785		if (toplevel && md_unit_isopen(ui)) {
3786			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
3787			    from_min);
3788			return (EBUSY);
3789		}
3790		break;
3791
3792	default:
3793		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
3794		    from_min);
3795		return (EINVAL);
3796	}
3797
3798	return (0);	/* ok */
3799}
3800
3801/*
3802 * Named service entry point: MDRNM_CHECK
3803 */
3804intptr_t
3805mirror_rename_check(
3806	md_rendelta_t	*delta,
3807	md_rentxn_t	*rtxnp)
3808{
3809	mm_submirror_t		*sm;
3810	mm_submirror_ic_t	*smic;
3811	md_m_shared_t		*shared;
3812	int			ci;
3813	int			i;
3814	int			compcnt;
3815	mm_unit_t		*un;
3816	int			err = 0;
3817
3818	ASSERT(delta);
3819	ASSERT(rtxnp);
3820	ASSERT(delta->unp);
3821	ASSERT(delta->uip);
3822	ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE));
3823
3824	if (!delta || !rtxnp || !delta->unp || !delta->uip) {
3825		(void) mdsyserror(&rtxnp->mde, EINVAL);
3826		return (EINVAL);
3827	}
3828
3829	un = (mm_unit_t *)delta->unp;
3830
3831	for (i = 0; i < NMIRROR; i++) {
3832		sm = &un->un_sm[i];
3833		smic = &un->un_smic[i];
3834
3835		if (!SMS_IS(sm, SMS_INUSE))
3836			continue;
3837
3838		ASSERT(smic->sm_get_component_count);
3839		if (!smic->sm_get_component_count) {
3840			(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
3841			    md_getminor(delta->dev));
3842			return (ENXIO);
3843		}
3844
3845		compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3846
3847		for (ci = 0; ci < compcnt; ci++) {
3848
3849			ASSERT(smic->sm_shared_by_indx);
3850			if (!smic->sm_shared_by_indx) {
3851				(void) mdmderror(&rtxnp->mde,
3852				    MDE_RENAME_CONFIG_ERROR,
3853				    md_getminor(delta->dev));
3854				return (ENXIO);
3855			}
3856
3857			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3858			    (sm->sm_dev, sm, ci);
3859
3860			ASSERT(shared);
3861			if (!shared) {
3862				(void) mdmderror(&rtxnp->mde,
3863				    MDE_RENAME_CONFIG_ERROR,
3864				    md_getminor(delta->dev));
3865				return (ENXIO);
3866			}
3867
3868			if (shared->ms_hs_id != 0) {
3869				(void) mdmderror(&rtxnp->mde,
3870				    MDE_SM_FAILED_COMPS,
3871				    md_getminor(delta->dev));
3872				return (EIO);
3873			}
3874
3875			switch (shared->ms_state) {
3876			case CS_OKAY:
3877				break;
3878
3879			case CS_RESYNC:
3880				(void) mdmderror(&rtxnp->mde,
3881				    MDE_RESYNC_ACTIVE,
3882				    md_getminor(delta->dev));
3883				return (EBUSY);
3884
3885			default:
3886				(void) mdmderror(&rtxnp->mde,
3887				    MDE_SM_FAILED_COMPS,
3888				    md_getminor(delta->dev));
3889				return (EINVAL);
3890			}
3891
3892		}
3893	}
3894
3895	/* self does additional checks */
3896	if (delta->old_role == MDRR_SELF) {
3897		err = mirror_may_renexch_self(un, delta->uip, rtxnp);
3898	}
3899
3900	return (err);
3901}
3902
3903/* end of rename/exchange */
3904