1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * NAME:	raid_ioctl.c
29 *
30 * DESCRIPTION: RAID driver source file containing IOCTL operations.
31 *
32 * ROUTINES PROVIDED FOR EXTERNAL USE:
33 *	  raid_commit() - commits MD database updates for a RAID metadevice
34 *	md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
35 *
36 * ROUTINES PROVIDED FOR INTERNAL USE:
37 *	 raid_getun() - Performs unit checking on a RAID metadevice
38 *    init_col_nextio() - normal backend when zeroing column of RAID metadevice.
39 *	 init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
40 *  raid_init_columns() - Zero one or more columns of a RAID metadevice.
41 *	     raid_set() - used to create a RAID metadevice
42 *	     raid_get() - used to get the unit structure of a RAID metadevice
43 *	 raid_replace() - used to replace a component of a RAID metadevice
44 *	    raid_grow() - Concatenate to a RAID metadevice
45 *	  raid_change() - change dynamic values of a RAID metadevice
46 *	   raid_reset() - used to reset (clear / remove) a RAID metadevice
47 *	raid_get_geom() - used to get the geometry of a RAID metadevice
48 *	raid_get_vtoc() - used to get the VTOC on a RAID metadevice
49 *	raid_set_vtoc() - used to set the VTOC on a RAID metadevice
50 *	raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
51 *	raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
52 *	 raid_getdevs() - return all devices within a RAID metadevice
53 *   raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
54 */
55
56
57#include <sys/param.h>
58#include <sys/systm.h>
59#include <sys/conf.h>
60#include <sys/file.h>
61#include <sys/user.h>
62#include <sys/uio.h>
63#include <sys/t_lock.h>
64#include <sys/buf.h>
65#include <sys/dkio.h>
66#include <sys/vtoc.h>
67#include <sys/kmem.h>
68#include <vm/page.h>
69#include <sys/sysmacros.h>
70#include <sys/types.h>
71#include <sys/mkdev.h>
72#include <sys/stat.h>
73#include <sys/open.h>
74#include <sys/disp.h>
75#include <sys/modctl.h>
76#include <sys/ddi.h>
77#include <sys/sunddi.h>
78#include <sys/cred.h>
79#include <sys/lvm/mdvar.h>
80#include <sys/lvm/md_names.h>
81#include <sys/lvm/md_mddb.h>
82#include <sys/lvm/md_raid.h>
83#include <sys/lvm/md_convert.h>
84
85#include <sys/sysevent/eventdefs.h>
86#include <sys/sysevent/svm.h>
87
88extern int		md_status;
89extern unit_t		md_nunits;
90extern set_t		md_nsets;
91extern md_set_t		md_set[];
92extern md_ops_t		raid_md_ops;
93extern major_t		md_major;
94extern md_krwlock_t	md_unit_array_rw;
95extern mdq_anchor_t	md_done_daemon;
96extern mdq_anchor_t	md_ff_daemonq;
97extern	int		mdopen();
98extern	int		mdclose();
99extern	void		md_probe_one();
100extern int		md_init_probereq(md_probedev_impl_t *,
101				daemon_queue_t **);
102extern md_resync_t	md_cpr_resync;
103
104
105extern void dump_mr_unit(mr_unit_t *);
106
107typedef struct raid_ci {
108	DAEMON_QUEUE
109	struct raid_ci	*ci_next;
110	mr_unit_t	*ci_un;
111	int		ci_col;
112	int		ci_err;
113	int		ci_flag;
114	size_t		ci_zerosize;
115	diskaddr_t	ci_blkno;
116	diskaddr_t	ci_lastblk;
117	buf_t		ci_buf;
118} raid_ci_t;
119/* values for the ci_flag */
120#define	COL_INITING	(0x0001)
121#define	COL_INIT_DONE	(0x0002)
122#define	COL_READY	(0x0004)
123
124/*
125 * NAME:	raid_getun
126 * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
127 * PARAMETERS:	minor_t	      mnum - minor device number for RAID unit
128 *		md_error_t    *mde - pointer to error reporting structure
129 *		int	     flags - pointer to error reporting structure
130 *					STALE_OK - allow stale MD memory
131 *					  NO_OLD - unit must not exist
132 *					 NO_LOCK - no IOCTL lock needed
133 *					 WR_LOCK - write IOCTL lock needed
134 *					 RD_LOCK - read IOCTL lock needed
135 *		IOLOCK	     *lock - pointer to IOCTL lock
136 *
137 * LOCKS:	obtains unit reader or writer lock via IOLOCK
138 *
139 */
140static mr_unit_t *
141raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
142{
143	mr_unit_t	*un;
144	mdi_unit_t	*ui;
145	set_t		setno = MD_MIN2SET(mnum);
146
147	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
148		(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
149		return (NULL);
150	}
151
152	if (!(flags & STALE_OK)) {
153		if (md_get_setstatus(setno) & MD_SET_STALE) {
154			(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
155			return (NULL);
156		}
157	}
158
159	ui = MDI_UNIT(mnum);
160	if (flags & NO_OLD) {
161		if (ui != NULL) {
162			(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
163			return (NULL);
164		}
165		return ((mr_unit_t *)1);
166	}
167
168	if (ui == NULL) {
169		(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
170		return (NULL);
171	}
172	if (flags & ARRAY_WRITER)
173		md_array_writer(lock);
174	else if (flags & ARRAY_READER)
175		md_array_reader(lock);
176
177	if (!(flags & NO_LOCK)) {
178		if (flags & WR_LOCK) {
179			(void) md_ioctl_io_lock(lock, ui);
180			(void) md_ioctl_writerlock(lock, ui);
181		} else /* RD_LOCK */
182			(void) md_ioctl_readerlock(lock, ui);
183	}
184	un = (mr_unit_t *)MD_UNIT(mnum);
185
186	if (un->c.un_type != MD_METARAID) {
187		(void) mdmderror(mde, MDE_NOT_RAID, mnum);
188		return (NULL);
189	}
190
191	return (un);
192}
193
194
195/*
196 * NAME:	raid_commit
197 * DESCRIPTION: commits MD database updates for a RAID metadevice
198 * PARAMETERS:	mr_unit_t	 *un - RAID unit to update in the MD database
199 *		mddb_recid_t *extras - array of other record IDs to update
200 *
201 * LOCKS:	assumes caller holds unit writer lock
202 *
203 */
204void
205raid_commit(mr_unit_t *un, mddb_recid_t	*extras)
206{
207	mddb_recid_t	*recids;
208	int 		ri = 0;
209	int		nrecids = 0;
210
211	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
212		return;
213
214	/* Count the extra recids */
215	if (extras != NULL) {
216		while (extras[nrecids] != 0) {
217			nrecids++;
218		}
219	}
220
221	/*
222	 * Allocate space for two recids in addition to the extras:
223	 * one for the unit structure, one for the null terminator.
224	 */
225	nrecids += 2;
226	recids = (mddb_recid_t *)
227	    kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
228
229	if (un != NULL) {
230		ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
231		recids[ri++] = un->c.un_record_id;
232	}
233
234	if (extras != NULL) {
235		while (*extras != 0) {
236			recids[ri++] = *extras;
237			extras++;
238		}
239	}
240
241	if (ri > 0) {
242		mddb_commitrecs_wrapper(recids);
243	}
244
245	kmem_free(recids, nrecids * sizeof (mddb_recid_t));
246}
247
248static int
249raid_check_pw(mr_unit_t *un)
250{
251	buf_t		bp;
252	char		*buf;
253	mr_column_t	*colptr;
254	minor_t		mnum = MD_SID(un);
255	int		i;
256	int		err = 0;
257	minor_t		unit;
258
259	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
260
261	for (i = 0; i < un->un_totalcolumncnt; i++) {
262		md_dev64_t tmpdev;
263
264		colptr = &un->un_column[i];
265
266		tmpdev = colptr->un_dev;
267		/*
268		 * Open by device id
269		 * If this device is hotspared
270		 * use the hotspare key
271		 */
272		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
273		    colptr->un_hs_key : colptr->un_orig_key);
274		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
275			colptr->un_dev = tmpdev;
276			return (1);
277		}
278		colptr->un_dev = tmpdev;
279
280		bzero((caddr_t)&bp, sizeof (buf_t));
281		bp.b_back = &bp;
282		bp.b_forw = &bp;
283		bp.b_flags = B_READ | B_BUSY;
284		sema_init(&bp.b_io, 0, NULL,
285		    SEMA_DEFAULT, NULL);
286		sema_init(&bp.b_sem, 0, NULL,
287		    SEMA_DEFAULT, NULL);
288		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
289		bp.b_lblkno = colptr->un_pwstart;
290		bp.b_bcount = DEV_BSIZE;
291		bp.b_bufsize = DEV_BSIZE;
292		bp.b_un.b_addr = (caddr_t)buf;
293		bp.b_offset = -1;
294		(void) md_call_strategy(&bp, 0, NULL);
295		if (biowait(&bp))
296			err = 1;
297		if (i == 0) {
298			if (un->c.un_revision & MD_64BIT_META_DEV) {
299				unit = ((raid_pwhdr_t *)buf)->rpw_unit;
300			} else {
301				unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
302			}
303		}
304		/*
305		 * depending upon being an 64bit or 32 bit raid, the
306		 * pre write headers have different layout
307		 */
308		if (un->c.un_revision & MD_64BIT_META_DEV) {
309			if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
310			    (((raid_pwhdr_t *)buf)->rpw_unit != unit))
311				err = 1;
312		} else {
313			if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
314			    (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
315				err = 1;
316		}
317		md_layered_close(colptr->un_dev, MD_OFLG_NULL);
318		if (err)
319			break;
320	}
321	kmem_free(buf, DEV_BSIZE);
322	return (err);
323}
324
325/*
326 * NAME:	init_col_nextio
327 * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
328 * PARAMETERS:	raid_ci_t *cur - struct for column being zeroed
329 *
330 * LOCKS:	assumes caller holds unit reader lock,
331 *		preiodically releases and reacquires unit reader lock,
332 *		broadcasts on unit conditional variable (un_cv)
333 *
334 */
335#define	INIT_RLS_CNT	10
336static void
337init_col_nextio(raid_ci_t *cur)
338{
339	mr_unit_t	*un;
340
341	un = cur->ci_un;
342
343	cur->ci_blkno += cur->ci_zerosize;
344
345	mutex_enter(&un->un_mx);
346	/* ===> update un_percent_done */
347	un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
348	mutex_exit(&un->un_mx);
349
350	/*
351	 * When gorwing a device, normal I/O is still going on.
352	 * The init thread still holds the unit reader lock which
353	 * prevents I/O from doing state changes.
354	 * So every INIT_RLS_CNT init I/Os, we will release the
355	 * unit reader lock.
356	 *
357	 * CAVEAT:
358	 * We know we are in the middle of a grow operation and the
359	 * unit cannot be grown or removed (through reset or halt)
360	 * so the mr_unit_t structure will not move or disappear.
361	 * In addition, we know that only one of the init I/Os
362	 * can be in col_init_nextio at a time because they are
363	 * placed on the md_done_daemon queue and md only processes
364	 * one element of this queue at a time. In addition, any
365	 * code that needs to acquire the unit writer lock to change
366	 * state is supposed to be on the md_mstr_daemon queue so
367	 * it can be processing while we sit here waiting to get the
368	 * unit reader lock back.
369	 */
370
371	if (cur->ci_blkno < cur->ci_lastblk) {
372		/* truncate last chunk to end_addr if needed */
373		if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
374			cur->ci_zerosize = (size_t)
375			    (cur->ci_lastblk - cur->ci_blkno);
376		}
377
378		/* set address and length for I/O bufs */
379		cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
380		cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
381		cur->ci_buf.b_lblkno = cur->ci_blkno;
382
383		(void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
384		return;
385	}
386	/* finished initializing this column */
387	mutex_enter(&un->un_mx);
388	cur->ci_flag = COL_INIT_DONE;
389	uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
390	mutex_exit(&un->un_mx);
391	cv_broadcast(&un->un_cv);
392}
393
394/*
395 * NAME:	init_col_int
396 * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
397 * PARAMETERS:	buf_t	  *cb - I/O buffer for which interrupt occurred
398 *
399 * LOCKS:	assumes caller holds unit reader or writer lock
400 *
401 */
402static int
403init_col_int(buf_t *cb)
404{
405	raid_ci_t	*cur;
406
407	cur = (raid_ci_t *)cb->b_chain;
408	if (cb->b_flags & B_ERROR) {
409		mutex_enter(&cur->ci_un->un_mx);
410		cur->ci_err = EIO;
411		mutex_exit(&cur->ci_un->un_mx);
412		cv_broadcast(&cur->ci_un->un_cv);
413		return (1);
414	}
415	daemon_request(&md_done_daemon, init_col_nextio,
416	    (daemon_queue_t *)cur, REQ_OLD);
417	return (1);
418}
419
420/*
421 * NAME:	raid_init_columns
422 * DESCRIPTION: Zero one or more columns of a RAID metadevice.
423 * PARAMETERS:	minor_t	 mnum - RAID unit minor identifier
424 *
425 * LOCKS:	obtains and releases unit reader lock,
426 *		obtains and releases unit writer lock,
427 *		obtains and releases md_unit_array_rw write lock,
428 *		obtains and releases unit mutex (un_mx) lock,
429 *		waits on unit conditional variable (un_cv)
430 *
431 */
432static void
433raid_init_columns(minor_t mnum)
434{
435	mr_unit_t	*un;
436	mdi_unit_t	*ui;
437	raid_ci_t	*ci_chain = NULL, *cur;
438	rus_state_t	state;
439	caddr_t		zero_addr;
440	diskaddr_t	end_off;
441	size_t		zerosize;
442	int		err = 0;
443	int		ix;
444	int		colcnt = 0;
445	int		col;
446	set_t		setno = MD_MIN2SET(mnum);
447
448	/*
449	 * Increment the raid resync count for cpr
450	 */
451	mutex_enter(&md_cpr_resync.md_resync_mutex);
452	md_cpr_resync.md_raid_resync++;
453	mutex_exit(&md_cpr_resync.md_resync_mutex);
454
455	/*
456	 * initialization is a multiple step process.  The first step
457	 * is to go through the unit structure and start each device
458	 * in the init state writing zeros over the component.
459	 * Next initialize the prewrite areas, so the device can be
460	 * used if a metainit -k is done.  Now close the componenets.
461	 *
462	 * Once this complete set the state of each component being
463	 * zeroed and set the correct state for the unit.
464	 *
465	 * last commit the records.
466	 */
467
468	ui = MDI_UNIT(mnum);
469	un = md_unit_readerlock(ui);
470
471	/* check for active init on this column */
472	/* exiting is cpr safe */
473	if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
474		md_unit_readerexit(ui);
475		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
476		/*
477		 * Decrement the raid resync count for cpr
478		 */
479		mutex_enter(&md_cpr_resync.md_resync_mutex);
480		md_cpr_resync.md_raid_resync--;
481		mutex_exit(&md_cpr_resync.md_resync_mutex);
482		thread_exit();
483	}
484
485	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
486	    MD_SID(un));
487	un->un_init_colcnt = 0;
488	un->un_init_iocnt = 0;
489	end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
490	zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
491
492	/* allocate zero-filled buffer */
493	zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
494
495	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
496		if (un->un_column[ix].un_devstate != RCS_INIT)
497			continue;
498		/* allocate new column init structure */
499		cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
500		ASSERT(cur != NULL);
501		un->un_init_colcnt++;
502		cur->ci_next = ci_chain;
503		ci_chain = cur;
504		cur->ci_un = un;
505		cur->ci_col = ix;
506		cur->ci_err = 0;
507		cur->ci_flag = COL_INITING;
508		cur->ci_zerosize = zerosize;
509		cur->ci_blkno = un->un_column[ix].un_pwstart;
510		cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
511		    + (un->un_segsize * un->un_segsincolumn);
512		/* initialize static buf fields */
513		cur->ci_buf.b_un.b_addr = zero_addr;
514		cur->ci_buf.b_chain = (buf_t *)cur;
515		cur->ci_buf.b_back = &cur->ci_buf;
516		cur->ci_buf.b_forw = &cur->ci_buf;
517		cur->ci_buf.b_iodone = init_col_int;
518		cur->ci_buf.b_flags = B_BUSY | B_WRITE;
519		cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
520		sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
521		sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
522		/* set address and length for I/O bufs */
523		cur->ci_buf.b_bufsize = dbtob(zerosize);
524		cur->ci_buf.b_bcount = dbtob(zerosize);
525		cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
526		cur->ci_buf.b_offset = -1;
527
528		if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
529			md_dev64_t tmpdev = un->un_column[ix].un_dev;
530			/*
531			 * Open by device id
532			 * If this column is hotspared then
533			 * use the hotspare key
534			 */
535			tmpdev = md_resolve_bydevid(mnum, tmpdev,
536			    HOTSPARED(un, ix) ?
537			    un->un_column[ix].un_hs_key :
538			    un->un_column[ix].un_orig_key);
539			if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
540			    MD_OFLG_NULL)) == 0)
541				un->un_column[ix].un_devflags |=
542				    MD_RAID_DEV_ISOPEN;
543			un->un_column[ix].un_dev = tmpdev;
544		}
545		if (cur->ci_err == 0)
546			md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
547	}
548
549	md_unit_readerexit(ui);
550	state = un->un_state;
551	colcnt = un->un_init_colcnt;
552	mutex_enter(&un->un_mx);
553	while (colcnt) {
554		cv_wait(&un->un_cv, &un->un_mx);
555
556		colcnt = 0;
557		for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
558			col = cur->ci_col;
559			if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
560				if (cur->ci_err)
561					err = cur->ci_err;
562				else if (cur->ci_flag == COL_INIT_DONE) {
563					(void) init_pw_area(un,
564					    un->un_column[col].un_dev,
565					    un->un_column[col].un_pwstart,
566					    col);
567					cur->ci_flag = COL_READY;
568				}
569			} else {
570				colcnt++;
571			}
572		}
573	}
574	mutex_exit(&un->un_mx);
575
576	/* This prevents new opens */
577	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
578	(void) md_io_writerlock(ui);
579	un = (mr_unit_t *)md_unit_writerlock(ui);
580	while (ci_chain) {
581		cur = ci_chain;
582
583		/* take this element out of the chain */
584		ci_chain = cur->ci_next;
585		/* free this element */
586		sema_destroy(&cur->ci_buf.b_io);
587		sema_destroy(&cur->ci_buf.b_sem);
588		if (cur->ci_err)
589			raid_set_state(cur->ci_un, cur->ci_col,
590			    RCS_INIT_ERRED, 0);
591		else
592			raid_set_state(cur->ci_un, cur->ci_col,
593			    RCS_OKAY, 0);
594		kmem_free(cur, sizeof (raid_ci_t));
595	}
596
597	/* free the zeroed buffer */
598	kmem_free(zero_addr, dbtob(zerosize));
599
600	/* determine new unit state */
601	if (err == 0) {
602		if (state == RUS_INIT)
603			un->un_state = RUS_OKAY;
604		else {
605			un->c.un_total_blocks = un->un_grow_tb;
606			md_nblocks_set(mnum, un->c.un_total_blocks);
607			un->un_grow_tb = 0;
608			if (raid_state_cnt(un, RCS_OKAY) ==
609			    un->un_totalcolumncnt)
610				un->un_state = RUS_OKAY;
611		}
612	} else {  /* error orcurred */
613		if (state & RUS_INIT)
614			un->un_state = RUS_DOI;
615	}
616	uniqtime32(&un->un_timestamp);
617	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
618	un->un_init_colcnt = 0;
619	un->un_init_iocnt = 0;
620	raid_commit(un, NULL);
621	md_unit_writerexit(ui);
622	(void) md_io_writerexit(ui);
623	rw_exit(&md_unit_array_rw.lock);
624	if (err) {
625		if (un->un_state & RUS_DOI) {
626			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
627			    SVM_TAG_METADEVICE, setno, MD_SID(un));
628		} else {
629			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
630			    SVM_TAG_METADEVICE, setno, MD_SID(un));
631		}
632	} else {
633		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
634		    SVM_TAG_METADEVICE, setno, MD_SID(un));
635	}
636	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
637	/*
638	 * Decrement the raid resync count for cpr
639	 */
640	mutex_enter(&md_cpr_resync.md_resync_mutex);
641	md_cpr_resync.md_raid_resync--;
642	mutex_exit(&md_cpr_resync.md_resync_mutex);
643	thread_exit();
644	/*NOTREACHED*/
645}
646
647static int
648raid_init_unit(minor_t mnum, md_error_t *ep)
649{
650	mdi_unit_t	*ui;
651	mr_unit_t	*un;
652	int		rval, i;
653	set_t		setno = MD_MIN2SET(mnum);
654
655	ui = MDI_UNIT(mnum);
656	if (md_get_setstatus(setno) & MD_SET_STALE)
657		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
658
659	/* Don't start an init if the device is not available */
660	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
661		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
662	}
663
664	if (raid_internal_open(mnum, (FREAD | FWRITE),
665	    OTYP_LYR, MD_OFLG_ISINIT)) {
666		rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
667		goto out;
668	}
669
670	un = md_unit_readerlock(ui);
671	un->un_percent_done = 0;
672	md_unit_readerexit(ui);
673	/* start resync_unit thread */
674	(void) thread_create(NULL, 0, raid_init_columns,
675	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
676
677	return (0);
678
679out:
680	un = md_unit_writerlock(ui);
681	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
682	/* recover state */
683	for (i = 0; i < un->un_totalcolumncnt; i++)
684		if (COLUMN_STATE(un, i) == RCS_INIT)
685			raid_set_state(un, i, RCS_ERRED, 0);
686	if (un->un_state & RUS_INIT)
687		un->un_state = RUS_DOI;
688	raid_commit(un, NULL);
689	md_unit_writerexit(ui);
690	if (un->un_state & RUS_DOI) {
691		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
692		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
693	} else {
694		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
695		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
696	}
697	return (rval);
698}
699
700/*
701 * NAME:	raid_regen
702 *
703 * DESCRIPTION:	regenerate all the parity on the raid device.  This
704 *		routine starts a thread that will regenerate the
705 *		parity on a raid device.  If an I/O error occurs during
706 *		this process the entire device is placed in error.
707 *
708 * PARAMETERS:	md_set_params_t *msp - ioctl packet
709 */
710static void
711regen_unit(minor_t mnum)
712{
713	mdi_unit_t	*ui = MDI_UNIT(mnum);
714	mr_unit_t	*un = MD_UNIT(mnum);
715	buf_t		buf, *bp;
716	caddr_t		buffer;
717	int		err = 0;
718	diskaddr_t	total_segments;
719	diskaddr_t	line;
720	size_t		iosize;
721
722	/*
723	 * Increment raid resync count for cpr
724	 */
725	mutex_enter(&md_cpr_resync.md_resync_mutex);
726	md_cpr_resync.md_raid_resync++;
727	mutex_exit(&md_cpr_resync.md_resync_mutex);
728
729	iosize = dbtob(un->un_segsize);
730	buffer = kmem_alloc(iosize, KM_SLEEP);
731	bp = &buf;
732	total_segments = un->un_segsincolumn;
733	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
734	    MD_UN2SET(un), MD_SID(un));
735	un->un_percent_done = 0;
736	init_buf(bp, B_READ | B_BUSY, iosize);
737
738	for (line = 0; line < total_segments; line++) {
739		bp->b_lblkno = line *
740		    ((un->un_origcolumncnt - 1) * un->un_segsize);
741		bp->b_un.b_addr = buffer;
742		bp->b_bcount = iosize;
743		bp->b_iodone = NULL;
744		/*
745		 * The following assignment is only correct because
746		 * md_raid_strategy is fine when it's only a minor number
747		 * and not a real dev_t. Yuck.
748		 */
749		bp->b_edev = mnum;
750		md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
751		if (biowait(bp)) {
752			err = 1;
753			break;
754		}
755		un->un_percent_done = (uint_t)((line * 1000) /
756		    un->un_segsincolumn);
757		/* just to avoid rounding errors */
758		if (un->un_percent_done > 1000)
759			un->un_percent_done = 1000;
760		reset_buf(bp, B_READ | B_BUSY, iosize);
761	}
762	destroy_buf(bp);
763	kmem_free(buffer, iosize);
764
765	(void) md_io_writerlock(ui);
766	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
767	(void) md_io_writerexit(ui);
768	un = md_unit_writerlock(ui);
769	if (!err &&
770	    (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
771			un->un_state = RUS_OKAY;
772	raid_commit(un, NULL);
773	md_unit_writerexit(ui);
774	if (err ||
775	    raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
776		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
777		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
778	} else {
779		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
780		    MD_UN2SET(un), MD_SID(un));
781	}
782
783	/*
784	 * Decrement the raid resync count for cpr
785	 */
786	mutex_enter(&md_cpr_resync.md_resync_mutex);
787	md_cpr_resync.md_raid_resync--;
788	mutex_exit(&md_cpr_resync.md_resync_mutex);
789	thread_exit();
790}
791
792static int
793raid_regen_unit(minor_t mnum, md_error_t *ep)
794{
795	mdi_unit_t	*ui;
796	mr_unit_t	*un;
797	int		i;
798	set_t		setno = MD_MIN2SET(mnum);
799
800	ui = MDI_UNIT(mnum);
801	un = (mr_unit_t *)MD_UNIT(mnum);
802
803	if (md_get_setstatus(setno) & MD_SET_STALE)
804		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
805
806	/* Don't start a regen if the device is not available */
807	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
808		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
809	}
810
811	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
812		(void) md_unit_writerlock(ui);
813		for (i = 0; i < un->un_totalcolumncnt; i++)
814			raid_set_state(un, i, RCS_ERRED, 0);
815		md_unit_writerexit(ui);
816		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
817	}
818
819	/* start resync_unit thread */
820	(void) thread_create(NULL, 0, regen_unit,
821	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
822
823	return (0);
824}
825
826static int
827raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
828{
829	minor_t		mnum = mrp->mnum;
830	mr_unit_t	*un;
831
832	mdclrerror(&mrp->mde);
833
834	un = md_unit_readerlock(MDI_UNIT(mnum));
835
836	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
837		md_unit_readerexit(MDI_UNIT(mnum));
838		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
839	}
840
841	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
842	    (raid_state_cnt(un, RCS_RESYNC))) {
843		md_unit_readerexit(MDI_UNIT(mnum));
844		return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
845	}
846
847	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
848		md_unit_readerexit(MDI_UNIT(mnum));
849		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
850	}
851
852	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
853	    (! (un->un_state & RUS_OKAY))) {
854		md_unit_readerexit(MDI_UNIT(mnum));
855		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
856	}
857
858	md_unit_readerexit(MDI_UNIT(mnum));
859
860	/* get locks and recheck to be sure something did not change */
861	if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
862		return (0);
863
864	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
865	    (! (un->un_state & RUS_OKAY))) {
866		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
867	}
868
869	raid_set_state(un, 0, RCS_REGEN, 0);
870	raid_commit(un, NULL);
871	md_ioctl_droplocks(lock);
872	return (raid_regen_unit(mnum, &mrp->mde));
873}
874
875/*
876 * NAME:	raid_set
877 * DESCRIPTION: used to create a RAID metadevice
878 * PARAMETERS:	md_set_params_t *d   - pointer to set data structure
879 *		int		mode - must be FWRITE
880 *
881 * LOCKS:	none
882 *
883 */
884static int
885raid_set(void	*d, int mode)
886{
887	minor_t		mnum;
888	mr_unit_t	*un;
889	mddb_recid_t	mr_recid;
890	mddb_recid_t	*recids;
891	mddb_type_t	typ1;
892	int		err;
893	set_t		setno;
894	int		num_recs;
895	int		rid;
896	int		col;
897	md_set_params_t	*msp = d;
898
899
900	mnum = msp->mnum;
901	setno = MD_MIN2SET(mnum);
902
903	mdclrerror(&msp->mde);
904
905	if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
906		return (0);
907
908	typ1 = (mddb_type_t)md_getshared_key(setno,
909	    raid_md_ops.md_driver.md_drivername);
910
911	/* create the db record for this mdstruct */
912
913	if (msp->options & MD_CRO_64BIT) {
914#if defined(_ILP32)
915		return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
916#else
917		mr_recid = mddb_createrec(msp->size, typ1, 0,
918		    MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
919#endif
920	} else {
921		mr_recid = mddb_createrec(msp->size, typ1, 0,
922		    MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
923	}
924
925	if (mr_recid < 0)
926		return (mddbstatus2error(&msp->mde,
927		    (int)mr_recid, mnum, setno));
928
929	/* get the address of the mdstruct */
930	un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
931	/*
932	 * It is okay that we muck with the mdstruct here,
933	 * since no one else will know about the mdstruct
934	 * until we commit it. If we crash, the record will
935	 * be automatically purged, since we haven't
936	 * committed it yet.
937	 */
938
939	/* copy in the user's mdstruct */
940	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
941	    msp->size, mode)) {
942		mddb_deleterec_wrapper(mr_recid);
943		return (EFAULT);
944	}
945	/* All 64 bit metadevices only support EFI labels. */
946	if (msp->options & MD_CRO_64BIT) {
947		un->c.un_flag |= MD_EFILABEL;
948	}
949
950	/*
951	 * allocate the real recids array.  since we may have to commit
952	 * underlying metadevice records, we need an array of size:
953	 * total number of components in raid + 3 (1 for the raid itself,
954	 * one for the hotspare, one for the end marker).
955	 */
956	num_recs = un->un_totalcolumncnt + 3;
957	rid = 0;
958	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
959	recids[rid++] = mr_recid;
960
961	MD_SID(un) = mnum;
962	MD_RECID(un) = recids[0];
963	MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
964	MD_PARENT(un) = MD_NO_PARENT;
965	un->un_resync_copysize = 0;
966	un->c.un_revision |= MD_FN_META_DEV;
967
968	if (UNIT_STATE(un) == RUS_INIT)
969		MD_STATUS(un) |= MD_UN_GROW_PENDING;
970
971	if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
972		mddb_deleterec_wrapper(mr_recid);
973		err = mderror(&msp->mde, MDE_RAID_INVALID);
974		goto out;
975	}
976
977	if (err = raid_build_incore(un, 0)) {
978		if (un->mr_ic) {
979			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
980			    un->un_totalcolumncnt);
981			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
982		}
983
984		md_nblocks_set(mnum, -1ULL);
985		MD_UNIT(mnum) = NULL;
986
987		mddb_deleterec_wrapper(mr_recid);
988		goto out;
989	}
990
991	/*
992	 * Update unit availability
993	 */
994	md_set[setno].s_un_avail--;
995
996	recids[rid] = 0;
997	if (un->un_hsp_id != -1) {
998		/* increment the reference count of the hot spare pool */
999		err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
1000		    &recids[rid], NULL, NULL, NULL);
1001		if (err) {
1002			md_nblocks_set(mnum, -1ULL);
1003			MD_UNIT(mnum) = NULL;
1004
1005			mddb_deleterec_wrapper(mr_recid);
1006			goto out;
1007		}
1008		rid++;
1009	}
1010
1011	/*
1012	 * set the parent on any metadevice components.
1013	 * NOTE: currently soft partitions are the only metadevices
1014	 * which can appear within a RAID metadevice.
1015	 */
1016	for (col = 0; col < un->un_totalcolumncnt; col++) {
1017		mr_column_t	*mr_col = &un->un_column[col];
1018		md_unit_t	*comp_un;
1019
1020		if (md_getmajor(mr_col->un_dev) == md_major) {
1021			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1022			recids[rid++] = MD_RECID(comp_un);
1023			md_set_parent(mr_col->un_dev, MD_SID(un));
1024		}
1025	}
1026
1027	/* set the end marker */
1028	recids[rid] = 0;
1029
1030	mddb_commitrecs_wrapper(recids);
1031	md_create_unit_incore(mnum, &raid_md_ops, 1);
1032
1033	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
1034	    MD_SID(un));
1035
1036out:
1037	kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
1038	if (err)
1039		return (err);
1040
1041	/* only attempt to init a device that is in the init state */
1042	if (UNIT_STATE(un) != RUS_INIT)
1043		return (0);
1044
1045	return (raid_init_unit(mnum, &msp->mde));
1046}
1047
1048/*
1049 * NAME:	raid_get
1050 * DESCRIPTION: used to get the unit structure of a RAID metadevice
1051 * PARAMETERS:	md_i_get_t   *migp - pointer to get data structure
1052 *		int	      mode - must be FREAD
1053 *		IOLOCK	     *lock - pointer to IOCTL lock
1054 *
1055 * LOCKS:	obtains unit reader lock via IOLOCK
1056 *
1057 */
1058static int
1059raid_get(
1060	void		*migp,
1061	int		mode,
1062	IOLOCK		*lock
1063)
1064{
1065	minor_t		mnum;
1066	mr_unit_t	*un;
1067	md_i_get_t	*migph = migp;
1068
1069
1070	mnum = migph->id;
1071
1072	mdclrerror(&migph->mde);
1073
1074	if ((un = raid_getun(mnum, &migph->mde,
1075	    RD_LOCK, lock)) == NULL)
1076		return (0);
1077
1078	if (migph->size == 0) {
1079		migph->size = un->c.un_size;
1080		return (0);
1081	}
1082
1083	if (migph->size < un->c.un_size) {
1084		return (EFAULT);
1085	}
1086	if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
1087	    un->c.un_size, mode))
1088		return (EFAULT);
1089
1090	return (0);
1091}
1092
1093
1094/*
1095 * NAME:	raid_replace
1096 * DESCRIPTION: used to replace a component of a RAID metadevice
1097 * PARAMETERS:	replace_params_t *mrp - pointer to replace data structure
1098 *		IOLOCK	     *lock - pointer to IOCTL lock
1099 *
1100 * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1101 *		obtains and releases md_unit_array_rw write lock
1102 *
1103 */
1104static int
1105raid_replace(
1106	replace_params_t	*mrp,
1107	IOLOCK			*lock
1108)
1109{
1110	minor_t		mnum = mrp->mnum;
1111	md_dev64_t	odev = mrp->old_dev;
1112	md_error_t	*ep = &mrp->mde;
1113	mr_unit_t	*un;
1114	rcs_state_t	state;
1115	int		ix, col = -1;
1116	int		force = 0;
1117	int		err = 0;
1118	replace_cmd_t	cmd;
1119	set_t		setno;
1120	side_t		side;
1121	mdkey_t		devkey;
1122	int		nkeys;
1123	mddb_recid_t	extra_recids[3] = { 0, 0, 0 };
1124	int		extra_rids = 0;
1125	md_error_t	mde = mdnullerror;
1126	sv_dev_t	sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
1127
1128	mdclrerror(ep);
1129	setno = MD_MIN2SET(mnum);
1130	side = mddb_getsidenum(setno);
1131
1132	un = md_unit_readerlock(MDI_UNIT(mnum));
1133
1134	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1135	    (raid_state_cnt(un, RCS_RESYNC) != 0)) {
1136		md_unit_readerexit(MDI_UNIT(mnum));
1137		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1138	}
1139
1140	if (un->un_state & RUS_DOI) {
1141		md_unit_readerexit(MDI_UNIT(mnum));
1142		return (mdmderror(ep, MDE_RAID_DOI, mnum));
1143	}
1144
1145	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1146	    (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
1147		md_unit_readerexit(MDI_UNIT(mnum));
1148		return (mdmderror(ep, MDE_IN_USE, mnum));
1149	}
1150
1151	md_unit_readerexit(MDI_UNIT(mnum));
1152
1153	/* get locks and recheck to be sure something did not change */
1154	if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
1155		return (0);
1156
1157	if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
1158		return (mddeverror(ep, MDE_NAME_SPACE, odev));
1159	}
1160
1161	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1162		md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
1163		/*
1164		 * Try to resolve devt again if NODEV64
1165		 */
1166		if (tmpdevt == NODEV64) {
1167			tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
1168			    un->un_column[ix].un_orig_key);
1169			un->un_column[ix].un_orig_dev = tmpdevt;
1170		}
1171
1172		if (un->un_column[ix].un_orig_dev == odev) {
1173			col = ix;
1174			break;
1175		} else {
1176			if (un->un_column[ix].un_orig_dev == NODEV64) {
1177				/*
1178				 * Now we use the keys to match.
1179				 * If no key found, continue.
1180				 */
1181				if (nkeys == 0) {
1182					continue;
1183				}
1184				if (un->un_column[ix].un_orig_key == devkey) {
1185					if (nkeys > 1)
1186						return (mddeverror(ep,
1187						    MDE_MULTNM, odev));
1188					col = ix;
1189					break;
1190				}
1191			}
1192		}
1193	}
1194
1195	if (col == -1)
1196		return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1197		    mnum, odev));
1198
1199	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1200	    (raid_state_cnt(un, RCS_RESYNC) != 0))
1201		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1202
1203	if (un->un_state & RUS_DOI)
1204		return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
1205		    un->un_column[col].un_dev));
1206
1207	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1208	    (MD_STATUS(un) & MD_UN_GROW_PENDING))
1209		return (mdmderror(ep, MDE_IN_USE, mnum));
1210
1211	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
1212		force = 1;
1213	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
1214		cmd = ENABLE_COMP;
1215	if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
1216		cmd = REPLACE_COMP;
1217
1218	if (un->un_state == RUS_LAST_ERRED) {
1219		/* Must use -f force flag for unit in LAST_ERRED state */
1220		if (!force)
1221			return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
1222
1223		/* Must use -f force flag on ERRED column first */
1224		if (un->un_column[col].un_devstate != RCS_ERRED) {
1225			for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1226				if (un->un_column[ix].un_devstate & RCS_ERRED)
1227					return (mdcomperror(ep,
1228					    MDE_RAID_COMP_ERRED, mnum,
1229					    un->un_column[ix].un_dev));
1230			}
1231		}
1232
1233		/* must use -f force flag on LAST_ERRED columns next */
1234		if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
1235		    (un->un_column[col].un_devstate != RCS_ERRED))
1236			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1237			    mnum, un->un_column[col].un_dev));
1238	}
1239
1240	if (un->un_state == RUS_ERRED) {
1241		if (! (un->un_column[col].un_devstate &
1242		    (RCS_ERRED | RCS_INIT_ERRED)))
1243			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1244			    mnum, un->un_column[ix].un_dev));
1245	}
1246
1247	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
1248	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
1249
1250	state = un->un_column[col].un_devstate;
1251	if (state & RCS_INIT_ERRED) {
1252		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1253		un->un_percent_done = 0;
1254		raid_set_state(un, col, RCS_INIT, 0);
1255	} else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
1256	    resync_request(mnum, col, 0, ep))
1257		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1258
1259
1260	if (cmd == REPLACE_COMP) {
1261		md_dev64_t tmpdev = mrp->new_dev;
1262
1263		/*
1264		 * open the device by device id
1265		 */
1266		tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
1267		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
1268			return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
1269			    tmpdev));
1270		}
1271
1272		/*
1273		 * If it's a metadevice, make sure it gets reparented
1274		 */
1275		if (md_getmajor(tmpdev) == md_major) {
1276			minor_t		new_mnum = md_getminor(tmpdev);
1277			md_unit_t	*new_un = MD_UNIT(new_mnum);
1278
1279			md_set_parent(tmpdev, MD_SID(un));
1280			extra_recids[extra_rids++] = MD_RECID(new_un);
1281		}
1282
1283		mrp->new_dev = tmpdev;
1284		un->un_column[col].un_orig_dev = tmpdev;
1285		un->un_column[col].un_orig_key = mrp->new_key;
1286		un->un_column[col].un_orig_pwstart = mrp->start_blk;
1287		un->un_column[col].un_orig_devstart =
1288		    mrp->start_blk + un->un_pwsize;
1289
1290		/*
1291		 * If the old device was a metadevice, make sure to
1292		 * reset its parent.
1293		 */
1294		if (md_getmajor(odev) == md_major) {
1295			minor_t		old_mnum = md_getminor(odev);
1296			md_unit_t	*old_un = MD_UNIT(old_mnum);
1297
1298			md_reset_parent(odev);
1299			extra_recids[extra_rids++] =
1300			    MD_RECID(old_un);
1301		}
1302
1303		if (HOTSPARED(un, col)) {
1304			md_layered_close(mrp->new_dev, MD_OFLG_NULL);
1305			un->un_column[col].un_alt_dev = mrp->new_dev;
1306			un->un_column[col].un_alt_pwstart = mrp->start_blk;
1307			un->un_column[col].un_alt_devstart =
1308			    mrp->start_blk + un->un_pwsize;
1309			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1310		} else {
1311			/*
1312			 * not hot spared.  Close the old device and
1313			 * move the new device in.
1314			 */
1315			if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
1316				md_layered_close(odev, MD_OFLG_NULL);
1317			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1318			un->un_column[col].un_dev = mrp->new_dev;
1319			un->un_column[col].un_pwstart = mrp->start_blk;
1320			un->un_column[col].un_devstart =
1321			    mrp->start_blk + un->un_pwsize;
1322			if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
1323				un->un_column[col].un_devflags |=
1324				    MD_RAID_REGEN_RESYNC;
1325			}
1326		}
1327		/*
1328		 * If the old device is not a metadevice then
1329		 * save off the set number and key so that it
1330		 * can be removed from the namespace later.
1331		 */
1332		if (md_getmajor(odev) != md_major) {
1333			sv.setno = setno;
1334			sv.key = devkey;
1335		}
1336	}
1337
1338	if (cmd == ENABLE_COMP) {
1339		md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
1340		mdkey_t raidkey =  un->un_column[col].un_orig_key;
1341
1342		/*
1343		 * We trust the dev_t because we cannot determine the
1344		 * dev_t from the device id since a new disk is in the
1345		 * same location. Since this is a call from metareplace -e dx
1346		 * AND it is SCSI a new dev_t is not generated.  So the
1347		 * dev_t from the mddb is used. Before enabling the device
1348		 * we check to make sure that multiple entries for the same
1349		 * device does not exist in the namespace. If they do we
1350		 * fail the ioctl.
1351		 * One of the many ways multiple entries in the name space
1352		 * can occur is if one removed the failed component in a
1353		 * RAID metadevice and put another disk that was part of
1354		 * another metadevice. After reboot metadevadm would correctly
1355		 * update the device name for the metadevice whose component
1356		 * has moved. However now in the metadb there are two entries
1357		 * for the same name (ctds) that belong to different
1358		 * metadevices. One is valid, the other is a ghost or "last
1359		 * know as" ctds.
1360		 */
1361		tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
1362		if (tmpdev == NODEV64)
1363			tmpdev = md_getdevnum(setno, side, raidkey,
1364			    MD_TRUST_DEVT);
1365		/*
1366		 * check for multiple entries in namespace for the
1367		 * same dev
1368		 */
1369
1370		if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
1371		    &nkeys) != 0)
1372			return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
1373		/*
1374		 * If number of keys are greater that
1375		 * 1, then we have an invalid
1376		 * namespace. STOP and return.
1377		 */
1378		if (nkeys > 1)
1379			return (mddeverror(ep, MDE_MULTNM, tmpdev));
1380		if (devkey != raidkey)
1381			return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1382			    mnum, tmpdev));
1383
1384		if (un->un_column[col].un_orig_dev == NODEV64)
1385			un->un_column[col].un_orig_dev = tmpdev;
1386
1387		if (HOTSPARED(un, col)) {
1388			un->un_column[col].un_alt_dev =
1389			    un->un_column[col].un_orig_dev;
1390			un->un_column[col].un_alt_pwstart =
1391			    un->un_column[col].un_orig_pwstart;
1392			un->un_column[col].un_alt_devstart =
1393			    un->un_column[col].un_orig_devstart;
1394			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1395		} else {
1396			if (!(un->un_column[col].un_devflags &
1397			    MD_RAID_DEV_ISOPEN)) {
1398				if (md_layered_open(mnum, &tmpdev,
1399				    MD_OFLG_NULL)) {
1400					un->un_column[col].un_dev = tmpdev;
1401					return (mdcomperror(ep,
1402					    MDE_COMP_OPEN_ERR, mnum, tmpdev));
1403				}
1404				ASSERT(tmpdev != NODEV64 &&
1405				    tmpdev != 0);
1406
1407				if ((md_getmajor(tmpdev) != md_major) &&
1408				    (md_devid_found(setno, side, raidkey)
1409				    == 1)) {
1410					if (md_update_namespace_did(setno, side,
1411					    raidkey, &mde) != 0) {
1412						cmn_err(CE_WARN,
1413						    "md: could not"
1414						    " update namespace\n");
1415					}
1416				}
1417				un->un_column[col].un_dev =
1418				    un->un_column[col].un_orig_dev;
1419			}
1420			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1421			un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
1422		}
1423	}
1424	if (mrp->has_label) {
1425		un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
1426	} else {
1427		un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
1428	}
1429
1430	raid_commit(un, extra_recids);
1431
1432	/* If the component has been replaced - clean up the name space */
1433	if (sv.setno != MD_SET_BAD) {
1434		md_rem_names(&sv, 1);
1435	}
1436
1437	md_ioctl_droplocks(lock);
1438
1439	if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
1440		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
1441		    setno, MD_SID(un));
1442	} else {
1443		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
1444		    setno, MD_SID(un));
1445	}
1446
1447	if (un->un_column[col].un_devstate & RCS_INIT)
1448		err = raid_init_unit(mnum, ep);
1449	else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
1450		err = raid_resync_unit(mnum, ep);
1451
1452	mdclrerror(ep);
1453	if (!err)
1454		return (0);
1455
1456	/* be sure state */
1457	/* is already set by this time */
1458	/* fix state  and commit record */
1459	un = md_unit_writerlock(MDI_UNIT(mnum));
1460	if (state & RCS_INIT_ERRED)
1461		raid_set_state(un, col, state, 1);
1462	else if (state & RCS_OKAY)
1463		raid_set_state(un, col, RCS_ERRED, 0);
1464	else
1465		raid_set_state(un, col, state, 1);
1466	raid_commit(un, NULL);
1467	md_unit_writerexit(MDI_UNIT(mnum));
1468	mdclrerror(ep);
1469	return (0);
1470}
1471
1472
1473/*
1474 * NAME:	raid_set_sync
1475 * DESCRIPTION: used to sync a component of a RAID metadevice
1476 * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
1477 *		int	      mode - must be FWRITE
1478 *		IOLOCK	     *lock - pointer to IOCTL lock
1479 *
1480 * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1481 *		obtains and releases md_unit_array_rw write lock
1482 *
1483 */
1484static int
1485raid_set_sync(
1486	md_resync_ioctl_t	*rip,
1487	IOLOCK			*lock
1488)
1489{
1490	minor_t			mnum = rip->ri_mnum;
1491	mr_unit_t		*un;
1492	int			init = 0;
1493	int			resync = 0;
1494	int			regen = 0;
1495	int			ix;
1496	int			err;
1497
1498	mdclrerror(&rip->mde);
1499
1500	if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
1501		return (0);
1502
1503	if (un->un_state & RUS_DOI)
1504		return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
1505
1506	if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
1507		return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
1508
1509	/* This prevents new opens */
1510
1511	rip->ri_flags = 0;
1512	if (un->un_state & RUS_REGEN)
1513		regen++;
1514
1515	if (raid_state_cnt(un, RCS_RESYNC))
1516		resync++;
1517
1518	if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
1519		init++;
1520
1521	ASSERT(!(resync && init && regen));
1522	md_ioctl_droplocks(lock);
1523	rip->ri_percent_done = 0;
1524
1525	if (init) {
1526		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1527		return (raid_init_unit(mnum, &rip->mde));
1528	}
1529
1530	/*
1531	 * If resync is needed, it will call raid_internal_open forcing
1532	 * replay before the open completes.
1533	 * Otherwise, call raid_internal_open directly to force
1534	 * replay to complete during boot (metasync -r).
1535	 * NOTE: the unit writer lock must remain held while setting
1536	 *	 MD_UN_RESYNC_ACTIVE but must be released before
1537	 *	 calling raid_resync_unit or raid_internal_open.
1538	 */
1539	if (resync) {
1540		ASSERT(resync < 2);
1541		un = md_unit_writerlock(MDI_UNIT(mnum));
1542		MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
1543		/* Must release unit writer lock for resync */
1544		/*
1545		 * correctly setup the devices before trying to start the
1546		 * resync operation.
1547		 */
1548		for (ix = 0; un->un_totalcolumncnt; ix++) {
1549			if (un->un_column[ix].un_devstate & RCS_RESYNC) {
1550				if ((un->un_column[ix].un_devflags &
1551				    MD_RAID_COPY_RESYNC) &&
1552				    HOTSPARED(un, ix)) {
1553					un->un_column[ix].un_alt_dev =
1554					    un->un_column[ix].un_orig_dev;
1555					un->un_column[ix].un_alt_devstart =
1556					    un->un_column[ix].un_orig_devstart;
1557					un->un_column[ix].un_alt_pwstart =
1558					    un->un_column[ix].un_orig_pwstart;
1559				}
1560				break;
1561			}
1562		}
1563		ASSERT(un->un_column[ix].un_devflags &
1564		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1565		rip->ri_percent_done = 0;
1566		un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
1567		(void) resync_request(mnum, ix, 0, NULL);
1568		md_unit_writerexit(MDI_UNIT(mnum));
1569		err = raid_resync_unit(mnum, &rip->mde);
1570		return (err);
1571	}
1572
1573	if (regen) {
1574		err = raid_regen_unit(mnum, &rip->mde);
1575		return (err);
1576	}
1577
1578	/* The unit requires not work so just force replay of the device */
1579	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
1580		return (mdmderror(&rip->mde,
1581		    MDE_RAID_OPEN_FAILURE, mnum));
1582	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1583
1584	return (0);
1585}
1586
1587/*
1588 * NAME:	raid_get_resync
1589 * DESCRIPTION: used to check resync status on a component of a RAID metadevice
1590 * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
1591 *		int	      mode - must be FWRITE
1592 *		IOLOCK	     *lock - pointer to IOCTL lock
1593 *
1594 * LOCKS:	none
1595 *
1596 */
1597static int
1598raid_get_resync(
1599	md_resync_ioctl_t	*rip,
1600	IOLOCK			*lock
1601)
1602{
1603	minor_t			mnum = rip->ri_mnum;
1604	mr_unit_t		*un;
1605	u_longlong_t		percent;
1606	int			cnt;
1607	int			ix;
1608	uint64_t		d;
1609
1610	mdclrerror(&rip->mde);
1611
1612	if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
1613		return (0);
1614
1615	rip->ri_flags = 0;
1616	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1617		d = un->un_segsincolumn;
1618		percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
1619		if (percent > 1000)
1620			percent = 1000;	/* can't go over 100% */
1621		rip->ri_percent_done = (int)percent;
1622		rip->ri_flags |= MD_RI_INPROGRESS;
1623	}
1624
1625	if (UNIT_STATE(un) & RUS_INIT) {
1626		d = un->un_segsize * un->un_segsincolumn *
1627		    un->un_totalcolumncnt;
1628		percent =
1629		    d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
1630		if (percent > 1000)
1631			percent = 1000;	/* can't go over 100% */
1632		rip->ri_percent_done = (int)percent;
1633		rip->ri_flags |= MD_GROW_INPROGRESS;
1634	} else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1635		d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
1636		percent =
1637		    d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
1638		if (percent > 1000)
1639			percent = 1000;
1640		rip->ri_percent_done = (int)percent;
1641		rip->ri_flags |= MD_GROW_INPROGRESS;
1642	}
1643
1644	if (un->un_state & RUS_REGEN)
1645		rip->ri_percent_done = un->un_percent_done;
1646
1647	cnt = 0;
1648	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1649		switch (un->un_column[ix].un_devstate) {
1650		case RCS_INIT:
1651		case RCS_ERRED:
1652		case RCS_LAST_ERRED:
1653			cnt++;
1654			break;
1655		default:
1656			break;
1657		}
1658	}
1659	d = un->un_totalcolumncnt;
1660	rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
1661	return (0);
1662}
1663
1664/*
1665 * NAME:	raid_grow
1666 * DESCRIPTION: Concatenate to a RAID metadevice
1667 * PARAMETERS:	md_grow_params_t *mgp
1668 *			      - pointer to IOCGROW data structure
1669 *		int	 mode - must be FWRITE
1670 *		IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
1671 *
1672 * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1673 *		obtains and releases md_unit_array_rw write lock
1674 *
1675 */
1676static int
1677raid_grow(void *mgp, int mode, IOLOCK *lock)
1678{
1679	minor_t		mnum;
1680	mr_unit_t	*un, *new_un;
1681	mdi_unit_t	*ui;
1682	mddb_type_t	typ1;
1683	mddb_recid_t	mr_recid;
1684	mddb_recid_t	old_vtoc = 0;
1685	mddb_recid_t	*recids;
1686	md_create_rec_option_t options;
1687	int		err;
1688	int		col, i;
1689	int64_t		tb, atb;
1690	u_longlong_t	unrev;
1691	int		tc;
1692	int		rval = 0;
1693	set_t		setno;
1694	mr_column_ic_t	*mrc;
1695	int		num_recs, rid;
1696	md_grow_params_t	*mgph = mgp;
1697
1698
1699	mnum = mgph->mnum;
1700
1701	mdclrerror(&mgph->mde);
1702
1703	ui = MDI_UNIT(mnum);
1704	un = md_unit_readerlock(ui);
1705
1706	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1707		md_unit_readerexit(ui);
1708		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1709	}
1710
1711	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1712		md_unit_readerexit(ui);
1713		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1714	}
1715
1716	if (UNIT_STATE(un) & RUS_LAST_ERRED) {
1717		md_unit_readerexit(ui);
1718		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1719	}
1720
1721	if (UNIT_STATE(un) & RUS_DOI) {
1722		md_unit_readerexit(ui);
1723		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1724	}
1725
1726	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
1727		md_unit_readerexit(ui);
1728		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1729	}
1730
1731	md_unit_readerexit(ui);
1732
1733	if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
1734	    NULL)
1735		return (0);
1736
1737	if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1738		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1739
1740	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1741		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1742
1743	if (un->c.un_size >= mgph->size)
1744		return (EINVAL);
1745
1746	if (UNIT_STATE(un) & RUS_LAST_ERRED)
1747		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1748
1749	if (UNIT_STATE(un) & RUS_DOI)
1750		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1751
1752	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
1753		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1754
1755	setno = MD_MIN2SET(mnum);
1756
1757	typ1 = (mddb_type_t)md_getshared_key(setno,
1758	    raid_md_ops.md_driver.md_drivername);
1759
1760	/*
1761	 * Preserve the friendly name nature of the device that is
1762	 * growing.
1763	 */
1764	options = MD_CRO_RAID;
1765	if (un->c.un_revision & MD_FN_META_DEV)
1766		options |= MD_CRO_FN;
1767	if (mgph->options & MD_CRO_64BIT) {
1768#if defined(_ILP32)
1769		return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
1770#else
1771		mr_recid = mddb_createrec(mgph->size, typ1, 0,
1772		    MD_CRO_64BIT | options, setno);
1773#endif
1774	} else {
1775		mr_recid = mddb_createrec(mgph->size, typ1, 0,
1776		    MD_CRO_32BIT | options, setno);
1777	}
1778	if (mr_recid < 0) {
1779		rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
1780		    mnum, setno);
1781		return (rval);
1782	}
1783
1784	/* get the address of the new unit */
1785	new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
1786
1787	/*
1788	 * It is okay that we muck with the new unit here,
1789	 * since no one else will know about the unit struct
1790	 * until we commit it. If we crash, the record will
1791	 * be automatically purged, since we haven't
1792	 * committed it yet and the old unit struct will be found.
1793	 */
1794
1795	/* copy in the user's unit struct */
1796	err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
1797	    mgph->size, mode);
1798	if (err) {
1799		mddb_deleterec_wrapper(mr_recid);
1800		return (EFAULT);
1801	}
1802
1803	/* make sure columns are being added */
1804	if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
1805		mddb_deleterec_wrapper(mr_recid);
1806		return (EINVAL);
1807	}
1808
1809	/*
1810	 * Save a few of the new unit structs fields.
1811	 * Before they get clobbered.
1812	 */
1813	tc = new_un->un_totalcolumncnt;
1814	tb = new_un->c.un_total_blocks;
1815	atb = new_un->c.un_actual_tb;
1816	unrev = new_un->c.un_revision;
1817
1818	/*
1819	 * Copy the old unit struct (static stuff)
1820	 * into new unit struct
1821	 */
1822	bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
1823
1824	/*
1825	 * Restore a few of the new unit struct values.
1826	 */
1827	new_un->un_totalcolumncnt = tc;
1828	new_un->c.un_actual_tb = atb;
1829	new_un->un_grow_tb = tb;
1830	new_un->c.un_revision = unrev;
1831	new_un->c.un_record_id = mr_recid;
1832	new_un->c.un_size = mgph->size;
1833
1834	ASSERT(new_un->mr_ic == un->mr_ic);
1835
1836	/*
1837	 * Save old column slots
1838	 */
1839	mrc = un->un_column_ic;
1840
1841	/*
1842	 * Allocate new column slot
1843	 */
1844	new_un->un_column_ic = (mr_column_ic_t *)
1845	    kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
1846	    KM_SLEEP);
1847
1848	/*
1849	 * Restore old column slots
1850	 * Free the old column slots
1851	 */
1852	bcopy(mrc, new_un->un_column_ic,
1853	    sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1854	kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1855
1856	/* All 64 bit metadevices only support EFI labels. */
1857	if (mgph->options & MD_CRO_64BIT) {
1858		new_un->c.un_flag |= MD_EFILABEL;
1859		/*
1860		 * If the device was previously smaller than a terabyte,
1861		 * and had a vtoc record attached to it, we remove the
1862		 * vtoc record, because the layout has changed completely.
1863		 */
1864		if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1865		    (un->c.un_vtoc_id != 0)) {
1866			old_vtoc = un->c.un_vtoc_id;
1867			new_un->c.un_vtoc_id =
1868			    md_vtoc_to_efi_record(old_vtoc, setno);
1869		}
1870	}
1871
1872
1873	/*
1874	 * allocate the real recids array.  since we may have to commit
1875	 * underlying metadevice records, we need an array of size:
1876	 * total number of new components being attach + 2 (one for the
1877	 * raid itself, one for the end marker).
1878	 */
1879	num_recs = new_un->un_totalcolumncnt + 2;
1880	rid = 0;
1881	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
1882	recids[rid++] = mr_recid;
1883
1884	for (col = un->un_totalcolumncnt;
1885	    (col < new_un->un_totalcolumncnt); col++) {
1886		mr_column_t	*mr_col = &new_un->un_column[col];
1887		md_unit_t	*comp_un;
1888
1889		if (raid_build_pw_reservation(new_un, col) != 0) {
1890			/* release pwslots already allocated by grow */
1891			for (i = un->un_totalcolumncnt; i < col; i++) {
1892				raid_free_pw_reservation(new_un, i);
1893			}
1894			kmem_free(new_un->un_column_ic,
1895			    sizeof (mr_column_ic_t) *
1896			    new_un->un_totalcolumncnt);
1897			kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
1898			kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1899			mddb_deleterec_wrapper(mr_recid);
1900			return (EINVAL);
1901		}
1902		/*
1903		 * set parent on metadevices being added.
1904		 * NOTE: currently soft partitions are the only metadevices
1905		 * which can appear within a RAID metadevice.
1906		 */
1907		if (md_getmajor(mr_col->un_dev) == md_major) {
1908			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1909			recids[rid++] = MD_RECID(comp_un);
1910			md_set_parent(mr_col->un_dev, MD_SID(new_un));
1911		}
1912		new_un->un_column[col].un_devflags = 0;
1913	}
1914
1915	/* set end marker */
1916	recids[rid] = 0;
1917
1918	/* commit new unit struct */
1919	mddb_commitrecs_wrapper(recids);
1920
1921	/* delete old unit struct */
1922	mddb_deleterec_wrapper(un->c.un_record_id);
1923
1924	/* place new unit in in-core array */
1925	md_nblocks_set(mnum, new_un->c.un_total_blocks);
1926	MD_UNIT(mnum) = new_un;
1927
1928	/*
1929	 * If old_vtoc has a non zero value, we know:
1930	 * - This unit crossed the border from smaller to larger one TB
1931	 * - There was a vtoc record for the unit,
1932	 * - This vtoc record is no longer needed, because
1933	 *   a new efi record has been created for this un.
1934	 */
1935	if (old_vtoc != 0) {
1936		mddb_deleterec_wrapper(old_vtoc);
1937	}
1938
1939	/* free recids */
1940	kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1941
1942	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1943	    MD_UN2SET(new_un), MD_SID(new_un));
1944	MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
1945
1946	/*
1947	 * Since the md_ioctl_writelock aquires the unit write lock
1948	 * and open/close aquires the unit reader lock it is necessary
1949	 * to drop the unit write lock and then reaquire it as needed
1950	 * later.
1951	 */
1952	md_unit_writerexit(ui);
1953
1954	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
1955		rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
1956		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1957		    MD_UN2SET(new_un), MD_SID(new_un));
1958		return (rval);
1959	}
1960	(void) md_unit_writerlock(ui);
1961	for (i = 0; i < new_un->un_totalcolumncnt; i++) {
1962		if (new_un->un_column[i].un_devstate & RCS_OKAY)
1963			(void) init_pw_area(new_un, new_un->un_column[i].un_dev,
1964			    new_un->un_column[i].un_pwstart, i);
1965	}
1966	md_unit_writerexit(ui);
1967	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1968	(void) md_unit_writerlock(ui);
1969	/* create a background thread to initialize the columns */
1970	md_ioctl_droplocks(lock);
1971
1972	return (raid_init_unit(mnum, &mgph->mde));
1973}
1974
1975/*
1976 * NAME:	raid_reset
1977 * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
1978 * PARAMETERS:	md_i_reset_t *mirp - pointer to reset data structure
1979 *
1980 * LOCKS:	obtains and releases md_unit_array_rw write lock
1981 *
1982 */
1983static int
1984raid_reset(md_i_reset_t	*mirp)
1985{
1986	minor_t		mnum = mirp->mnum;
1987	mr_unit_t	*un;
1988	mdi_unit_t	*ui;
1989	set_t		setno = MD_MIN2SET(mnum);
1990
1991	mdclrerror(&mirp->mde);
1992
1993	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1994	/*
1995	 * NOTE: need to get md_unit_writerlock to avoid conflict
1996	 * with raid_init thread.
1997	 */
1998	if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
1999	    NULL) {
2000		rw_exit(&md_unit_array_rw.lock);
2001		return (0);
2002	}
2003	ui = MDI_UNIT(mnum);
2004
2005	if (MD_HAS_PARENT(MD_PARENT(un))) {
2006		rw_exit(&md_unit_array_rw.lock);
2007		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
2008	}
2009
2010	un = (mr_unit_t *)md_unit_openclose_enter(ui);
2011	if (md_unit_isopen(MDI_UNIT(mnum))) {
2012		md_unit_openclose_exit(ui);
2013		rw_exit(&md_unit_array_rw.lock);
2014		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
2015	}
2016	md_unit_openclose_exit(ui);
2017	if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
2018		rw_exit(&md_unit_array_rw.lock);
2019		return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
2020	}
2021
2022	reset_raid(un, mnum, 1);
2023
2024	/*
2025	 * Update unit availability
2026	 */
2027	md_set[setno].s_un_avail++;
2028
2029	/*
2030	 * If MN set, reset s_un_next so all nodes can have
2031	 * the same view of the next available slot when
2032	 * nodes are -w and -j
2033	 */
2034	if (MD_MNSET_SETNO(setno)) {
2035		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
2036	}
2037
2038	rw_exit(&md_unit_array_rw.lock);
2039
2040	return (0);
2041}
2042
2043/*
2044 * NAME:	raid_get_geom
2045 * DESCRIPTION: used to get the geometry of a RAID metadevice
2046 * PARAMETERS:	mr_unit_t    *un - RAID unit to get the geometry for
2047 *		struct dk_geom *gp - pointer to geometry data structure
2048 *
2049 * LOCKS:	none
2050 *
2051 */
2052static int
2053raid_get_geom(
2054	mr_unit_t	*un,
2055	struct dk_geom	*geomp
2056)
2057{
2058	md_get_geom((md_unit_t *)un, geomp);
2059
2060	return (0);
2061}
2062
2063/*
2064 * NAME:	raid_get_vtoc
2065 * DESCRIPTION: used to get the VTOC on a RAID metadevice
2066 * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
2067 *		struct vtoc *vtocp - pointer to VTOC data structure
2068 *
2069 * LOCKS:	none
2070 *
2071 */
2072static int
2073raid_get_vtoc(
2074	mr_unit_t	*un,
2075	struct vtoc	*vtocp
2076)
2077{
2078	md_get_vtoc((md_unit_t *)un, vtocp);
2079
2080	return (0);
2081}
2082
2083/*
2084 * NAME:	raid_set_vtoc
2085 * DESCRIPTION: used to set the VTOC on a RAID metadevice
2086 * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2087 *		struct vtoc *vtocp - pointer to VTOC data structure
2088 *
2089 * LOCKS:	none
2090 *
2091 */
2092static int
2093raid_set_vtoc(
2094	mr_unit_t	*un,
2095	struct vtoc	*vtocp
2096)
2097{
2098	return (md_set_vtoc((md_unit_t *)un, vtocp));
2099}
2100
2101
2102/*
2103 * NAME:	raid_get_extvtoc
2104 * DESCRIPTION: used to get the extended VTOC on a RAID metadevice
2105 * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
2106 *		struct extvtoc *vtocp - pointer to extended VTOC data structure
2107 *
2108 * LOCKS:	none
2109 *
2110 */
2111static int
2112raid_get_extvtoc(
2113	mr_unit_t	*un,
2114	struct extvtoc	*vtocp
2115)
2116{
2117	md_get_extvtoc((md_unit_t *)un, vtocp);
2118
2119	return (0);
2120}
2121
2122/*
2123 * NAME:	raid_set_extvtoc
2124 * DESCRIPTION: used to set the extended VTOC on a RAID metadevice
2125 * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2126 *		struct extvtoc *vtocp - pointer to extended VTOC data structure
2127 *
2128 * LOCKS:	none
2129 *
2130 */
2131static int
2132raid_set_extvtoc(
2133	mr_unit_t	*un,
2134	struct extvtoc	*vtocp
2135)
2136{
2137	return (md_set_extvtoc((md_unit_t *)un, vtocp));
2138}
2139
2140
2141
2142/*
2143 * NAME:	raid_get_cgapart
2144 * DESCRIPTION: used to get the dk_map on a RAID metadevice
2145 * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2146 *		struct vtoc *dkmapp - pointer to dk_map data structure
2147 *
2148 * LOCKS:	none
2149 *
2150 */
2151
2152static int
2153raid_get_cgapart(
2154	mr_unit_t	*un,
2155	struct dk_map	*dkmapp
2156)
2157{
2158	md_get_cgapart((md_unit_t *)un, dkmapp);
2159	return (0);
2160}
2161
2162/*
2163 * NAME:	raid_getdevs
2164 * DESCRIPTION: return all devices within a RAID metadevice
2165 * PARAMETERS:	md_getdevs_params_t *mgdp
2166 *			      - pointer to getdevs IOCTL data structure
2167 *		int	 mode - should be FREAD
2168 *		IOLOCK *lockp - IOCTL read/write lock
2169 *
2170 * LOCKS:	obtains unit reader lock via IOLOCK
2171 *
2172 */
2173static int
2174raid_getdevs(
2175	void			*mgdp,
2176	int			mode,
2177	IOLOCK			*lock
2178)
2179{
2180	minor_t			mnum;
2181	mr_unit_t		*un;
2182	md_dev64_t		*udevs;
2183	int			i, cnt;
2184	md_dev64_t		unit_dev;
2185	md_getdevs_params_t	*mgdph = mgdp;
2186
2187
2188	mnum = mgdph->mnum;
2189
2190	/* check out unit */
2191	mdclrerror(&mgdph->mde);
2192
2193	if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
2194		return (0);
2195
2196	udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
2197
2198	for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
2199		if (cnt < mgdph->cnt) {
2200			unit_dev = un->un_column[i].un_orig_dev;
2201			if (md_getmajor(unit_dev) != md_major) {
2202				if ((unit_dev = md_xlate_mini_2_targ
2203				    (unit_dev)) == NODEV64)
2204					return (ENODEV);
2205			}
2206
2207			if (ddi_copyout((caddr_t)&unit_dev,
2208			    (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2209				return (EFAULT);
2210		}
2211		if (HOTSPARED(un, i)) {
2212			cnt++;
2213			if (cnt >= mgdph->cnt)
2214				continue;
2215
2216			unit_dev = un->un_column[i].un_dev;
2217			if (md_getmajor(unit_dev) != md_major) {
2218				if ((unit_dev = md_xlate_mini_2_targ
2219				    (unit_dev)) == NODEV64)
2220					return (ENODEV);
2221			}
2222
2223			if (ddi_copyout((caddr_t)&unit_dev,
2224			    (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2225				return (EFAULT);
2226		}
2227	}
2228	mgdph->cnt = cnt;
2229	return (0);
2230}
2231
2232/*
2233 * NAME:	raid_change
2234 * DESCRIPTION: used to change the following dynamic values:
2235 *			the hot spare pool
2236 *		in the unit structure of a RAID metadevice
2237 * PARAMETERS:	md_change_params_t   *mcp - pointer to change data structure
2238 *		IOLOCK	     *lock - pointer to IOCTL lock
2239 *
2240 * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun)
2241 *
2242 */
2243static int
2244raid_change(
2245	md_raid_params_t	*mrp,
2246	IOLOCK			*lock
2247)
2248{
2249	minor_t		mnum = mrp->mnum;
2250	mr_unit_t	*un;
2251	int		ix;
2252	mddb_recid_t	recids[3] = {0, 0, 0};
2253	int		err;
2254	int		irecid;
2255	int		inc_new_hsp = 0;
2256
2257	mdclrerror(&mrp->mde);
2258
2259	if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
2260		return (0);
2261
2262	if (!mrp->params.change_hsp_id)
2263		return (0);
2264
2265	/* verify that no hotspare is in use */
2266	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
2267		if (HOTSPARED(un, ix)) {
2268			return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
2269		}
2270	}
2271
2272	/* replace the hot spare pool */
2273
2274	irecid = 0;
2275	if (mrp->params.hsp_id != -1) {
2276		/* increment the reference count of the new hsp */
2277		err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
2278		    &recids[0], NULL, NULL, NULL);
2279		if (err) {
2280			return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2281			    mrp->params.hsp_id));
2282		}
2283		inc_new_hsp = 1;
2284		irecid++;
2285	}
2286
2287	if (un->un_hsp_id != -1) {
2288		/* decrement the reference count of the old hsp */
2289		err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
2290		    &recids[irecid], NULL, NULL, NULL);
2291		if (err) {
2292			err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2293			    mrp->params.hsp_id);
2294			if (inc_new_hsp) {
2295				(void) md_hot_spare_ifc(HSP_DECREF,
2296				    mrp->params.hsp_id, 0, 0,
2297				    &recids[0], NULL, NULL, NULL);
2298				/*
2299				 * Don't need to commit the record,
2300				 * because it wasn't committed before
2301				 */
2302			}
2303			return (err);
2304		}
2305	}
2306
2307	un->un_hsp_id = mrp->params.hsp_id;
2308
2309	raid_commit(un, recids);
2310	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
2311	    MD_UN2SET(un), MD_SID(un));
2312
2313	/* Now trigger hot spare processing in case one is needed. */
2314	if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
2315		(void) raid_hotspares();
2316
2317	return (0);
2318}
2319
2320/*
2321 * NAME:	raid_admin_ioctl
2322 * DESCRIPTION: IOCTL operations unique to metadevices and RAID
2323 * PARAMETERS:	int	  cmd - IOCTL command to be executed
2324 *		void	*data - pointer to IOCTL data structure
2325 *		int	 mode - either FREAD or FWRITE
2326 *		IOLOCK *lockp - IOCTL read/write lock
2327 *
2328 * LOCKS:	none
2329 *
2330 */
2331static int
2332raid_admin_ioctl(
2333	int		cmd,
2334	void		*data,
2335	int		mode,
2336	IOLOCK		*lockp
2337)
2338{
2339	size_t		sz = 0;
2340	void		*d = NULL;
2341	int		err = 0;
2342
2343	/* We can only handle 32-bit clients for internal commands */
2344	if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2345		return (EINVAL);
2346	}
2347
2348
2349	/* dispatch ioctl */
2350	switch (cmd) {
2351
2352	case MD_IOCSET:
2353	{
2354		if (! (mode & FWRITE))
2355			return (EACCES);
2356
2357		sz = sizeof (md_set_params_t);
2358		d = kmem_alloc(sz, KM_SLEEP);
2359
2360		if (ddi_copyin(data, d, sz, mode)) {
2361			err = EFAULT;
2362			break;
2363		}
2364
2365		err = raid_set(d, mode);
2366		break;
2367	}
2368
2369	case MD_IOCGET:
2370	{
2371		if (! (mode & FREAD))
2372			return (EACCES);
2373
2374		sz = sizeof (md_i_get_t);
2375		d = kmem_alloc(sz, KM_SLEEP);
2376
2377		if (ddi_copyin(data, d, sz, mode)) {
2378			err = EFAULT;
2379			break;
2380		}
2381
2382		err = raid_get(d, mode, lockp);
2383		break;
2384	}
2385
2386	case MD_IOCREPLACE:
2387	{
2388		if (! (mode & FWRITE))
2389			return (EACCES);
2390
2391		sz = sizeof (replace_params_t);
2392		d = kmem_alloc(sz, KM_SLEEP);
2393
2394		if (ddi_copyin(data, d, sz, mode)) {
2395			err = EFAULT;
2396			break;
2397		}
2398
2399		err = raid_replace((replace_params_t *)d, lockp);
2400		break;
2401	}
2402
2403	case MD_IOCSETSYNC:
2404	{
2405		if (! (mode & FWRITE))
2406			return (EACCES);
2407
2408		sz = sizeof (md_resync_ioctl_t);
2409		d = kmem_alloc(sz, KM_SLEEP);
2410
2411		if (ddi_copyin(data, d, sz, mode)) {
2412			err = EFAULT;
2413			break;
2414		}
2415
2416		err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
2417		break;
2418	}
2419
2420	case MD_IOCGETSYNC:
2421	{
2422		if (! (mode & FREAD))
2423			return (EACCES);
2424
2425		sz = sizeof (md_resync_ioctl_t);
2426		d = kmem_alloc(sz, KM_SLEEP);
2427
2428		if (ddi_copyin(data, d, sz, mode)) {
2429			err = EFAULT;
2430			break;
2431		}
2432		err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
2433
2434		break;
2435	}
2436
2437	case MD_IOCGROW:
2438	{
2439		if (! (mode & FWRITE))
2440			return (EACCES);
2441
2442		sz = sizeof (md_grow_params_t);
2443		d = kmem_alloc(sz, KM_SLEEP);
2444
2445		if (ddi_copyin(data, d, sz, mode)) {
2446			err = EFAULT;
2447			break;
2448		}
2449
2450		err = raid_grow(d, mode, lockp);
2451		break;
2452	}
2453
2454	case MD_IOCCHANGE:
2455	{
2456		if (! (mode & FWRITE))
2457			return (EACCES);
2458
2459		sz = sizeof (md_raid_params_t);
2460		d = kmem_alloc(sz, KM_SLEEP);
2461
2462		if (ddi_copyin(data, d, sz, mode)) {
2463			err = EFAULT;
2464			break;
2465		}
2466
2467		err = raid_change((md_raid_params_t *)d, lockp);
2468		break;
2469	}
2470
2471	case MD_IOCRESET:
2472	{
2473		if (! (mode & FWRITE))
2474			return (EACCES);
2475
2476		sz = sizeof (md_i_reset_t);
2477		d = kmem_alloc(sz, KM_SLEEP);
2478
2479		if (ddi_copyin(data, d, sz, mode)) {
2480			err = EFAULT;
2481			break;
2482		}
2483
2484		err = raid_reset((md_i_reset_t *)d);
2485		break;
2486	}
2487
2488	case MD_IOCGET_DEVS:
2489	{
2490		if (! (mode & FREAD))
2491			return (EACCES);
2492
2493		sz = sizeof (md_getdevs_params_t);
2494		d = kmem_alloc(sz, KM_SLEEP);
2495
2496		if (ddi_copyin(data, d, sz, mode)) {
2497			err = EFAULT;
2498			break;
2499		}
2500
2501		err = raid_getdevs(d, mode, lockp);
2502		break;
2503	}
2504
2505	case MD_IOCSETREGEN:
2506	{
2507		if (! (mode & FWRITE))
2508			return (EACCES);
2509
2510		sz = sizeof (md_regen_param_t);
2511		d = kmem_alloc(sz, KM_SLEEP);
2512
2513		if (ddi_copyin(data, d, sz, mode)) {
2514			err = EFAULT;
2515			break;
2516		}
2517
2518		err = raid_regen((md_regen_param_t *)d, lockp);
2519		break;
2520	}
2521
2522	case MD_IOCPROBE_DEV:
2523	{
2524		md_probedev_impl_t	*p = NULL;
2525		md_probedev_t		*ph = NULL;
2526		daemon_queue_t		*hdr = NULL;
2527		int			i;
2528		size_t			sz1 = 0;
2529
2530
2531		if (! (mode & FREAD))
2532			return (EACCES);
2533
2534		sz = sizeof (md_probedev_t);
2535
2536		d = kmem_alloc(sz, KM_SLEEP);
2537
2538		/* now copy in the data */
2539		if (ddi_copyin(data, d, sz, mode)) {
2540			err = EFAULT;
2541			goto free_mem;
2542		}
2543
2544		/*
2545		 * Sanity test the args. Test name should have the keyword
2546		 * probe.
2547		 */
2548		p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2549		p->probe_sema = NULL;
2550		p->probe_mx = NULL;
2551		p->probe.mnum_list = (uint64_t)NULL;
2552
2553		ph = (md_probedev_t *)d;
2554		p->probe.nmdevs = ph->nmdevs;
2555		(void) strcpy(p->probe.test_name, ph->test_name);
2556		bcopy(&ph->md_driver, &(p->probe.md_driver),
2557		    sizeof (md_driver_t));
2558
2559		if ((p->probe.nmdevs < 1) ||
2560		    (strstr(p->probe.test_name, "probe") == NULL)) {
2561			err = EINVAL;
2562			goto free_mem;
2563		}
2564
2565		sz1 = sizeof (minor_t) * p->probe.nmdevs;
2566
2567		p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
2568		    KM_SLEEP);
2569
2570		if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
2571		    (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
2572			err = EFAULT;
2573			goto free_mem;
2574		}
2575
2576		if (err = md_init_probereq(p, &hdr))
2577			goto free_mem;
2578
2579		/*
2580		 * put the request on the queue and wait.
2581		 */
2582
2583		daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2584
2585		(void) IOLOCK_RETURN(0, lockp);
2586		/* wait for the events to occur */
2587		for (i = 0; i < p->probe.nmdevs; i++) {
2588			sema_p(PROBE_SEMA(p));
2589		}
2590		while (md_ioctl_lock_enter() == EINTR)
2591			;
2592
2593		/*
2594		 * clean up. The hdr list is freed in the probe routines
2595		 * since the list is NULL by the time we get here.
2596		 */
2597free_mem:
2598		if (p) {
2599			if (p->probe_sema != NULL) {
2600				sema_destroy(PROBE_SEMA(p));
2601				kmem_free(p->probe_sema, sizeof (ksema_t));
2602			}
2603			if (p->probe_mx != NULL) {
2604				mutex_destroy(PROBE_MX(p));
2605				kmem_free(p->probe_mx, sizeof (kmutex_t));
2606			}
2607			if (p->probe.mnum_list)
2608				kmem_free((caddr_t)(uintptr_t)
2609				    p->probe.mnum_list, sz1);
2610
2611			kmem_free(p, sizeof (md_probedev_impl_t));
2612		}
2613		break;
2614	}
2615
2616	default:
2617		return (ENOTTY);
2618	}
2619
2620	/*
2621	 * copyout and free any args
2622	 */
2623	if (sz != 0) {
2624		if (err == 0) {
2625			if (ddi_copyout(d, data, sz, mode) != 0) {
2626				err = EFAULT;
2627			}
2628		}
2629		kmem_free(d, sz);
2630	}
2631	return (err);
2632}
2633
2634/*
2635 * NAME:	md_raid_ioctl
2636 * DESCRIPTION: RAID metadevice IOCTL operations entry point.
2637 * PARAMETERS:	md_dev64_t dev - RAID device identifier
2638 *		int	  cmd  - IOCTL command to be executed
2639 *		void	*data  - pointer to IOCTL data structure
2640 *		int	 mode  - either FREAD or FWRITE
2641 *		IOLOCK *lockp  - IOCTL read/write lock
2642 *
2643 * LOCKS:	none
2644 *
2645 */
2646int
2647md_raid_ioctl(
2648	dev_t		dev,
2649	int		cmd,
2650	void		*data,
2651	int		mode,
2652	IOLOCK		*lockp
2653)
2654{
2655	minor_t		mnum = getminor(dev);
2656	mr_unit_t	*un;
2657	int		err = 0;
2658
2659	/* handle admin ioctls */
2660	if (mnum == MD_ADM_MINOR)
2661		return (raid_admin_ioctl(cmd, data, mode, lockp));
2662
2663	/* check unit */
2664	if ((MD_MIN2SET(mnum) >= md_nsets) ||
2665	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
2666	    ((un = MD_UNIT(mnum)) == NULL))
2667		return (ENXIO);
2668
2669	/* is this a supported ioctl? */
2670	err = md_check_ioctl_against_unit(cmd, un->c);
2671	if (err != 0) {
2672		return (err);
2673	}
2674
2675	/* dispatch ioctl */
2676	switch (cmd) {
2677
2678	case DKIOCINFO:
2679	{
2680		struct dk_cinfo *p;
2681
2682		if (! (mode & FREAD))
2683			return (EACCES);
2684
2685		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2686
2687		get_info(p, mnum);
2688		if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
2689			err = EFAULT;
2690
2691		kmem_free(p, sizeof (*p));
2692		return (err);
2693	}
2694
2695	case DKIOCGMEDIAINFO:
2696	{
2697		struct dk_minfo	p;
2698
2699		if (! (mode & FREAD))
2700			return (EACCES);
2701
2702		get_minfo(&p, mnum);
2703		if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
2704			err = EFAULT;
2705
2706		return (err);
2707	}
2708
2709	case DKIOCGGEOM:
2710	{
2711		struct dk_geom	*p;
2712
2713		if (! (mode & FREAD))
2714			return (EACCES);
2715
2716		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2717
2718		if ((err = raid_get_geom(un, p)) == 0) {
2719			if (ddi_copyout((caddr_t)p, data, sizeof (*p),
2720			    mode) != 0)
2721				err = EFAULT;
2722		}
2723
2724		kmem_free(p, sizeof (*p));
2725		return (err);
2726	}
2727
2728	case DKIOCGVTOC:
2729	{
2730		struct vtoc	*vtoc;
2731
2732		if (! (mode & FREAD))
2733			return (EACCES);
2734
2735		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2736		if ((err = raid_get_vtoc(un, vtoc)) != 0) {
2737			kmem_free(vtoc, sizeof (*vtoc));
2738			return (err);
2739		}
2740
2741		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2742			if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
2743				err = EFAULT;
2744		}
2745#ifdef _SYSCALL32
2746		else {
2747			struct vtoc32	*vtoc32;
2748
2749			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2750
2751			vtoctovtoc32((*vtoc), (*vtoc32));
2752			if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
2753				err = EFAULT;
2754			kmem_free(vtoc32, sizeof (*vtoc32));
2755		}
2756#endif /* _SYSCALL32 */
2757
2758		kmem_free(vtoc, sizeof (*vtoc));
2759		return (err);
2760	}
2761
2762	case DKIOCSVTOC:
2763	{
2764		struct vtoc	*vtoc;
2765
2766		if (! (mode & FWRITE))
2767			return (EACCES);
2768
2769		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2770		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2771			if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
2772				err = EFAULT;
2773			}
2774		}
2775#ifdef _SYSCALL32
2776		else {
2777			struct vtoc32	*vtoc32;
2778
2779			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2780
2781			if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
2782				err = EFAULT;
2783			} else {
2784				vtoc32tovtoc((*vtoc32), (*vtoc));
2785			}
2786			kmem_free(vtoc32, sizeof (*vtoc32));
2787		}
2788#endif /* _SYSCALL32 */
2789
2790		if (err == 0)
2791			err = raid_set_vtoc(un, vtoc);
2792
2793		kmem_free(vtoc, sizeof (*vtoc));
2794		return (err);
2795	}
2796
2797	case DKIOCGEXTVTOC:
2798	{
2799		struct extvtoc	*extvtoc;
2800
2801		if (! (mode & FREAD))
2802			return (EACCES);
2803
2804		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2805		if ((err = raid_get_extvtoc(un, extvtoc)) != 0) {
2806			kmem_free(extvtoc, sizeof (*extvtoc));
2807			return (err);
2808		}
2809
2810		if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
2811			err = EFAULT;
2812
2813		kmem_free(extvtoc, sizeof (*extvtoc));
2814		return (err);
2815	}
2816
2817	case DKIOCSEXTVTOC:
2818	{
2819		struct extvtoc	*extvtoc;
2820
2821		if (! (mode & FWRITE))
2822			return (EACCES);
2823
2824		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2825		if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
2826			err = EFAULT;
2827		}
2828
2829		if (err == 0)
2830			err = raid_set_extvtoc(un, extvtoc);
2831
2832		kmem_free(extvtoc, sizeof (*extvtoc));
2833		return (err);
2834	}
2835
2836	case DKIOCGAPART:
2837	{
2838		struct dk_map	dmp;
2839
2840		if ((err = raid_get_cgapart(un, &dmp)) != 0) {
2841			return (err);
2842		}
2843
2844		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2845			if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
2846			    mode) != 0)
2847				err = EFAULT;
2848		}
2849#ifdef _SYSCALL32
2850		else {
2851			struct dk_map32 dmp32;
2852
2853			dmp32.dkl_cylno = dmp.dkl_cylno;
2854			dmp32.dkl_nblk = dmp.dkl_nblk;
2855
2856			if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
2857			    mode) != 0)
2858				err = EFAULT;
2859		}
2860#endif /* _SYSCALL32 */
2861
2862		return (err);
2863	}
2864	case DKIOCGETEFI:
2865	{
2866		/*
2867		 * This one can be done centralized,
2868		 * no need to put in the same code for all types of metadevices
2869		 */
2870		return (md_dkiocgetefi(mnum, data, mode));
2871	}
2872
2873	case DKIOCSETEFI:
2874	{
2875		/*
2876		 * This one can be done centralized,
2877		 * no need to put in the same code for all types of metadevices
2878		 */
2879		return (md_dkiocsetefi(mnum, data, mode));
2880	}
2881
2882	case DKIOCPARTITION:
2883	{
2884		return (md_dkiocpartition(mnum, data, mode));
2885	}
2886
2887	default:
2888		return (ENOTTY);
2889	}
2890}
2891
2892/*
2893 * rename/exchange named service entry points and support functions follow.
2894 * Most functions are handled generically, except for raid-specific locking
2895 * and checking
2896 */
2897
2898/*
2899 * NAME:	raid_may_renexch_self
2900 * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
2901 * PARAMETERS:	mr_unit_t	*un - unit struct of raid unit to be renamed
2902 *		mdi_unit_t	*ui - in-core unit struct of same raid unit
2903 *		md_rentxn_t	*rtxnp - rename transaction state
2904 *
2905 * LOCKS:	none
2906 *
2907 */
2908static int
2909raid_may_renexch_self(
2910	mr_unit_t	*un,
2911	mdi_unit_t	*ui,
2912	md_rentxn_t	*rtxnp)
2913{
2914	minor_t	from_min;
2915	minor_t	to_min;
2916	bool_t	toplevel;
2917	bool_t	related;
2918
2919	from_min = rtxnp->from.mnum;
2920	to_min = rtxnp->to.mnum;
2921
2922	if (!un || !ui) {
2923		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2924		    from_min);
2925		return (EINVAL);
2926	}
2927
2928	ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
2929	if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
2930		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2931		return (EINVAL);
2932	}
2933
2934	if (MD_PARENT(un) == MD_MULTI_PARENT) {
2935		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2936		return (EINVAL);
2937	}
2938
2939	toplevel = !MD_HAS_PARENT(MD_PARENT(un));
2940
2941	/* we're related if trying to swap with our parent */
2942	related = (!toplevel) && (MD_PARENT(un) == to_min);
2943
2944	switch (rtxnp->op) {
2945	case MDRNOP_EXCHANGE:
2946
2947		if (!related) {
2948			(void) mdmderror(&rtxnp->mde,
2949			    MDE_RENAME_TARGET_UNRELATED, to_min);
2950			return (EINVAL);
2951		}
2952
2953		break;
2954
2955	case MDRNOP_RENAME:
2956		/*
2957		 * if from is top-level and is open, then the kernel is using
2958		 * the md_dev64_t.
2959		 */
2960
2961		if (toplevel && md_unit_isopen(ui)) {
2962			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2963			    from_min);
2964			return (EBUSY);
2965		}
2966		break;
2967
2968	default:
2969		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2970		    from_min);
2971		return (EINVAL);
2972	}
2973
2974	return (0);	/* ok */
2975}
2976
2977/*
2978 * NAME:	raid_rename_check
2979 * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
2980 * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
2981 *					 raid device for rename transaction
2982 *		md_rentxn_t	*rtxnp - rename transaction state
2983 *
2984 * LOCKS:	none
2985 *
2986 */
2987intptr_t
2988raid_rename_check(
2989	md_rendelta_t	*delta,
2990	md_rentxn_t	*rtxnp)
2991{
2992	int		 err	= 0;
2993	int		 column;
2994	mr_unit_t	*un;
2995
2996	ASSERT(delta);
2997	ASSERT(rtxnp);
2998	ASSERT(delta->unp);
2999	ASSERT(delta->uip);
3000
3001	if (!delta || !rtxnp || !delta->unp || !delta->uip) {
3002		(void) mdsyserror(&rtxnp->mde, EINVAL);
3003		return (EINVAL);
3004	}
3005
3006	un = (mr_unit_t *)delta->unp;
3007
3008	for (column = 0; column < un->un_totalcolumncnt; column++) {
3009		rcs_state_t	colstate;
3010
3011		colstate = un->un_column[column].un_devstate;
3012
3013		if (colstate & RCS_LAST_ERRED) {
3014			(void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
3015			    md_getminor(delta->dev));
3016			return (EINVAL);
3017		}
3018
3019		if (colstate & RCS_INIT_ERRED) {
3020			(void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
3021			    md_getminor(delta->dev));
3022			return (EINVAL);
3023		}
3024
3025		/* How did we get this far before detecting this? */
3026		if (colstate & RCS_RESYNC) {
3027			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
3028			    md_getminor(delta->dev));
3029			return (EBUSY);
3030		}
3031
3032		if (colstate & RCS_ERRED) {
3033			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3034			    md_getminor(delta->dev));
3035			return (EINVAL);
3036		}
3037
3038		if (!(colstate & RCS_OKAY)) {
3039			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3040			    md_getminor(delta->dev));
3041			return (EINVAL);
3042		}
3043
3044		if (HOTSPARED(un, column)) {
3045			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3046			    md_getminor(delta->dev));
3047			return (EINVAL);
3048		}
3049	}
3050
3051	/* self does additional checks */
3052	if (delta->old_role == MDRR_SELF) {
3053		err = raid_may_renexch_self((mr_unit_t *)delta->unp,
3054		    delta->uip, rtxnp);
3055	}
3056	return (err);
3057}
3058
3059/*
3060 * NAME:	raid_rename_lock
3061 * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
3062 * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
3063 *					 raid device for rename transaction
3064 *		md_rentxn_t	*rtxnp - rename transaction state
3065 *
3066 * LOCKS:	io and unit locks (taken explicitly *not* via ioctl wrappers)
3067 *
3068 */
3069intptr_t
3070raid_rename_lock(
3071	md_rendelta_t	*delta,
3072	md_rentxn_t	*rtxnp)
3073{
3074	minor_t		mnum;
3075
3076	ASSERT(delta);
3077	ASSERT(rtxnp);
3078
3079	mnum = md_getminor(delta->dev);
3080	if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
3081		return (0);
3082	}
3083
3084	ASSERT(delta->uip);
3085	if (!delta->uip) {
3086		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
3087		return (ENODEV);
3088	}
3089
3090	ASSERT(delta->unp);
3091	if (!delta->unp) {
3092
3093		return (ENODEV);
3094	}
3095
3096	ASSERT(!IO_WRITER_HELD(delta->unp));
3097	(void) md_io_writerlock(delta->uip);
3098	ASSERT(IO_WRITER_HELD(delta->unp));
3099
3100
3101	ASSERT(!UNIT_WRITER_HELD(delta->unp));
3102	(void) md_unit_writerlock(delta->uip);
3103	ASSERT(UNIT_WRITER_HELD(delta->unp));
3104
3105	return (0);
3106}
3107
3108/*
3109 * NAME:	raid_rename_unlock
3110 * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
3111 * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
3112 *					 raid device for rename transaction
3113 *		md_rentxn_t	*rtxnp - rename transaction state
3114 *
3115 * LOCKS:	drops io and unit locks
3116 *
3117 */
3118/* ARGSUSED */
3119void
3120raid_rename_unlock(
3121	md_rendelta_t	*delta,
3122	md_rentxn_t	*rtxnp)
3123{
3124	mr_unit_t	*un = (mr_unit_t *)delta->unp;
3125	minor_t		mnum = MD_SID(un);
3126	int		col;
3127
3128	ASSERT(delta);
3129	ASSERT(delta->unp);
3130	ASSERT(delta->uip);
3131
3132	ASSERT(UNIT_WRITER_HELD(delta->unp));
3133	md_unit_writerexit(delta->uip);
3134	ASSERT(!UNIT_WRITER_HELD(delta->unp));
3135
3136	if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
3137		goto out;
3138	}
3139	if (raid_internal_open(mnum, (FREAD | FWRITE),
3140	    OTYP_LYR, MD_OFLG_ISINIT) == 0) {
3141		for (col = 0; col < un->un_totalcolumncnt; col++) {
3142			if (un->un_column[col].un_devstate & RCS_OKAY)
3143				(void) init_pw_area(un,
3144				    un->un_column[col].un_dev,
3145				    un->un_column[col].un_pwstart, col);
3146		}
3147		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
3148	}
3149
3150out:
3151	ASSERT(IO_WRITER_HELD(delta->unp));
3152	md_io_writerexit(delta->uip);
3153	ASSERT(!IO_WRITER_HELD(delta->unp));
3154}
3155/* end of rename/exchange named service and support functions */
3156