md.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Md - is the meta-disk driver.   It sits below the UFS file system
29 * but above the 'real' disk drivers, xy, id, sd etc.
30 *
31 * To the UFS software, md looks like a normal driver, since it has
32 * the normal kinds of entries in the bdevsw and cdevsw arrays. So
33 * UFS accesses md in the usual ways.  In particular, the strategy
34 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
35 * and ufs_writelbn().
36 *
37 * Md maintains an array of minor devices (meta-partitions).   Each
38 * meta partition stands for a matrix of real partitions, in rows
39 * which are not necessarily of equal length.	Md maintains a table,
40 * with one entry for each meta-partition,  which lists the rows and
41 * columns of actual partitions, and the job of the strategy routine
42 * is to translate from the meta-partition device and block numbers
43 * known to UFS into the actual partitions' device and block numbers.
44 *
45 * See below, in mdstrategy(), mdreal(), and mddone() for details of
46 * this translation.
47 */
48
49/*
50 * Driver for Virtual Disk.
51 */
52
53#include <sys/user.h>
54#include <sys/sysmacros.h>
55#include <sys/conf.h>
56#include <sys/stat.h>
57#include <sys/errno.h>
58#include <sys/param.h>
59#include <sys/systm.h>
60#include <sys/file.h>
61#include <sys/open.h>
62#include <sys/dkio.h>
63#include <sys/vtoc.h>
64#include <sys/cmn_err.h>
65#include <sys/ddi.h>
66#include <sys/sunddi.h>
67#include <sys/debug.h>
68#include <sys/utsname.h>
69#include <sys/lvm/mdvar.h>
70#include <sys/lvm/md_names.h>
71#include <sys/lvm/md_mddb.h>
72#include <sys/lvm/md_sp.h>
73#include <sys/types.h>
74#include <sys/kmem.h>
75#include <sys/cladm.h>
76#include <sys/priv_names.h>
77#include <sys/modhash.h>
78
79#ifndef	lint
80char 		_depends_on[] = "strmod/rpcmod";
81#endif	/* lint */
82int		md_init_debug	= 0;	/* module binding debug */
83
84/*
85 * Tunable to turn off the failfast behavior.
86 */
87int		md_ff_disable = 0;
88
89/*
90 * dynamically allocated list of non FF driver names - needs to
91 * be freed when md is detached.
92 */
93char	**non_ff_drivers = NULL;
94
95md_krwlock_t	md_unit_array_rw;	/* protects all unit arrays */
96md_krwlock_t	nm_lock;		/* protects all the name spaces */
97
98md_resync_t	md_cpr_resync;
99
100extern char	svm_bootpath[];
101#define	SVM_PSEUDO_STR	"/pseudo/md@0:"
102
103#define		VERSION_LENGTH	6
104#define		VERSION		"1.0"
105
106/*
107 * Keep track of possible 'orphan' entries in the name space
108 */
109int		*md_nm_snarfed = NULL;
110
111/*
112 * Global tunable giving the percentage of free space left in replica during
113 * conversion of non-devid style replica to devid style replica.
114 */
115int		md_conv_perc = MDDB_DEVID_CONV_PERC;
116
117#ifdef	DEBUG
118/* debug code to verify framework exclusion guarantees */
119int		md_in;
120kmutex_t	md_in_mx;			/* used to md global stuff */
121#define	IN_INIT		0x01
122#define	IN_FINI		0x02
123#define	IN_ATTACH	0x04
124#define	IN_DETACH	0x08
125#define	IN_OPEN		0x10
126#define	MD_SET_IN(x) {						\
127	mutex_enter(&md_in_mx);					\
128	if (md_in)						\
129		debug_enter("MD_SET_IN exclusion lost");	\
130	if (md_in & x)						\
131		debug_enter("MD_SET_IN already set");		\
132	md_in |= x;						\
133	mutex_exit(&md_in_mx);					\
134}
135
136#define	MD_CLR_IN(x) {						\
137	mutex_enter(&md_in_mx);					\
138	if (md_in & ~(x))					\
139		debug_enter("MD_CLR_IN exclusion lost");	\
140	if (!(md_in & x))					\
141		debug_enter("MD_CLR_IN already clr");		\
142	md_in &= ~x;						\
143	mutex_exit(&md_in_mx);					\
144}
145#else	/* DEBUG */
146#define	MD_SET_IN(x)
147#define	MD_CLR_IN(x)
148#endif	/* DEBUG */
149hrtime_t savetime1, savetime2;
150
151
152/*
153 * list things protected by md_mx even if they aren't
154 * used in this file.
155 */
156kmutex_t	md_mx;			/* used to md global stuff */
157kcondvar_t	md_cv;			/* md_status events */
158int		md_status = 0;		/* global status for the meta-driver */
159int		md_num_daemons = 0;
160int		md_ioctl_cnt = 0;
161int		md_mtioctl_cnt = 0;	/* multithreaded ioctl cnt */
162uint_t		md_mdelay = 10;		/* variable so can be patched */
163
164int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
165
166major_t		md_major, md_major_targ;
167
168unit_t		md_nunits = MD_MAXUNITS;
169set_t		md_nsets = MD_MAXSETS;
170int		md_nmedh = 0;
171char		*md_med_trans_lst = NULL;
172md_set_t	md_set[MD_MAXSETS];
173md_set_io_t	md_set_io[MD_MAXSETS];
174
175md_krwlock_t	hsp_rwlp;		/* protects hot_spare_interface */
176md_krwlock_t	ni_rwlp;		/* protects notify_interface */
177md_ops_t	**md_ops = NULL;
178ddi_modhandle_t	*md_mods = NULL;
179md_ops_t	*md_opslist;
180clock_t		md_hz;
181md_event_queue_t	*md_event_queue = NULL;
182
183int		md_in_upgrade;
184int		md_keep_repl_state;
185int		md_devid_destroy;
186
187/* for sending messages thru a door to userland */
188door_handle_t	mdmn_door_handle = NULL;
189int		mdmn_door_did = -1;
190
191dev_info_t		*md_devinfo = NULL;
192
193md_mn_nodeid_t	md_mn_mynode_id = ~0u;	/* My node id (for multi-node sets) */
194
195static	uint_t		md_ocnt[OTYPCNT];
196
197static int		mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
198static int		mdattach(dev_info_t *, ddi_attach_cmd_t);
199static int		mddetach(dev_info_t *, ddi_detach_cmd_t);
200static int		mdopen(dev_t *, int, int, cred_t *);
201static int		mdclose(dev_t, int, int, cred_t *);
202static int		mddump(dev_t, caddr_t, daddr_t, int);
203static int		mdread(dev_t, struct uio *, cred_t *);
204static int		mdwrite(dev_t, struct uio *, cred_t *);
205static int		mdaread(dev_t, struct aio_req *, cred_t *);
206static int		mdawrite(dev_t, struct aio_req *, cred_t *);
207static int		mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
208static int		mdprop_op(dev_t, dev_info_t *,
209				ddi_prop_op_t, int, char *, caddr_t, int *);
210
211static struct cb_ops md_cb_ops = {
212	mdopen,			/* open */
213	mdclose,		/* close */
214	mdstrategy,		/* strategy */
215				/* print routine -- none yet */
216	(int(*)(dev_t, char *))nulldev,
217	mddump,			/* dump */
218	mdread,			/* read */
219	mdwrite,		/* write */
220	mdioctl,		/* ioctl */
221				/* devmap */
222	(int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
223			uint_t))nodev,
224				/* mmap */
225	(int(*)(dev_t, off_t, int))nodev,
226				/* segmap */
227	(int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
228		unsigned, unsigned, cred_t *))nodev,
229	nochpoll,		/* poll */
230	mdprop_op,		/* prop_op */
231	0,			/* streamtab */
232	(D_64BIT|D_MP|D_NEW),	/* driver compatibility flag */
233	CB_REV,			/* cb_ops version */
234	mdaread,		/* aread */
235	mdawrite,		/* awrite */
236};
237
238static struct dev_ops md_devops = {
239	DEVO_REV,		/* dev_ops version */
240	0,			/* device reference count */
241	mdinfo,			/* info routine */
242	nulldev,		/* identify routine */
243	nulldev,		/* probe - not defined */
244	mdattach,		/* attach routine */
245	mddetach,		/* detach routine */
246	nodev,			/* reset - not defined */
247	&md_cb_ops,		/* driver operations */
248	NULL,			/* bus operations */
249	nodev,			/* power management */
250	ddi_quiesce_not_needed,		/* quiesce */
251};
252
253/*
254 * loadable module wrapper
255 */
256#include <sys/modctl.h>
257
258static struct modldrv modldrv = {
259	&mod_driverops,			/* type of module -- a pseudodriver */
260	"Solaris Volume Manager base module", /* name of the module */
261	&md_devops,			/* driver ops */
262};
263
264static struct modlinkage modlinkage = {
265	MODREV_1,
266	(void *)&modldrv,
267	NULL
268};
269
270
271/* md_medd.c */
272extern	void	med_init(void);
273extern	void	med_fini(void);
274extern  void	md_devid_cleanup(set_t, uint_t);
275
276/* md_names.c */
277extern void			*lookup_entry(struct nm_next_hdr *, set_t,
278					side_t, mdkey_t, md_dev64_t, int);
279extern struct nm_next_hdr	*get_first_record(set_t, int, int);
280extern int			remove_entry(struct nm_next_hdr *,
281					side_t, mdkey_t, int);
282
283int		md_maxphys	= 0;	/* maximum io size in bytes */
284#define		MD_MAXBCOUNT	(1024 * 1024)
285unsigned	md_maxbcount	= 0;	/* maximum physio size in bytes */
286
287/*
288 * Some md ioctls trigger io framework device tree operations.  An
289 * example is md ioctls that call md_resolve_bydevid(): which uses the
290 * io framework to resolve a devid. Such operations result in acquiring
291 * io framework locks (like ndi_devi_enter() of "/") while holding
292 * driver locks (like md_unit_writerlock()).
293 *
294 * The prop_op(9E) entry point is called from the devinfo driver with
295 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
296 * implementation must avoid taking a lock that is held per above md
297 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
298 * without risking deadlock.
299 *
300 * To service "size" requests without risking deadlock, we maintain a
301 * "mnum->nblocks" sizemap (protected by a short-term global mutex).
302 */
303static kmutex_t		md_nblocks_mutex;
304static mod_hash_t	*md_nblocksmap;		/* mnum -> nblocks */
305int			md_nblocksmap_size = 512;
306
307/*
308 * Maintain "mnum->nblocks" sizemap for mdprop_op use:
309 *
310 * Create: any code that establishes a unit's un_total_blocks needs the
311 * following type of call to establish nblocks for mdprop_op():
312 *	md_nblocks_set(mnum, un->c.un_total_blocks);"
313 *	NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
314 *		...or  "MD_UNIT..*="
315 *
316 * Change: any code that changes a unit's un_total_blocks needs the
317 * following type of call to sync nblocks for mdprop_op():
318 *	md_nblocks_set(mnum, un->c.un_total_blocks);"
319 *	NOTE: locate via cscope for "un_total_blocks[ \t]*="
320 *
321 * Destroy: any code that deletes a unit needs the following type of call
322 * to sync nblocks for mdprop_op():
323 *	md_nblocks_set(mnum, -1ULL);
324 *	NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
325 *		...or  "MD_UNIT..*="
326 */
327void
328md_nblocks_set(minor_t mnum, uint64_t nblocks)
329{
330	mutex_enter(&md_nblocks_mutex);
331	if (nblocks == -1ULL)
332		(void) mod_hash_destroy(md_nblocksmap,
333		    (mod_hash_key_t)(intptr_t)mnum);
334	else
335		(void) mod_hash_replace(md_nblocksmap,
336		    (mod_hash_key_t)(intptr_t)mnum,
337		    (mod_hash_val_t)(intptr_t)nblocks);
338	mutex_exit(&md_nblocks_mutex);
339}
340
341/* get the size of a mnum from "mnum->nblocks" sizemap */
342uint64_t
343md_nblocks_get(minor_t mnum)
344{
345	mod_hash_val_t	hv;
346
347	mutex_enter(&md_nblocks_mutex);
348	if (mod_hash_find(md_nblocksmap,
349	    (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
350		mutex_exit(&md_nblocks_mutex);
351		return ((uint64_t)(intptr_t)hv);
352	}
353	mutex_exit(&md_nblocks_mutex);
354	return (0);
355}
356
357/* allocate/free dynamic space associated with driver globals */
358void
359md_global_alloc_free(int alloc)
360{
361	set_t	s;
362
363	if (alloc) {
364		/* initialize driver global locks */
365		cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
366		mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
367		rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
368		rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
369		rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
370		rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
371		mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
372		    MUTEX_DEFAULT, NULL);
373		mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
374
375		/* initialize per set driver global locks */
376		for (s = 0; s < MD_MAXSETS; s++) {
377			/* initialize per set driver globals locks */
378			mutex_init(&md_set[s].s_dbmx,
379			    NULL, MUTEX_DEFAULT, NULL);
380			mutex_init(&md_set_io[s].md_io_mx,
381			    NULL, MUTEX_DEFAULT, NULL);
382			cv_init(&md_set_io[s].md_io_cv,
383			    NULL, CV_DEFAULT, NULL);
384		}
385	} else {
386		/* destroy per set driver global locks */
387		for (s = 0; s < MD_MAXSETS; s++) {
388			cv_destroy(&md_set_io[s].md_io_cv);
389			mutex_destroy(&md_set_io[s].md_io_mx);
390			mutex_destroy(&md_set[s].s_dbmx);
391		}
392
393		/* destroy driver global locks */
394		mutex_destroy(&md_nblocks_mutex);
395		mutex_destroy(&md_cpr_resync.md_resync_mutex);
396		rw_destroy(&hsp_rwlp.lock);
397		rw_destroy(&ni_rwlp.lock);
398		rw_destroy(&nm_lock.lock);
399		rw_destroy(&md_unit_array_rw.lock);
400		mutex_destroy(&md_mx);
401		cv_destroy(&md_cv);
402	}
403}
404
405int
406_init(void)
407{
408	set_t	s;
409	int	err;
410
411	MD_SET_IN(IN_INIT);
412
413	/* allocate dynamic space associated with driver globals */
414	md_global_alloc_free(1);
415
416	/* initialize driver globals */
417	md_major = ddi_name_to_major("md");
418	md_hz = drv_usectohz(NUM_USEC_IN_SEC);
419
420	/* initialize tunable globals */
421	if (md_maxphys == 0)		/* maximum io size in bytes */
422		md_maxphys = maxphys;
423	if (md_maxbcount == 0)		/* maximum physio size in bytes */
424		md_maxbcount = MD_MAXBCOUNT;
425
426	/* initialize per set driver globals */
427	for (s = 0; s < MD_MAXSETS; s++)
428		md_set_io[s].io_state = MD_SET_ACTIVE;
429
430	/*
431	 * NOTE: the framework does not currently guarantee exclusion
432	 * between _init and attach after calling mod_install.
433	 */
434	MD_CLR_IN(IN_INIT);
435	if ((err = mod_install(&modlinkage))) {
436		MD_SET_IN(IN_INIT);
437		md_global_alloc_free(0);	/* free dynamic space */
438		MD_CLR_IN(IN_INIT);
439	}
440	return (err);
441}
442
443int
444_fini(void)
445{
446	int	err;
447
448	/*
449	 * NOTE: the framework currently does not guarantee exclusion
450	 * with attach until after mod_remove returns 0.
451	 */
452	if ((err = mod_remove(&modlinkage)))
453		return (err);
454
455	MD_SET_IN(IN_FINI);
456	md_global_alloc_free(0);	/* free dynamic space */
457	MD_CLR_IN(IN_FINI);
458	return (err);
459}
460
461int
462_info(struct modinfo *modinfop)
463{
464	return (mod_info(&modlinkage, modinfop));
465}
466
467/* ARGSUSED */
468static int
469mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
470{
471	int	len;
472	unit_t	i;
473	size_t	sz;
474	char	ver[VERSION_LENGTH];
475	char	**maj_str_array;
476	char	*str, *str2;
477
478	MD_SET_IN(IN_ATTACH);
479	md_in_upgrade = 0;
480	md_keep_repl_state = 0;
481	md_devid_destroy = 0;
482
483	if (cmd != DDI_ATTACH) {
484		MD_CLR_IN(IN_ATTACH);
485		return (DDI_FAILURE);
486	}
487
488	if (md_devinfo != NULL) {
489		MD_CLR_IN(IN_ATTACH);
490		return (DDI_FAILURE);
491	}
492
493	mddb_init();
494
495	if (md_start_daemons(TRUE)) {
496		MD_CLR_IN(IN_ATTACH);
497		mddb_unload();		/* undo mddb_init() allocations */
498		return (DDI_FAILURE);
499	}
500
501	/* clear the halted state */
502	md_clr_status(MD_GBL_HALTED);
503
504	/* see if the diagnostic switch is on */
505	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
506	    DDI_PROP_DONTPASS, "md_init_debug", 0))
507		md_init_debug++;
508
509	/* see if the failfast disable switch is on */
510	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
511	    DDI_PROP_DONTPASS, "md_ff_disable", 0))
512		md_ff_disable++;
513
514	/* try and get the md_nmedh property */
515	md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
516	    DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
517	if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
518		md_nmedh = MED_DEF_HOSTS;
519
520	/* try and get the md_med_trans_lst property */
521	len = 0;
522	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
523	    0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
524	    len == 0) {
525		md_med_trans_lst = md_strdup("tcp");
526	} else {
527		md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
528		if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
529		    0, "md_med_trans_lst", md_med_trans_lst, &len) !=
530		    DDI_PROP_SUCCESS) {
531			kmem_free(md_med_trans_lst, (size_t)len);
532			md_med_trans_lst = md_strdup("tcp");
533		}
534	}
535
536	/*
537	 * Must initialize the internal data structures before the
538	 * any possible calls to 'goto attach_failure' as _fini
539	 * routine references them.
540	 */
541	med_init();
542
543	md_ops = (md_ops_t **)kmem_zalloc(
544	    sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
545	md_mods = (ddi_modhandle_t *)kmem_zalloc(
546	    sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
547
548	/* try and get the md_xlate property */
549	/* Should we only do this if upgrade? */
550	len = sizeof (char) * 5;
551	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
552	    0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
553		if (strcmp(ver, VERSION) == 0) {
554			len = 0;
555			if (ddi_prop_op(DDI_DEV_T_ANY, dip,
556			    PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
557			    (caddr_t)&md_tuple_table, &len) !=
558			    DDI_PROP_SUCCESS) {
559				if (md_init_debug)
560					cmn_err(CE_WARN,
561					    "md_xlate ddi_prop_op failed");
562				goto attach_failure;
563			} else {
564				md_tuple_length =
565				    len/(2 * ((int)sizeof (dev32_t)));
566				md_in_upgrade = 1;
567			}
568
569			/* Get target's name to major table */
570			if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
571			    dip, DDI_PROP_DONTPASS,
572			    "md_targ_nm_table", &maj_str_array,
573			    &md_majortab_len) != DDI_PROP_SUCCESS) {
574				md_majortab_len = 0;
575				if (md_init_debug)
576					cmn_err(CE_WARN, "md_targ_nm_table "
577					    "ddi_prop_lookup_string_array "
578					    "failed");
579				goto attach_failure;
580			}
581
582			md_major_tuple_table =
583			    (struct md_xlate_major_table *)
584			    kmem_zalloc(md_majortab_len *
585			    sizeof (struct md_xlate_major_table), KM_SLEEP);
586
587			for (i = 0; i < md_majortab_len; i++) {
588				/* Getting major name */
589				str = strchr(maj_str_array[i], ' ');
590				if (str == NULL)
591					continue;
592				*str = '\0';
593				md_major_tuple_table[i].drv_name =
594				    md_strdup(maj_str_array[i]);
595
596				/* Simplified atoi to get major number */
597				str2 = str + 1;
598				md_major_tuple_table[i].targ_maj = 0;
599				while ((*str2 >= '0') && (*str2 <= '9')) {
600					md_major_tuple_table[i].targ_maj *= 10;
601					md_major_tuple_table[i].targ_maj +=
602					    *str2++ - '0';
603				}
604				*str = ' ';
605			}
606			ddi_prop_free((void *)maj_str_array);
607		} else {
608			if (md_init_debug)
609				cmn_err(CE_WARN, "md_xlate_ver is incorrect");
610			goto attach_failure;
611		}
612	}
613
614	/*
615	 * Check for properties:
616	 * 	md_keep_repl_state and md_devid_destroy
617	 * and set globals if these exist.
618	 */
619	md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
620	    0, "md_keep_repl_state", 0);
621
622	md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
623	    0, "md_devid_destroy", 0);
624
625	if (MD_UPGRADE)
626		md_major_targ = md_targ_name_to_major("md");
627	else
628		md_major_targ = 0;
629
630	/* allocate admin device node */
631	if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
632	    MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
633		goto attach_failure;
634
635	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
636	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
637		goto attach_failure;
638
639	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
640	    "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
641		goto attach_failure;
642
643	/* these could have been cleared by a detach */
644	md_nunits = MD_MAXUNITS;
645	md_nsets = MD_MAXSETS;
646
647	sz = sizeof (void *) * MD_MAXUNITS;
648	if (md_set[0].s_un == NULL)
649		md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
650	if (md_set[0].s_ui == NULL)
651		md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
652
653	md_devinfo = dip;
654
655	/*
656	 * Only allocate device node for root mirror metadevice.
657	 * Don't pre-allocate unnecessary device nodes (thus slowing down a
658	 * boot when we attach).
659	 * We can't read the mddbs in attach.  The mddbs will be read
660	 * by metainit during the boot process when it is doing the
661	 * auto-take processing and any other minor nodes will be
662	 * allocated at that point.
663	 *
664	 * There are two scenarios to be aware of here:
665	 * 1) when we are booting from a mirrored root we need the root
666	 *    metadevice to exist very early (during vfs_mountroot processing)
667	 * 2) we need all of the nodes to be created so that any mnttab entries
668	 *    will succeed (handled by metainit reading the mddb during boot).
669	 */
670	if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
671	    == 0) {
672		char *p;
673		int mnum = 0;
674
675		/*
676		 * The svm_bootpath string looks something like
677		 * /pseudo/md@0:0,150,blk where 150 is the minor number
678		 * in this example so we need to set the pointer p onto
679		 * the first digit of the minor number and convert it
680		 * from ascii.
681		 */
682		for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
683		    *p >= '0' && *p <= '9'; p++) {
684			mnum *= 10;
685			mnum += *p - '0';
686		}
687
688		if (md_create_minor_node(0, mnum)) {
689			kmem_free(md_set[0].s_un, sz);
690			kmem_free(md_set[0].s_ui, sz);
691			goto attach_failure;
692		}
693	}
694
695	/* create the hash to store the meta device sizes */
696	md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
697	    md_nblocksmap_size, mod_hash_null_valdtor);
698
699	MD_CLR_IN(IN_ATTACH);
700	return (DDI_SUCCESS);
701
702attach_failure:
703	/*
704	 * Use our own detach routine to toss any stuff we allocated above.
705	 * NOTE: detach will call md_halt to free the mddb_init allocations.
706	 */
707	MD_CLR_IN(IN_ATTACH);
708	if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
709		cmn_err(CE_WARN, "detach from attach failed");
710	return (DDI_FAILURE);
711}
712
713/* ARGSUSED */
714static int
715mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
716{
717	extern int	check_active_locators();
718	set_t		s;
719	size_t		sz;
720	int		len;
721
722	MD_SET_IN(IN_DETACH);
723
724	/* check command */
725	if (cmd != DDI_DETACH) {
726		MD_CLR_IN(IN_DETACH);
727		return (DDI_FAILURE);
728	}
729
730	/*
731	 * if we have not already halted yet we have no active config
732	 * then automatically initiate a halt so we can detach.
733	 */
734	if (!(md_get_status() & MD_GBL_HALTED)) {
735		if (check_active_locators() == 0) {
736			/*
737			 * NOTE: a successful md_halt will have done the
738			 * mddb_unload to free allocations done in mddb_init
739			 */
740			if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
741				cmn_err(CE_NOTE, "md:detach: "
742				    "Could not halt Solaris Volume Manager");
743				MD_CLR_IN(IN_DETACH);
744				return (DDI_FAILURE);
745			}
746		}
747
748		/* fail detach if we have not halted */
749		if (!(md_get_status() & MD_GBL_HALTED)) {
750			MD_CLR_IN(IN_DETACH);
751			return (DDI_FAILURE);
752		}
753	}
754
755	/* must be in halted state, this will be cleared on next attach */
756	ASSERT(md_get_status() & MD_GBL_HALTED);
757
758	/* cleanup attach allocations and initializations */
759	md_major_targ = 0;
760
761	sz = sizeof (void *) * md_nunits;
762	for (s = 0; s < md_nsets; s++) {
763		if (md_set[s].s_un != NULL) {
764			kmem_free(md_set[s].s_un, sz);
765			md_set[s].s_un = NULL;
766		}
767
768		if (md_set[s].s_ui != NULL) {
769			kmem_free(md_set[s].s_ui, sz);
770			md_set[s].s_ui = NULL;
771		}
772	}
773	md_nunits = 0;
774	md_nsets = 0;
775	md_nmedh = 0;
776
777	if (non_ff_drivers != NULL) {
778		int	i;
779
780		for (i = 0; non_ff_drivers[i] != NULL; i++)
781			kmem_free(non_ff_drivers[i],
782			    strlen(non_ff_drivers[i]) + 1);
783
784		/* free i+1 entries because there is a null entry at list end */
785		kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
786		non_ff_drivers = NULL;
787	}
788
789	if (md_med_trans_lst != NULL) {
790		kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
791		md_med_trans_lst = NULL;
792	}
793
794	if (md_mods != NULL) {
795		kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
796		md_mods = NULL;
797	}
798
799	if (md_ops != NULL) {
800		kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
801		md_ops = NULL;
802	}
803
804	if (MD_UPGRADE) {
805		len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
806		md_in_upgrade = 0;
807		md_xlate_free(len);
808		md_majortab_free();
809	}
810
811	/*
812	 * Undo what we did in mdattach, freeing resources
813	 * and removing things we installed.  The system
814	 * framework guarantees we are not active with this devinfo
815	 * node in any other entry points at this time.
816	 */
817	ddi_prop_remove_all(dip);
818	ddi_remove_minor_node(dip, NULL);
819
820	med_fini();
821
822	mod_hash_destroy_idhash(md_nblocksmap);
823
824	md_devinfo = NULL;
825
826	MD_CLR_IN(IN_DETACH);
827	return (DDI_SUCCESS);
828}
829
830
831/*
832 * Given the device number return the devinfo pointer
833 * given to md via md_attach
834 */
835/*ARGSUSED*/
836static int
837mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
838{
839	int		error = DDI_FAILURE;
840
841	switch (infocmd) {
842	case DDI_INFO_DEVT2DEVINFO:
843		if (md_devinfo) {
844			*result = (void *)md_devinfo;
845			error = DDI_SUCCESS;
846		}
847		break;
848
849	case DDI_INFO_DEVT2INSTANCE:
850		*result = (void *)0;
851		error = DDI_SUCCESS;
852		break;
853	}
854	return (error);
855}
856
857/*
858 * property operation routine.  return the number of blocks for the partition
859 * in question or forward the request to the property facilities.
860 */
861static int
862mdprop_op(
863	dev_t dev,		/* device number associated with device */
864	dev_info_t *dip,	/* device info struct for this device */
865	ddi_prop_op_t prop_op,	/* property operator */
866	int mod_flags,		/* property flags */
867	char *name,		/* name of property */
868	caddr_t valuep,		/* where to put property value */
869	int *lengthp)		/* put length of property here */
870{
871	return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
872	    name, valuep, lengthp, md_nblocks_get(getminor(dev))));
873}
874
875static void
876snarf_user_data(set_t setno)
877{
878	mddb_recid_t		recid;
879	mddb_recstatus_t	status;
880
881	recid = mddb_makerecid(setno, 0);
882	while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
883		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
884			continue;
885
886		status = mddb_getrecstatus(recid);
887		if (status == MDDB_STALE)
888			continue;
889
890		if (status == MDDB_NODATA) {
891			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
892			continue;
893		}
894
895		ASSERT(status == MDDB_OK);
896
897		mddb_setrecprivate(recid, MD_PRV_GOTIT);
898	}
899}
900
901static void
902md_print_block_usage(mddb_set_t *s, uint_t blks)
903{
904	uint_t		ib;
905	int		li;
906	mddb_mb_ic_t	*mbip;
907	uint_t		max_blk_needed;
908	mddb_lb_t	*lbp;
909	mddb_sidelocator_t	*slp;
910	int		drv_index;
911	md_splitname	sn;
912	char		*name;
913	char		*suffix;
914	size_t		prefixlen;
915	size_t		suffixlen;
916	int		alloc_sz;
917
918
919	max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
920
921	cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
922	    "            Additional Blocks Needed:            %d\n\n"
923	    "            Increase size of following replicas for\n"
924	    "            device relocatability by deleting listed\n"
925	    "            replica and re-adding replica with\n"
926	    "            increased size (see metadb(1M)):\n"
927	    "                Replica                   Increase By",
928	    s->s_totalblkcnt, (blks - s->s_freeblkcnt));
929
930	lbp = s->s_lbp;
931
932	for (li = 0; li < lbp->lb_loccnt; li++) {
933		if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
934			continue;
935		ib = 0;
936		for (mbip = s->s_mbiarray[li]; mbip != NULL;
937		    mbip = mbip->mbi_next) {
938			ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
939		}
940		if (ib == 0)
941			continue;
942		if (ib < max_blk_needed) {
943			slp = &lbp->lb_sidelocators[s->s_sideno][li];
944			drv_index = slp->l_drvnm_index;
945			mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
946			    &sn);
947			prefixlen = SPN_PREFIX(&sn).pre_len;
948			suffixlen = SPN_SUFFIX(&sn).suf_len;
949			alloc_sz = (int)(prefixlen + suffixlen + 2);
950			name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
951			(void) strncpy(name, SPN_PREFIX(&sn).pre_data,
952			    prefixlen);
953			name[prefixlen] = '/';
954			suffix = name + (prefixlen + 1);
955			(void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
956			    suffixlen);
957			name[prefixlen + suffixlen + 1] = '\0';
958			cmn_err(CE_WARN,
959			    "  %s (%s:%d:%d)   %d blocks",
960			    name, lbp->lb_drvnm[drv_index].dn_data,
961			    slp->l_mnum, lbp->lb_locators[li].l_blkno,
962			    (max_blk_needed - ib));
963			kmem_free(name, alloc_sz);
964		}
965	}
966}
967
968/*
969 * md_create_minor_node:
970 *	Create the minor device for the given set and un_self_id.
971 *
972 * Input:
973 *	setno	- set number
974 *	mnum	- selfID of unit
975 *
976 * Output:
977 *	None.
978 *
979 * Returns 0 for success, 1 for failure.
980 *
981 * Side-effects:
982 *	None.
983 */
984int
985md_create_minor_node(set_t setno, minor_t mnum)
986{
987	char		name[20];
988
989	/* Check for valid arguments */
990	if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
991		return (1);
992
993	(void) snprintf(name, 20, "%u,%u,blk",
994	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
995
996	if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
997	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
998		return (1);
999
1000	(void) snprintf(name, 20, "%u,%u,raw",
1001	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
1002
1003	if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
1004	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
1005		return (1);
1006
1007	return (0);
1008}
1009
1010/*
1011 * For a given key check if it is an orphaned record.
1012 * The following conditions are used to determine an orphan.
1013 * 1. The device associated with that key is not a metadevice.
1014 * 2. If DEVID_STYLE then the physical device does not have a device Id
1015 * associated with it.
1016 *
1017 * If a key does not have an entry in the devid namespace it could be
1018 * a device that does not support device ids. Hence the record is not
1019 * deleted.
1020 */
1021
1022static int
1023md_verify_orphaned_record(set_t setno, mdkey_t key)
1024{
1025	md_dev64_t	odev; /* orphaned dev */
1026	mddb_set_t	*s;
1027	side_t		side = 0;
1028	struct nm_next_hdr	*did_nh = NULL;
1029
1030	s = (mddb_set_t *)md_set[setno].s_db;
1031	if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
1032	    == NULL)
1033		return (0);
1034	/*
1035	 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1036	 */
1037	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
1038		odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
1039		if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
1040			return (0);
1041		if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
1042		    NULL)
1043			return (1);
1044	}
1045	return (0);
1046}
1047
1048int
1049md_snarf_db_set(set_t setno, md_error_t *ep)
1050{
1051	int			err = 0;
1052	int			i;
1053	mddb_recid_t		recid;
1054	mddb_type_t		drvrid;
1055	mddb_recstatus_t	status;
1056	md_ops_t		*ops;
1057	uint_t			privat;
1058	mddb_set_t		*s;
1059	uint_t			cvt_blks;
1060	struct nm_next_hdr	*nh;
1061	mdkey_t			key = MD_KEYWILD;
1062	side_t			side = 0;
1063	int			size;
1064	int			devid_flag;
1065	int			retval;
1066	uint_t			un;
1067	int			un_next_set = 0;
1068
1069	md_haltsnarf_enter(setno);
1070
1071	mutex_enter(&md_mx);
1072	if (md_set[setno].s_status & MD_SET_SNARFED) {
1073		mutex_exit(&md_mx);
1074		md_haltsnarf_exit(setno);
1075		return (0);
1076	}
1077	mutex_exit(&md_mx);
1078
1079	if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1080		if (md_start_daemons(TRUE)) {
1081			if (ep != NULL)
1082				(void) mdsyserror(ep, ENXIO);
1083			err = -1;
1084			goto out;
1085		}
1086	}
1087
1088
1089	/*
1090	 * Load the devid name space if it exists
1091	 */
1092	(void) md_load_namespace(setno, NULL, NM_DEVID);
1093	if (!md_load_namespace(setno, ep, 0L)) {
1094		/*
1095		 * Unload the devid namespace
1096		 */
1097		(void) md_unload_namespace(setno, NM_DEVID);
1098		err = -1;
1099		goto out;
1100	}
1101
1102	/*
1103	 * If replica is in non-devid state, convert if:
1104	 * 	- not in probe during upgrade (md_keep_repl_state = 0)
1105	 * 	- enough space available in replica
1106	 *	- local set
1107	 *	- not a multi-node diskset
1108	 *	- clustering is not present (for non-local set)
1109	 */
1110	s = (mddb_set_t *)md_set[setno].s_db;
1111	devid_flag = 0;
1112	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1113		devid_flag = 1;
1114	if (cluster_bootflags & CLUSTER_CONFIGURED)
1115		if (setno != MD_LOCAL_SET)
1116			devid_flag = 0;
1117	if (MD_MNSET_SETNO(setno))
1118		devid_flag = 0;
1119	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1120		devid_flag = 0;
1121
1122	/*
1123	 * if we weren't devid style before and md_keep_repl_state=1
1124	 * we need to stay non-devid
1125	 */
1126	if ((md_keep_repl_state == 1) &&
1127	    ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1128		devid_flag = 0;
1129	if (devid_flag) {
1130		/*
1131		 * Determine number of free blocks needed to convert
1132		 * entire replica to device id format - locator blocks
1133		 * and namespace.
1134		 */
1135		cvt_blks = 0;
1136		if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1137			if (ep != NULL)
1138				(void) mdsyserror(ep, EIO);
1139			err = -1;
1140			goto out;
1141
1142		}
1143		cvt_blks += md_nm_did_chkspace(setno);
1144
1145		/* add MDDB_DEVID_CONV_PERC% */
1146		if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1147			cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1148		}
1149
1150		if (cvt_blks <= s->s_freeblkcnt) {
1151			if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1152				if (ep != NULL)
1153					(void) mdsyserror(ep, EIO);
1154				err = -1;
1155				goto out;
1156			}
1157
1158		} else {
1159			/*
1160			 * Print message that replica can't be converted for
1161			 * lack of space.   No failure - just continue to
1162			 * run without device ids.
1163			 */
1164			cmn_err(CE_WARN,
1165			    "Unable to add Solaris Volume Manager device "
1166			    "relocation data.\n"
1167			    "          To use device relocation feature:\n"
1168			    "          - Increase size of listed replicas\n"
1169			    "          - Reboot");
1170			md_print_block_usage(s, cvt_blks);
1171			cmn_err(CE_WARN,
1172			    "Loading set without device relocation data.\n"
1173			    "          Solaris Volume Manager disk movement "
1174			    "not tracked in local set.");
1175		}
1176	}
1177
1178	/*
1179	 * go through and load any modules referenced in
1180	 * data base
1181	 */
1182	recid = mddb_makerecid(setno, 0);
1183	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1184		status = mddb_getrecstatus(recid);
1185		if (status == MDDB_STALE) {
1186			if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1187				md_set_setstatus(setno, MD_SET_STALE);
1188				cmn_err(CE_WARN,
1189				    "md: state database is stale");
1190			}
1191		} else if (status == MDDB_NODATA) {
1192			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1193			continue;
1194		}
1195		drvrid = mddb_getrectype1(recid);
1196		if (drvrid < MDDB_FIRST_MODID)
1197			continue;
1198		if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1199		    drvrid) < 0) {
1200			cmn_err(CE_NOTE, "md: could not load misc/%s",
1201			    md_getshared_name(setno, drvrid));
1202		}
1203	}
1204
1205	if (recid < 0)
1206		goto out;
1207
1208	snarf_user_data(setno);
1209
1210	/*
1211	 * Initialize the md_nm_snarfed array
1212	 * this array is indexed by the key and
1213	 * is set by md_getdevnum during the snarf time
1214	 */
1215	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1216		size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1217		    r_next_key) * (sizeof (int)));
1218		md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1219	}
1220
1221	/*
1222	 * go through and snarf until nothing gets added
1223	 */
1224	do {
1225		i = 0;
1226		for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1227			if (ops->md_snarf != NULL) {
1228				retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1229				if (retval == -1) {
1230					err = -1;
1231					/* Don't know the failed unit */
1232					(void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1233					    0);
1234					(void) md_halt_set(setno, MD_HALT_ALL);
1235					(void) mddb_unload_set(setno);
1236					md_haltsnarf_exit(setno);
1237					return (err);
1238				} else {
1239					i += retval;
1240				}
1241			}
1242		}
1243	} while (i);
1244
1245	/*
1246	 * Set the first available slot and availability
1247	 */
1248	md_set[setno].s_un_avail = 0;
1249	for (un = 0; un < MD_MAXUNITS; un++) {
1250		if (md_set[setno].s_un[un] != NULL) {
1251			continue;
1252		} else {
1253			if (!un_next_set) {
1254				md_set[setno].s_un_next = un;
1255				un_next_set = 1;
1256			}
1257			md_set[setno].s_un_avail++;
1258		}
1259	}
1260
1261	md_set_setstatus(setno, MD_SET_SNARFED);
1262
1263	recid = mddb_makerecid(setno, 0);
1264	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1265		privat = mddb_getrecprivate(recid);
1266		if (privat & MD_PRV_COMMIT) {
1267			if (mddb_commitrec(recid)) {
1268				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1269					md_set_setstatus(setno, MD_SET_STALE);
1270					cmn_err(CE_WARN,
1271					    "md: state database is stale");
1272				}
1273			}
1274			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1275		}
1276	}
1277
1278	/* Deletes must happen after all the commits */
1279	recid = mddb_makerecid(setno, 0);
1280	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1281		privat = mddb_getrecprivate(recid);
1282		if (privat & MD_PRV_DELETE) {
1283			if (mddb_deleterec(recid)) {
1284				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1285					md_set_setstatus(setno, MD_SET_STALE);
1286					cmn_err(CE_WARN,
1287					    "md: state database is stale");
1288				}
1289				mddb_setrecprivate(recid, MD_PRV_GOTIT);
1290			}
1291			recid = mddb_makerecid(setno, 0);
1292		}
1293	}
1294
1295	/*
1296	 * go through and clean up records until nothing gets cleaned up.
1297	 */
1298	do {
1299		i = 0;
1300		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1301			if (ops->md_snarf != NULL)
1302				i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1303	} while (i);
1304
1305	if (md_nm_snarfed != NULL &&
1306	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
1307		/*
1308		 * go thru and cleanup the namespace and the device id
1309		 * name space
1310		 */
1311		for (key = 1;
1312		    key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1313		    key++) {
1314			/*
1315			 * Is the entry an 'orphan'?
1316			 */
1317			if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1318			    NULL) {
1319				/*
1320				 * If the value is not set then apparently
1321				 * it is not part of the current configuration,
1322				 * remove it this can happen when system panic
1323				 * between the primary name space update and
1324				 * the device id name space update
1325				 */
1326				if (md_nm_snarfed[key] == 0) {
1327					if (md_verify_orphaned_record(setno,
1328					    key) == 1)
1329						(void) remove_entry(nh,
1330						    side, key, 0L);
1331				}
1332			}
1333		}
1334	}
1335
1336	if (md_nm_snarfed != NULL) {
1337		/*
1338		 * Done and free the memory
1339		 */
1340		kmem_free(md_nm_snarfed, size);
1341		md_nm_snarfed = NULL;
1342	}
1343
1344	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1345	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
1346		/*
1347		 * if the destroy flag has been set and
1348		 * the MD_SET_DIDCLUP bit is not set in
1349		 * the set's status field, cleanup the
1350		 * entire device id namespace
1351		 */
1352		if (md_devid_destroy &&
1353		    !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1354			(void) md_devid_cleanup(setno, 1);
1355			md_set_setstatus(setno, MD_SET_DIDCLUP);
1356		} else
1357			(void) md_devid_cleanup(setno, 0);
1358	}
1359
1360	/*
1361	 * clear single threading on snarf, return success or error
1362	 */
1363out:
1364	md_haltsnarf_exit(setno);
1365	return (err);
1366}
1367
1368void
1369get_minfo(struct dk_minfo *info, minor_t mnum)
1370{
1371	md_unit_t	*un;
1372	mdi_unit_t	*ui;
1373
1374	info->dki_capacity = 0;
1375	info->dki_lbsize = 0;
1376	info->dki_media_type = 0;
1377
1378	if ((ui = MDI_UNIT(mnum)) == NULL) {
1379		return;
1380	}
1381	un = (md_unit_t *)md_unit_readerlock(ui);
1382	info->dki_capacity = un->c.un_total_blocks;
1383	md_unit_readerexit(ui);
1384	info->dki_lbsize = DEV_BSIZE;
1385	info->dki_media_type = DK_UNKNOWN;
1386}
1387
1388
1389void
1390get_info(struct dk_cinfo *info, minor_t mnum)
1391{
1392	/*
1393	 * Controller Information
1394	 */
1395	info->dki_ctype = DKC_MD;
1396	info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1397	(void) strcpy(info->dki_cname,
1398	    ddi_get_name(ddi_get_parent(md_devinfo)));
1399	/*
1400	 * Unit Information
1401	 */
1402	info->dki_unit = mnum;
1403	info->dki_slave = 0;
1404	(void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1405	info->dki_flags = 0;
1406	info->dki_partition = 0;
1407	info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1408
1409	/*
1410	 * We can't get from here to there yet
1411	 */
1412	info->dki_addr = 0;
1413	info->dki_space = 0;
1414	info->dki_prio = 0;
1415	info->dki_vec = 0;
1416}
1417
1418/*
1419 * open admin device
1420 */
1421static int
1422mdadminopen(
1423	int	flag,
1424	int	otyp)
1425{
1426	int	err = 0;
1427
1428	/* single thread */
1429	mutex_enter(&md_mx);
1430
1431	/* check type and flags */
1432	if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1433		err = EINVAL;
1434		goto out;
1435	}
1436	if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1437	    (md_status & MD_GBL_EXCL)) {
1438		err = EBUSY;
1439		goto out;
1440	}
1441
1442	/* count and flag open */
1443	md_ocnt[otyp]++;
1444	md_status |= MD_GBL_OPEN;
1445	if (flag & FEXCL)
1446		md_status |= MD_GBL_EXCL;
1447
1448	/* unlock return success */
1449out:
1450	mutex_exit(&md_mx);
1451	return (err);
1452}
1453
1454/*
1455 * open entry point
1456 */
1457static int
1458mdopen(
1459	dev_t		*dev,
1460	int		flag,
1461	int		otyp,
1462	cred_t		*cred_p)
1463{
1464	minor_t		mnum = getminor(*dev);
1465	unit_t		unit = MD_MIN2UNIT(mnum);
1466	set_t		setno = MD_MIN2SET(mnum);
1467	mdi_unit_t	*ui = NULL;
1468	int		err = 0;
1469	md_parent_t	parent;
1470
1471	/* dispatch admin device opens */
1472	if (mnum == MD_ADM_MINOR)
1473		return (mdadminopen(flag, otyp));
1474
1475	/* lock, check status */
1476	rw_enter(&md_unit_array_rw.lock, RW_READER);
1477
1478tryagain:
1479	if (md_get_status() & MD_GBL_HALTED)  {
1480		err = ENODEV;
1481		goto out;
1482	}
1483
1484	/* check minor */
1485	if ((setno >= md_nsets) || (unit >= md_nunits)) {
1486		err = ENXIO;
1487		goto out;
1488	}
1489
1490	/* make sure we're snarfed */
1491	if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1492		if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1493			err = ENODEV;
1494			goto out;
1495		}
1496	}
1497	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1498		err = ENODEV;
1499		goto out;
1500	}
1501
1502	/* check unit */
1503	if ((ui = MDI_UNIT(mnum)) == NULL) {
1504		err = ENXIO;
1505		goto out;
1506	}
1507
1508	/*
1509	 * The softpart open routine may do an I/O during the open, in
1510	 * which case the open routine will set the OPENINPROGRESS flag
1511	 * and drop all locks during the I/O.  If this thread sees
1512	 * the OPENINPROGRESS flag set, if should wait until the flag
1513	 * is reset before calling the driver's open routine.  It must
1514	 * also revalidate the world after it grabs the unit_array lock
1515	 * since the set may have been released or the metadevice cleared
1516	 * during the sleep.
1517	 */
1518	if (MD_MNSET_SETNO(setno)) {
1519		mutex_enter(&ui->ui_mx);
1520		if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1521			rw_exit(&md_unit_array_rw.lock);
1522			cv_wait(&ui->ui_cv, &ui->ui_mx);
1523			rw_enter(&md_unit_array_rw.lock, RW_READER);
1524			mutex_exit(&ui->ui_mx);
1525			goto tryagain;
1526		}
1527		mutex_exit(&ui->ui_mx);
1528	}
1529
1530	/* Test if device is openable */
1531	if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1532		err = ENXIO;
1533		goto out;
1534	}
1535
1536	/* don't allow opens w/WRITE flag if stale */
1537	if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1538		err = EROFS;
1539		goto out;
1540	}
1541
1542	/* don't allow writes to subdevices */
1543	parent = md_get_parent(md_expldev(*dev));
1544	if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1545		err = EROFS;
1546		goto out;
1547	}
1548
1549	/* open underlying driver */
1550	if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1551		if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1552		    (dev, flag, otyp, cred_p, 0)) != 0)
1553			goto out;
1554	}
1555
1556	/* or do it ourselves */
1557	else {
1558		/* single thread */
1559		(void) md_unit_openclose_enter(ui);
1560		err = md_unit_incopen(mnum, flag, otyp);
1561		md_unit_openclose_exit(ui);
1562		if (err != 0)
1563			goto out;
1564	}
1565
1566	/* unlock, return status */
1567out:
1568	rw_exit(&md_unit_array_rw.lock);
1569	return (err);
1570}
1571
1572/*
1573 * close admin device
1574 */
1575static int
1576mdadminclose(
1577	int	otyp)
1578{
1579	int	i;
1580	int	err = 0;
1581
1582	/* single thread */
1583	mutex_enter(&md_mx);
1584
1585	/* check type and flags */
1586	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1587		err = EINVAL;
1588		goto out;
1589	} else if (md_ocnt[otyp] == 0) {
1590		err = ENXIO;
1591		goto out;
1592	}
1593
1594	/* count and flag closed */
1595	if (otyp == OTYP_LYR)
1596		md_ocnt[otyp]--;
1597	else
1598		md_ocnt[otyp] = 0;
1599	md_status &= ~MD_GBL_OPEN;
1600	for (i = 0; (i < OTYPCNT); ++i)
1601		if (md_ocnt[i] != 0)
1602			md_status |= MD_GBL_OPEN;
1603	if (! (md_status & MD_GBL_OPEN))
1604		md_status &= ~MD_GBL_EXCL;
1605
1606	/* unlock return success */
1607out:
1608	mutex_exit(&md_mx);
1609	return (err);
1610}
1611
1612/*
1613 * close entry point
1614 */
1615static int
1616mdclose(
1617	dev_t		dev,
1618	int		flag,
1619	int		otyp,
1620	cred_t		*cred_p)
1621{
1622	minor_t		mnum = getminor(dev);
1623	set_t		setno = MD_MIN2SET(mnum);
1624	unit_t		unit = MD_MIN2UNIT(mnum);
1625	mdi_unit_t	*ui = NULL;
1626	int		err = 0;
1627
1628	/* dispatch admin device closes */
1629	if (mnum == MD_ADM_MINOR)
1630		return (mdadminclose(otyp));
1631
1632	/* check minor */
1633	if ((setno >= md_nsets) || (unit >= md_nunits) ||
1634	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1635		err = ENXIO;
1636		goto out;
1637	}
1638
1639	/* close underlying driver */
1640	if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1641		if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1642		    (dev, flag, otyp, cred_p, 0)) != 0)
1643			goto out;
1644	}
1645
1646	/* or do it ourselves */
1647	else {
1648		/* single thread */
1649		(void) md_unit_openclose_enter(ui);
1650		err = md_unit_decopen(mnum, otyp);
1651		md_unit_openclose_exit(ui);
1652		if (err != 0)
1653			goto out;
1654	}
1655
1656	/* return success */
1657out:
1658	return (err);
1659}
1660
1661
1662/*
1663 * This routine performs raw read operations.  It is called from the
1664 * device switch at normal priority.
1665 *
1666 * The main catch is that the *uio struct which is passed to us may
1667 * specify a read which spans two buffers, which would be contiguous
1668 * on a single partition,  but not on a striped partition. This will
1669 * be handled by mdstrategy.
1670 */
1671/*ARGSUSED*/
1672static int
1673mdread(dev_t dev, struct uio *uio, cred_t *credp)
1674{
1675	minor_t		mnum;
1676	mdi_unit_t	*ui;
1677	int		error;
1678
1679	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1680	    (MD_MIN2SET(mnum) >= md_nsets) ||
1681	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1682	    ((ui = MDI_UNIT(mnum)) == NULL))
1683		return (ENXIO);
1684
1685	if (md_ops[ui->ui_opsindex]->md_read  != NULL)
1686		return ((*md_ops[ui->ui_opsindex]->md_read)
1687		    (dev, uio, credp));
1688
1689	if ((error = md_chk_uio(uio)) != 0)
1690		return (error);
1691
1692	return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1693}
1694
1695/*
1696 * This routine performs async raw read operations.  It is called from the
1697 * device switch at normal priority.
1698 *
1699 * The main catch is that the *aio struct which is passed to us may
1700 * specify a read which spans two buffers, which would be contiguous
1701 * on a single partition,  but not on a striped partition. This will
1702 * be handled by mdstrategy.
1703 */
1704/*ARGSUSED*/
1705static int
1706mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1707{
1708	minor_t		mnum;
1709	mdi_unit_t	*ui;
1710	int		error;
1711
1712
1713	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1714	    (MD_MIN2SET(mnum) >= md_nsets) ||
1715	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1716	    ((ui = MDI_UNIT(mnum)) == NULL))
1717		return (ENXIO);
1718
1719	if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
1720		return ((*md_ops[ui->ui_opsindex]->md_aread)
1721		    (dev, aio, credp));
1722
1723	if ((error = md_chk_uio(aio->aio_uio)) != 0)
1724		return (error);
1725
1726	return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1727}
1728
1729/*
1730 * This routine performs raw write operations.	It is called from the
1731 * device switch at normal priority.
1732 *
1733 * The main catch is that the *uio struct which is passed to us may
1734 * specify a write which spans two buffers, which would be contiguous
1735 * on a single partition,  but not on a striped partition. This is
1736 * handled by mdstrategy.
1737 *
1738 */
1739/*ARGSUSED*/
1740static int
1741mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1742{
1743	minor_t		mnum;
1744	mdi_unit_t	*ui;
1745	int		error;
1746
1747	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1748	    (MD_MIN2SET(mnum) >= md_nsets) ||
1749	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1750	    ((ui = MDI_UNIT(mnum)) == NULL))
1751		return (ENXIO);
1752
1753	if (md_ops[ui->ui_opsindex]->md_write  != NULL)
1754		return ((*md_ops[ui->ui_opsindex]->md_write)
1755		    (dev, uio, credp));
1756
1757	if ((error = md_chk_uio(uio)) != 0)
1758		return (error);
1759
1760	return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1761}
1762
1763/*
1764 * This routine performs async raw write operations.  It is called from the
1765 * device switch at normal priority.
1766 *
1767 * The main catch is that the *aio struct which is passed to us may
1768 * specify a write which spans two buffers, which would be contiguous
1769 * on a single partition,  but not on a striped partition. This is
1770 * handled by mdstrategy.
1771 *
1772 */
1773/*ARGSUSED*/
1774static int
1775mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1776{
1777	minor_t		mnum;
1778	mdi_unit_t	*ui;
1779	int		error;
1780
1781
1782	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1783	    (MD_MIN2SET(mnum) >= md_nsets) ||
1784	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1785	    ((ui = MDI_UNIT(mnum)) == NULL))
1786		return (ENXIO);
1787
1788	if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
1789		return ((*md_ops[ui->ui_opsindex]->md_awrite)
1790		    (dev, aio, credp));
1791
1792	if ((error = md_chk_uio(aio->aio_uio)) != 0)
1793		return (error);
1794
1795	return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1796}
1797
1798int
1799mdstrategy(struct buf *bp)
1800{
1801	minor_t		mnum;
1802	mdi_unit_t	*ui;
1803
1804	ASSERT((bp->b_flags & B_DONE) == 0);
1805
1806	if (panicstr)
1807		md_clr_status(MD_GBL_DAEMONS_LIVE);
1808
1809	if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1810	    (MD_MIN2SET(mnum) >= md_nsets) ||
1811	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1812	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1813		bp->b_flags |= B_ERROR;
1814		bp->b_error = ENXIO;
1815		bp->b_resid = bp->b_bcount;
1816		biodone(bp);
1817		return (0);
1818	}
1819
1820	bp->b_flags &= ~(B_ERROR | B_DONE);
1821	if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
1822		(*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1823	} else {
1824		(void) errdone(ui, bp, ENXIO);
1825	}
1826	return (0);
1827}
1828
1829/*
1830 * Return true if the ioctl is allowed to be multithreaded.
1831 * All the ioctls with MN are sent only from the message handlers through
1832 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1833 * ioctl for the same metadevice are issued at the same time.
1834 * So we are safe here.
1835 * The other ioctls do not mess with any metadevice structures and therefor
1836 * are harmless too, if called multiple times at the same time.
1837 */
1838static boolean_t
1839is_mt_ioctl(int cmd) {
1840
1841	switch (cmd) {
1842	case MD_IOCGUNIQMSGID:
1843	case MD_IOCGVERSION:
1844	case MD_IOCISOPEN:
1845	case MD_MN_SET_MM_OWNER:
1846	case MD_MN_SET_STATE:
1847	case MD_MN_SUSPEND_WRITES:
1848	case MD_MN_ALLOCATE_HOTSPARE:
1849	case MD_MN_SET_SETFLAGS:
1850	case MD_MN_GET_SETFLAGS:
1851	case MD_MN_MDDB_OPTRECFIX:
1852	case MD_MN_MDDB_PARSE:
1853	case MD_MN_MDDB_BLOCK:
1854	case MD_MN_DB_USERREQ:
1855	case MD_IOC_SPSTATUS:
1856	case MD_MN_COMMD_ERR:
1857	case MD_MN_SET_COMMD_RUNNING:
1858	case MD_MN_RESYNC:
1859	case MD_MN_SETSYNC:
1860	case MD_MN_POKE_HOTSPARES:
1861		return (1);
1862	default:
1863		return (0);
1864	}
1865}
1866
1867/*
1868 * This routine implements the ioctl calls for the Virtual Disk System.
1869 * It is called from the device switch at normal priority.
1870 */
1871/* ARGSUSED */
1872static int
1873mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1874	int *rval_p)
1875{
1876	minor_t		mnum = getminor(dev);
1877	mdi_unit_t	*ui;
1878	IOLOCK		lock;
1879	int		err;
1880
1881	/*
1882	 * For multinode disksets  number of ioctls are allowed to be
1883	 * multithreaded.
1884	 * A fundamental assumption made in this implementation is that
1885	 * ioctls either do not interact with other md structures  or the
1886	 * ioctl to the admin device can only occur if the metadevice
1887	 * device is open. i.e. avoid a race between metaclear and the
1888	 * progress of a multithreaded ioctl.
1889	 */
1890
1891	if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1892		return (EINTR);
1893	}
1894
1895	/*
1896	 * initialize lock tracker
1897	 */
1898	IOLOCK_INIT(&lock);
1899
1900	/* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1901
1902	if (is_mt_ioctl(cmd)) {
1903		/* increment the md_mtioctl_cnt */
1904		mutex_enter(&md_mx);
1905		md_mtioctl_cnt++;
1906		mutex_exit(&md_mx);
1907		lock.l_flags |= MD_MT_IOCTL;
1908	}
1909
1910	/*
1911	 * this has been added to prevent notification from re-snarfing
1912	 * so metaunload will work.  It may interfere with other modules
1913	 * halt process.
1914	 */
1915	if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1916		return (IOLOCK_RETURN(ENXIO, &lock));
1917
1918	/*
1919	 * admin device ioctls
1920	 */
1921	if (mnum == MD_ADM_MINOR) {
1922		err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1923		    mode, &lock);
1924	}
1925
1926	/*
1927	 * metadevice ioctls
1928	 */
1929	else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1930	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1931	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1932		err = ENXIO;
1933	} else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1934		err = ENOTTY;
1935	} else {
1936		err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1937		    (dev, cmd, (void *) data, mode, &lock);
1938	}
1939
1940	/*
1941	 * drop any locks we grabbed
1942	 */
1943	return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1944}
1945
1946static int
1947mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1948{
1949	minor_t		mnum;
1950	set_t		setno;
1951	mdi_unit_t	*ui;
1952
1953	if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1954		return (ENXIO);
1955
1956	setno = MD_MIN2SET(mnum);
1957
1958	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1959	    ((ui = MDI_UNIT(mnum)) == NULL))
1960		return (ENXIO);
1961
1962
1963	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1964		return (ENXIO);
1965
1966	if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
1967		return ((*md_ops[ui->ui_opsindex]->md_dump)
1968		    (dev, addr, blkno, nblk));
1969
1970	return (ENXIO);
1971}
1972
1973/*
1974 * Metadevice unit number dispatcher
1975 * When this routine is called it will scan the
1976 * incore unit array and return the avail slot
1977 * hence the unit number to the caller
1978 *
1979 * Return -1 if there is nothing available
1980 */
1981unit_t
1982md_get_nextunit(set_t setno)
1983{
1984	unit_t	un, start;
1985
1986	/*
1987	 * If nothing available
1988	 */
1989	if (md_set[setno].s_un_avail == 0) {
1990		return (MD_UNITBAD);
1991	}
1992
1993	mutex_enter(&md_mx);
1994	start = un = md_set[setno].s_un_next;
1995
1996	/* LINTED: E_CONSTANT_CONDITION */
1997	while (1) {
1998		if (md_set[setno].s_un[un] == NULL) {
1999			/*
2000			 * Advance the starting index for the next
2001			 * md_get_nextunit call
2002			 */
2003			if (un == MD_MAXUNITS - 1) {
2004				md_set[setno].s_un_next = 0;
2005			} else {
2006				md_set[setno].s_un_next = un + 1;
2007			}
2008			break;
2009		}
2010
2011		un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
2012
2013		if (un == start) {
2014			un = MD_UNITBAD;
2015			break;
2016		}
2017
2018	}
2019
2020	mutex_exit(&md_mx);
2021	return (un);
2022}
2023