md_subr.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Driver for Virtual Disk.
29 */
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/buf.h>
33#include <sys/conf.h>
34#include <sys/user.h>
35#include <sys/uio.h>
36#include <sys/proc.h>
37#include <sys/t_lock.h>
38#include <sys/dkio.h>
39#include <sys/kmem.h>
40#include <sys/debug.h>
41#include <sys/cmn_err.h>
42#include <sys/sysmacros.h>
43#include <sys/types.h>
44#include <sys/mkdev.h>
45#include <sys/vtoc.h>
46#include <sys/open.h>
47#include <sys/file.h>
48#include <vm/page.h>
49#include <sys/callb.h>
50#include <sys/disp.h>
51#include <sys/modctl.h>
52#include <sys/errno.h>
53#include <sys/door.h>
54#include <sys/lvm/mdmn_commd.h>
55#include <sys/lvm/md_hotspares.h>
56
57#include <sys/lvm/mdvar.h>
58#include <sys/lvm/md_names.h>
59
60#include <sys/ddi.h>
61#include <sys/proc.h>
62#include <sys/sunddi.h>
63#include <sys/esunddi.h>
64
65#include <sys/sysevent.h>
66#include <sys/sysevent/eventdefs.h>
67
68#include <sys/sysevent/svm.h>
69#include <sys/lvm/md_basic.h>
70
71
72/*
73 * Machine specific Hertz is kept here
74 */
75extern clock_t			md_hz;
76
77/*
78 * Externs.
79 */
80extern int			(*mdv_strategy_tstpnt)(buf_t *, int, void*);
81extern major_t			md_major;
82extern unit_t			md_nunits;
83extern set_t			md_nsets;
84extern md_set_t			md_set[];
85extern md_set_io_t		md_set_io[];
86extern md_ops_t			**md_ops;
87extern md_ops_t			*md_opslist;
88extern ddi_modhandle_t		*md_mods;
89extern dev_info_t		*md_devinfo;
90
91extern md_krwlock_t		md_unit_array_rw;
92extern kmutex_t			md_mx;
93extern kcondvar_t		md_cv;
94
95extern md_krwlock_t		hsp_rwlp;
96extern md_krwlock_t		ni_rwlp;
97
98extern int			md_num_daemons;
99extern int			md_status;
100extern int			md_ioctl_cnt;
101extern int			md_mtioctl_cnt;
102
103extern struct metatransops	metatransops;
104extern md_event_queue_t		*md_event_queue;
105extern md_resync_t		md_cpr_resync;
106extern int			md_done_daemon_threads;
107extern int			md_ff_daemon_threads;
108
109
110extern mddb_set_t	*mddb_setenter(set_t setno, int flag, int *errorcodep);
111extern void		mddb_setexit(mddb_set_t *s);
112extern void		*lookup_entry(struct nm_next_hdr *, set_t,
113				side_t, mdkey_t, md_dev64_t, int);
114extern struct nm_next_hdr	*get_first_record(set_t, int, int);
115
116struct mdq_anchor	md_done_daemon; /* done request queue */
117struct mdq_anchor	md_mstr_daemon; /* mirror error, WOW requests */
118struct mdq_anchor	md_mhs_daemon;	/* mirror hotspare requests queue */
119struct mdq_anchor	md_hs_daemon;	/* raid hotspare requests queue */
120struct mdq_anchor	md_ff_daemonq;	/* failfast request queue */
121struct mdq_anchor	md_mirror_daemon; /* mirror owner queue */
122struct mdq_anchor	md_mirror_io_daemon; /* mirror owner i/o queue */
123struct mdq_anchor	md_mirror_rs_daemon; /* mirror resync done queue */
124struct mdq_anchor	md_sp_daemon;	/* soft-part error daemon queue */
125struct mdq_anchor	md_mto_daemon;	/* mirror timeout daemon queue */
126
127int md_done_daemon_threads = 1;	/* threads for md_done_daemon requestq */
128int md_mstr_daemon_threads = 1;	/* threads for md_mstr_daemon requestq */
129int md_mhs_daemon_threads = 1;	/* threads for md_mhs_daemon requestq */
130int md_hs_daemon_threads = 1;	/* threads for md_hs_daemon requestq */
131int md_ff_daemon_threads = 3;	/* threads for md_ff_daemon requestq */
132int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
133int md_sp_daemon_threads = 1;	/* threads for md_sp_daemon requestq */
134int md_mto_daemon_threads = 1;	/* threads for md_mto_daemon requestq */
135
136#ifdef DEBUG
137/* Flag to switch on debug messages */
138int md_release_reacquire_debug = 0;	/* debug flag */
139#endif
140
141/*
142 *
143 * The md_request_queues is table of pointers to request queues and the number
144 * of threads associated with the request queues.
145 * When the number of threads is set to 1, then the order of execution is
146 * sequential.
147 * The number of threads for all the queues have been defined as global
148 * variables to enable kernel tuning.
149 *
150 */
151
152#define	MD_DAEMON_QUEUES 11
153
154md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
155	{&md_done_daemon, &md_done_daemon_threads},
156	{&md_mstr_daemon, &md_mstr_daemon_threads},
157	{&md_hs_daemon, &md_hs_daemon_threads},
158	{&md_ff_daemonq, &md_ff_daemon_threads},
159	{&md_mirror_daemon, &md_mirror_daemon_threads},
160	{&md_mirror_io_daemon, &md_mirror_daemon_threads},
161	{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
162	{&md_sp_daemon, &md_sp_daemon_threads},
163	{&md_mhs_daemon, &md_mhs_daemon_threads},
164	{&md_mto_daemon, &md_mto_daemon_threads},
165	{0, 0}
166};
167
168/*
169 * Number of times a message is retried before issuing a warning to the operator
170 */
171#define	MD_MN_WARN_INTVL	10
172
173/*
174 * Setting retry cnt to one (pre decremented) so that we actually do no
175 * retries when committing/deleting a mddb rec. The underlying disk driver
176 * does several retries to check if the disk is really dead or not so there
177 * is no reason for us to retry on top of the drivers retries.
178 */
179
180uint_t			md_retry_cnt = 1; /* global so it can be patched */
181
182/*
183 * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
184 * Again, made patchable here should it prove useful.
185 */
186uint_t			md_send_retry_limit = 30;
187
188/*
189 * Bug # 1212146
190 * Before this change the user had to pass in a short aligned buffer because of
191 * problems in some underlying device drivers.  This problem seems to have been
192 * corrected in the underlying drivers so we will default to not requiring any
193 * alignment.  If the user needs to check for a specific alignment,
194 * md_uio_alignment_mask may be set in /etc/system to accomplish this.  To get
195 * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
196 * to check for word alignment, it can be set to 3, for double word alignment,
197 * it can be set to 7, etc.
198 *
199 * [Other part of fix is in function md_chk_uio()]
200 */
201static int		md_uio_alignment_mask = 0;
202
203/*
204 * for md_dev64_t translation
205 */
206struct md_xlate_table		*md_tuple_table;
207struct md_xlate_major_table	*md_major_tuple_table;
208int				md_tuple_length;
209uint_t				md_majortab_len;
210
211/* Function declarations */
212
213static int md_create_probe_rqlist(md_probedev_impl_t *plist,
214			daemon_queue_t **hdr, intptr_t (*probe_test)());
215
216/*
217 * manipulate global status
218 */
219void
220md_set_status(int bits)
221{
222	mutex_enter(&md_mx);
223	md_status |= bits;
224	mutex_exit(&md_mx);
225}
226
227void
228md_clr_status(int bits)
229{
230	mutex_enter(&md_mx);
231	md_status &= ~bits;
232	mutex_exit(&md_mx);
233}
234
235int
236md_get_status()
237{
238	int result;
239	mutex_enter(&md_mx);
240	result = md_status;
241	mutex_exit(&md_mx);
242	return (result);
243}
244
245void
246md_set_setstatus(set_t setno, int bits)
247{
248	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
249
250	mutex_enter(&md_mx);
251	md_set[setno].s_status |= bits;
252	mutex_exit(&md_mx);
253}
254
255void
256md_clr_setstatus(set_t setno, int bits)
257{
258	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
259
260	mutex_enter(&md_mx);
261	md_set[setno].s_status &= ~bits;
262	mutex_exit(&md_mx);
263}
264
265uint_t
266md_get_setstatus(set_t setno)
267{
268	uint_t result;
269
270	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
271
272	mutex_enter(&md_mx);
273	result = md_set[setno].s_status;
274	mutex_exit(&md_mx);
275	return (result);
276}
277
278/*
279 * md_unit_readerlock_common:
280 * -------------------------
281 * Mark the given unit as having a reader reference. Spin waiting for any
282 * writer references to be released.
283 *
284 * Input:
285 *	ui		unit reference
286 *	lock_held	0 => ui_mx needs to be grabbed
287 *			1 => ui_mx already held
288 * Output:
289 *	mm_unit_t corresponding to unit structure
290 *	ui->ui_readercnt incremented
291 */
292static void *
293md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
294{
295	uint_t	flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
296
297	if (!lock_held)
298		mutex_enter(&ui->ui_mx);
299	while (ui->ui_lock & flag) {
300		if (panicstr) {
301			if (ui->ui_lock & MD_UL_WRITER)
302				panic("md: writer lock is held");
303			break;
304		}
305		cv_wait(&ui->ui_cv, &ui->ui_mx);
306	}
307	ui->ui_readercnt++;
308	if (!lock_held)
309		mutex_exit(&ui->ui_mx);
310	return (MD_UNIT(ui->ui_link.ln_id));
311}
312
313void *
314md_unit_readerlock(mdi_unit_t *ui)
315{
316	return (md_unit_readerlock_common(ui, 0));
317}
318
319/*
320 * md_unit_writerlock_common:
321 * -------------------------
322 * Acquire a unique writer reference. Causes previous readers to drain.
323 * Spins if a writer reference already exists or if a previous reader/writer
324 * dropped the lock to allow a ksend_message to be despatched.
325 *
326 * Input:
327 *	ui		unit reference
328 *	lock_held	0 => grab ui_mx
329 *			1 => ui_mx already held on entry
330 * Output:
331 *	mm_unit_t reference
332 */
333static void *
334md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
335{
336	uint_t	flag = MD_UL_WRITER;
337
338	if (panicstr)
339		panic("md: writer lock not allowed");
340
341	if (!lock_held)
342		mutex_enter(&ui->ui_mx);
343
344	while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
345		ui->ui_wanabecnt++;
346		ui->ui_lock |= MD_UL_WANABEWRITER;
347		cv_wait(&ui->ui_cv, &ui->ui_mx);
348		if (--ui->ui_wanabecnt == 0)
349			ui->ui_lock &= ~MD_UL_WANABEWRITER;
350	}
351	ui->ui_lock |= MD_UL_WRITER;
352	ui->ui_owner = curthread;
353
354	if (!lock_held)
355		mutex_exit(&ui->ui_mx);
356	return (MD_UNIT(ui->ui_link.ln_id));
357}
358
359void *
360md_unit_writerlock(mdi_unit_t *ui)
361{
362	return (md_unit_writerlock_common(ui, 0));
363}
364
365/*
366 * md_unit_readerexit_common:
367 * -------------------------
368 * Release the readerlock for the specified unit. If the reader count reaches
369 * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
370 *
371 * Input:
372 *	ui		unit reference
373 *	lock_held	0 => ui_mx needs to be acquired
374 *			1 => ui_mx already held
375 */
376static void
377md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
378{
379	if (!lock_held)
380		mutex_enter(&ui->ui_mx);
381	ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
382	ASSERT(ui->ui_readercnt != 0);
383	ui->ui_readercnt--;
384	if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
385		cv_broadcast(&ui->ui_cv);
386
387	if (!lock_held)
388		mutex_exit(&ui->ui_mx);
389}
390
391void
392md_unit_readerexit(mdi_unit_t *ui)
393{
394	md_unit_readerexit_common(ui, 0);
395}
396
397/*
398 * md_unit_writerexit_common:
399 * -------------------------
400 * Release the writerlock currently held on the unit. Wake any threads waiting
401 * on becoming reader or writer (MD_UL_WANABEWRITER set).
402 *
403 * Input:
404 *	ui		unit reference
405 *	lock_held	0 => ui_mx to be acquired
406 *			1 => ui_mx already held
407 */
408static void
409md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
410{
411	if (!lock_held)
412		mutex_enter(&ui->ui_mx);
413	ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
414	ASSERT(ui->ui_readercnt == 0);
415	ui->ui_lock &= ~MD_UL_WRITER;
416	ui->ui_owner = NULL;
417
418	cv_broadcast(&ui->ui_cv);
419	if (!lock_held)
420		mutex_exit(&ui->ui_mx);
421}
422
423void
424md_unit_writerexit(mdi_unit_t *ui)
425{
426	md_unit_writerexit_common(ui, 0);
427}
428
429void *
430md_io_readerlock(mdi_unit_t *ui)
431{
432	md_io_lock_t	*io = ui->ui_io_lock;
433
434	ASSERT(io);  /* checks case where no io lock allocated */
435	mutex_enter(&io->io_mx);
436	while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
437		if (panicstr) {
438			if (io->io_lock & MD_UL_WRITER)
439				panic("md: writer lock is held");
440			break;
441		}
442		cv_wait(&io->io_cv, &io->io_mx);
443	}
444	io->io_readercnt++;
445	mutex_exit(&io->io_mx);
446	return (MD_UNIT(ui->ui_link.ln_id));
447}
448
449void *
450md_io_writerlock(mdi_unit_t *ui)
451{
452	md_io_lock_t	*io = ui->ui_io_lock;
453
454	ASSERT(io);  /* checks case where no io lock allocated */
455	if (panicstr)
456		panic("md: writer lock not allowed");
457
458	mutex_enter(&io->io_mx);
459	while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
460		io->io_wanabecnt++;
461		io->io_lock |= MD_UL_WANABEWRITER;
462		cv_wait(&io->io_cv, &io->io_mx);
463		if (--io->io_wanabecnt == 0)
464			io->io_lock &= ~MD_UL_WANABEWRITER;
465	}
466	io->io_lock |= MD_UL_WRITER;
467	io->io_owner = curthread;
468
469	mutex_exit(&io->io_mx);
470	return (MD_UNIT(ui->ui_link.ln_id));
471}
472
473void
474md_io_readerexit(mdi_unit_t *ui)
475{
476	md_io_lock_t	*io = ui->ui_io_lock;
477
478	mutex_enter(&io->io_mx);
479	ASSERT((io->io_lock & MD_UL_WRITER) == 0);
480	ASSERT(io->io_readercnt != 0);
481	io->io_readercnt--;
482	if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
483		cv_broadcast(&io->io_cv);
484	}
485	mutex_exit(&io->io_mx);
486}
487
488void
489md_io_writerexit(mdi_unit_t *ui)
490{
491	md_io_lock_t	*io = ui->ui_io_lock;
492
493	mutex_enter(&io->io_mx);
494	ASSERT((io->io_lock & MD_UL_WRITER) != 0);
495	ASSERT(io->io_readercnt == 0);
496	io->io_lock &= ~MD_UL_WRITER;
497	io->io_owner = NULL;
498
499	cv_broadcast(&io->io_cv);
500	mutex_exit(&io->io_mx);
501}
502
503/*
504 * Attempt to grab that set of locks defined as global.
505 * A mask containing the set of global locks that are owned upon
506 * entry is input.  Any additional global locks are then grabbed.
507 * This keeps the caller from having to know the set of global
508 * locks.
509 */
510static int
511md_global_lock_enter(int global_locks_owned_mask)
512{
513
514	/*
515	 * The current implementation has been verified by inspection
516	 * and test to be deadlock free.  If another global lock is
517	 * added, changing the algorithm used by this function should
518	 * be considered.  With more than 2 locks it is difficult to
519	 * guarantee that locks are being acquired in the correct order.
520	 * The safe approach would be to drop all of the locks that are
521	 * owned at function entry and then reacquire all of the locks
522	 * in the order defined by the lock hierarchy.
523	 */
524	mutex_enter(&md_mx);
525	if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
526		while ((md_mtioctl_cnt != 0) ||
527		    (md_status & MD_GBL_IOCTL_LOCK)) {
528			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
529				mutex_exit(&md_mx);
530				return (EINTR);
531			}
532		}
533		md_status |= MD_GBL_IOCTL_LOCK;
534		md_ioctl_cnt++;
535	}
536	if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
537		while (md_status & MD_GBL_HS_LOCK) {
538			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
539				md_status &= ~MD_GBL_IOCTL_LOCK;
540				mutex_exit(&md_mx);
541				return (EINTR);
542			}
543		}
544		md_status |= MD_GBL_HS_LOCK;
545	}
546	mutex_exit(&md_mx);
547	return (0);
548}
549
550/*
551 * Release the set of global locks that were grabbed in md_global_lock_enter
552 * that were not already owned by the calling thread.  The set of previously
553 * owned global locks is passed in as a mask parameter.
554 */
555static int
556md_global_lock_exit(int global_locks_owned_mask, int code,
557	int flags, mdi_unit_t *ui)
558{
559	mutex_enter(&md_mx);
560
561	/* If MT ioctl decrement mt_ioctl_cnt */
562	if ((flags & MD_MT_IOCTL)) {
563		md_mtioctl_cnt--;
564	} else {
565		if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
566			/* clear the lock and decrement count */
567			ASSERT(md_ioctl_cnt == 1);
568			md_ioctl_cnt--;
569			md_status &= ~MD_GBL_IOCTL_LOCK;
570		}
571		if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
572			md_status &= ~MD_GBL_HS_LOCK;
573	}
574	if (flags & MD_READER_HELD)
575		md_unit_readerexit(ui);
576	if (flags & MD_WRITER_HELD)
577		md_unit_writerexit(ui);
578	if (flags & MD_IO_HELD)
579		md_io_writerexit(ui);
580	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
581		rw_exit(&md_unit_array_rw.lock);
582	}
583	cv_broadcast(&md_cv);
584	mutex_exit(&md_mx);
585
586	return (code);
587}
588
589/*
590 * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
591 * use of the md_global_lock_{enter|exit} functions to avoid duplication
592 * of code.  They rely upon the fact that the locks that are specified in
593 * the input mask are not acquired or freed.  If this algorithm changes
594 * as described in the block comment at the beginning of md_global_lock_enter
595 * then it will be necessary to change these 2 functions.  Otherwise these
596 * functions will be grabbing and holding global locks unnecessarily.
597 */
598int
599md_ioctl_lock_enter(void)
600{
601	/* grab only the ioctl lock */
602	return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
603}
604
605/*
606 * If md_ioctl_lock_exit is being called at the end of an ioctl before
607 * returning to user space, then ioctl_end is set to 1.
608 * Otherwise, the ioctl lock is being dropped in the middle of handling
609 * an ioctl and will be reacquired before the end of the ioctl.
610 * Do not attempt to process the MN diskset mddb parse flags unless
611 * ioctl_end is true - otherwise a deadlock situation could arise.
612 */
613int
614md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
615{
616	int				ret_val;
617	uint_t				status;
618	mddb_set_t			*s;
619	int				i;
620	int				err;
621	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
622	md_mn_kresult_t			*kresult;
623	mddb_lb_t			*lbp;
624	int				rval = 1;
625	int				flag;
626
627	/* release only the ioctl lock */
628	ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
629
630	/*
631	 * If md_ioctl_lock_exit is being called with a possible lock held
632	 * (ioctl_end is 0), then don't check the MN disksets since the
633	 * call to mddb_setenter may cause a lock ordering deadlock.
634	 */
635	if (!ioctl_end)
636		return (ret_val);
637
638	/*
639	 * Walk through disksets to see if there is a MN diskset that
640	 * has messages that need to be sent.  Set must be snarfed and
641	 * be a MN diskset in order to be checked.
642	 *
643	 * In a MN diskset, this routine may send messages to the
644	 * rpc.mdcommd in order to have the slave nodes re-parse parts
645	 * of the mddb.  Messages can only be sent with no locks held,
646	 * so if mddb change occurred while the ioctl lock is held, this
647	 * routine must send the messages.
648	 */
649	for (i = 1; i < md_nsets; i++) {
650		status = md_get_setstatus(i);
651
652		/* Set must be snarfed and be a MN diskset */
653		if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
654		    (MD_SET_SNARFED | MD_SET_MNSET))
655			continue;
656
657		/* Grab set lock so that set can't change */
658		if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
659			continue;
660
661		lbp = s->s_lbp;
662
663		/* Re-get set status now that lock is held */
664		status = md_get_setstatus(i);
665
666		/*
667		 * If MN parsing block flag is set - continue to next set.
668		 *
669		 * If s_mn_parseflags_sending is non-zero, then another thread
670		 * is already currently sending a parse message, so just
671		 * release the set mutex.  If this ioctl had caused an mddb
672		 * change that results in a parse message to be generated,
673		 * the thread that is currently sending a parse message would
674		 * generate the additional parse message.
675		 *
676		 * If s_mn_parseflags_sending is zero then loop until
677		 * s_mn_parseflags is 0 (until there are no more
678		 * messages to send).
679		 * While s_mn_parseflags is non-zero,
680		 *	put snapshot of parse_flags in s_mn_parseflags_sending
681		 *	set s_mn_parseflags to zero
682		 *	release set mutex
683		 *	send message
684		 *	re-grab set mutex
685		 *	set s_mn_parseflags_sending to zero
686		 *
687		 * If set is STALE, send message with NO_LOG flag so that
688		 * rpc.mdcommd won't attempt to log message to non-writeable
689		 * replica.
690		 */
691		mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
692		    KM_SLEEP);
693		while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
694		    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
695		    (!(status & MD_SET_MNPARSE_BLK))) {
696
697			/* Grab snapshot of parse flags */
698			s->s_mn_parseflags_sending = s->s_mn_parseflags;
699			s->s_mn_parseflags = 0;
700
701			mutex_exit(&md_set[(s)->s_setno].s_dbmx);
702
703			/*
704			 * Send the message to the slaves to re-parse
705			 * the indicated portions of the mddb. Send the status
706			 * of the 50 mddbs in this set so that slaves know
707			 * which mddbs that the master node thinks are 'good'.
708			 * Otherwise, slave may reparse, but from wrong
709			 * replica.
710			 */
711			mddb_parse_msg->msg_parse_flags =
712			    s->s_mn_parseflags_sending;
713
714			for (i = 0; i < MDDB_NLB; i++) {
715				mddb_parse_msg->msg_lb_flags[i] =
716				    lbp->lb_locators[i].l_flags;
717			}
718			kresult = kmem_zalloc(sizeof (md_mn_kresult_t),
719			    KM_SLEEP);
720			while (rval != 0) {
721				flag = 0;
722				if (status & MD_SET_STALE)
723					flag |= MD_MSGF_NO_LOG;
724				rval = mdmn_ksend_message(s->s_setno,
725				    MD_MN_MSG_MDDB_PARSE, flag, 0,
726				    (char *)mddb_parse_msg,
727				    sizeof (md_mn_msg_mddb_parse_t), kresult);
728				/* if the node hasn't yet joined, it's Ok. */
729				if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
730				    (kresult->kmmr_comm_state !=
731				    MDMNE_NOT_JOINED)) {
732					mdmn_ksend_show_error(rval, kresult,
733					    "MD_MN_MSG_MDDB_PARSE");
734					cmn_err(CE_WARN, "md_ioctl_lock_exit: "
735					    "Unable to send mddb update "
736					    "message to other nodes in "
737					    "diskset %s\n", s->s_setname);
738					rval = 1;
739				}
740			}
741			kmem_free(kresult, sizeof (md_mn_kresult_t));
742
743			/*
744			 * Re-grab mutex to clear sending field and to
745			 * see if another parse message needs to be generated.
746			 */
747			mutex_enter(&md_set[(s)->s_setno].s_dbmx);
748			s->s_mn_parseflags_sending = 0;
749		}
750		kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
751		mutex_exit(&md_set[(s)->s_setno].s_dbmx);
752	}
753	return (ret_val);
754}
755
756/*
757 * Called when in an ioctl and need readerlock.
758 */
759void *
760md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
761{
762	ASSERT(lock != NULL);
763	lock->l_ui = ui;
764	lock->l_flags |= MD_READER_HELD;
765	return (md_unit_readerlock_common(ui, 0));
766}
767
768/*
769 * Called when in an ioctl and need writerlock.
770 */
771void *
772md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
773{
774	ASSERT(lock != NULL);
775	lock->l_ui = ui;
776	lock->l_flags |= MD_WRITER_HELD;
777	return (md_unit_writerlock_common(ui, 0));
778}
779
780void *
781md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
782{
783	ASSERT(lock != NULL);
784	lock->l_ui = ui;
785	lock->l_flags |= MD_IO_HELD;
786	return (md_io_writerlock(ui));
787}
788
789void
790md_ioctl_readerexit(IOLOCK *lock)
791{
792	ASSERT(lock != NULL);
793	lock->l_flags &= ~MD_READER_HELD;
794	md_unit_readerexit(lock->l_ui);
795}
796
797void
798md_ioctl_writerexit(IOLOCK *lock)
799{
800	ASSERT(lock != NULL);
801	lock->l_flags &= ~MD_WRITER_HELD;
802	md_unit_writerexit(lock->l_ui);
803}
804
805void
806md_ioctl_io_exit(IOLOCK *lock)
807{
808	ASSERT(lock != NULL);
809	lock->l_flags &= ~MD_IO_HELD;
810	md_io_writerexit(lock->l_ui);
811}
812
813/*
814 * md_ioctl_releaselocks:
815 * --------------------
816 * Release the unit locks that are held and stop subsequent
817 * md_unit_reader/writerlock calls from progressing. This allows the caller
818 * to send messages across the cluster when running in a multinode
819 * environment.
820 * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
821 * allowed to progress as normal. This is required as these typically are
822 * invoked by the message handler that may be called while a unit lock is
823 * marked as released.
824 *
825 * On entry:
826 *	variety of unit locks may be held including ioctl lock
827 *
828 * On exit:
829 *      locks released and unit structure updated to prevent subsequent reader/
830 *      writer locks being acquired until md_ioctl_reacquirelocks is called
831 */
832void
833md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
834{
835	/* This actually releases the locks. */
836	(void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
837}
838
839/*
840 * md_ioctl_reacquirelocks:
841 * ----------------------
842 * Reacquire the locks that were held when md_ioctl_releaselocks
843 * was called.
844 *
845 * On entry:
846 *      No unit locks held
847 * On exit:
848 *	locks held that were held at md_ioctl_releaselocks time including
849 *	the ioctl lock.
850 */
851void
852md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
853{
854	if (flags & MD_MT_IOCTL) {
855		mutex_enter(&md_mx);
856		md_mtioctl_cnt++;
857		mutex_exit(&md_mx);
858	} else {
859		while (md_ioctl_lock_enter() == EINTR)
860			;
861	}
862	if (flags & MD_ARRAY_WRITER) {
863		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
864	} else if (flags & MD_ARRAY_READER) {
865		rw_enter(&md_unit_array_rw.lock, RW_READER);
866	}
867	if (ui != (mdi_unit_t *)NULL) {
868		if (flags & MD_IO_HELD) {
869			(void) md_io_writerlock(ui);
870		}
871
872		mutex_enter(&ui->ui_mx);
873		if (flags & MD_READER_HELD) {
874			(void) md_unit_readerlock_common(ui, 1);
875		} else if (flags & MD_WRITER_HELD) {
876			(void) md_unit_writerlock_common(ui, 1);
877		}
878		/* Wake up any blocked readerlock() calls */
879		cv_broadcast(&ui->ui_cv);
880		mutex_exit(&ui->ui_mx);
881	}
882}
883
884void
885md_ioctl_droplocks(IOLOCK *lock)
886{
887	mdi_unit_t	*ui;
888	int		flags;
889
890	ASSERT(lock != NULL);
891	ui = lock->l_ui;
892	flags = lock->l_flags;
893	if (flags & MD_READER_HELD) {
894		lock->l_flags &= ~MD_READER_HELD;
895		md_unit_readerexit(ui);
896	}
897	if (flags & MD_WRITER_HELD) {
898		lock->l_flags &= ~MD_WRITER_HELD;
899		md_unit_writerexit(ui);
900	}
901	if (flags & MD_IO_HELD) {
902		lock->l_flags &= ~MD_IO_HELD;
903		md_io_writerexit(ui);
904	}
905	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
906		lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
907		rw_exit(&md_unit_array_rw.lock);
908	}
909}
910
911void
912md_array_writer(IOLOCK *lock)
913{
914	ASSERT(lock != NULL);
915	lock->l_flags |= MD_ARRAY_WRITER;
916	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
917}
918
919void
920md_array_reader(IOLOCK *lock)
921{
922	ASSERT(lock != NULL);
923	lock->l_flags |= MD_ARRAY_READER;
924	rw_enter(&md_unit_array_rw.lock, RW_READER);
925}
926
927/*
928 * Called when in an ioctl and need opencloselock.
929 * Sets flags in lockp for READER_HELD.
930 */
931void *
932md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
933{
934	void	*un;
935
936	ASSERT(lockp != NULL);
937	mutex_enter(&ui->ui_mx);
938	while (ui->ui_lock & MD_UL_OPENORCLOSE)
939		cv_wait(&ui->ui_cv, &ui->ui_mx);
940	ui->ui_lock |= MD_UL_OPENORCLOSE;
941
942	/* Maintain mutex across the readerlock call */
943	lockp->l_ui = ui;
944	lockp->l_flags |= MD_READER_HELD;
945	un = md_unit_readerlock_common(ui, 1);
946	mutex_exit(&ui->ui_mx);
947
948	return (un);
949}
950
951/*
952 * Clears reader lock using md_ioctl instead of md_unit
953 * and updates lockp.
954 */
955void
956md_ioctl_openclose_exit(IOLOCK *lockp)
957{
958	mdi_unit_t	*ui;
959
960	ASSERT(lockp != NULL);
961	ui = lockp->l_ui;
962	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
963
964	md_ioctl_readerexit(lockp);
965
966	mutex_enter(&ui->ui_mx);
967	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
968
969	cv_broadcast(&ui->ui_cv);
970	mutex_exit(&ui->ui_mx);
971}
972
973/*
974 * Clears reader lock using md_ioctl instead of md_unit
975 * and updates lockp.
976 * Does not acquire or release the ui_mx lock since the calling
977 * routine has already acquired this lock.
978 */
979void
980md_ioctl_openclose_exit_lh(IOLOCK *lockp)
981{
982	mdi_unit_t	*ui;
983
984	ASSERT(lockp != NULL);
985	ui = lockp->l_ui;
986	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
987
988	lockp->l_flags &= ~MD_READER_HELD;
989	md_unit_readerexit_common(lockp->l_ui, 1);
990
991	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
992	cv_broadcast(&ui->ui_cv);
993}
994
995void *
996md_unit_openclose_enter(mdi_unit_t *ui)
997{
998	void	*un;
999
1000	mutex_enter(&ui->ui_mx);
1001	while (ui->ui_lock & (MD_UL_OPENORCLOSE))
1002		cv_wait(&ui->ui_cv, &ui->ui_mx);
1003	ui->ui_lock |= MD_UL_OPENORCLOSE;
1004
1005	/* Maintain mutex across the readerlock call */
1006	un = md_unit_readerlock_common(ui, 1);
1007	mutex_exit(&ui->ui_mx);
1008
1009	return (un);
1010}
1011
1012void
1013md_unit_openclose_exit(mdi_unit_t *ui)
1014{
1015	md_unit_readerexit(ui);
1016
1017	mutex_enter(&ui->ui_mx);
1018	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1019	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1020
1021	cv_broadcast(&ui->ui_cv);
1022	mutex_exit(&ui->ui_mx);
1023}
1024
1025/*
1026 * Drop the openclose and readerlocks without acquiring or
1027 * releasing the ui_mx lock since the calling routine has
1028 * already acquired this lock.
1029 */
1030void
1031md_unit_openclose_exit_lh(mdi_unit_t *ui)
1032{
1033	md_unit_readerexit_common(ui, 1);
1034	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1035	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1036	cv_broadcast(&ui->ui_cv);
1037}
1038
1039int
1040md_unit_isopen(
1041	mdi_unit_t	*ui
1042)
1043{
1044	int		isopen;
1045
1046	/* check status */
1047	mutex_enter(&ui->ui_mx);
1048	isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
1049	mutex_exit(&ui->ui_mx);
1050	return (isopen);
1051}
1052
1053int
1054md_unit_incopen(
1055	minor_t		mnum,
1056	int		flag,
1057	int		otyp
1058)
1059{
1060	mdi_unit_t	*ui = MDI_UNIT(mnum);
1061	int		err = 0;
1062
1063	/* check type and flags */
1064	ASSERT(ui != NULL);
1065	mutex_enter(&ui->ui_mx);
1066	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1067		err = EINVAL;
1068		goto out;
1069	}
1070	if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
1071	    (ui->ui_lock & MD_UL_EXCL)) {
1072		err = EBUSY;
1073		goto out;
1074	}
1075
1076	/* count and flag open */
1077	ui->ui_ocnt[otyp]++;
1078	ui->ui_lock |= MD_UL_OPEN;
1079	if (flag & FEXCL)
1080		ui->ui_lock |= MD_UL_EXCL;
1081
1082	/* setup kstat, return success */
1083	mutex_exit(&ui->ui_mx);
1084	md_kstat_init(mnum);
1085	return (0);
1086
1087	/* return error */
1088out:
1089	mutex_exit(&ui->ui_mx);
1090	return (err);
1091}
1092
1093int
1094md_unit_decopen(
1095	minor_t		mnum,
1096	int		otyp
1097)
1098{
1099	mdi_unit_t	*ui = MDI_UNIT(mnum);
1100	int		err = 0;
1101	unsigned	i;
1102
1103	/* check type and flags */
1104	ASSERT(ui != NULL);
1105	mutex_enter(&ui->ui_mx);
1106	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1107		err = EINVAL;
1108		goto out;
1109	} else if (ui->ui_ocnt[otyp] == 0) {
1110		err = ENXIO;
1111		goto out;
1112	}
1113
1114	/* count and flag closed */
1115	if (otyp == OTYP_LYR)
1116		ui->ui_ocnt[otyp]--;
1117	else
1118		ui->ui_ocnt[otyp] = 0;
1119	ui->ui_lock &= ~MD_UL_OPEN;
1120	for (i = 0; (i < OTYPCNT); ++i)
1121		if (ui->ui_ocnt[i] != 0)
1122			ui->ui_lock |= MD_UL_OPEN;
1123	if (! (ui->ui_lock & MD_UL_OPEN))
1124		ui->ui_lock &= ~MD_UL_EXCL;
1125
1126	/* teardown kstat, return success */
1127	if (! (ui->ui_lock & MD_UL_OPEN)) {
1128		mutex_exit(&ui->ui_mx);
1129		md_kstat_destroy(mnum);
1130		return (0);
1131	}
1132
1133	/* return success */
1134out:
1135	mutex_exit(&ui->ui_mx);
1136	return (err);
1137}
1138
1139md_dev64_t
1140md_xlate_targ_2_mini(md_dev64_t targ_devt)
1141{
1142	dev32_t		mini_32_devt, targ_32_devt;
1143	int		i;
1144
1145	/*
1146	 * check to see if we're in an upgrade situation
1147	 * if we are not in upgrade just return the input device
1148	 */
1149
1150	if (!MD_UPGRADE)
1151		return (targ_devt);
1152
1153	targ_32_devt = md_cmpldev(targ_devt);
1154
1155	i = 0;
1156	while (i != md_tuple_length) {
1157		if (md_tuple_table[i].targ_devt == targ_32_devt) {
1158			mini_32_devt = md_tuple_table[i].mini_devt;
1159			return (md_expldev((md_dev64_t)mini_32_devt));
1160		}
1161		i++;
1162	}
1163	return (NODEV64);
1164}
1165
1166md_dev64_t
1167md_xlate_mini_2_targ(md_dev64_t mini_devt)
1168{
1169	dev32_t		mini_32_devt, targ_32_devt;
1170	int		i;
1171
1172	if (!MD_UPGRADE)
1173		return (mini_devt);
1174
1175	mini_32_devt = md_cmpldev(mini_devt);
1176
1177	i = 0;
1178	while (i != md_tuple_length) {
1179		if (md_tuple_table[i].mini_devt == mini_32_devt) {
1180			targ_32_devt = md_tuple_table[i].targ_devt;
1181			return (md_expldev((md_dev64_t)targ_32_devt));
1182		}
1183		i++;
1184	}
1185	return (NODEV64);
1186}
1187
1188void
1189md_xlate_free(int size)
1190{
1191	kmem_free(md_tuple_table, size);
1192}
1193
1194char *
1195md_targ_major_to_name(major_t maj)
1196{
1197	char *drv_name = NULL;
1198	int	i;
1199
1200	if (!MD_UPGRADE)
1201		return (ddi_major_to_name(maj));
1202
1203	for (i = 0; i < md_majortab_len; i++) {
1204		if (md_major_tuple_table[i].targ_maj == maj) {
1205			drv_name = md_major_tuple_table[i].drv_name;
1206			break;
1207		}
1208	}
1209	return (drv_name);
1210}
1211
1212major_t
1213md_targ_name_to_major(char *drv_name)
1214{
1215	major_t maj;
1216	int	i;
1217
1218	maj = md_getmajor(NODEV64);
1219	if (!MD_UPGRADE)
1220		return (ddi_name_to_major(drv_name));
1221
1222	for (i = 0; i < md_majortab_len; i++) {
1223		if ((strcmp(md_major_tuple_table[i].drv_name,
1224		    drv_name)) == 0) {
1225			maj = md_major_tuple_table[i].targ_maj;
1226			break;
1227		}
1228	}
1229
1230	return (maj);
1231}
1232
1233void
1234md_majortab_free()
1235{
1236	size_t	sz;
1237	int	i;
1238
1239	for (i = 0; i < md_majortab_len; i++) {
1240		freestr(md_major_tuple_table[i].drv_name);
1241	}
1242
1243	sz = md_majortab_len * sizeof (struct md_xlate_major_table);
1244	kmem_free(md_major_tuple_table, sz);
1245}
1246
1247/* functions return a pointer to a function which returns an int */
1248
1249intptr_t (*
1250md_get_named_service(md_dev64_t dev, int modindex, char *name,
1251	intptr_t (*Default)()))()
1252{
1253	mdi_unit_t		*ui;
1254	md_named_services_t	*sp;
1255	int			i;
1256
1257	/*
1258	 * Return the first named service found.
1259	 * Use this path when it is known that there is only
1260	 * one named service possible (e.g., hotspare interface)
1261	 */
1262	if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
1263		for (i = 0; i < MD_NOPS; i++) {
1264			if (md_ops[i] == NULL) {
1265				continue;
1266			}
1267			sp = md_ops[i]->md_services;
1268			if (sp == NULL)
1269				continue;
1270			while (sp->md_service != NULL) {
1271				if (strcmp(name, sp->md_name) == 0)
1272					return (sp->md_service);
1273				sp++;
1274			}
1275		}
1276		return (Default);
1277	}
1278
1279	/*
1280	 * Return the named service for the given modindex.
1281	 * This is used if there are multiple possible named services
1282	 * and each one needs to be called (e.g., poke hotspares)
1283	 */
1284	if (dev == NODEV64) {
1285		if (modindex >= MD_NOPS)
1286			return (Default);
1287
1288		if (md_ops[modindex] == NULL)
1289			return (Default);
1290
1291		sp = md_ops[modindex]->md_services;
1292		if (sp == NULL)
1293			return (Default);
1294
1295		while (sp->md_service != NULL) {
1296			if (strcmp(name, sp->md_name) == 0)
1297				return (sp->md_service);
1298			sp++;
1299		}
1300		return (Default);
1301	}
1302
1303	/*
1304	 * Return the named service for this md_dev64_t
1305	 */
1306	if (md_getmajor(dev) != md_major)
1307		return (Default);
1308
1309	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
1310	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
1311		return (NULL);
1312
1313
1314	if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
1315		return (NULL);
1316
1317	sp = md_ops[ui->ui_opsindex]->md_services;
1318	if (sp == NULL)
1319		return (Default);
1320	while (sp->md_service != NULL) {
1321		if (strcmp(name, sp->md_name) == 0)
1322			return (sp->md_service);
1323		sp++;
1324	}
1325	return (Default);
1326}
1327
1328/*
1329 * md_daemon callback routine
1330 */
1331boolean_t
1332callb_md_cpr(void *arg, int code)
1333{
1334	callb_cpr_t *cp = (callb_cpr_t *)arg;
1335	int ret = 0;				/* assume success */
1336
1337	mutex_enter(cp->cc_lockp);
1338
1339	switch (code) {
1340	case CB_CODE_CPR_CHKPT:
1341		/*
1342		 * Check for active resync threads
1343		 */
1344		mutex_enter(&md_cpr_resync.md_resync_mutex);
1345		if ((md_cpr_resync.md_mirror_resync > 0) ||
1346		    (md_cpr_resync.md_raid_resync > 0)) {
1347			mutex_exit(&md_cpr_resync.md_resync_mutex);
1348			cmn_err(CE_WARN, "There are Solaris Volume Manager "
1349			    "synchronization threads running.");
1350			cmn_err(CE_WARN, "Please try system suspension at "
1351			    "a later time.");
1352			ret = -1;
1353			break;
1354		}
1355		mutex_exit(&md_cpr_resync.md_resync_mutex);
1356
1357		cp->cc_events |= CALLB_CPR_START;
1358		while (!(cp->cc_events & CALLB_CPR_SAFE))
1359			/* cv_timedwait() returns -1 if it times out. */
1360			if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp,
1361			    lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1)
1362				break;
1363			break;
1364
1365	case CB_CODE_CPR_RESUME:
1366		cp->cc_events &= ~CALLB_CPR_START;
1367		cv_signal(&cp->cc_stop_cv);
1368		break;
1369	}
1370	mutex_exit(cp->cc_lockp);
1371	return (ret != -1);
1372}
1373
1374void
1375md_daemon(int pass_thru, mdq_anchor_t *anchor)
1376{
1377	daemon_queue_t  *dq;
1378	callb_cpr_t	cprinfo;
1379
1380	if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
1381		return;
1382	/*
1383	 * Register cpr callback
1384	 */
1385	CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
1386
1387	/*CONSTCOND*/
1388	while (1) {
1389		mutex_enter(&anchor->a_mx);
1390		while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
1391			if (pass_thru) {
1392				/*
1393				 * CALLB_CPR_EXIT Will do
1394				 * mutex_exit(&anchor->a_mx)
1395				 */
1396				CALLB_CPR_EXIT(&cprinfo);
1397				return;
1398			}
1399			if (md_get_status() & MD_GBL_DAEMONS_DIE) {
1400				mutex_exit(&anchor->a_mx);
1401				mutex_enter(&md_mx);
1402				md_num_daemons--;
1403				mutex_exit(&md_mx);
1404				/*
1405				 * CALLB_CPR_EXIT will do
1406				 * mutex_exit(&anchor->a_mx)
1407				 */
1408				mutex_enter(&anchor->a_mx);
1409				CALLB_CPR_EXIT(&cprinfo);
1410				thread_exit();
1411			}
1412			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1413			cv_wait(&anchor->a_cv, &anchor->a_mx);
1414			CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
1415		}
1416		dq->dq_prev->dq_next = dq->dq_next;
1417		dq->dq_next->dq_prev = dq->dq_prev;
1418		dq->dq_prev = dq->dq_next = NULL;
1419		anchor->dq.qlen--;
1420		mutex_exit(&anchor->a_mx);
1421		(*(dq->dq_call))(dq);
1422	}
1423	/*NOTREACHED*/
1424}
1425
1426/*
1427 * daemon_request:
1428 *
1429 * Adds requests to appropriate requestq which is
1430 * anchored by *anchor.
1431 * The request is the first element of a doubly linked circular list.
1432 * When the request is a single element, the forward and backward
1433 * pointers MUST point to the element itself.
1434 */
1435
1436void
1437daemon_request(mdq_anchor_t *anchor, void (*func)(),
1438				daemon_queue_t *request, callstyle_t style)
1439{
1440	daemon_queue_t *rqtp;
1441	int i = 0;
1442
1443	rqtp = request;
1444	if (style == REQ_OLD) {
1445		ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
1446		/* set it to the new style */
1447		rqtp->dq_prev = rqtp->dq_next = rqtp;
1448	}
1449	ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
1450
1451	/* scan the list and add the function to each element */
1452
1453	do {
1454		rqtp->dq_call = func;
1455		i++;
1456		rqtp = rqtp->dq_next;
1457	} while (rqtp != request);
1458
1459	/* save pointer to tail of the request list */
1460	rqtp = request->dq_prev;
1461
1462	mutex_enter(&anchor->a_mx);
1463	/* stats */
1464	anchor->dq.qlen += i;
1465	anchor->dq.treqs += i;
1466	anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
1467	    anchor->dq.qlen : anchor->dq.maxq_len;
1468
1469	/* now add the list to request queue */
1470	request->dq_prev = anchor->dq.dq_prev;
1471	rqtp->dq_next = &anchor->dq;
1472	anchor->dq.dq_prev->dq_next = request;
1473	anchor->dq.dq_prev = rqtp;
1474	cv_broadcast(&anchor->a_cv);
1475	mutex_exit(&anchor->a_mx);
1476}
1477
1478void
1479mddb_commitrec_wrapper(mddb_recid_t recid)
1480{
1481	int sent_log = 0;
1482	uint_t retry = md_retry_cnt;
1483	set_t	setno;
1484
1485	while (mddb_commitrec(recid)) {
1486		if (! sent_log) {
1487			cmn_err(CE_WARN,
1488			    "md: state database commit failed");
1489			sent_log = 1;
1490		}
1491		delay(md_hz);
1492
1493		/*
1494		 * Setting retry cnt to one (pre decremented) so that we
1495		 * actually do no retries when committing/deleting a mddb rec.
1496		 * The underlying disk driver does several retries to check
1497		 * if the disk is really dead or not so there
1498		 * is no reason for us to retry on top of the drivers retries.
1499		 */
1500
1501		if (--retry == 0) {
1502			setno = mddb_getsetnum(recid);
1503			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1504				panic(
1505				    "md: Panic due to lack of DiskSuite state\n"
1506				    " database replicas. Fewer than 50%% of "
1507				    "the total were available,\n so panic to "
1508				    "ensure data integrity.");
1509			} else {
1510				panic("md: state database problem");
1511			}
1512			/*NOTREACHED*/
1513		}
1514	}
1515}
1516
1517void
1518mddb_commitrecs_wrapper(mddb_recid_t *recids)
1519{
1520	int sent_log = 0;
1521	uint_t retry = md_retry_cnt;
1522	set_t	setno;
1523
1524	while (mddb_commitrecs(recids)) {
1525		if (! sent_log) {
1526			cmn_err(CE_WARN,
1527			    "md: state database commit failed");
1528			sent_log = 1;
1529		}
1530		delay(md_hz);
1531
1532		/*
1533		 * Setting retry cnt to one (pre decremented) so that we
1534		 * actually do no retries when committing/deleting a mddb rec.
1535		 * The underlying disk driver does several retries to check
1536		 * if the disk is really dead or not so there
1537		 * is no reason for us to retry on top of the drivers retries.
1538		 */
1539
1540		if (--retry == 0) {
1541			/*
1542			 * since all the records are part of the same set
1543			 * use the first one to get setno
1544			 */
1545			setno = mddb_getsetnum(*recids);
1546			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1547				panic(
1548				    "md: Panic due to lack of DiskSuite state\n"
1549				    " database replicas. Fewer than 50%% of "
1550				    "the total were available,\n so panic to "
1551				    "ensure data integrity.");
1552			} else {
1553				panic("md: state database problem");
1554			}
1555			/*NOTREACHED*/
1556		}
1557	}
1558}
1559
1560void
1561mddb_deleterec_wrapper(mddb_recid_t recid)
1562{
1563	int sent_log = 0;
1564	uint_t retry = md_retry_cnt;
1565	set_t	setno;
1566
1567	while (mddb_deleterec(recid)) {
1568		if (! sent_log) {
1569			cmn_err(CE_WARN,
1570			    "md: state database delete failed");
1571			sent_log = 1;
1572		}
1573		delay(md_hz);
1574
1575		/*
1576		 * Setting retry cnt to one (pre decremented) so that we
1577		 * actually do no retries when committing/deleting a mddb rec.
1578		 * The underlying disk driver does several retries to check
1579		 * if the disk is really dead or not so there
1580		 * is no reason for us to retry on top of the drivers retries.
1581		 */
1582
1583		if (--retry == 0) {
1584			setno = mddb_getsetnum(recid);
1585			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1586				panic(
1587				    "md: Panic due to lack of DiskSuite state\n"
1588				    " database replicas. Fewer than 50%% of "
1589				    "the total were available,\n so panic to "
1590				    "ensure data integrity.");
1591			} else {
1592				panic("md: state database problem");
1593			}
1594			/*NOTREACHED*/
1595		}
1596	}
1597}
1598
1599/*
1600 * md_holdset_enter is called in order to hold the set in its
1601 * current state (loaded, unloaded, snarfed, unsnarfed, etc)
1602 * until md_holdset_exit is called.  This is used by the mirror
1603 * code to mark the set as HOLD so that the set won't be
1604 * unloaded while hotspares are being allocated in check_4_hotspares.
1605 * The original fix to the mirror code to hold the set was to call
1606 * md_haltsnarf_enter, but this will block all ioctls and ioctls
1607 * must work for a MN diskset while hotspares are allocated.
1608 */
1609void
1610md_holdset_enter(set_t setno)
1611{
1612	mutex_enter(&md_mx);
1613	while (md_set[setno].s_status & MD_SET_HOLD)
1614		cv_wait(&md_cv, &md_mx);
1615	md_set[setno].s_status |= MD_SET_HOLD;
1616	mutex_exit(&md_mx);
1617}
1618
1619void
1620md_holdset_exit(set_t setno)
1621{
1622	mutex_enter(&md_mx);
1623	md_set[setno].s_status &= ~MD_SET_HOLD;
1624	cv_broadcast(&md_cv);
1625	mutex_exit(&md_mx);
1626}
1627
1628/*
1629 * Returns a 0 if this thread marked the set as HOLD (success),
1630 * returns a -1 if set was already marked HOLD (failure).
1631 * Used by the release_set code to see if set is marked HOLD.
1632 * HOLD is set by a daemon when hotspares are being allocated
1633 * to mirror units.
1634 */
1635int
1636md_holdset_testandenter(set_t setno)
1637{
1638	mutex_enter(&md_mx);
1639	if (md_set[setno].s_status & MD_SET_HOLD) {
1640		mutex_exit(&md_mx);
1641		return (-1);
1642	}
1643	md_set[setno].s_status |= MD_SET_HOLD;
1644	mutex_exit(&md_mx);
1645	return (0);
1646}
1647
1648void
1649md_haltsnarf_enter(set_t setno)
1650{
1651	mutex_enter(&md_mx);
1652	while (md_set[setno].s_status & MD_SET_SNARFING)
1653		cv_wait(&md_cv, &md_mx);
1654
1655	md_set[setno].s_status |= MD_SET_SNARFING;
1656	mutex_exit(&md_mx);
1657}
1658
1659void
1660md_haltsnarf_exit(set_t setno)
1661{
1662	mutex_enter(&md_mx);
1663	md_set[setno].s_status &= ~MD_SET_SNARFING;
1664	cv_broadcast(&md_cv);
1665	mutex_exit(&md_mx);
1666}
1667
1668void
1669md_haltsnarf_wait(set_t setno)
1670{
1671	mutex_enter(&md_mx);
1672	while (md_set[setno].s_status & MD_SET_SNARFING)
1673		cv_wait(&md_cv, &md_mx);
1674	mutex_exit(&md_mx);
1675}
1676
1677/*
1678 * ASSUMED that the md_unit_array_rw WRITER lock is held.
1679 */
1680int
1681md_halt_set(set_t setno, enum md_haltcmd cmd)
1682{
1683	int	i, err;
1684
1685	if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
1686		return (0);
1687	}
1688
1689	if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
1690		for (i = 0; i < MD_NOPS; i++) {
1691			if (md_ops[i] == NULL)
1692				continue;
1693			if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
1694				for (--i; i > 0; --i) {
1695					if (md_ops[i] == NULL)
1696						continue;
1697					(void) (*(md_ops[i]->md_halt))
1698					    (MD_HALT_OPEN, setno);
1699				}
1700				return (EBUSY);
1701			}
1702		}
1703
1704		for (i = 0; i < MD_NOPS; i++) {
1705			if (md_ops[i] == NULL)
1706				continue;
1707			if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
1708				for (i = 0; i < MD_NOPS; i++) {
1709					if (md_ops[i] == NULL)
1710						continue;
1711					(void) (*(md_ops[i]->md_halt))
1712					    (MD_HALT_OPEN, setno);
1713				}
1714				return (EBUSY);
1715			}
1716		}
1717	}
1718
1719	if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
1720		for (i = 0; i < MD_NOPS; i++) {
1721			if (md_ops[i] == NULL)
1722				continue;
1723			err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
1724			if (err != 0)
1725				cmn_err(CE_NOTE,
1726				    "md: halt failed for %s, error %d",
1727				    md_ops[i]->md_driver.md_drivername, err);
1728		}
1729
1730		/*
1731		 * Unload the devid namespace if it is loaded
1732		 */
1733		md_unload_namespace(setno, NM_DEVID);
1734		md_unload_namespace(setno, 0L);
1735		md_clr_setstatus(setno, MD_SET_SNARFED);
1736	}
1737
1738	return (0);
1739}
1740
1741int
1742md_halt(int global_locks_owned_mask)
1743{
1744	set_t			i, j;
1745	int			err;
1746	int			init_queues;
1747	md_requestq_entry_t	*rqp;
1748	md_ops_t		**pops, *ops, *lops;
1749	ddi_modhandle_t		mod;
1750	char			*name;
1751
1752	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1753
1754	/*
1755	 * Grab the all of the global locks that are not
1756	 * already owned to ensure that there isn't another
1757	 * thread trying to access a global resource
1758	 * while the halt is in progress
1759	 */
1760	if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
1761		return (EINTR);
1762
1763	for (i = 0; i < md_nsets; i++)
1764		md_haltsnarf_enter(i);
1765
1766	/*
1767	 * Kill the daemon threads.
1768	 */
1769	init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
1770	md_clr_status(MD_GBL_DAEMONS_LIVE);
1771	md_set_status(MD_GBL_DAEMONS_DIE);
1772
1773	rqp = &md_daemon_queues[0];
1774	i = 0;
1775	while (!NULL_REQUESTQ_ENTRY(rqp)) {
1776		cv_broadcast(&rqp->dispq_headp->a_cv);
1777		rqp = &md_daemon_queues[++i];
1778	}
1779
1780	mutex_enter(&md_mx);
1781	while (md_num_daemons != 0) {
1782		mutex_exit(&md_mx);
1783		delay(md_hz);
1784		mutex_enter(&md_mx);
1785	}
1786	mutex_exit(&md_mx);
1787	md_clr_status(MD_GBL_DAEMONS_DIE);
1788
1789	for (i = 0; i < md_nsets; i++)
1790		/*
1791		 * Only call into md_halt_set if s_un / s_ui are both set.
1792		 * If they are NULL this set hasn't been accessed, so its
1793		 * pointless performing the call.
1794		 */
1795		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1796			if (md_halt_set(i, MD_HALT_CHECK)) {
1797				if (md_start_daemons(init_queues))
1798					cmn_err(CE_WARN,
1799					    "md: restart of daemon threads "
1800					    "failed");
1801				for (j = 0; j < md_nsets; j++)
1802					md_haltsnarf_exit(j);
1803
1804				return (md_global_lock_exit(
1805				    global_locks_owned_mask, EBUSY,
1806				    MD_ARRAY_WRITER, NULL));
1807			}
1808		}
1809
1810	/*
1811	 * if we get here we are going to do it
1812	 */
1813	for (i = 0; i < md_nsets; i++) {
1814		/*
1815		 * Only call into md_halt_set if s_un / s_ui are both set.
1816		 * If they are NULL this set hasn't been accessed, so its
1817		 * pointless performing the call.
1818		 */
1819		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1820			err = md_halt_set(i, MD_HALT_DOIT);
1821			if (err != 0)
1822				cmn_err(CE_NOTE,
1823				    "md: halt failed set %u, error %d",
1824				    (unsigned)i, err);
1825		}
1826	}
1827
1828	/*
1829	 * issue a halt unload to each module to indicate that it
1830	 * is about to be unloaded.  Each module is called once, set
1831	 * has no meaning at this point in time.
1832	 */
1833	for (i = 0; i < MD_NOPS; i++) {
1834		if (md_ops[i] == NULL)
1835			continue;
1836		err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
1837		if (err != 0)
1838			cmn_err(CE_NOTE,
1839			    "md: halt failed for %s, error %d",
1840			    md_ops[i]->md_driver.md_drivername, err);
1841	}
1842
1843	/* ddi_modclose the submodules */
1844	for (i = 0; i < MD_NOPS; i++) {
1845		/* skip if not open */
1846		if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
1847			continue;
1848
1849		/* find and unlink from md_opslist */
1850		ops = md_ops[i];
1851		mod = md_mods[i];
1852		pops = &md_opslist;
1853		for (lops = *pops; lops;
1854		    pops = &lops->md_next, lops = *pops) {
1855			if (lops == ops) {
1856				*pops = ops->md_next;
1857				ops->md_next = NULL;
1858				break;
1859			}
1860		}
1861
1862		/* uninitialize */
1863		name = ops->md_driver.md_drivername;
1864		md_ops[i] = NULL;
1865		md_mods[i] = NULL;
1866		ops->md_selfindex = 0;
1867		ops->md_driver.md_drivername[0] = '\0';
1868		rw_destroy(&ops->md_link_rw.lock);
1869
1870		/* close */
1871		err = ddi_modclose(mod);
1872		if (err != 0)
1873			cmn_err(CE_NOTE,
1874			    "md: halt close failed for %s, error %d",
1875			    name ? name : "UNKNOWN", err);
1876	}
1877
1878	/* Unload the database */
1879	mddb_unload();
1880
1881	md_set_status(MD_GBL_HALTED);	/* we are ready to be unloaded */
1882
1883	for (i = 0; i < md_nsets; i++)
1884		md_haltsnarf_exit(i);
1885
1886	return (md_global_lock_exit(global_locks_owned_mask, 0,
1887	    MD_ARRAY_WRITER, NULL));
1888}
1889
1890/*
1891 * md_layered_open() is an internal routine only for SVM modules.
1892 * So the input device will be a md_dev64_t, because all SVM modules internally
1893 * work with that device type.
1894 * ddi routines on the other hand work with dev_t. So, if we call any ddi
1895 * routines from here we first have to convert that device into a dev_t.
1896 */
1897
1898int
1899md_layered_open(
1900	minor_t		mnum,
1901	md_dev64_t	*dev,
1902	int		md_oflags
1903)
1904{
1905	int		flag = (FREAD | FWRITE);
1906	cred_t		*cred_p = kcred;
1907	major_t		major;
1908	int		err;
1909	dev_t		ddi_dev = md_dev64_to_dev(*dev);
1910
1911	if (ddi_dev == NODEV)
1912		return (ENODEV);
1913
1914	major = getmajor(ddi_dev);
1915
1916	/* metadevice */
1917	if (major == md_major) {
1918		mdi_unit_t	*ui;
1919
1920		/* open underlying driver */
1921		mnum = getminor(ddi_dev);
1922
1923		ui = MDI_UNIT(mnum);
1924		if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1925			int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
1926			    flag, OTYP_LYR, cred_p, md_oflags);
1927			/*
1928			 * As open() may change the device,
1929			 * send this info back to the caller.
1930			 */
1931			*dev = md_expldev(ddi_dev);
1932			return (ret);
1933		}
1934
1935		/* or do it ourselves */
1936		(void) md_unit_openclose_enter(ui);
1937		err = md_unit_incopen(mnum, flag, OTYP_LYR);
1938		md_unit_openclose_exit(ui);
1939		/* convert our ddi_dev back to the dev we were given */
1940		*dev = md_expldev(ddi_dev);
1941		return (err);
1942	}
1943
1944	/*
1945	 * Open regular device, since open() may change dev_t give new dev_t
1946	 * back to the caller.
1947	 */
1948	err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
1949	*dev = md_expldev(ddi_dev);
1950	return (err);
1951}
1952
1953/*
1954 * md_layered_close() is an internal routine only for SVM modules.
1955 * So the input device will be a md_dev64_t, because all SVM modules internally
1956 * work with that device type.
1957 * ddi routines on the other hand work with dev_t. So, if we call any ddi
1958 * routines from here we first have to convert that device into a dev_t.
1959 */
1960void
1961md_layered_close(
1962	md_dev64_t	dev,
1963	int		md_cflags
1964)
1965{
1966	int		flag = (FREAD | FWRITE);
1967	cred_t		*cred_p = kcred;
1968	dev_t		ddi_dev = md_dev64_to_dev(dev);
1969	major_t		major = getmajor(ddi_dev);
1970	minor_t		mnum = getminor(ddi_dev);
1971
1972	/* metadevice */
1973	if (major == md_major) {
1974		mdi_unit_t	*ui = MDI_UNIT(mnum);
1975
1976		/* close underlying driver */
1977		if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1978			(*md_ops[ui->ui_opsindex]->md_close)
1979			    (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
1980			return;
1981		}
1982
1983		/* or do it ourselves */
1984		(void) md_unit_openclose_enter(ui);
1985		(void) md_unit_decopen(mnum, OTYP_LYR);
1986		md_unit_openclose_exit(ui);
1987		return;
1988	}
1989
1990	/* close regular device */
1991	(void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
1992}
1993
1994/*
1995 * saves a little code in mdstrategy
1996 */
1997int
1998errdone(mdi_unit_t *ui, struct buf *bp, int err)
1999{
2000	if ((bp->b_error = err) != 0)
2001		bp->b_flags |= B_ERROR;
2002	else
2003		bp->b_resid = bp->b_bcount;
2004	md_unit_readerexit(ui);
2005	md_biodone(bp);
2006	return (1);
2007}
2008
2009static int	md_write_label = 0;
2010
2011int
2012md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
2013{
2014	diskaddr_t endblk;
2015	set_t	setno = MD_UN2SET(un);
2016
2017	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
2018	    (! (bp->b_flags & B_READ)))
2019		return (errdone(ui, bp, EROFS));
2020	/*
2021	 * Check early for unreasonable block number.
2022	 *
2023	 * b_blkno is defined as adaddr_t which is typedef'd to a long.
2024	 * A problem occurs if b_blkno has bit 31 set and un_total_blocks
2025	 * doesn't, b_blkno is then compared as a negative number which is
2026	 * always less than a positive.
2027	 */
2028	if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
2029		return (errdone(ui, bp, EINVAL));
2030
2031	if (bp->b_lblkno == un->c.un_total_blocks)
2032		return (errdone(ui, bp, 0));
2033
2034	/*
2035	 * make sure we don't clobber any labels
2036	 */
2037	if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
2038	    (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
2039		cmn_err(CE_NOTE, "md: %s: write to label",
2040		    md_shortname(getminor(bp->b_edev)));
2041		return (errdone(ui, bp, EINVAL));
2042	}
2043
2044	bp->b_resid = 0;
2045	endblk = (diskaddr_t)(bp->b_lblkno +
2046	    howmany(bp->b_bcount, DEV_BSIZE) - 1);
2047
2048	if (endblk > (un->c.un_total_blocks - 1)) {
2049		bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
2050		endblk = un->c.un_total_blocks - 1;
2051		bp->b_bcount -= bp->b_resid;
2052	}
2053	return (0);
2054}
2055
2056/*
2057 * init_request_queue: initializes the request queues and creates the threads.
2058 *	return value =  0  :invalid num_threads
2059 *		     =  n   : n is the number of threads created.
2060 */
2061
2062int
2063init_requestq(
2064	md_requestq_entry_t *rq, /* request queue info */
2065	void (*threadfn)(),	 /* function to start the thread */
2066	caddr_t threadfn_args,	 /* args to the function */
2067	int pri,		 /* thread priority */
2068	int init_queue)		 /* flag to init queues */
2069{
2070	struct mdq_anchor *rqhead;
2071	int	i;
2072	int	num_threads;
2073
2074
2075	num_threads = *(rq->num_threadsp);
2076	rqhead = rq->dispq_headp;
2077
2078	if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
2079		return (0);
2080
2081	if (init_queue) {
2082		rqhead->dq.maxq_len = 0;
2083		rqhead->dq.treqs = 0;
2084		rqhead->dq.dq_next = &rqhead->dq;
2085		rqhead->dq.dq_prev = &rqhead->dq;
2086		cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
2087		mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
2088	}
2089	for (i = 0; i < num_threads; i++) {
2090		(void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
2091		    TS_RUN, pri);
2092	}
2093	return (i);
2094}
2095
2096static void
2097start_daemon(struct mdq_anchor *q)
2098{
2099	md_daemon(0, q);
2100	ASSERT(0);
2101}
2102
2103/*
2104 * Creates all the md daemons.
2105 * Global:
2106 *	md_num_daemons is set to number of daemons.
2107 *	MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
2108 *
2109 * Return value: 0  success
2110 *		 1  failure
2111 */
2112int
2113md_start_daemons(int init_queue)
2114{
2115	md_requestq_entry_t	*rqp;
2116	int	cnt;
2117	int	i;
2118	int	retval = 0;
2119
2120
2121	if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
2122		return (retval);
2123	}
2124	md_clr_status(MD_GBL_DAEMONS_DIE);
2125
2126	rqp = &md_daemon_queues[0];
2127	i = 0;
2128	while (!NULL_REQUESTQ_ENTRY(rqp)) {
2129		cnt = init_requestq(rqp, start_daemon,
2130		    (caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
2131
2132		if (cnt && cnt != *rqp->num_threadsp) {
2133			retval = 1;
2134			break;
2135		}
2136		/*
2137		 * initialize variables
2138		 */
2139		md_num_daemons += cnt;
2140		rqp = &md_daemon_queues[++i];
2141	}
2142
2143	md_set_status(MD_GBL_DAEMONS_LIVE);
2144	return (retval);
2145}
2146
2147int
2148md_loadsubmod(set_t setno, char *name, int drvrid)
2149{
2150	ddi_modhandle_t	mod;
2151	md_ops_t	**pops, *ops;
2152	int		i, err;
2153
2154	/*
2155	 * See if the submodule is mdopened. If not, i is the index of the
2156	 * next empty slot.
2157	 */
2158	for (i = 0; md_ops[i] != NULL; i++) {
2159		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2160		    MD_DRIVERNAMELEN) == 0)
2161			return (i);
2162
2163		if (i == (MD_NOPS - 1))
2164			return (-1);
2165	}
2166
2167	if (drvrid < 0) {
2168		/* Do not try to add any records to the DB when stale. */
2169		if (md_get_setstatus(setno) & MD_SET_STALE)
2170			return (-1);
2171		drvrid = md_setshared_name(setno, name, 0L);
2172	}
2173
2174	if (drvrid < 0)
2175		return (-1);
2176
2177	/* open and import the md_ops of the submodules */
2178	mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
2179	if (mod == NULL) {
2180		cmn_err(CE_WARN, "md_loadsubmod: "
2181		    "unable to ddi_modopen %s, error %d\n", name, err);
2182		return (-1);
2183	}
2184	pops = ddi_modsym(mod, "md_interface_ops", &err);
2185	if (pops == NULL) {
2186		cmn_err(CE_WARN, "md_loadsubmod: "
2187		    "unable to import md_interface_ops from %s, error %d\n",
2188		    name, err);
2189		(void) ddi_modclose(mod);
2190		return (-1);
2191	}
2192
2193	/* ddi_modsym returns pointer to md_interface_ops in submod */
2194	ops = *pops;
2195
2196	/* initialize */
2197	ops->md_selfindex = i;
2198	rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
2199	(void) strncpy(ops->md_driver.md_drivername, name,
2200	    MD_DRIVERNAMELEN);
2201
2202	/* plumb */
2203	md_ops[i] = ops;
2204	md_mods[i] = mod;
2205	ops->md_next = md_opslist;
2206	md_opslist = ops;
2207
2208	/* return index */
2209	return (i);
2210}
2211
2212int
2213md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
2214{
2215	int	i;
2216	int	modindex;
2217	char	*name = driver->md_drivername;
2218	set_t	setno = driver->md_setno;
2219	int	drvid;
2220	int	local_dont_load;
2221
2222	if (setno >= md_nsets)
2223		return (-1);
2224
2225	for (i = 0; name[i] != 0; i++)
2226		if (i == (MD_DRIVERNAMELEN -1))
2227			return (-1);
2228
2229	/*
2230	 * If set is STALE, set local_dont_load to 1 since no records
2231	 * should be added to DB when stale.
2232	 */
2233	if (md_get_setstatus(setno) & MD_SET_STALE) {
2234		local_dont_load = 1;
2235	} else {
2236		local_dont_load = dont_load;
2237	}
2238
2239	/*
2240	 * Single thread ioctl module binding with respect to
2241	 * similar code executed in md_loadsubmod that is called
2242	 * from md_snarf_db_set (which is where that path does
2243	 * its md_haltsnarf_enter call).
2244	 */
2245	md_haltsnarf_enter(setno);
2246
2247	/* See if the submodule is already ddi_modopened. */
2248	for (i = 0; md_ops[i] != NULL; i++) {
2249		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2250		    MD_DRIVERNAMELEN) == 0) {
2251			if (! local_dont_load &&
2252			    (md_getshared_key(setno, name) == MD_KEYBAD)) {
2253				if (md_setshared_name(setno, name, 0L)
2254				    == MD_KEYBAD) {
2255					if (!db_notrequired)
2256						goto err;
2257				}
2258			}
2259			md_haltsnarf_exit(setno);
2260			return (i);
2261		}
2262
2263		if (i == (MD_NOPS -1))
2264			break;
2265	}
2266
2267	if (local_dont_load)
2268		goto err;
2269
2270	drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
2271
2272	/* ddi_modopen the submodule */
2273	modindex = md_loadsubmod(setno, name, drvid);
2274	if (modindex < 0)
2275		goto err;
2276
2277	if (md_ops[modindex]->md_snarf != NULL)
2278		(*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
2279
2280	md_haltsnarf_exit(setno);
2281	return (modindex);
2282
2283err:	md_haltsnarf_exit(setno);
2284	return (-1);
2285}
2286
2287void
2288md_call_strategy(buf_t *bp, int flags, void *private)
2289{
2290	mdi_unit_t	*ui;
2291
2292	if (mdv_strategy_tstpnt)
2293		if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
2294			return;
2295	if (getmajor(bp->b_edev) != md_major) {
2296		(void) bdev_strategy(bp);
2297		return;
2298	}
2299
2300	flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
2301	ui = MDI_UNIT(getminor(bp->b_edev));
2302	ASSERT(ui != NULL);
2303	(*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
2304}
2305
2306/*
2307 * md_call_ioctl:
2308 * -------------
2309 * Issue the specified ioctl to the device associated with the given md_dev64_t
2310 *
2311 * Arguments:
2312 *	dev	- underlying device [md_dev64_t]
2313 *	cmd	- ioctl to perform
2314 *	data	- arguments / result location
2315 *	mode	- read/write/layered ioctl
2316 *	lockp	- lock reference
2317 *
2318 * Returns:
2319 *	0	success
2320 *	!=0	Failure (error code)
2321 */
2322int
2323md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
2324{
2325	dev_t		device = md_dev64_to_dev(dev);
2326	int		rval;
2327	mdi_unit_t	*ui;
2328
2329	/*
2330	 * See if device is a metadevice. If not call cdev_ioctl(), otherwise
2331	 * call the ioctl entry-point in the metadevice.
2332	 */
2333	if (md_getmajor(dev) != md_major) {
2334		int	rv;
2335		rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
2336		    ddi_get_cred(), &rv);
2337	} else {
2338		ui = MDI_UNIT(md_getminor(dev));
2339		ASSERT(ui != NULL);
2340		rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
2341		    mode, lockp);
2342	}
2343	return (rval);
2344}
2345
2346void
2347md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
2348{
2349	md_link_t	*next;
2350	md_link_t	**pprev;
2351
2352	rw_enter(rw, RW_WRITER);
2353
2354	next = *head;
2355	pprev = head;
2356	while (next) {
2357		if ((next->ln_setno == setno) && (next->ln_id == id)) {
2358			*pprev = next->ln_next;
2359			rw_exit(rw);
2360			return;
2361		}
2362		pprev = &next->ln_next;
2363		next = next->ln_next;
2364	}
2365
2366	rw_exit(rw);
2367}
2368
2369int
2370md_dev_exists(md_dev64_t dev)
2371{
2372
2373	if (dev == NODEV64)
2374		return (0);
2375
2376	if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
2377		return (1);
2378
2379	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
2380	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
2381		return (0);
2382
2383	if (MDI_UNIT(md_getminor(dev)) != NULL)
2384		return (1);
2385
2386	return (0);
2387}
2388
2389md_parent_t
2390md_get_parent(md_dev64_t dev)
2391{
2392	md_unit_t	*un;
2393	mdi_unit_t	*ui;
2394	md_parent_t	parent;
2395
2396	if (md_getmajor(dev) != md_major)
2397		return (MD_NO_PARENT);
2398
2399	ui = MDI_UNIT(md_getminor(dev));
2400
2401	un = (md_unit_t *)md_unit_readerlock(ui);
2402	parent = un->c.un_parent;
2403	md_unit_readerexit(ui);
2404
2405	return (parent);
2406}
2407
2408void
2409md_set_parent(md_dev64_t dev, md_parent_t parent)
2410{
2411	md_unit_t	*un;
2412	mdi_unit_t	*ui;
2413
2414	if (md_getmajor(dev) != md_major)
2415		return;
2416
2417	ui = MDI_UNIT(md_getminor(dev));
2418
2419	un = (md_unit_t *)md_unit_readerlock(ui);
2420	un->c.un_parent = parent;
2421	md_unit_readerexit(ui);
2422}
2423
2424void
2425md_reset_parent(md_dev64_t dev)
2426{
2427	md_unit_t	*un;
2428	mdi_unit_t	*ui;
2429
2430	if (md_getmajor(dev) != md_major)
2431		return;
2432
2433	ui = MDI_UNIT(md_getminor(dev));
2434
2435	un = (md_unit_t *)md_unit_readerlock(ui);
2436	un->c.un_parent = MD_NO_PARENT;
2437	md_unit_readerexit(ui);
2438}
2439
2440
2441static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
2442
2443int
2444md_hot_spare_ifc(
2445	hs_cmds_t	cmd,
2446	mddb_recid_t	id,
2447	u_longlong_t	size,
2448	int		labeled,
2449	mddb_recid_t	*hs_id,
2450	mdkey_t		*key,
2451	md_dev64_t	*dev,
2452	diskaddr_t	*sblock)
2453{
2454	int		err;
2455
2456	/*
2457	 * RW lock on hot_spare_interface. We don't want it to change from
2458	 * underneath us. If hot_spare_interface is NULL we're going to
2459	 * need to set it. So we need to upgrade to a WRITER lock. If that
2460	 * doesn't work, we drop the lock and reenter as WRITER. This leaves
2461	 * a small hole during which hot_spare_interface could be modified
2462	 * so we check it for NULL again. What a pain. Then if still null
2463	 * load from md_get_named_service.
2464	 */
2465
2466	rw_enter(&hsp_rwlp.lock, RW_READER);
2467	if (hot_spare_interface == NULL) {
2468		if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
2469			rw_exit(&hsp_rwlp.lock);
2470			rw_enter(&hsp_rwlp.lock, RW_WRITER);
2471			if (hot_spare_interface != NULL) {
2472				err = ((*hot_spare_interface)
2473				    (cmd, id, size, labeled, hs_id, key, dev,
2474				    sblock));
2475				rw_exit(&hsp_rwlp.lock);
2476				return (err);
2477			}
2478		}
2479		hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2480		    "hot spare interface", 0);
2481		rw_downgrade(&hsp_rwlp.lock);
2482	}
2483
2484	if (hot_spare_interface == NULL) {
2485		cmn_err(CE_WARN, "md: no hotspare interface");
2486		rw_exit(&hsp_rwlp.lock);
2487		return (0);
2488	}
2489
2490	err = ((*hot_spare_interface)
2491	    (cmd, id, size, labeled, hs_id, key, dev, sblock));
2492	rw_exit(&hsp_rwlp.lock);
2493	return (err);
2494}
2495
2496void
2497md_clear_hot_spare_interface()
2498{
2499	rw_enter(&hsp_rwlp.lock, RW_WRITER);
2500	hot_spare_interface = NULL;
2501	rw_exit(&hsp_rwlp.lock);
2502}
2503
2504
2505static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
2506
2507int
2508md_notify_interface(
2509	md_event_cmds_t cmd,
2510	md_tags_t	tag,
2511	set_t		set,
2512	md_dev64_t	dev,
2513	md_event_type_t event
2514)
2515{
2516	int		err;
2517
2518	if (md_event_queue == NULL)
2519		return (0);
2520	rw_enter(&ni_rwlp.lock, RW_READER);
2521	if (notify_interface == NULL) {
2522		if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
2523			rw_exit(&ni_rwlp.lock);
2524			rw_enter(&ni_rwlp.lock, RW_WRITER);
2525			if (notify_interface != NULL) {
2526				err = ((*notify_interface)
2527				    (cmd, tag, set, dev, event));
2528				rw_exit(&ni_rwlp.lock);
2529				return (err);
2530			}
2531		}
2532		notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2533		    "notify interface", 0);
2534		rw_downgrade(&ni_rwlp.lock);
2535	}
2536	if (notify_interface == NULL) {
2537		cmn_err(CE_WARN, "md: no notify interface");
2538		rw_exit(&ni_rwlp.lock);
2539		return (0);
2540	}
2541	err = ((*notify_interface)(cmd, tag, set, dev, event));
2542	rw_exit(&ni_rwlp.lock);
2543	return (err);
2544}
2545
2546char *
2547obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
2548{
2549	char		*setname;
2550	char		name[MD_MAX_CTDLEN];
2551	minor_t		mnum = md_getminor(dev);
2552	major_t		maj = md_getmajor(dev);
2553	int		rtn = 0;
2554
2555	/*
2556	 * Verify that the passed dev_t refers to a valid metadevice.
2557	 * If it doesn't we can make no assumptions as to what the device
2558	 * name is. Return NULL in these cases.
2559	 */
2560	if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
2561	    (MD_MIN2SET(mnum) >= md_nsets)) {
2562		return (NULL);
2563	}
2564
2565	setname = NULL;
2566	name[0] = '\0';
2567	switch (tag) {
2568	case SVM_TAG_HSP:
2569		if (setno == 0) {
2570			rtn = snprintf(name, sizeof (name), "hsp%u",
2571			    (unsigned)MD_MIN2UNIT(mnum));
2572		} else {
2573			setname = mddb_getsetname(setno);
2574			if (setname != NULL) {
2575				rtn = snprintf(name, sizeof (name), "%s/hsp%u",
2576				    setname, (unsigned)MD_MIN2UNIT(mnum));
2577			}
2578		}
2579		break;
2580	case SVM_TAG_DRIVE:
2581		(void) sprintf(name, "drive");
2582		break;
2583	case SVM_TAG_HOST:
2584		(void) sprintf(name, "host");
2585		break;
2586	case SVM_TAG_SET:
2587		rtn = snprintf(name, sizeof (name), "%s",
2588		    mddb_getsetname(setno));
2589		if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2590			(void) sprintf(name, "diskset");
2591			rtn = 0;
2592		}
2593		break;
2594	default:
2595		rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
2596		break;
2597	}
2598
2599	/* Check if we got any rubbish for any of the snprintf's */
2600	if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2601		return (NULL);
2602	}
2603
2604	return (md_strdup(name));
2605}
2606
2607/* Sysevent subclass and mdnotify event type pairs */
2608struct node {
2609	char		*se_ev;
2610	md_event_type_t	md_ev;
2611};
2612
2613/*
2614 * Table must be sorted in case sensitive ascending order of
2615 * the sysevents values
2616 */
2617static struct node ev_table[] = {
2618	{ ESC_SVM_ADD,			EQ_ADD },
2619	{ ESC_SVM_ATTACH,		EQ_ATTACH },
2620	{ ESC_SVM_ATTACHING,		EQ_ATTACHING },
2621	{ ESC_SVM_CHANGE,		EQ_CHANGE },
2622	{ ESC_SVM_CREATE,		EQ_CREATE },
2623	{ ESC_SVM_DELETE,		EQ_DELETE },
2624	{ ESC_SVM_DETACH,		EQ_DETACH },
2625	{ ESC_SVM_DETACHING,		EQ_DETACHING },
2626	{ ESC_SVM_DRIVE_ADD,		EQ_DRIVE_ADD },
2627	{ ESC_SVM_DRIVE_DELETE,		EQ_DRIVE_DELETE },
2628	{ ESC_SVM_ENABLE,		EQ_ENABLE },
2629	{ ESC_SVM_ERRED,		EQ_ERRED },
2630	{ ESC_SVM_EXCHANGE,		EQ_EXCHANGE },
2631	{ ESC_SVM_GROW,			EQ_GROW },
2632	{ ESC_SVM_HS_CHANGED,		EQ_HS_CHANGED },
2633	{ ESC_SVM_HS_FREED,		EQ_HS_FREED },
2634	{ ESC_SVM_HOST_ADD,		EQ_HOST_ADD },
2635	{ ESC_SVM_HOST_DELETE,		EQ_HOST_DELETE },
2636	{ ESC_SVM_HOTSPARED,		EQ_HOTSPARED },
2637	{ ESC_SVM_INIT_FAILED,		EQ_INIT_FAILED },
2638	{ ESC_SVM_INIT_FATAL,		EQ_INIT_FATAL },
2639	{ ESC_SVM_INIT_START,		EQ_INIT_START },
2640	{ ESC_SVM_INIT_SUCCESS,		EQ_INIT_SUCCESS },
2641	{ ESC_SVM_IOERR,		EQ_IOERR },
2642	{ ESC_SVM_LASTERRED,		EQ_LASTERRED },
2643	{ ESC_SVM_MEDIATOR_ADD,		EQ_MEDIATOR_ADD },
2644	{ ESC_SVM_MEDIATOR_DELETE,	EQ_MEDIATOR_DELETE },
2645	{ ESC_SVM_OFFLINE,		EQ_OFFLINE },
2646	{ ESC_SVM_OK,			EQ_OK },
2647	{ ESC_SVM_ONLINE,		EQ_ONLINE },
2648	{ ESC_SVM_OPEN_FAIL,		EQ_OPEN_FAIL },
2649	{ ESC_SVM_REGEN_DONE,		EQ_REGEN_DONE },
2650	{ ESC_SVM_REGEN_FAILED,		EQ_REGEN_FAILED },
2651	{ ESC_SVM_REGEN_START,		EQ_REGEN_START },
2652	{ ESC_SVM_RELEASE,		EQ_RELEASE },
2653	{ ESC_SVM_REMOVE,		EQ_REMOVE },
2654	{ ESC_SVM_RENAME_DST,		EQ_RENAME_DST },
2655	{ ESC_SVM_RENAME_SRC,		EQ_RENAME_SRC },
2656	{ ESC_SVM_REPLACE,		EQ_REPLACE },
2657	{ ESC_SVM_RESYNC_DONE,		EQ_RESYNC_DONE },
2658	{ ESC_SVM_RESYNC_FAILED,	EQ_RESYNC_FAILED },
2659	{ ESC_SVM_RESYNC_START,		EQ_RESYNC_START },
2660	{ ESC_SVM_RESYNC_SUCCESS,	EQ_RESYNC_SUCCESS },
2661	{ ESC_SVM_TAKEOVER,		EQ_TAKEOVER }
2662};
2663
2664static md_tags_t md_tags[] = {
2665	TAG_UNK,
2666	TAG_METADEVICE,
2667	TAG_UNK,
2668	TAG_UNK,
2669	TAG_UNK,
2670	TAG_UNK,
2671	TAG_REPLICA,
2672	TAG_HSP,
2673	TAG_HS,
2674	TAG_SET,
2675	TAG_DRIVE,
2676	TAG_HOST,
2677	TAG_MEDIATOR
2678};
2679
2680md_event_type_t
2681ev_get(char *subclass)
2682{
2683	int	high, mid, low, p;
2684
2685	low = 0;
2686	high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
2687	while (low <= high) {
2688		mid = (high + low) / 2;
2689		p = strcmp(subclass, ev_table[mid].se_ev);
2690		if (p == 0) {
2691			return (ev_table[mid].md_ev);
2692		} else if (p < 0) {
2693			high = mid - 1;
2694		} else {
2695			low = mid + 1;
2696		}
2697	}
2698
2699	return (EQ_EMPTY);
2700}
2701
2702/*
2703 * Log mdnotify event
2704 */
2705void
2706do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
2707{
2708	md_event_type_t	ev_type;
2709	md_tags_t	md_tag;
2710
2711	/* Translate sysevent into mdnotify event */
2712	ev_type = ev_get(se_subclass);
2713
2714	if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
2715		md_tag = TAG_UNK;
2716	} else {
2717		md_tag = md_tags[tag];
2718	}
2719
2720	NOTIFY_MD(md_tag, setno, devid, ev_type);
2721}
2722
2723/*
2724 * Log SVM sys events
2725 */
2726void
2727svm_gen_sysevent(
2728	char		*se_class,
2729	char		*se_subclass,
2730	uint32_t	tag,
2731	set_t		setno,
2732	md_dev64_t	devid
2733)
2734{
2735	nvlist_t		*attr_list;
2736	sysevent_id_t		eid;
2737	int			err = DDI_SUCCESS;
2738	char			*devname;
2739	extern dev_info_t	*md_devinfo;
2740
2741	/* Raise the mdnotify event before anything else */
2742	do_mdnotify(se_subclass, tag, setno, devid);
2743
2744	if (md_devinfo == NULL) {
2745		return;
2746	}
2747
2748	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
2749
2750	if (err == DDI_SUCCESS) {
2751		/* Add the version numver */
2752		err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
2753		    (uint32_t)SVM_VERSION);
2754		if (err != DDI_SUCCESS) {
2755			goto fail;
2756		}
2757
2758		/* Add the tag attribute */
2759		err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
2760		if (err != DDI_SUCCESS) {
2761			goto fail;
2762		}
2763
2764		/* Add the set number attribute */
2765		err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
2766		if (err != DDI_SUCCESS) {
2767			goto fail;
2768		}
2769
2770		/* Add the device id attribute */
2771		err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
2772		if (err != DDI_SUCCESS) {
2773			goto fail;
2774		}
2775
2776		/* Add the device name attribute */
2777		devname = obj2devname(tag, setno, devid);
2778		if (devname != NULL) {
2779			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2780			    devname);
2781			freestr(devname);
2782		} else {
2783			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2784			    "unspecified");
2785		}
2786		if (err != DDI_SUCCESS) {
2787			goto fail;
2788		}
2789
2790		/* Attempt to post event */
2791		err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
2792		    se_subclass, attr_list, &eid, DDI_SLEEP);
2793
2794		nvlist_free(attr_list);
2795		if (err != DDI_SUCCESS) {
2796			cmn_err(CE_WARN, "Failed to log event for %s, %s,"
2797			    " err=%x", se_class, se_subclass, err);
2798		}
2799	}
2800
2801	return;
2802
2803fail:
2804	nvlist_free(attr_list);
2805	cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
2806	    se_class, se_subclass, err);
2807}
2808
2809void
2810md_clear_named_service()
2811{
2812	rw_enter(&ni_rwlp.lock, RW_WRITER);
2813	notify_interface = NULL;
2814	rw_exit(&ni_rwlp.lock);
2815}
2816
2817void
2818md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
2819{
2820	mdi_unit_t	*ui;
2821	set_t		setno = MD_MIN2SET(mnum);
2822
2823	ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
2824	ui->ui_opsindex = ops->md_selfindex;
2825
2826	/* initialize all the incore conditional variables */
2827	mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
2828	cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
2829
2830	if (alloc_lock) {
2831		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
2832		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
2833		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
2834		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
2835		    MUTEX_DEFAULT, NULL);
2836		ui->ui_io_lock->io_list_front = NULL;
2837		ui->ui_io_lock->io_list_back = NULL;
2838	}
2839	if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
2840		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
2841		MDI_VOIDUNIT(mnum) = (void *) ui;
2842		rw_exit(&md_unit_array_rw.lock);
2843	} else
2844		MDI_VOIDUNIT(mnum) = (void *) ui;
2845
2846	rw_enter(&ops->md_link_rw.lock, RW_WRITER);
2847	ui->ui_link.ln_next = ops->md_head;
2848	ui->ui_link.ln_setno = setno;
2849	ui->ui_link.ln_id = mnum;
2850	ops->md_head = &ui->ui_link;
2851	/* setup the unavailable field */
2852#if defined(_ILP32)
2853	if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
2854		ui->ui_tstate |= MD_64MD_ON_32KERNEL;
2855		cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
2856		    "metadevices are not accessible on a 32 bit kernel",
2857		    mnum);
2858	}
2859#endif
2860
2861	rw_exit(&ops->md_link_rw.lock);
2862}
2863
2864void
2865md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
2866{
2867	mdi_unit_t	*ui;
2868
2869	/*
2870	 * ASSUMPTION: md_unit_array_rw WRITER lock is held.
2871	 */
2872	ui = MDI_UNIT(mnum);
2873	if (ui == NULL)
2874		return;
2875
2876	md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
2877	    &ops->md_head);
2878
2879	/* destroy the io lock if one is being used */
2880	if (ui->ui_io_lock) {
2881		mutex_destroy(&ui->ui_io_lock->io_mx);
2882		cv_destroy(&ui->ui_io_lock->io_cv);
2883		kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
2884	}
2885
2886	/* teardown kstat */
2887	md_kstat_destroy(mnum);
2888
2889	/* destroy all the incore conditional variables */
2890	mutex_destroy(&ui->ui_mx);
2891	cv_destroy(&ui->ui_cv);
2892
2893	kmem_free(ui, sizeof (mdi_unit_t));
2894	MDI_VOIDUNIT(mnum) = (void *) NULL;
2895}
2896
2897void
2898md_rem_names(sv_dev_t *sv, int nsv)
2899{
2900	int	i, s;
2901	int	max_sides;
2902
2903	if (nsv == 0)
2904		return;
2905
2906	/* All entries removed are in the same diskset */
2907	if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
2908		max_sides = MD_MNMAXSIDES;
2909	else
2910		max_sides = MD_MAXSIDES;
2911
2912	for (i = 0; i < nsv; i++)
2913		for (s = 0; s < max_sides; s++)
2914			(void) md_remdevname(sv[i].setno, s, sv[i].key);
2915}
2916
2917/*
2918 * Checking user args before we get into physio - returns 0 for ok, else errno
2919 * We do a lot of checking against illegal arguments here because some of the
2920 * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
2921 * like odd address user buffer.) Those drivers capture bad arguments in
2922 * xxread and xxwrite. But since meta-driver calls their strategy routines
2923 * directly, two bad scenario might happen:
2924 *	1. the real strategy doesn't like it and panic.
2925 *	2. the real strategy doesn't like it and set B_ERROR.
2926 *
2927 * The second case is no better than the first one, since the meta-driver
2928 * will treat it as a media-error and off line the mirror metapartition.
2929 * (Too bad there is no way to tell what error it is.)
2930 *
2931 */
2932int
2933md_chk_uio(struct uio *uio)
2934{
2935	int	i;
2936	struct iovec *iov;
2937
2938	/*
2939	 * Check for negative or not block-aligned offset
2940	 */
2941	if ((uio->uio_loffset < 0) ||
2942	    ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
2943		return (EINVAL);
2944	}
2945	iov = uio->uio_iov;
2946	i = uio->uio_iovcnt;
2947
2948	while (i--) {
2949		if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
2950			return (EINVAL);
2951		/*
2952		 * Bug # 1212146
2953		 * The default is to not check alignment, but we can now check
2954		 * for a larger number of alignments if desired.
2955		 */
2956		if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
2957			return (EINVAL);
2958		iov++;
2959	}
2960	return (0);
2961}
2962
2963char *
2964md_shortname(
2965	minor_t		mnum
2966)
2967{
2968	static char	buf[MAXPATHLEN];
2969	char		*devname;
2970	char		*invalid = " (Invalid minor number %u) ";
2971	char		*metaname;
2972	mdc_unit_t	*un;
2973	side_t		side;
2974	set_t		setno = MD_MIN2SET(mnum);
2975	unit_t		unit = MD_MIN2UNIT(mnum);
2976
2977	if ((un = MD_UNIT(mnum)) == NULL) {
2978		(void) snprintf(buf, sizeof (buf), invalid, mnum);
2979		return (buf);
2980	}
2981
2982	/*
2983	 * If unit is not a friendly name unit, derive the name from the
2984	 * minor number.
2985	 */
2986	if ((un->un_revision & MD_FN_META_DEV) == 0) {
2987		/* This is a traditional metadevice */
2988		if (setno == MD_LOCAL_SET) {
2989			(void) snprintf(buf, sizeof (buf), "d%u",
2990			    (unsigned)unit);
2991		} else {
2992			(void) snprintf(buf, sizeof (buf), "%s/d%u",
2993			    mddb_getsetname(setno), (unsigned)unit);
2994		}
2995		return (buf);
2996	}
2997
2998	/*
2999	 * It is a friendly name metadevice, so we need to get its name.
3000	 */
3001	side = mddb_getsidenum(setno);
3002	devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
3003	if (md_getdevname(setno, side, MD_KEYWILD,
3004	    md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
3005		/*
3006		 * md_getdevname has given us either /dev/md/dsk/<metaname>
3007		 * or /dev/md/<setname>/dsk/<metname> depending on whether
3008		 * or not we are in the local set.  Thus, we'll pull the
3009		 * metaname from this string.
3010		 */
3011		if ((metaname = strrchr(devname, '/')) == NULL) {
3012			(void) snprintf(buf, sizeof (buf), invalid, mnum);
3013			goto out;
3014		}
3015		metaname++;	/* move past slash */
3016		if (setno == MD_LOCAL_SET) {
3017			/* No set name. */
3018			(void) snprintf(buf, sizeof (buf), "%s", metaname);
3019		} else {
3020			/* Include setname */
3021			(void) snprintf(buf, sizeof (buf), "%s/%s",
3022			    mddb_getsetname(setno), metaname);
3023		}
3024	} else {
3025		/* We couldn't find the name. */
3026		(void) snprintf(buf, sizeof (buf), invalid, mnum);
3027	}
3028
3029out:
3030	kmem_free(devname, MAXPATHLEN);
3031	return (buf);
3032}
3033
3034char *
3035md_devname(
3036	set_t		setno,
3037	md_dev64_t	dev,
3038	char		*buf,
3039	size_t		size
3040)
3041{
3042	static char	mybuf[MD_MAX_CTDLEN];
3043	int		err;
3044
3045	if (buf == NULL) {
3046		buf = mybuf;
3047		size = sizeof (mybuf);
3048	} else {
3049		ASSERT(size >= MD_MAX_CTDLEN);
3050	}
3051
3052	err = md_getdevname_common(setno, mddb_getsidenum(setno),
3053	    0, dev, buf, size, MD_NOWAIT_LOCK);
3054	if (err) {
3055		if (err == ENOENT) {
3056			(void) sprintf(buf, "(Unavailable)");
3057		} else {
3058			(void) sprintf(buf, "(%u.%u)",
3059			    md_getmajor(dev), md_getminor(dev));
3060		}
3061	}
3062
3063	return (buf);
3064}
3065void
3066md_minphys(buf_t *pb)
3067{
3068	extern unsigned md_maxbcount;
3069
3070	if (pb->b_bcount > md_maxbcount)
3071		pb->b_bcount = md_maxbcount;
3072}
3073
3074void
3075md_bioinit(struct buf *bp)
3076{
3077	ASSERT(bp);
3078
3079	bioinit(bp);
3080	bp->b_back = bp;
3081	bp->b_forw = bp;
3082	bp->b_flags = B_BUSY;	/* initialize flags */
3083}
3084
3085void
3086md_bioreset(struct buf *bp)
3087{
3088	ASSERT(bp);
3089
3090	bioreset(bp);
3091	bp->b_back = bp;
3092	bp->b_forw = bp;
3093	bp->b_flags = B_BUSY;	/* initialize flags */
3094}
3095
3096/*
3097 * md_bioclone is needed as long as the real bioclone only takes a daddr_t
3098 * as block number.
3099 * We simply call bioclone with all input parameters but blkno, and set the
3100 * correct blkno afterwards.
3101 * Caveat Emptor: bp_mem must not be NULL!
3102 */
3103buf_t *
3104md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
3105		int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
3106{
3107	(void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
3108	bp_mem->b_lblkno = blkno;
3109	return (bp_mem);
3110}
3111
3112
3113/*
3114 * kstat stuff
3115 */
3116void
3117md_kstat_init_ui(
3118	minor_t		 mnum,
3119	mdi_unit_t	*ui
3120)
3121{
3122	if ((ui != NULL) && (ui->ui_kstat == NULL)) {
3123		set_t	setno = MD_MIN2SET(mnum);
3124		unit_t  unit = MD_MIN2UNIT(mnum);
3125		char	module[KSTAT_STRLEN];
3126		char	*p = module;
3127
3128		if (setno != MD_LOCAL_SET) {
3129			char	buf[64];
3130			char	*s = buf;
3131			char	*e = module + sizeof (module) - 4;
3132
3133			(void) sprintf(buf, "%u", setno);
3134			while ((p < e) && (*s != '\0'))
3135				*p++ = *s++;
3136			*p++ = '/';
3137		}
3138		*p++ = 'm';
3139		*p++ = 'd';
3140		*p = '\0';
3141		if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
3142		    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
3143			ui->ui_kstat->ks_lock = &ui->ui_mx;
3144			kstat_install(ui->ui_kstat);
3145		}
3146	}
3147}
3148
3149void
3150md_kstat_init(
3151	minor_t		mnum
3152)
3153{
3154	md_kstat_init_ui(mnum, MDI_UNIT(mnum));
3155}
3156
3157void
3158md_kstat_destroy_ui(
3159	mdi_unit_t	*ui
3160)
3161{
3162	/*
3163	 * kstat_delete() interface has it's own locking mechanism and
3164	 * does not allow holding of kstat lock (ks_lock).
3165	 * Note: ks_lock == ui_mx from the md_kstat_init_ui().
3166	 */
3167	if ((ui != NULL) && (ui->ui_kstat != NULL)) {
3168		kstat_delete(ui->ui_kstat);
3169		ui->ui_kstat = NULL;
3170	}
3171}
3172
3173void
3174md_kstat_destroy(
3175	minor_t		mnum
3176)
3177{
3178	md_kstat_destroy_ui(MDI_UNIT(mnum));
3179}
3180
3181/*
3182 * In the following subsequent routines, locks are held before checking the
3183 * validity of ui_kstat. This is done to make sure that we don't trip over
3184 * a NULL ui_kstat anymore.
3185 */
3186
3187void
3188md_kstat_waitq_enter(
3189	mdi_unit_t	*ui
3190)
3191{
3192	mutex_enter(&ui->ui_mx);
3193	if (ui->ui_kstat != NULL)
3194		kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3195	mutex_exit(&ui->ui_mx);
3196}
3197
3198void
3199md_kstat_waitq_to_runq(
3200	mdi_unit_t	*ui
3201)
3202{
3203	mutex_enter(&ui->ui_mx);
3204	if (ui->ui_kstat != NULL)
3205		kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
3206	mutex_exit(&ui->ui_mx);
3207}
3208
3209void
3210md_kstat_waitq_exit(
3211	mdi_unit_t	*ui
3212)
3213{
3214	mutex_enter(&ui->ui_mx);
3215	if (ui->ui_kstat != NULL)
3216		kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3217	mutex_exit(&ui->ui_mx);
3218}
3219
3220void
3221md_kstat_runq_enter(
3222	mdi_unit_t	*ui
3223)
3224{
3225	mutex_enter(&ui->ui_mx);
3226	if (ui->ui_kstat != NULL)
3227		kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3228	mutex_exit(&ui->ui_mx);
3229}
3230
3231void
3232md_kstat_runq_exit(
3233	mdi_unit_t	*ui
3234)
3235{
3236	mutex_enter(&ui->ui_mx);
3237	if (ui->ui_kstat != NULL)
3238		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3239	mutex_exit(&ui->ui_mx);
3240}
3241
3242void
3243md_kstat_done(
3244	mdi_unit_t	*ui,
3245	buf_t		*bp,
3246	int		war
3247)
3248{
3249	size_t  n_done;
3250
3251	/* check for end of device */
3252	if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
3253		n_done = bp->b_bcount;
3254	} else if (bp->b_bcount < bp->b_resid) {
3255		n_done = 0;
3256	} else {
3257		n_done = bp->b_bcount - bp->b_resid;
3258	}
3259
3260	/* do accounting */
3261	mutex_enter(&ui->ui_mx);
3262	if (ui->ui_kstat != NULL) {
3263		if ((! war) && (bp->b_flags & B_READ)) {
3264			KSTAT_IO_PTR(ui->ui_kstat)->reads++;
3265			KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
3266		} else {
3267			KSTAT_IO_PTR(ui->ui_kstat)->writes++;
3268			KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
3269		}
3270		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3271	}
3272	mutex_exit(&ui->ui_mx);
3273}
3274
3275pid_t
3276md_getpid()
3277{
3278	pid_t valuep;
3279	if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
3280		ASSERT(0);
3281		return ((pid_t)0);
3282	} else {
3283		ASSERT(valuep);
3284		return (valuep);
3285	}
3286}
3287
3288
3289proc_t *
3290md_getproc()
3291{
3292	proc_t  *valuep;
3293	if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
3294		ASSERT(0);
3295		return ((proc_t *)NULL);
3296	} else {
3297		ASSERT(valuep);
3298		return (valuep);
3299	}
3300}
3301
3302extern kmutex_t pidlock;
3303
3304/*
3305 * this check to see if a process pid pair are still running.  For the
3306 * disk set lock when both pid/proc are zero then the locks is not
3307 * currently held.
3308 */
3309int
3310md_checkpid(pid_t pid, proc_t *proc)
3311{
3312	int	retval = 1;
3313
3314	if (pid == 0 && proc == NULL)
3315		return (0);
3316
3317	mutex_enter(&pidlock);
3318	if (prfind(pid)  != proc)
3319		retval = 0;
3320	mutex_exit(&pidlock);
3321	return (retval);
3322}
3323
3324/*
3325 * NAME: md_init_probereq
3326 *
3327 * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
3328 *		they can be dispatched to multiple daemon threads.
3329 *
3330 * PARAMETERS: struct md_probedev *p	pointer ioctl input
3331 *
3332 * RETURN VALUE: Returns errno
3333 *
3334 */
3335
3336int
3337md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
3338{
3339	int		err = 0;
3340	int		modindx;
3341	intptr_t	(*probe_test)();
3342
3343	/*
3344	 * Initialize the semaphores and mutex
3345	 * for the request
3346	 */
3347
3348	p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
3349
3350	p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
3351	sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
3352	mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
3353
3354	modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
3355	probe_test = md_get_named_service(NODEV64, modindx,
3356	    p->probe.test_name, 0);
3357	if (probe_test == NULL) {
3358		err = EINVAL;
3359		goto err_out;
3360	}
3361
3362	err = md_create_probe_rqlist(p, hdrpp, probe_test);
3363err_out:
3364	return (err);
3365}
3366
3367/*
3368 * NAME: md_probe_one
3369 *
3370 * DESCRIPTION: Generic routine for probing disks. This is called from the
3371 *		daemon.
3372 *
3373 * PARAMETERS: probe_req_t	*reqp	pointer to the probe request structure.
3374 *
3375 */
3376
3377void
3378md_probe_one(probe_req_t *reqp)
3379{
3380	mdi_unit_t		*ui;
3381	md_probedev_impl_t	*p;
3382	int			err = 0;
3383
3384	p = (md_probedev_impl_t *)reqp->private_handle;
3385	/*
3386	 * Validate the unit while holding the global ioctl lock, then
3387	 * obtain the unit_writerlock. Once the writerlock has been obtained
3388	 * we can release the global lock. As long as we hold one of these
3389	 * locks this will prevent a metaclear operation being performed
3390	 * on the metadevice because metaclear takes the readerlock (via
3391	 * openclose lock).
3392	 */
3393	while (md_ioctl_lock_enter() == EINTR)
3394		;
3395	ui = MDI_UNIT(reqp->mnum);
3396	if (ui != NULL) {
3397		(void) md_unit_writerlock_common(ui, 0);
3398		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3399		err = (*reqp->probe_fcn)(ui, reqp->mnum);
3400		md_unit_writerexit(ui);
3401	} else {
3402		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3403	}
3404
3405	/* update the info info in the probe structure */
3406
3407	mutex_enter(PROBE_MX(p));
3408	if (err != 0) {
3409		cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
3410		    reqp->mnum);
3411		(void) mdsyserror(&(p->probe.mde), err);
3412	}
3413
3414	mutex_exit(PROBE_MX(p));
3415	sema_v(PROBE_SEMA(p));
3416
3417	kmem_free(reqp, sizeof (probe_req_t));
3418}
3419char *
3420md_strdup(char *cp)
3421{
3422	char *new_cp = NULL;
3423
3424	new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
3425
3426	return (strcpy(new_cp, cp));
3427}
3428
3429void
3430freestr(char *cp)
3431{
3432	kmem_free(cp, strlen(cp) + 1);
3433}
3434
3435/*
3436 * Validate the list and skip invalid devices. Then create
3437 * a doubly linked circular list of devices to probe.
3438 * The hdr points to the head and tail of this list.
3439 */
3440
3441static int
3442md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
3443			intptr_t (*probe_test)())
3444{
3445	int i, err, nodevcnt;
3446	probe_req_t *tp;
3447	daemon_queue_t *hp;
3448	minor_t mnum;
3449
3450	nodevcnt = 0;
3451
3452	hp = NULL;
3453
3454	for (i = 0; i <  plist->probe.nmdevs; i++) {
3455		mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
3456		if (MDI_UNIT(mnum) == NULL) {
3457			cmn_err(CE_WARN, "md: Cannot probe %s since it does "
3458			    "not exist", md_shortname(mnum));
3459			nodevcnt++;
3460			continue;
3461		}
3462		tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
3463		tp->mnum = mnum;
3464		tp->private_handle = (void *)plist;
3465		tp->probe_fcn = probe_test;
3466		if (hp == NULL) {
3467			hp = (daemon_queue_t *)tp;
3468			hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
3469		} else {
3470			tp->dq.dq_next = hp;
3471			tp->dq.dq_prev = hp->dq_prev;
3472			hp->dq_prev->dq_next = (daemon_queue_t *)tp;
3473			hp->dq_prev = (daemon_queue_t *)tp;
3474		}
3475	}
3476
3477	*hdr = hp;
3478	if (nodevcnt > 0)
3479		plist->probe.nmdevs -= nodevcnt;
3480
3481	/*
3482	 * If there are no devices to be probed because they were
3483	 * incorrect, then return an error.
3484	 */
3485	err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
3486
3487	return (err);
3488}
3489
3490/*
3491 * This routine increments the I/O count for set I/O operations.  This
3492 * value is used to determine if an I/O can done.  If a release is in
3493 * process this will return an error and cause the I/O to be errored.
3494 */
3495int
3496md_inc_iocount(set_t setno)
3497{
3498	int	rc = 0;
3499
3500	if (setno == 0)
3501		return (0);
3502
3503	mutex_enter(&md_set_io[setno].md_io_mx);
3504	if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
3505		rc = EIO;
3506		goto out;
3507	}
3508
3509	ASSERT(md_set_io[setno].io_cnt >= 0);
3510	md_set_io[setno].io_cnt++;
3511
3512out:	mutex_exit(&md_set_io[setno].md_io_mx);
3513	return (rc);
3514}
3515
3516void
3517md_inc_iocount_noblock(set_t setno)
3518{
3519
3520	if (setno == 0)
3521		return;
3522
3523	mutex_enter(&md_set_io[setno].md_io_mx);
3524	md_set_io[setno].io_cnt++;
3525	mutex_exit(&md_set_io[setno].md_io_mx);
3526}
3527void
3528md_dec_iocount(set_t setno)
3529{
3530
3531	if (setno == 0)
3532		return;
3533
3534	mutex_enter(&md_set_io[setno].md_io_mx);
3535	md_set_io[setno].io_cnt--;
3536	ASSERT(md_set_io[setno].io_cnt >= 0);
3537	if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
3538	    (md_set_io[setno].io_cnt == 0))
3539		cv_broadcast(&md_set_io[setno].md_io_cv);
3540	mutex_exit(&md_set_io[setno].md_io_mx);
3541}
3542
3543int
3544md_isblock_setio(set_t setno)
3545{
3546	int	rc = 0;
3547
3548	if (setno == 0)
3549		return (0);
3550
3551	mutex_enter(&md_set_io[setno].md_io_mx);
3552	if (md_set_io[setno].io_state & MD_SET_RELEASE)
3553		rc = 1;
3554
3555	mutex_exit(&md_set_io[setno].md_io_mx);
3556	return (rc);
3557}
3558
3559int
3560md_block_setio(set_t setno)
3561{
3562	int	rc = 0;
3563
3564	if (setno == 0)
3565		return (1);
3566
3567	mutex_enter(&md_set_io[setno].md_io_mx);
3568	md_set_io[setno].io_state = MD_SET_RELEASE;
3569
3570	while (md_set_io[setno].io_cnt > 0) {
3571		cv_wait(&md_set_io[setno].md_io_cv,
3572		    &md_set_io[setno].md_io_mx);
3573	}
3574	rc = 1;
3575
3576
3577	ASSERT(md_set_io[setno].io_cnt == 0);
3578	mutex_exit(&md_set_io[setno].md_io_mx);
3579
3580	return (rc);
3581}
3582
3583void
3584md_clearblock_setio(set_t setno)
3585{
3586	if (setno == 0)
3587		return;
3588
3589	mutex_enter(&md_set_io[setno].md_io_mx);
3590	md_set_io[setno].io_state = MD_SET_ACTIVE;
3591	mutex_exit(&md_set_io[setno].md_io_mx);
3592}
3593
3594void
3595md_unblock_setio(set_t setno)
3596{
3597	if (setno == 0)
3598		return;
3599
3600	mutex_enter(&md_set_io[setno].md_io_mx);
3601#ifdef DEBUG
3602	if (md_set_io[setno].io_cnt != 0) {
3603		cmn_err(CE_NOTE, "set %d count was %ld at take",
3604		    setno, md_set_io[setno].io_cnt);
3605	}
3606#endif /* DEBUG */
3607
3608	md_set_io[setno].io_state = MD_SET_ACTIVE;
3609	md_set_io[setno].io_cnt = 0;
3610	mutex_exit(&md_set_io[setno].md_io_mx);
3611}
3612
3613/*
3614 * Test and set version of the md_block_setio.
3615 * Set the io_state to keep new I/O from being issued.
3616 * If there is I/O currently in progress, then set io_state to active
3617 * and return failure.  Otherwise, return a 1 for success.
3618 *
3619 * Used in a MN diskset since the commd must be suspended before
3620 * this node can attempt to withdraw from a diskset.  But, with commd
3621 * suspended, I/O may have been issued that can never finish until
3622 * commd is resumed (allocation of hotspare, etc). So, if I/O is
3623 * outstanding after diskset io_state is marked RELEASE, then set diskset
3624 * io_state back to ACTIVE and return failure.
3625 */
3626int
3627md_tas_block_setio(set_t setno)
3628{
3629	int	rc;
3630
3631	if (setno == 0)
3632		return (1);
3633
3634	mutex_enter(&md_set_io[setno].md_io_mx);
3635	md_set_io[setno].io_state = MD_SET_RELEASE;
3636
3637	if (md_set_io[setno].io_cnt > 0) {
3638		md_set_io[setno].io_state = MD_SET_ACTIVE;
3639		rc = 0;
3640	} else {
3641		rc = 1;
3642	}
3643
3644	mutex_exit(&md_set_io[setno].md_io_mx);
3645
3646	return (rc);
3647}
3648
3649void
3650md_biodone(struct buf *pb)
3651{
3652	minor_t	mnum;
3653	set_t	setno;
3654	mdi_unit_t	*ui;
3655
3656	mnum = getminor(pb->b_edev);
3657	setno = MD_MIN2SET(mnum);
3658
3659	if (setno == 0) {
3660		biodone(pb);
3661		return;
3662	}
3663
3664#ifdef DEBUG
3665	ui = MDI_UNIT(mnum);
3666	if (!md_unit_isopen(ui))
3667		cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
3668#endif /* DEBUG */
3669
3670	/*
3671	 * Handle the local diskset
3672	 */
3673	if (md_set_io[setno].io_cnt > 0)
3674		md_dec_iocount(setno);
3675
3676#ifdef DEBUG
3677	/*
3678	 * this is being done after the lock is dropped so there
3679	 * are cases it may be invalid.  It is advisory.
3680	 */
3681	if (md_set_io[setno].io_state & MD_SET_RELEASE) {
3682		/* Only display this error once for this metadevice */
3683		if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
3684			cmn_err(CE_NOTE,
3685			    "I/O to %s attempted during set RELEASE\n",
3686			    md_shortname(mnum));
3687			ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
3688		}
3689	}
3690#endif /* DEBUG */
3691
3692	biodone(pb);
3693}
3694
3695
3696/*
3697 * Driver special private devt handling routine
3698 * INPUT:  md_dev64_t
3699 * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
3700 */
3701dev_t
3702md_dev64_to_dev(md_dev64_t dev)
3703{
3704	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3705	minor_t minor = (minor_t)(dev & MAXMIN64);
3706
3707	return (makedevice(major, minor));
3708
3709}
3710
3711/*
3712 * Driver private makedevice routine
3713 * INPUT:  major_t major, minor_t minor
3714 * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
3715 */
3716md_dev64_t
3717md_makedevice(major_t major, minor_t minor)
3718{
3719	return (((md_dev64_t)major << NBITSMINOR64) | minor);
3720
3721}
3722
3723
3724/*
3725 * Driver private devt md_getmajor routine
3726 * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3727 * OUTPUT: the appropriate major number
3728 */
3729major_t
3730md_getmajor(md_dev64_t dev)
3731{
3732	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3733
3734	if (major == 0) {
3735		/* Here we were given a 32bit dev */
3736		major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
3737	}
3738	return (major);
3739}
3740
3741/*
3742 * Driver private devt md_getminor routine
3743 * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3744 * OUTPUT: the appropriate minor number
3745 */
3746minor_t
3747md_getminor(md_dev64_t dev)
3748{
3749	minor_t minor;
3750	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3751
3752	if (major == 0) {
3753		/* Here we were given a 32bit dev */
3754		minor = (minor_t)(dev & MAXMIN32);
3755	} else {
3756		minor = (minor_t)(dev & MAXMIN64);
3757	}
3758	return (minor);
3759}
3760
3761int
3762md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
3763{
3764	/*
3765	 * If the metadevice is an old style device, it has a vtoc,
3766	 *	in that case all reading EFI ioctls are not applicable.
3767	 * If the metadevice has an EFI label, reading vtoc and geom ioctls
3768	 *	are not supposed to work.
3769	 */
3770	switch (cmd) {
3771		case DKIOCGGEOM:
3772		case DKIOCGAPART:
3773			/* if > 2 TB then fail */
3774			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3775				return (ENOTSUP);
3776			}
3777			break;
3778		case DKIOCGVTOC:
3779			/* if > 2 TB then fail */
3780			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3781				return (ENOTSUP);
3782			}
3783
3784			/* if > 1 TB but < 2TB return overflow */
3785			if (c.un_revision & MD_64BIT_META_DEV) {
3786				return (EOVERFLOW);
3787			}
3788			break;
3789		case DKIOCGEXTVTOC:
3790			/* if > 2 TB then fail */
3791			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3792				return (ENOTSUP);
3793			}
3794			break;
3795		case DKIOCGETEFI:
3796		case DKIOCPARTITION:
3797			if ((c.un_flag & MD_EFILABEL) == 0) {
3798				return (ENOTSUP);
3799			}
3800			break;
3801
3802		case DKIOCSETEFI:
3803		/* setting an EFI label should always be ok */
3804			return (0);
3805
3806		case DKIOCSVTOC:
3807			/* if > 2 TB then fail */
3808			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3809				return (ENOTSUP);
3810			}
3811
3812			/* if > 1 TB but < 2TB return overflow */
3813			if (c.un_revision & MD_64BIT_META_DEV) {
3814				return (EOVERFLOW);
3815			}
3816			break;
3817		case DKIOCSEXTVTOC:
3818			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3819				return (ENOTSUP);
3820			}
3821			break;
3822	}
3823	return (0);
3824}
3825
3826/*
3827 * md_vtoc_to_efi_record()
3828 * Input:  record id of the vtoc record
3829 * Output: record id of the efi record
3830 * Function:
3831 *	- reads the  volume name from the vtoc record
3832 *	- converts the volume name to a format, libefi understands
3833 *	- creates a new record of size MD_EFI_PARTNAME_BYTES
3834 *	- stores the volname in that record,
3835 *	- commits that record
3836 *	- returns the recid of the efi record.
3837 * Caveat Emptor:
3838 *	The calling routine must do something like
3839 *	- un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
3840 *	- commit(un)
3841 *	- delete(vtoc_recid)
3842 *	in order to keep the mddb consistent in case of a panic in the middle.
3843 * Errors:
3844 *	- returns 0 on any error
3845 */
3846mddb_recid_t
3847md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
3848{
3849	struct vtoc	*vtoc;
3850	ushort_t	*v;
3851	mddb_recid_t	efi_recid;
3852	int		i;
3853
3854	if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
3855		return (0);
3856	}
3857	vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
3858	efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
3859	    MD_CRO_32BIT, setno);
3860	if (efi_recid < 0) {
3861		return (0);
3862	}
3863	v = (ushort_t *)mddb_getrecaddr(efi_recid);
3864
3865	/* This for loop read, converts and writes */
3866	for (i = 0; i < LEN_DKL_VVOL; i++) {
3867		v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
3868	}
3869	/* commit the new record */
3870	mddb_commitrec_wrapper(efi_recid);
3871
3872	return (efi_recid);
3873}
3874
3875/*
3876 * Send a kernel message.
3877 * user has to provide for an allocated result structure
3878 * If the door handler disappears we retry, emitting warnings every so often.
3879 *
3880 * The recipient argument is almost always unused, and is therefore typically
3881 * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
3882 * marking and clearing of the DRL from a node that is not currently the
3883 * owner.  In these cases, the recipient argument will be the nodeid of the
3884 * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
3885 * nodes will not receive these messages.
3886 *
3887 * For the case where md_mn_is_commd_present() is false, we rely on the
3888 * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for
3889 * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0.
3890 */
3891int
3892mdmn_ksend_message(
3893	set_t		setno,
3894	md_mn_msgtype_t	type,
3895	uint_t		flags,
3896	md_mn_nodeid_t	recipient,
3897	char		*data,
3898	int		size,
3899	md_mn_kresult_t	*result)
3900{
3901	door_arg_t	da;
3902	md_mn_kmsg_t	*kmsg;
3903	uint_t		send_try_cnt = 0;
3904	uint_t		retry_noise_cnt = 0;
3905	int		rval;
3906	k_sigset_t	oldmask, newmask;
3907
3908	if (size > MDMN_MAX_KMSG_DATA)
3909		return (ENOMEM);
3910	kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
3911	kmsg->kmsg_flags = flags;
3912	kmsg->kmsg_setno = setno;
3913	kmsg->kmsg_recipient = recipient;
3914	kmsg->kmsg_type	= type;
3915	kmsg->kmsg_size	= size;
3916	bcopy(data, &(kmsg->kmsg_data), size);
3917
3918	/*
3919	 * Wait for the door handle to be established.
3920	 */
3921	while (mdmn_door_did == -1) {
3922		if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
3923			cmn_err(CE_WARN, "door handle not yet ready. "
3924			    "Check if /usr/lib/lvm/mddoors is running");
3925		}
3926		delay(md_hz);
3927	}
3928
3929	/*
3930	 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
3931	 * do not fail if the user process receives a signal while we're
3932	 * active in the door interface.
3933	 */
3934	if (flags & MD_MSGF_BLK_SIGNAL) {
3935		sigfillset(&newmask);
3936		sigreplace(&newmask, &oldmask);
3937	}
3938
3939	/*
3940	 * If message failed with an RPC_FAILURE when rpc.mdcommd had
3941	 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
3942	 * then don't retry the message anymore.  If message
3943	 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
3944	 * times which should allow a shutting down system time to
3945	 * notify the kernel of a graceful shutdown of rpc.mdcommd.
3946	 *
3947	 * Caller of this routine will need to check the md_mn_commd_present
3948	 * flag and the failure error in order to determine whether to panic
3949	 * or not.  If md_mn_commd_present is set to 0 and failure error
3950	 * is RPC_FAILURE, the calling routine should not panic since the
3951	 * system is in the process of being shutdown.
3952	 *
3953	 */
3954
3955	retry_noise_cnt = send_try_cnt = 0;
3956	while (md_mn_is_commd_present_lite()) {
3957		/*
3958		 * data_ptr and data_size are initialized here because on
3959		 * return from the upcall, they contain data duplicated from
3960		 * rbuf and rsize.  This causes subsequent upcalls to fail.
3961		 */
3962		da.data_ptr = (char *)(kmsg);
3963		da.data_size = sizeof (md_mn_kmsg_t);
3964		da.desc_ptr = NULL;
3965		da.desc_num = 0;
3966		da.rbuf = (char *)result;
3967		da.rsize = sizeof (*result);
3968
3969		while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
3970		    NULL, SIZE_MAX, 0)) != 0) {
3971			if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
3972				if (rval == EAGAIN)  {
3973					cmn_err(CE_WARN,
3974					    "md: door_upcall failed. "
3975					    "Check if mddoors is running.");
3976				} else if (rval == EINTR) {
3977					cmn_err(CE_WARN,
3978					    "md: door_upcall failed. "
3979					    "Check if rpc.mdcommd is running.");
3980				} else {
3981					cmn_err(CE_WARN,
3982					    "md: door_upcall failed. "
3983					    "Returned %d",
3984					    rval);
3985				}
3986			}
3987			if (++send_try_cnt >= md_send_retry_limit)
3988				break;
3989
3990			delay(md_hz);
3991
3992			/*
3993			 * data_ptr and data_size are re-initialized here
3994			 * because on return from the upcall, they contain
3995			 * data duplicated from rbuf and rsize.  This causes
3996			 * subsequent upcalls to fail.
3997			 */
3998			da.data_ptr = (char *)(kmsg);
3999			da.data_size = sizeof (md_mn_kmsg_t);
4000			da.desc_ptr = NULL;
4001			da.desc_num = 0;
4002			da.rbuf = (char *)result;
4003			da.rsize = sizeof (*result);
4004		}
4005
4006
4007		/*
4008		 * If:
4009		 * - the send succeeded (MDMNE_ACK)
4010		 * - we had an MDMNE_RPC_FAIL and commd is now gone
4011		 *   (note: since the outer loop is commd-dependent,
4012		 *   checking MDMN_RPC_FAIL here is meaningless)
4013		 * - we were told not to retry
4014		 * - we exceeded the RPC failure send limit
4015		 * punch out of the outer loop prior to the delay()
4016		 */
4017		if (result->kmmr_comm_state == MDMNE_ACK ||
4018		    (flags & MD_MSGF_KSEND_NORETRY) ||
4019		    (++send_try_cnt % md_send_retry_limit) == 0 ||
4020		    !md_mn_is_commd_present())
4021			break;
4022		delay(md_hz);
4023	}
4024
4025	if (flags & MD_MSGF_BLK_SIGNAL) {
4026		sigreplace(&oldmask, (k_sigset_t *)NULL);
4027	}
4028	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
4029
4030	return (0);
4031}
4032
4033/*
4034 * Called to propagate the capability of a metadevice to all nodes in the set.
4035 *
4036 * On entry, lockp is set if the function has been called from within an ioctl.
4037 *
4038 * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
4039 * routine to enable other mdioctls to enter the kernel while this
4040 * thread of execution waits on the completion of mdmn_ksend_message. When
4041 * the message is completed the thread continues and md_ioctl_lock must be
4042 * reacquired.  Even though md_ioctl_lock is interruptable, we choose to
4043 * ignore EINTR as we must not return without acquiring md_ioctl_lock.
4044 */
4045
4046int
4047mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
4048{
4049	md_mn_msg_setcap_t	msg;
4050	md_mn_kresult_t		*kres;
4051	mdi_unit_t		*ui = MDI_UNIT(mnum);
4052	int			ret;
4053	k_sigset_t		oldmask, newmask;
4054
4055	(void) strncpy((char *)&msg.msg_setcap_driver,
4056	    md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
4057	msg.msg_setcap_mnum = mnum;
4058	msg.msg_setcap_set = vc.vc_set;
4059
4060	if (lockp)
4061		IOLOCK_RETURN_RELEASE(0, lockp);
4062	kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4063
4064	/*
4065	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4066	 * interface from failing if the user process receives a signal while
4067	 * in mdmn_ksend_message.
4068	 */
4069	sigfillset(&newmask);
4070	sigreplace(&newmask, &oldmask);
4071	ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
4072	    MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
4073	    kres));
4074	sigreplace(&oldmask, (k_sigset_t *)NULL);
4075
4076	if (!MDMN_KSEND_MSG_OK(ret, kres)) {
4077		mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
4078		ret = EIO;
4079	}
4080	kmem_free(kres, sizeof (md_mn_kresult_t));
4081
4082	if (lockp) {
4083		IOLOCK_RETURN_REACQUIRE(lockp);
4084	}
4085	return (ret);
4086}
4087
4088/*
4089 * Called to clear all of the transient capabilities for a metadevice when it is
4090 * not open on any node in the cluster
4091 * Called from close for mirror and sp.
4092 */
4093
4094void
4095mdmn_clear_all_capabilities(minor_t mnum)
4096{
4097	md_isopen_t	clumsg;
4098	int		ret;
4099	md_mn_kresult_t	*kresult;
4100	volcap_t	vc;
4101	k_sigset_t	oldmask, newmask;
4102
4103	clumsg.dev = md_makedevice(md_major, mnum);
4104	clumsg.mde = mdnullerror;
4105	/*
4106	 * The check open message doesn't have to be logged, nor should the
4107	 * result be stored in the MCT. We want an up-to-date state.
4108	 */
4109	kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4110
4111	/*
4112	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4113	 * interface from failing if the user process receives a signal while
4114	 * in mdmn_ksend_message.
4115	 */
4116	sigfillset(&newmask);
4117	sigreplace(&newmask, &oldmask);
4118	ret = mdmn_ksend_message(MD_MIN2SET(mnum),
4119	    MD_MN_MSG_CLU_CHECK,
4120	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
4121	    (char *)&clumsg, sizeof (clumsg), kresult);
4122	sigreplace(&oldmask, (k_sigset_t *)NULL);
4123
4124	if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
4125		/*
4126		 * Not open on any node, clear all capabilities, eg ABR and
4127		 * DMR
4128		 */
4129		vc.vc_set = 0;
4130		(void) mdmn_send_capability_message(mnum, vc, NULL);
4131	}
4132	kmem_free(kresult, sizeof (md_mn_kresult_t));
4133}
4134
4135/*
4136 * mdmn_ksend_show_error:
4137 * ---------------------
4138 * Called to display the error contents of a failing mdmn_ksend_message() result
4139 *
4140 * Input:
4141 *	rv	- return value from mdmn_ksend_message()
4142 *	kres	- pointer to result structure filled in by mdmn_ksend_message
4143 *	s	- Informative message to identify failing condition (e.g.
4144 *		  "Ownership change") This string will be displayed with
4145 *		  cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
4146 *		  administrator
4147 */
4148void
4149mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
4150{
4151	if (rv == 0) {
4152		cmn_err(CE_WARN, "%s *FAILED*", s);
4153		cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
4154		    " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
4155		    kres->kmmr_failing_node);
4156	} else {
4157		cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
4158	}
4159}
4160
4161/*
4162 * Callback routine for resync thread. If requested to suspend we mark the
4163 * commd as not being present.
4164 */
4165boolean_t
4166callb_md_mrs_cpr(void *arg, int code)
4167{
4168	callb_cpr_t *cp = (callb_cpr_t *)arg;
4169	int ret = 0;				/* assume success */
4170
4171	mutex_enter(cp->cc_lockp);
4172
4173	switch (code) {
4174	case CB_CODE_CPR_CHKPT:
4175		/*
4176		 * Mark the rpc.mdcommd as no longer present. We are trying to
4177		 * suspend the system and so we should expect RPC failures to
4178		 * occur.
4179		 */
4180		md_mn_clear_commd_present();
4181		cp->cc_events |= CALLB_CPR_START;
4182		while (!(cp->cc_events & CALLB_CPR_SAFE))
4183			/* cv_timedwait() returns -1 if it times out. */
4184			if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp,
4185			    lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1)
4186				break;
4187			break;
4188
4189	case CB_CODE_CPR_RESUME:
4190		cp->cc_events &= ~CALLB_CPR_START;
4191		cv_signal(&cp->cc_stop_cv);
4192		break;
4193	}
4194	mutex_exit(cp->cc_lockp);
4195	return (ret != -1);
4196}
4197
4198
4199void
4200md_rem_hspname(set_t setno, mdkey_t n_key)
4201{
4202	int	s;
4203	int	max_sides;
4204
4205
4206	/* All entries removed are in the same diskset */
4207	if (md_get_setstatus(setno) & MD_SET_MNSET)
4208		max_sides = MD_MNMAXSIDES;
4209	else
4210		max_sides = MD_MAXSIDES;
4211
4212	for (s = 0; s < max_sides; s++)
4213		(void) md_remdevname(setno, s, n_key);
4214}
4215
4216
4217int
4218md_rem_selfname(minor_t selfid)
4219{
4220	int	s;
4221	set_t	setno = MD_MIN2SET(selfid);
4222	int	max_sides;
4223	md_dev64_t	dev;
4224	struct nm_next_hdr	*nh;
4225	struct nm_name	*n;
4226	mdkey_t key;
4227
4228	/*
4229	 * Get the key since remove routine expects it
4230	 */
4231	dev = md_makedevice(md_major, selfid);
4232	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
4233		return (ENOENT);
4234	}
4235
4236	if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
4237	    MD_KEYWILD, dev, 0L)) == NULL) {
4238		return (ENOENT);
4239	}
4240
4241	/* All entries removed are in the same diskset */
4242	key = n->n_key;
4243	if (md_get_setstatus(setno) & MD_SET_MNSET)
4244		max_sides = MD_MNMAXSIDES;
4245	else
4246		max_sides = MD_MAXSIDES;
4247
4248	for (s = 0; s < max_sides; s++)
4249		(void) md_remdevname(setno, s, key);
4250
4251	return (0);
4252}
4253
4254void
4255md_upd_set_unnext(set_t setno, unit_t un)
4256{
4257	if (un < md_set[setno].s_un_next) {
4258		md_set[setno].s_un_next = un;
4259	}
4260}
4261
4262struct hot_spare_pool *
4263find_hot_spare_pool(set_t setno, int hsp_id)
4264{
4265	hot_spare_pool_t *hsp;
4266
4267	hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
4268	while (hsp != NULL) {
4269		if (hsp->hsp_self_id == hsp_id)
4270			return (hsp);
4271		hsp = hsp->hsp_next;
4272	}
4273
4274	return ((hot_spare_pool_t *)0);
4275}
4276
4277/*
4278 * md_create_taskq:
4279 *
4280 * Create a kernel taskq for the given set/unit combination. This is typically
4281 * used to complete a RR_CLEAN request when the callee is unable to obtain the
4282 * mutex / condvar access required to update the DRL safely.
4283 */
4284void *
4285md_create_taskq(set_t setno, minor_t mnum)
4286{
4287	char			name[20];
4288	ddi_taskq_t		*tqp;
4289
4290	(void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
4291
4292	tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
4293
4294	return ((void *)tqp);
4295}
4296