sp.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Soft partitioning metadevice driver (md_sp).
29 *
30 * This file contains the primary operations of the soft partitioning
31 * metadevice driver.  This includes all routines for normal operation
32 * (open/close/read/write).  Please see mdvar.h for a definition of
33 * metadevice operations vector (md_ops_t).  This driver is loosely
34 * based on the stripe driver (md_stripe).
35 *
36 * All metadevice administration is done through the use of ioctl's.
37 * As such, all administrative routines appear in sp_ioctl.c.
38 *
39 * Soft partitions are represented both in-core and in the metadb with a
40 * unit structure.  The soft partition-specific information in the unit
41 * structure includes the following information:
42 *	- Device information (md_dev64_t & md key) about the device on which
43 *	  the soft partition is built.
44 *	- Soft partition status information.
45 *	- The size of the soft partition and number of extents used to
46 *	  make up that size.
47 *	- An array of exents which define virtual/physical offset
48 *	  mappings and lengths for each extent.
49 *
50 * Typical soft partition operation proceeds as follows:
51 *	- The unit structure is fetched from the metadb and placed into
52 *	  an in-core array (as with other metadevices).  This operation
53 *	  is performed via sp_build_incore( ) and takes place during
54 *	  "snarfing" (when all metadevices are brought in-core at
55 *	  once) and when a new soft partition is created.
56 *	- A soft partition is opened via sp_open( ).  At open time the
57 *	  the soft partition unit structure is verified with the soft
58 *	  partition on-disk structures.  Additionally, the soft partition
59 *	  status is checked (only soft partitions in the OK state may be
60 *	  opened).
61 *	- Soft partition I/O is performed via sp_strategy( ) which relies on
62 *	  a support routine, sp_mapbuf( ), to do most of the work.
63 *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
64 *	  search of the extent array in the soft partition unit structure.
65 *	  Once a translation has been performed, the I/O is passed down
66 *	  to the next layer, which may be another metadevice or a physical
67 *	  disk.  Since a soft partition may contain multiple, non-contiguous
68 *	  extents, a single I/O may have to be fragmented.
69 *	- Soft partitions are closed using sp_close.
70 *
71 */
72
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/conf.h>
76#include <sys/file.h>
77#include <sys/user.h>
78#include <sys/uio.h>
79#include <sys/t_lock.h>
80#include <sys/buf.h>
81#include <sys/dkio.h>
82#include <sys/vtoc.h>
83#include <sys/kmem.h>
84#include <vm/page.h>
85#include <sys/cmn_err.h>
86#include <sys/sysmacros.h>
87#include <sys/types.h>
88#include <sys/mkdev.h>
89#include <sys/stat.h>
90#include <sys/open.h>
91#include <sys/lvm/mdvar.h>
92#include <sys/lvm/md_sp.h>
93#include <sys/lvm/md_convert.h>
94#include <sys/lvm/md_notify.h>
95#include <sys/lvm/md_crc.h>
96#include <sys/modctl.h>
97#include <sys/ddi.h>
98#include <sys/sunddi.h>
99#include <sys/debug.h>
100
101#include <sys/sysevent/eventdefs.h>
102#include <sys/sysevent/svm.h>
103
104md_ops_t		sp_md_ops;
105#ifndef	lint
106char			_depends_on[] = "drv/md";
107md_ops_t		*md_interface_ops = &sp_md_ops;
108#endif
109
110extern unit_t		md_nunits;
111extern set_t		md_nsets;
112extern md_set_t		md_set[];
113
114extern int		md_status;
115extern major_t		md_major;
116extern mdq_anchor_t	md_done_daemon;
117extern mdq_anchor_t	md_sp_daemon;
118extern kmutex_t		md_mx;
119extern kcondvar_t	md_cv;
120extern md_krwlock_t	md_unit_array_rw;
121extern clock_t		md_hz;
122
123static kmem_cache_t	*sp_parent_cache = NULL;
124static kmem_cache_t	*sp_child_cache = NULL;
125static void		sp_send_stat_ok(mp_unit_t *);
126static void		sp_send_stat_err(mp_unit_t *);
127
128/*
129 * FUNCTION:	sp_parent_constructor()
130 * INPUT:	none.
131 * OUTPUT:	ps	- parent save structure initialized.
132 * RETURNS:	void *	- ptr to initialized parent save structure.
133 * PURPOSE:	initialize parent save structure.
134 */
135/*ARGSUSED1*/
136static int
137sp_parent_constructor(void *p, void *d1, int d2)
138{
139	mutex_init(&((md_spps_t *)p)->ps_mx,
140	    NULL, MUTEX_DEFAULT, NULL);
141	return (0);
142}
143
144static void
145sp_parent_init(md_spps_t *ps)
146{
147	bzero(ps, offsetof(md_spps_t, ps_mx));
148}
149
150/*ARGSUSED1*/
151static void
152sp_parent_destructor(void *p, void *d)
153{
154	mutex_destroy(&((md_spps_t *)p)->ps_mx);
155}
156
157/*
158 * FUNCTION:	sp_child_constructor()
159 * INPUT:	none.
160 * OUTPUT:	cs	- child save structure initialized.
161 * RETURNS:	void *	- ptr to initialized child save structure.
162 * PURPOSE:	initialize child save structure.
163 */
164/*ARGSUSED1*/
165static int
166sp_child_constructor(void *p, void *d1, int d2)
167{
168	bioinit(&((md_spcs_t *)p)->cs_buf);
169	return (0);
170}
171
172static void
173sp_child_init(md_spcs_t *cs)
174{
175	cs->cs_mdunit = 0;
176	cs->cs_ps = NULL;
177	md_bioreset(&cs->cs_buf);
178}
179
180/*ARGSUSED1*/
181static void
182sp_child_destructor(void *p, void *d)
183{
184	biofini(&((md_spcs_t *)p)->cs_buf);
185}
186
187/*
188 * FUNCTION:	sp_run_queue()
189 * INPUT:	none.
190 * OUTPUT:	none.
191 * RETURNS:	void.
192 * PURPOSE:	run the md_daemon to clean up memory pool.
193 */
194/*ARGSUSED*/
195static void
196sp_run_queue(void *d)
197{
198	if (!(md_status & MD_GBL_DAEMONS_LIVE))
199		md_daemon(1, &md_done_daemon);
200}
201
202
203/*
204 * FUNCTION:	sp_build_incore()
205 * INPUT:	p		- ptr to unit structure.
206 *		snarfing	- flag to tell us we are snarfing.
207 * OUTPUT:	non.
208 * RETURNS:	int	- 0 (always).
209 * PURPOSE:	place unit structure into in-core unit array (keyed from
210 *		minor number).
211 */
212int
213sp_build_incore(void *p, int snarfing)
214{
215	mp_unit_t	*un = (mp_unit_t *)p;
216	minor_t		mnum;
217	set_t		setno;
218	md_dev64_t	tmpdev;
219
220	mnum = MD_SID(un);
221
222	if (MD_UNIT(mnum) != NULL)
223		return (0);
224
225	MD_STATUS(un) = 0;
226
227	if (snarfing) {
228		/*
229		 * if we are snarfing, we get the device information
230		 * from the metadb record (using the metadb key for
231		 * that device).
232		 */
233		setno = MD_MIN2SET(mnum);
234
235		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
236		    un->un_key, MD_NOTRUST_DEVT);
237		un->un_dev = tmpdev;
238	}
239
240	/* place various information in the in-core data structures */
241	md_nblocks_set(mnum, un->c.un_total_blocks);
242	MD_UNIT(mnum) = un;
243
244	return (0);
245}
246
247/*
248 * FUNCTION:	reset_sp()
249 * INPUT:	un		- unit structure to be reset/removed.
250 *		mnum		- minor number to be reset/removed.
251 *		removing	- flag to tell us if we are removing
252 *				  permanently or just reseting in-core
253 *				  structures.
254 * OUTPUT:	none.
255 * RETURNS:	void.
256 * PURPOSE:	used to either simply reset in-core structures or to
257 *		permanently remove metadevices from the metadb.
258 */
259void
260reset_sp(mp_unit_t *un, minor_t mnum, int removing)
261{
262	sv_dev_t	*sv;
263	mddb_recid_t	vtoc_id;
264
265	/* clean up in-core structures */
266	md_destroy_unit_incore(mnum, &sp_md_ops);
267
268	md_nblocks_set(mnum, -1ULL);
269	MD_UNIT(mnum) = NULL;
270
271	/*
272	 * Attempt release of minor node
273	 */
274	md_remove_minor_node(mnum);
275
276	if (!removing)
277		return;
278
279	/* we are removing the soft partition from the metadb */
280
281	/*
282	 * Save off device information so we can get to
283	 * it after we do the mddb_deleterec().
284	 */
285	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
286	sv->setno = MD_MIN2SET(mnum);
287	sv->key = un->un_key;
288	vtoc_id = un->c.un_vtoc_id;
289
290	/*
291	 * Remove self from the namespace
292	 */
293	if (un->c.un_revision & MD_FN_META_DEV) {
294		(void) md_rem_selfname(un->c.un_self_id);
295	}
296
297	/* Remove the unit structure */
298	mddb_deleterec_wrapper(un->c.un_record_id);
299
300	if (vtoc_id)
301		mddb_deleterec_wrapper(vtoc_id);
302
303	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
304	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
305
306	/*
307	 * remove the underlying device name from the metadb.  if other
308	 * soft partitions are built on this device, this will simply
309	 * decrease the reference count for this device.  otherwise the
310	 * name record for this device will be removed from the metadb.
311	 */
312	md_rem_names(sv, 1);
313	kmem_free(sv, sizeof (sv_dev_t));
314}
315
316/*
317 * FUNCTION:	sp_send_stat_msg
318 * INPUT:	un	- unit reference
319 *		status	- status to be sent to master node
320 *			MD_SP_OK - soft-partition is now OK
321 *			MD_SP_ERR	"	"	 errored
322 * OUTPUT:	none.
323 * RETURNS:	void.
324 * PURPOSE:	send a soft-partition status change to the master node. If the
325 *		message succeeds we simply return. If it fails we panic as the
326 *		cluster-wide view of the metadevices is now inconsistent.
327 * CALLING CONTEXT:
328 *	Blockable. No locks can be held.
329 */
330static void
331sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
332{
333	md_mn_msg_sp_setstat_t	sp_msg;
334	md_mn_kresult_t	*kres;
335	set_t		setno = MD_UN2SET(un);
336	int		rval;
337	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
338
339	sp_msg.sp_setstat_mnum = MD_SID(un);
340	sp_msg.sp_setstat_status = status;
341
342	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
343
344	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
345	    0, (char *)&sp_msg, sizeof (sp_msg), kres);
346
347	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
348		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
349		/* If we're shutting down already, pause things here. */
350		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
351			while (!md_mn_is_commd_present()) {
352				delay(md_hz);
353			}
354		}
355		/*
356		 * Panic as we are now in an inconsistent state.
357		 */
358		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
359		    md_shortname(MD_SID(un)), str);
360	}
361
362	kmem_free(kres, sizeof (md_mn_kresult_t));
363}
364
365/*
366 * FUNCTION:	sp_finish_error
367 * INPUT:	ps	- parent save structure for error-ed I/O.
368 *		lock_held	- set if the unit readerlock is held
369 * OUTPUT:	none.
370 * RETURNS:	void.
371 * PURPOSE:	report a driver error
372 */
373static void
374sp_finish_error(md_spps_t *ps, int lock_held)
375{
376	struct buf	*pb = ps->ps_bp;
377	mdi_unit_t	*ui = ps->ps_ui;
378	md_dev64_t	un_dev;			/* underlying device */
379	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
380	char		*str;
381
382	un_dev = md_expldev(ps->ps_un->un_dev);
383	/* set error type */
384	if (pb->b_flags & B_READ) {
385		str = "read";
386	} else {
387		str = "write";
388	}
389
390
391	SPPS_FREE(sp_parent_cache, ps);
392	pb->b_flags |= B_ERROR;
393
394	md_kstat_done(ui, pb, 0);
395
396	if (lock_held) {
397		md_unit_readerexit(ui);
398	}
399	md_biodone(pb);
400
401	cmn_err(CE_WARN, "md: %s: %s error on %s",
402	    md_shortname(md_getminor(md_dev)), str,
403	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
404}
405
406
407/*
408 * FUNCTION:	sp_xmit_ok
409 * INPUT:	dq	- daemon queue referencing failing ps structure
410 * OUTPUT:	none.
411 * RETURNS:	void.
412 * PURPOSE:	send a message to the master node in a multi-owner diskset to
413 *		update all attached nodes view of the soft-part to be MD_SP_OK.
414 * CALLING CONTEXT:
415 *	Blockable. No unit lock held.
416 */
417static void
418sp_xmit_ok(daemon_queue_t *dq)
419{
420	md_spps_t	*ps = (md_spps_t *)dq;
421
422	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
423	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
424
425	/*
426	 * Successfully transmitted error state to all nodes, now release this
427	 * parent structure.
428	 */
429	SPPS_FREE(sp_parent_cache, ps);
430}
431
432/*
433 * FUNCTION:	sp_xmit_error
434 * INPUT:	dq	- daemon queue referencing failing ps structure
435 * OUTPUT:	none.
436 * RETURNS:	void.
437 * PURPOSE:	send a message to the master node in a multi-owner diskset to
438 *		update all attached nodes view of the soft-part to be MD_SP_ERR.
439 * CALLING CONTEXT:
440 *	Blockable. No unit lock held.
441 */
442static void
443sp_xmit_error(daemon_queue_t *dq)
444{
445	md_spps_t	*ps = (md_spps_t *)dq;
446
447	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
448	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
449
450	/*
451	 * Successfully transmitted error state to all nodes, now release this
452	 * parent structure.
453	 */
454	SPPS_FREE(sp_parent_cache, ps);
455}
456static void
457sp_send_stat_ok(mp_unit_t *un)
458{
459	minor_t		mnum = MD_SID(un);
460	md_spps_t	*ps;
461
462	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
463	sp_parent_init(ps);
464	ps->ps_un = un;
465	ps->ps_ui = MDI_UNIT(mnum);
466
467	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
468	    REQ_OLD);
469}
470
471static void
472sp_send_stat_err(mp_unit_t *un)
473{
474	minor_t		mnum = MD_SID(un);
475	md_spps_t	*ps;
476
477	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
478	sp_parent_init(ps);
479	ps->ps_un = un;
480	ps->ps_ui = MDI_UNIT(mnum);
481
482	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
483	    REQ_OLD);
484}
485
486
487/*
488 * FUNCTION:	sp_error()
489 * INPUT:	ps	- parent save structure for error-ed I/O.
490 * OUTPUT:	none.
491 * RETURNS:	void.
492 * PURPOSE:	report a driver error.
493 * CALLING CONTEXT:
494 *	Interrupt - non-blockable
495 */
496static void
497sp_error(md_spps_t *ps)
498{
499	set_t		setno = MD_UN2SET(ps->ps_un);
500
501	/*
502	 * Drop the mutex associated with this request before (potentially)
503	 * enqueuing the free onto a separate thread. We have to release the
504	 * mutex before destroying the parent structure.
505	 */
506	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
507		if (MUTEX_HELD(&ps->ps_mx)) {
508			mutex_exit(&ps->ps_mx);
509		}
510	} else {
511		/*
512		 * this should only ever happen if we are panicking,
513		 * since DONTFREE is only set on the parent if panicstr
514		 * is non-NULL.
515		 */
516		ASSERT(panicstr);
517	}
518
519	/*
520	 * For a multi-owner set we need to send a message to the master so that
521	 * all nodes get the errored status when we first encounter it. To avoid
522	 * deadlocking when multiple soft-partitions encounter an error on one
523	 * physical unit we drop the unit readerlock before enqueueing the
524	 * request. That way we can service any messages that require a
525	 * writerlock to be held. Additionally, to avoid deadlocking when at
526	 * the bottom of a metadevice stack and a higher level mirror has
527	 * multiple requests outstanding on this soft-part, we clone the ps
528	 * that failed and pass the error back up the stack to release the
529	 * reference that this i/o may have in the higher-level metadevice.
530	 * The other nodes in the cluster just have to modify the soft-part
531	 * status and we do not need to block the i/o completion for this.
532	 */
533	if (MD_MNSET_SETNO(setno)) {
534		md_spps_t	*err_ps;
535		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
536		sp_parent_init(err_ps);
537
538		err_ps->ps_un = ps->ps_un;
539		err_ps->ps_ui = ps->ps_ui;
540
541		md_unit_readerexit(ps->ps_ui);
542
543		daemon_request(&md_sp_daemon, sp_xmit_error,
544		    (daemon_queue_t *)err_ps, REQ_OLD);
545
546		sp_finish_error(ps, 0);
547
548		return;
549	} else {
550		ps->ps_un->un_status = MD_SP_ERR;
551	}
552
553	/* Flag the error */
554	sp_finish_error(ps, 1);
555
556}
557
558/*
559 * FUNCTION:	sp_mapbuf()
560 * INPUT:	un	- unit structure for soft partition we are doing
561 *			  I/O on.
562 *		voff	- virtual offset in soft partition to map.
563 *		bcount	- # of blocks in the I/O.
564 * OUTPUT:	bp	- translated buffer to be passed down to next layer.
565 * RETURNS:	1	- request must be fragmented, more work to do,
566 *		0	- request satisified, no more work to do
567 *		-1	- error
568 * PURPOSE:	Map the the virtual offset in the soft partition (passed
569 *		in via voff) to the "physical" offset on whatever the soft
570 *		partition is built on top of.  We do this by doing a binary
571 *		search of the extent array in the soft partition unit
572 *		structure.  Once the current extent is found, we do the
573 *		translation, determine if the I/O will cross extent
574 *		boundaries (if so, we have to fragment the I/O), then
575 *		fill in the buf structure to be passed down to the next layer.
576 */
577static int
578sp_mapbuf(
579	mp_unit_t	*un,
580	sp_ext_offset_t	voff,
581	sp_ext_length_t	bcount,
582	buf_t		*bp
583)
584{
585	int		lo, mid, hi, found, more;
586	size_t		new_bcount;
587	sp_ext_offset_t new_blkno;
588	sp_ext_offset_t	new_offset;
589	sp_ext_offset_t	ext_endblk;
590	md_dev64_t	new_edev;
591	extern unsigned	md_maxphys;
592
593	found = 0;
594	lo = 0;
595	hi = un->un_numexts - 1;
596
597	/*
598	 * do a binary search to find the extent that contains the
599	 * starting offset.  after this loop, mid contains the index
600	 * of the correct extent.
601	 */
602	while (lo <= hi && !found) {
603		mid = (lo + hi) / 2;
604		/* is the starting offset contained within the mid-ext? */
605		if (voff >= un->un_ext[mid].un_voff &&
606		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
607			found = 1;
608		else if (voff < un->un_ext[mid].un_voff)
609			hi = mid - 1;
610		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
611			lo = mid + 1;
612	}
613
614	if (!found) {
615		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
616		return (-1);
617	}
618
619	/* translate to underlying physical offset/device */
620	new_offset = voff - un->un_ext[mid].un_voff;
621	new_blkno = un->un_ext[mid].un_poff + new_offset;
622	new_edev = un->un_dev;
623
624	/* determine if we need to break the I/O into fragments */
625	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
626	if (voff + btodb(bcount) > ext_endblk) {
627		new_bcount = dbtob(ext_endblk - voff);
628		more = 1;
629	} else {
630		new_bcount = bcount;
631		more = 0;
632	}
633
634	/* only break up the I/O if we're not built on another metadevice */
635	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
636		new_bcount = md_maxphys;
637		more = 1;
638	}
639	if (bp != (buf_t *)NULL) {
640		/* do bp updates */
641		bp->b_bcount = new_bcount;
642		bp->b_lblkno = new_blkno;
643		bp->b_edev = md_dev64_to_dev(new_edev);
644	}
645	return (more);
646}
647
648/*
649 * FUNCTION:	sp_validate()
650 * INPUT:	un	- unit structure to be validated.
651 * OUTPUT:	none.
652 * RETURNS:	0	- soft partition ok.
653 *		-1	- error.
654 * PURPOSE:	called on open to sanity check the soft partition.  In
655 *		order to open a soft partition:
656 *		- it must have at least one extent
657 *		- the extent info in core and on disk must match
658 *		- it may not be in an intermediate state (which would
659 *		  imply that a two-phase commit was interrupted)
660 *
661 *		If the extent checking fails (B_ERROR returned from the read
662 *		strategy call) _and_ we're a multi-owner diskset, we send a
663 *		message to the master so that all nodes inherit the same view
664 *		of the soft partition.
665 *		If we are checking a soft-part that is marked as in error, and
666 *		we can actually read and validate the watermarks we send a
667 *		message to clear the error to the master node.
668 */
669static int
670sp_validate(mp_unit_t *un)
671{
672	uint_t		ext;
673	struct buf	*buf;
674	sp_ext_length_t	len;
675	mp_watermark_t	*wm;
676	set_t		setno;
677	int		reset_error = 0;
678
679	setno = MD_UN2SET(un);
680
681	/* sanity check unit structure components ?? */
682	if (un->un_status != MD_SP_OK) {
683		if (un->un_status != MD_SP_ERR) {
684			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
685			    "status is %u.",
686			    md_shortname(MD_SID(un)),
687			    un->un_status);
688			return (-1);
689		} else {
690			cmn_err(CE_WARN, "md: %s: open of soft partition "
691			    "in Errored state.",
692			    md_shortname(MD_SID(un)));
693			reset_error = 1;
694		}
695	}
696
697	if (un->un_numexts == 0) {
698		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
699		    "not have any extents.", md_shortname(MD_SID(un)));
700		return (-1);
701	}
702
703	len = 0LL;
704	for (ext = 0; ext < un->un_numexts; ext++) {
705
706		/* tally extent lengths to check total size */
707		len += un->un_ext[ext].un_len;
708
709		/* allocate buffer for watermark */
710		buf = getrbuf(KM_SLEEP);
711
712		/* read watermark */
713		buf->b_flags = B_READ;
714		buf->b_edev = md_dev64_to_dev(un->un_dev);
715		buf->b_iodone = NULL;
716		buf->b_proc = NULL;
717		buf->b_bcount = sizeof (mp_watermark_t);
718		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
719		buf->b_bufsize = sizeof (mp_watermark_t);
720		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
721		    KM_SLEEP);
722
723		/*
724		 * make the call non-blocking so that it is not affected
725		 * by a set take.
726		 */
727		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
728		(void) biowait(buf);
729
730		if (buf->b_flags & B_ERROR) {
731			cmn_err(CE_WARN, "md: %s: open failed, could not "
732			    "read watermark at block %llu for extent %u, "
733			    "error %d.", md_shortname(MD_SID(un)),
734			    buf->b_lblkno, ext, buf->b_error);
735			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
736			freerbuf(buf);
737
738			/*
739			 * If we're a multi-owner diskset we send a message
740			 * indicating that this soft-part has an invalid
741			 * extent to the master node. This ensures a consistent
742			 * view of the soft-part across the cluster.
743			 */
744			if (MD_MNSET_SETNO(setno)) {
745				sp_send_stat_err(un);
746			}
747			return (-1);
748		}
749
750		wm = (mp_watermark_t *)buf->b_un.b_addr;
751
752		/* make sure the checksum is correct first */
753		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
754		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
755			cmn_err(CE_WARN, "md: %s: open failed, watermark "
756			    "at block %llu for extent %u does not have a "
757			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
758			    buf->b_lblkno, ext, wm->wm_checksum);
759			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
760			freerbuf(buf);
761			return (-1);
762		}
763
764		if (wm->wm_magic != MD_SP_MAGIC) {
765			cmn_err(CE_WARN, "md: %s: open failed, watermark "
766			    "at block %llu for extent %u does not have a "
767			    "valid watermark magic number, expected 0x%x, "
768			    "found 0x%x.", md_shortname(MD_SID(un)),
769			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
770			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
771			freerbuf(buf);
772			return (-1);
773		}
774
775		/* make sure sequence number matches the current extent */
776		if (wm->wm_seq != ext) {
777			cmn_err(CE_WARN, "md: %s: open failed, watermark "
778			    "at block %llu for extent %u has invalid "
779			    "sequence number %u.", md_shortname(MD_SID(un)),
780			    buf->b_lblkno, ext, wm->wm_seq);
781			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
782			freerbuf(buf);
783			return (-1);
784		}
785
786		/* make sure watermark length matches unit structure */
787		if (wm->wm_length != un->un_ext[ext].un_len) {
788			cmn_err(CE_WARN, "md: %s: open failed, watermark "
789			    "at block %llu for extent %u has inconsistent "
790			    "length, expected %llu, found %llu.",
791			    md_shortname(MD_SID(un)), buf->b_lblkno,
792			    ext, un->un_ext[ext].un_len,
793			    (u_longlong_t)wm->wm_length);
794			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
795			freerbuf(buf);
796			return (-1);
797		}
798
799		/*
800		 * make sure the type is a valid soft partition and not
801		 * a free extent or the end.
802		 */
803		if (wm->wm_type != EXTTYP_ALLOC) {
804			cmn_err(CE_WARN, "md: %s: open failed, watermark "
805			    "at block %llu for extent %u is not marked "
806			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
807			    buf->b_lblkno, ext, wm->wm_type);
808			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
809			freerbuf(buf);
810			return (-1);
811		}
812		/* free up buffer */
813		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
814		freerbuf(buf);
815	}
816
817	if (len != un->un_length) {
818		cmn_err(CE_WARN, "md: %s: open failed, computed length "
819		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
820		    len, un->un_length);
821		return (-1);
822	}
823
824	/*
825	 * If we're a multi-owner set _and_ reset_error is set, we should clear
826	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
827	 * MD_SP_OK.
828	 */
829	if (MD_MNSET_SETNO(setno) && reset_error) {
830		sp_send_stat_ok(un);
831	}
832	return (0);
833}
834
835/*
836 * FUNCTION:	sp_done()
837 * INPUT:	child_buf	- buffer attached to child save structure.
838 *				  this is the buffer on which I/O has just
839 *				  completed.
840 * OUTPUT:	none.
841 * RETURNS:	0	- success.
842 *		1	- error.
843 * PURPOSE:	called on I/O completion.
844 */
845static int
846sp_done(struct buf *child_buf)
847{
848	struct buf	*parent_buf;
849	mdi_unit_t	*ui;
850	md_spps_t	*ps;
851	md_spcs_t	*cs;
852
853	/* find the child save structure to which this buffer belongs */
854	cs = (md_spcs_t *)((caddr_t)child_buf -
855	    (sizeof (md_spcs_t) - sizeof (buf_t)));
856	/* now get the parent save structure */
857	ps = cs->cs_ps;
858	parent_buf = ps->ps_bp;
859
860	mutex_enter(&ps->ps_mx);
861	/* pass any errors back up to the parent */
862	if (child_buf->b_flags & B_ERROR) {
863		ps->ps_flags |= MD_SPPS_ERROR;
864		parent_buf->b_error = child_buf->b_error;
865	}
866	/* mapout, if needed */
867	if (child_buf->b_flags & B_REMAPPED)
868		bp_mapout(child_buf);
869
870	ps->ps_frags--;
871	if (ps->ps_frags != 0) {
872		/*
873		 * if this parent has more children, we just free the
874		 * child and return.
875		 */
876		kmem_cache_free(sp_child_cache, cs);
877		mutex_exit(&ps->ps_mx);
878		return (1);
879	}
880	/* there are no more children */
881	kmem_cache_free(sp_child_cache, cs);
882	if (ps->ps_flags & MD_SPPS_ERROR) {
883		sp_error(ps);
884		return (1);
885	}
886	ui = ps->ps_ui;
887	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
888		mutex_exit(&ps->ps_mx);
889	} else {
890		/*
891		 * this should only ever happen if we are panicking,
892		 * since DONTFREE is only set on the parent if panicstr
893		 * is non-NULL.
894		 */
895		ASSERT(panicstr);
896	}
897	SPPS_FREE(sp_parent_cache, ps);
898	md_kstat_done(ui, parent_buf, 0);
899	md_unit_readerexit(ui);
900	md_biodone(parent_buf);
901	return (0);
902}
903
904/*
905 * FUNCTION:	md_sp_strategy()
906 * INPUT:	parent_buf	- parent buffer
907 *		flag		- flags
908 *		private		- private data
909 * OUTPUT:	none.
910 * RETURNS:	void.
911 * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
912 *		needed to do I/O to a soft partition.  The basic
913 *		algorithm is as follows:
914 *			- Allocate a child save structure to keep track
915 *			  of the I/O we are going to pass down.
916 *			- Map the I/O to the correct extent in the soft
917 *			  partition (see sp_mapbuf()).
918 *			- bioclone() the buffer and pass it down the
919 *			  stack using md_call_strategy.
920 *			- If the I/O needs to split across extents,
921 *			  repeat the above steps until all fragments
922 *			  are finished.
923 */
924static void
925md_sp_strategy(buf_t *parent_buf, int flag, void *private)
926{
927	md_spps_t	*ps;
928	md_spcs_t	*cs;
929	int		more;
930	mp_unit_t	*un;
931	mdi_unit_t	*ui;
932	size_t		current_count;
933	off_t		current_offset;
934	sp_ext_offset_t	current_blkno;
935	buf_t		*child_buf;
936	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
937	int		strat_flag = flag;
938
939	/*
940	 * When doing IO to a multi owner meta device, check if set is halted.
941	 * We do this check without the needed lock held, for performance
942	 * reasons.
943	 * If an IO just slips through while the set is locked via an
944	 * MD_MN_SUSPEND_SET, we don't care about it.
945	 * Only check for suspension if we are a top-level i/o request
946	 * (MD_STR_NOTTOP is cleared in 'flag');
947	 */
948	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
949	    (MD_SET_HALTED | MD_SET_MNSET)) {
950		if ((flag & MD_STR_NOTTOP) == 0) {
951			mutex_enter(&md_mx);
952			/* Here we loop until the set is no longer halted */
953			while (md_set[setno].s_status & MD_SET_HALTED) {
954				cv_wait(&md_cv, &md_mx);
955			}
956			mutex_exit(&md_mx);
957		}
958	}
959
960	ui = MDI_UNIT(getminor(parent_buf->b_edev));
961
962	md_kstat_waitq_enter(ui);
963
964	un = (mp_unit_t *)md_unit_readerlock(ui);
965
966	if ((flag & MD_NOBLOCK) == 0) {
967		if (md_inc_iocount(setno) != 0) {
968			parent_buf->b_flags |= B_ERROR;
969			parent_buf->b_error = ENXIO;
970			parent_buf->b_resid = parent_buf->b_bcount;
971			md_kstat_waitq_exit(ui);
972			md_unit_readerexit(ui);
973			biodone(parent_buf);
974			return;
975		}
976	} else {
977		md_inc_iocount_noblock(setno);
978	}
979
980	if (!(flag & MD_STR_NOTTOP)) {
981		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
982			md_kstat_waitq_exit(ui);
983			return;
984		}
985	}
986
987	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
988	sp_parent_init(ps);
989
990	/*
991	 * Save essential information from the original buffhdr
992	 * in the parent.
993	 */
994	ps->ps_un = un;
995	ps->ps_ui = ui;
996	ps->ps_bp = parent_buf;
997	ps->ps_addr = parent_buf->b_un.b_addr;
998
999	current_count = parent_buf->b_bcount;
1000	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
1001	current_offset  = 0;
1002
1003	/*
1004	 * if we are at the top and we are panicking,
1005	 * we don't free in order to save state.
1006	 */
1007	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
1008		ps->ps_flags |= MD_SPPS_DONTFREE;
1009
1010	md_kstat_waitq_to_runq(ui);
1011
1012	ps->ps_frags++;
1013
1014	/*
1015	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
1016	 * metadevice.
1017	 */
1018	if (ui->ui_tstate & MD_ABR_CAP)
1019		strat_flag |= MD_STR_ABR;
1020
1021	/*
1022	 * this loop does the main work of an I/O.  we allocate a
1023	 * a child save for each buf, do the logical to physical
1024	 * mapping, decide if we need to frag the I/O, clone the
1025	 * new I/O to pass down the stack.  repeat until we've
1026	 * taken care of the entire buf that was passed to us.
1027	 */
1028	do {
1029		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1030		sp_child_init(cs);
1031		child_buf = &cs->cs_buf;
1032		cs->cs_ps = ps;
1033
1034		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1035		if (more == -1) {
1036			parent_buf->b_flags |= B_ERROR;
1037			parent_buf->b_error = EIO;
1038			md_kstat_done(ui, parent_buf, 0);
1039			md_unit_readerexit(ui);
1040			md_biodone(parent_buf);
1041			kmem_cache_free(sp_parent_cache, ps);
1042			return;
1043		}
1044
1045		child_buf = md_bioclone(parent_buf, current_offset,
1046		    child_buf->b_bcount, child_buf->b_edev,
1047		    child_buf->b_blkno, sp_done, child_buf,
1048		    KM_NOSLEEP);
1049		/* calculate new offset, counts, etc... */
1050		current_offset += child_buf->b_bcount;
1051		current_count -=  child_buf->b_bcount;
1052		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1053
1054		if (more) {
1055			mutex_enter(&ps->ps_mx);
1056			ps->ps_frags++;
1057			mutex_exit(&ps->ps_mx);
1058		}
1059
1060		md_call_strategy(child_buf, strat_flag, private);
1061	} while (more);
1062
1063	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
1064		while (!(ps->ps_flags & MD_SPPS_DONE)) {
1065			md_daemon(1, &md_done_daemon);
1066		}
1067		kmem_cache_free(sp_parent_cache, ps);
1068	}
1069}
1070
1071/*
1072 * FUNCTION:	sp_directed_read()
1073 * INPUT:	mnum	- minor number
1074 *		vdr	- vol_directed_rd_t from user
1075 *		mode	- access mode for copying data out.
1076 * OUTPUT:	none.
1077 * RETURNS:	0	- success
1078 *		Exxxxx	- failure error-code
1079 * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
1080 *		directed read as requested by the user. This is essentially the
1081 *		same as md_sp_strategy() with the exception being that the
1082 *		underlying 'md_call_strategy' is replaced with an ioctl call.
1083 */
1084int
1085sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
1086{
1087	md_spps_t	*ps;
1088	md_spcs_t	*cs;
1089	int		more;
1090	mp_unit_t	*un;
1091	mdi_unit_t	*ui;
1092	size_t		current_count;
1093	off_t		current_offset;
1094	sp_ext_offset_t	current_blkno;
1095	buf_t		*child_buf, *parent_buf;
1096	void		*kbuffer;
1097	vol_directed_rd_t	cvdr;
1098	caddr_t		userbuf;
1099	offset_t	useroff;
1100	int		ret = 0;
1101
1102	ui = MDI_UNIT(mnum);
1103
1104	md_kstat_waitq_enter(ui);
1105
1106	bzero(&cvdr, sizeof (cvdr));
1107
1108	un = (mp_unit_t *)md_unit_readerlock(ui);
1109
1110	/*
1111	 * Construct a parent_buf header which reflects the user-supplied
1112	 * request.
1113	 */
1114
1115	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
1116	if (kbuffer == NULL) {
1117		vdr->vdr_flags |= DKV_DMR_ERROR;
1118		md_kstat_waitq_exit(ui);
1119		md_unit_readerexit(ui);
1120		return (ENOMEM);
1121	}
1122
1123	parent_buf = getrbuf(KM_NOSLEEP);
1124	if (parent_buf == NULL) {
1125		vdr->vdr_flags |= DKV_DMR_ERROR;
1126		md_kstat_waitq_exit(ui);
1127		md_unit_readerexit(ui);
1128		kmem_free(kbuffer, vdr->vdr_nbytes);
1129		return (ENOMEM);
1130	}
1131	parent_buf->b_un.b_addr = kbuffer;
1132	parent_buf->b_flags = B_READ;
1133	parent_buf->b_bcount = vdr->vdr_nbytes;
1134	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
1135	parent_buf->b_edev = un->un_dev;
1136
1137
1138	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
1139	sp_parent_init(ps);
1140
1141	/*
1142	 * Save essential information from the original buffhdr
1143	 * in the parent.
1144	 */
1145	ps->ps_un = un;
1146	ps->ps_ui = ui;
1147	ps->ps_bp = parent_buf;
1148	ps->ps_addr = parent_buf->b_un.b_addr;
1149
1150	current_count = parent_buf->b_bcount;
1151	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
1152	current_offset  = 0;
1153
1154	md_kstat_waitq_to_runq(ui);
1155
1156	ps->ps_frags++;
1157	vdr->vdr_bytesread = 0;
1158
1159	/*
1160	 * this loop does the main work of an I/O.  we allocate a
1161	 * a child save for each buf, do the logical to physical
1162	 * mapping, decide if we need to frag the I/O, clone the
1163	 * new I/O to pass down the stack.  repeat until we've
1164	 * taken care of the entire buf that was passed to us.
1165	 */
1166	do {
1167		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1168		sp_child_init(cs);
1169		child_buf = &cs->cs_buf;
1170		cs->cs_ps = ps;
1171
1172		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1173		if (more == -1) {
1174			ret = EIO;
1175			vdr->vdr_flags |= DKV_DMR_SHORT;
1176			kmem_cache_free(sp_child_cache, cs);
1177			goto err_out;
1178		}
1179
1180		cvdr.vdr_flags = vdr->vdr_flags;
1181		cvdr.vdr_side = vdr->vdr_side;
1182		cvdr.vdr_nbytes = child_buf->b_bcount;
1183		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
1184		/* Work out where we are in the allocated buffer */
1185		useroff = (offset_t)(uintptr_t)kbuffer;
1186		useroff = useroff + (offset_t)current_offset;
1187		cvdr.vdr_data = (void *)(uintptr_t)useroff;
1188		child_buf = md_bioclone(parent_buf, current_offset,
1189		    child_buf->b_bcount, child_buf->b_edev,
1190		    child_buf->b_blkno, NULL,
1191		    child_buf, KM_NOSLEEP);
1192		/* calculate new offset, counts, etc... */
1193		current_offset += child_buf->b_bcount;
1194		current_count -=  child_buf->b_bcount;
1195		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1196
1197		if (more) {
1198			mutex_enter(&ps->ps_mx);
1199			ps->ps_frags++;
1200			mutex_exit(&ps->ps_mx);
1201		}
1202
1203		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
1204		    (mode | FKIOCTL), NULL);
1205
1206		/*
1207		 * Free the child structure as we've finished with it.
1208		 * Normally this would be done by sp_done() but we're just
1209		 * using md_bioclone() to segment the transfer and we never
1210		 * issue a strategy request so the iodone will not be called.
1211		 */
1212		kmem_cache_free(sp_child_cache, cs);
1213		if (ret == 0) {
1214			/* copyout the returned data to vdr_data + offset */
1215			userbuf = (caddr_t)kbuffer;
1216			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
1217			if (ddi_copyout(userbuf, vdr->vdr_data,
1218			    cvdr.vdr_bytesread, mode)) {
1219				ret = EFAULT;
1220				goto err_out;
1221			}
1222			vdr->vdr_bytesread += cvdr.vdr_bytesread;
1223		} else {
1224			goto err_out;
1225		}
1226	} while (more);
1227
1228	/*
1229	 * Update the user-supplied vol_directed_rd_t structure with the
1230	 * contents of the last issued child request.
1231	 */
1232	vdr->vdr_flags = cvdr.vdr_flags;
1233	vdr->vdr_side = cvdr.vdr_side;
1234	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
1235
1236err_out:
1237	if (ret != 0) {
1238		vdr->vdr_flags |= DKV_DMR_ERROR;
1239	}
1240	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
1241		vdr->vdr_flags |= DKV_DMR_SHORT;
1242	}
1243	kmem_cache_free(sp_parent_cache, ps);
1244	kmem_free(kbuffer, vdr->vdr_nbytes);
1245	freerbuf(parent_buf);
1246	md_unit_readerexit(ui);
1247	return (ret);
1248}
1249
1250/*
1251 * FUNCTION:	sp_snarf()
1252 * INPUT:	cmd	- snarf cmd.
1253 *		setno	- set number.
1254 * OUTPUT:	none.
1255 * RETURNS:	1	- soft partitions were snarfed.
1256 *		0	- no soft partitions were snarfed.
1257 * PURPOSE:	Snarf soft partition metadb records into their in-core
1258 *		structures.  This routine is called at "snarf time" when
1259 *		md loads and gets all metadevices records into memory.
1260 *		The basic algorithm is simply to walk the soft partition
1261 *		records in the metadb and call the soft partitioning
1262 *		build_incore routine to set up the in-core structures.
1263 */
1264static int
1265sp_snarf(md_snarfcmd_t cmd, set_t setno)
1266{
1267	mp_unit_t	*un;
1268	mddb_recid_t	recid;
1269	int		gotsomething;
1270	int		all_sp_gotten;
1271	mddb_type_t	rec_type;
1272	mddb_de_ic_t	*dep;
1273	mddb_rb32_t	*rbp;
1274	mp_unit_t	*big_un;
1275	mp_unit32_od_t	*small_un;
1276	size_t		newreqsize;
1277
1278
1279	if (cmd == MD_SNARF_CLEANUP)
1280		return (0);
1281
1282	all_sp_gotten = 1;
1283	gotsomething = 0;
1284
1285	/* get the record type */
1286	rec_type = (mddb_type_t)md_getshared_key(setno,
1287	    sp_md_ops.md_driver.md_drivername);
1288	recid = mddb_makerecid(setno, 0);
1289
1290	/*
1291	 * walk soft partition records in the metadb and call
1292	 * sp_build_incore to build in-core structures.
1293	 */
1294	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1295		/* if we've already gotten this record, go to the next one */
1296		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1297			continue;
1298
1299
1300		dep = mddb_getrecdep(recid);
1301		dep->de_flags = MDDB_F_SOFTPART;
1302		rbp = dep->de_rb;
1303
1304		switch (rbp->rb_revision) {
1305		case MDDB_REV_RB:
1306		case MDDB_REV_RBFN:
1307			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
1308				/*
1309				 * This means, we have an old and small record.
1310				 * And this record hasn't already been converted
1311				 * :-o before we create an incore metadevice
1312				 * from this we have to convert it to a big
1313				 * record.
1314				 */
1315				small_un =
1316				    (mp_unit32_od_t *)mddb_getrecaddr(recid);
1317				newreqsize = sizeof (mp_unit_t) +
1318				    ((small_un->un_numexts - 1) *
1319				    sizeof (struct mp_ext));
1320				big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
1321				    KM_SLEEP);
1322				softpart_convert((caddr_t)small_un,
1323				    (caddr_t)big_un, SMALL_2_BIG);
1324				kmem_free(small_un, dep->de_reqsize);
1325				dep->de_rb_userdata = big_un;
1326				dep->de_reqsize = newreqsize;
1327				rbp->rb_private |= MD_PRV_CONVD;
1328				un = big_un;
1329			} else {
1330				/* Record has already been converted */
1331				un = (mp_unit_t *)mddb_getrecaddr(recid);
1332			}
1333			un->c.un_revision &= ~MD_64BIT_META_DEV;
1334			break;
1335		case MDDB_REV_RB64:
1336		case MDDB_REV_RB64FN:
1337			/* Large device */
1338			un = (mp_unit_t *)mddb_getrecaddr(recid);
1339			un->c.un_revision |= MD_64BIT_META_DEV;
1340			un->c.un_flag |= MD_EFILABEL;
1341			break;
1342		}
1343		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
1344
1345		/*
1346		 * Create minor node for snarfed entry.
1347		 */
1348		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
1349
1350		if (MD_UNIT(MD_SID(un)) != NULL) {
1351			/* unit is already in-core */
1352			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1353			continue;
1354		}
1355		all_sp_gotten = 0;
1356		if (sp_build_incore((void *)un, 1) == 0) {
1357			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1358			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
1359			gotsomething = 1;
1360		}
1361	}
1362
1363	if (!all_sp_gotten)
1364		return (gotsomething);
1365	/* double-check records */
1366	recid = mddb_makerecid(setno, 0);
1367	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
1368		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
1369			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1370
1371	return (0);
1372}
1373
1374/*
1375 * FUNCTION:	sp_halt()
1376 * INPUT:	cmd	- halt cmd.
1377 *		setno	- set number.
1378 * RETURNS:	0	- success.
1379 *		1	- err.
1380 * PURPOSE:	Perform driver halt operations.  As with stripe, we
1381 *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
1382 *		does a check to see if halting can be done safely
1383 *		(no open soft partitions), the second cleans up and
1384 *		shuts down the driver.
1385 */
1386static int
1387sp_halt(md_haltcmd_t cmd, set_t setno)
1388{
1389	int		i;
1390	mdi_unit_t	*ui;
1391	minor_t		mnum;
1392
1393	if (cmd == MD_HALT_CLOSE)
1394		return (0);
1395
1396	if (cmd == MD_HALT_OPEN)
1397		return (0);
1398
1399	if (cmd == MD_HALT_UNLOAD)
1400		return (0);
1401
1402	if (cmd == MD_HALT_CHECK) {
1403		for (i = 0; i < md_nunits; i++) {
1404			mnum = MD_MKMIN(setno, i);
1405			if ((ui = MDI_UNIT(mnum)) == NULL)
1406				continue;
1407			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1408				continue;
1409			if (md_unit_isopen(ui))
1410				return (1);
1411		}
1412		return (0);
1413	}
1414
1415	if (cmd != MD_HALT_DOIT)
1416		return (1);
1417
1418	for (i = 0; i < md_nunits; i++) {
1419		mnum = MD_MKMIN(setno, i);
1420		if ((ui = MDI_UNIT(mnum)) == NULL)
1421			continue;
1422		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1423			continue;
1424		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
1425	}
1426
1427	return (0);
1428}
1429
1430/*
1431 * FUNCTION:	sp_open_dev()
1432 * INPUT:	un	- unit structure.
1433 *		oflags	- open flags.
1434 * OUTPUT:	none.
1435 * RETURNS:	0		- success.
1436 *		non-zero	- err.
1437 * PURPOSE:	open underlying device via md_layered_open.
1438 */
1439static int
1440sp_open_dev(mp_unit_t *un, int oflags)
1441{
1442	minor_t		mnum = MD_SID(un);
1443	int		err;
1444	md_dev64_t	tmpdev;
1445	set_t		setno = MD_MIN2SET(MD_SID(un));
1446	side_t		side = mddb_getsidenum(setno);
1447
1448	tmpdev = un->un_dev;
1449	/*
1450	 * Do the open by device id if underlying is regular
1451	 */
1452	if ((md_getmajor(tmpdev) != md_major) &&
1453	    md_devid_found(setno, side, un->un_key) == 1) {
1454		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
1455	}
1456	err = md_layered_open(mnum, &tmpdev, oflags);
1457	un->un_dev = tmpdev;
1458
1459	if (err)
1460		return (ENXIO);
1461
1462	return (0);
1463}
1464
1465/*
1466 * FUNCTION:	sp_open()
1467 * INPUT:	dev		- device to open.
1468 *		flag		- pass-through flag.
1469 *		otyp		- pass-through open type.
1470 *		cred_p		- credentials.
1471 *		md_oflags	- open flags.
1472 * OUTPUT:	none.
1473 * RETURNS:	0		- success.
1474 *		non-zero	- err.
1475 * PURPOSE:	open a soft partition.
1476 */
1477/* ARGSUSED */
1478static int
1479sp_open(
1480	dev_t		*dev,
1481	int		flag,
1482	int		otyp,
1483	cred_t		*cred_p,
1484	int		md_oflags
1485)
1486{
1487	minor_t		mnum = getminor(*dev);
1488	mdi_unit_t	*ui = MDI_UNIT(mnum);
1489	mp_unit_t	*un;
1490	int		err = 0;
1491	set_t		setno;
1492
1493	/*
1494	 * When doing an open of a multi owner metadevice, check to see if this
1495	 * node is a starting node and if a reconfig cycle is underway.
1496	 * If so, the system isn't sufficiently set up enough to handle the
1497	 * open (which involves I/O during sp_validate), so fail with ENXIO.
1498	 */
1499	setno = MD_MIN2SET(mnum);
1500	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
1501	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
1502			return (ENXIO);
1503	}
1504
1505	/* grab necessary locks */
1506	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1507	setno = MD_UN2SET(un);
1508
1509	/* open underlying device, if necessary */
1510	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
1511		if ((err = sp_open_dev(un, md_oflags)) != 0)
1512			goto out;
1513
1514		if (MD_MNSET_SETNO(setno)) {
1515			/* For probe, don't incur the overhead of validate */
1516			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
1517				/*
1518				 * Don't call sp_validate while
1519				 * unit_openclose lock is held.  So, actually
1520				 * open the device, drop openclose lock,
1521				 * call sp_validate, reacquire openclose lock,
1522				 * and close the device.  If sp_validate
1523				 * succeeds, then device will be re-opened.
1524				 */
1525				if ((err = md_unit_incopen(mnum, flag,
1526				    otyp)) != 0)
1527					goto out;
1528
1529				mutex_enter(&ui->ui_mx);
1530				ui->ui_lock |= MD_UL_OPENINPROGRESS;
1531				mutex_exit(&ui->ui_mx);
1532				md_unit_openclose_exit(ui);
1533				if (otyp != OTYP_LYR)
1534					rw_exit(&md_unit_array_rw.lock);
1535
1536				err = sp_validate(un);
1537
1538				if (otyp != OTYP_LYR)
1539					rw_enter(&md_unit_array_rw.lock,
1540					    RW_READER);
1541				(void) md_unit_openclose_enter(ui);
1542				(void) md_unit_decopen(mnum, otyp);
1543				mutex_enter(&ui->ui_mx);
1544				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
1545				cv_broadcast(&ui->ui_cv);
1546				mutex_exit(&ui->ui_mx);
1547				/*
1548				 * Should be in the same state as before
1549				 * the sp_validate.
1550				 */
1551				if (err != 0) {
1552					/* close the device opened above */
1553					md_layered_close(un->un_dev, md_oflags);
1554					err = EIO;
1555					goto out;
1556				}
1557			}
1558			/*
1559			 * As we're a multi-owner metadevice we need to ensure
1560			 * that all nodes have the same idea of the status.
1561			 * sp_validate() will mark the device as errored (if
1562			 * it cannot read the watermark) or ok (if it was
1563			 * previously errored but the watermark is now valid).
1564			 * This code-path is only entered on the non-probe open
1565			 * so we will maintain the errored state during a probe
1566			 * call. This means the sys-admin must metarecover -m
1567			 * to reset the soft-partition error.
1568			 */
1569		} else {
1570			/* For probe, don't incur the overhead of validate */
1571			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
1572			    (err = sp_validate(un)) != 0) {
1573				/* close the device opened above */
1574				md_layered_close(un->un_dev, md_oflags);
1575				err = EIO;
1576				goto out;
1577			} else {
1578				/*
1579				 * we succeeded in validating the on disk
1580				 * format versus the in core, so reset the
1581				 * status if it's in error
1582				 */
1583				if (un->un_status == MD_SP_ERR) {
1584					un->un_status = MD_SP_OK;
1585				}
1586			}
1587		}
1588	}
1589
1590	/* count open */
1591	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
1592		goto out;
1593
1594out:
1595	md_unit_openclose_exit(ui);
1596	return (err);
1597}
1598
1599/*
1600 * FUNCTION:	sp_close()
1601 * INPUT:	dev		- device to close.
1602 *		flag		- pass-through flag.
1603 *		otyp		- pass-through type.
1604 *		cred_p		- credentials.
1605 *		md_cflags	- close flags.
1606 * OUTPUT:	none.
1607 * RETURNS:	0		- success.
1608 *		non-zero	- err.
1609 * PURPOSE:	close a soft paritition.
1610 */
1611/* ARGSUSED */
1612static int
1613sp_close(
1614	dev_t		dev,
1615	int		flag,
1616	int		otyp,
1617	cred_t		*cred_p,
1618	int		md_cflags
1619)
1620{
1621	minor_t		mnum = getminor(dev);
1622	mdi_unit_t	*ui = MDI_UNIT(mnum);
1623	mp_unit_t	*un;
1624	int		err = 0;
1625
1626	/* grab necessary locks */
1627	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1628
1629	/* count closed */
1630	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1631		goto out;
1632
1633	/* close devices, if necessary */
1634	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1635		md_layered_close(un->un_dev, md_cflags);
1636	}
1637
1638	/*
1639	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
1640	 * clear these capabilities if this is the last close in
1641	 * the cluster
1642	 */
1643	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1644	    (ui->ui_tstate & MD_ABR_CAP)) {
1645		md_unit_openclose_exit(ui);
1646		mdmn_clear_all_capabilities(mnum);
1647		return (0);
1648	}
1649	/* unlock, return success */
1650out:
1651	md_unit_openclose_exit(ui);
1652	return (err);
1653}
1654
1655
1656/* used in sp_dump routine */
1657static struct buf dumpbuf;
1658
1659/*
1660 * FUNCTION:	sp_dump()
1661 * INPUT:	dev	- device to dump to.
1662 *		addr	- address to dump.
1663 *		blkno	- blkno on device.
1664 *		nblk	- number of blocks to dump.
1665 * OUTPUT:	none.
1666 * RETURNS:	result from bdev_dump.
1667 * PURPOSE:  This routine dumps memory to the disk.  It assumes that
1668 *           the memory has already been mapped into mainbus space.
1669 *           It is called at disk interrupt priority when the system
1670 *           is in trouble.
1671 *           NOTE: this function is defined using 32-bit arguments,
1672 *           but soft partitioning is internally 64-bit.  Arguments
1673 *           are casted where appropriate.
1674 */
1675static int
1676sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1677{
1678	mp_unit_t	*un;
1679	buf_t		*bp;
1680	sp_ext_length_t	nb;
1681	daddr_t		mapblk;
1682	int		result;
1683	int		more;
1684	int		saveresult = 0;
1685
1686	/*
1687	 * Don't need to grab the unit lock.
1688	 * Cause nothing else is supposed to be happenning.
1689	 * Also dump is not supposed to sleep.
1690	 */
1691	un = (mp_unit_t *)MD_UNIT(getminor(dev));
1692
1693	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1694		return (EINVAL);
1695
1696	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
1697		return (EINVAL);
1698
1699	bp = &dumpbuf;
1700	nb = (sp_ext_length_t)dbtob(nblk);
1701	do {
1702		bzero((caddr_t)bp, sizeof (*bp));
1703		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
1704		nblk = (int)(btodb(bp->b_bcount));
1705		mapblk = bp->b_blkno;
1706		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
1707		if (result)
1708			saveresult = result;
1709
1710		nb -= bp->b_bcount;
1711		addr += bp->b_bcount;
1712		blkno += nblk;
1713	} while (more);
1714
1715	return (saveresult);
1716}
1717
1718static int
1719sp_imp_set(
1720	set_t	setno
1721)
1722{
1723	mddb_recid_t	recid;
1724	int		gotsomething;
1725	mddb_type_t	rec_type;
1726	mddb_de_ic_t	*dep;
1727	mddb_rb32_t	*rbp;
1728	mp_unit_t	*un64;
1729	mp_unit32_od_t	*un32;
1730	md_dev64_t	self_devt;
1731	minor_t		*self_id;	/* minor needs to be updated */
1732	md_parent_t	*parent_id;	/* parent needs to be updated */
1733	mddb_recid_t	*record_id;	/* record id needs to be updated */
1734
1735	gotsomething = 0;
1736
1737	rec_type = (mddb_type_t)md_getshared_key(setno,
1738	    sp_md_ops.md_driver.md_drivername);
1739	recid = mddb_makerecid(setno, 0);
1740
1741	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1742		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1743			continue;
1744
1745		dep = mddb_getrecdep(recid);
1746		rbp = dep->de_rb;
1747
1748		switch (rbp->rb_revision) {
1749		case MDDB_REV_RB:
1750		case MDDB_REV_RBFN:
1751			/*
1752			 * Small device
1753			 */
1754			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1755			self_id = &(un32->c.un_self_id);
1756			parent_id = &(un32->c.un_parent);
1757			record_id = &(un32->c.un_record_id);
1758
1759			if (!md_update_minor(setno, mddb_getsidenum
1760			    (setno), un32->un_key))
1761				goto out;
1762			break;
1763
1764		case MDDB_REV_RB64:
1765		case MDDB_REV_RB64FN:
1766			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
1767			self_id = &(un64->c.un_self_id);
1768			parent_id = &(un64->c.un_parent);
1769			record_id = &(un64->c.un_record_id);
1770
1771			if (!md_update_minor(setno, mddb_getsidenum
1772			    (setno), un64->un_key))
1773				goto out;
1774			break;
1775		}
1776
1777		/*
1778		 * If this is a top level and a friendly name metadevice,
1779		 * update its minor in the namespace.
1780		 */
1781		if ((*parent_id == MD_NO_PARENT) &&
1782		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
1783		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
1784
1785			self_devt = md_makedevice(md_major, *self_id);
1786			if (!md_update_top_device_minor(setno,
1787			    mddb_getsidenum(setno), self_devt))
1788				goto out;
1789		}
1790
1791		/*
1792		 * Update unit with the imported setno
1793		 *
1794		 */
1795		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1796
1797		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1798		if (*parent_id != MD_NO_PARENT)
1799			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1800		*record_id = MAKERECID(setno, DBID(*record_id));
1801
1802		gotsomething = 1;
1803	}
1804
1805out:
1806	return (gotsomething);
1807}
1808
1809static md_named_services_t sp_named_services[] = {
1810	{NULL,					0}
1811};
1812
1813md_ops_t sp_md_ops = {
1814	sp_open,		/* open */
1815	sp_close,		/* close */
1816	md_sp_strategy,		/* strategy */
1817	NULL,			/* print */
1818	sp_dump,		/* dump */
1819	NULL,			/* read */
1820	NULL,			/* write */
1821	md_sp_ioctl,		/* ioctl, */
1822	sp_snarf,		/* snarf */
1823	sp_halt,		/* halt */
1824	NULL,			/* aread */
1825	NULL,			/* awrite */
1826	sp_imp_set,		/* import set */
1827	sp_named_services
1828};
1829
1830static void
1831init_init()
1832{
1833	sp_parent_cache = kmem_cache_create("md_softpart_parent",
1834	    sizeof (md_spps_t), 0, sp_parent_constructor,
1835	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
1836	sp_child_cache = kmem_cache_create("md_softpart_child",
1837	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
1838	    sp_child_constructor, sp_child_destructor, sp_run_queue,
1839	    NULL, NULL, 0);
1840}
1841
1842static void
1843fini_uninit()
1844{
1845	kmem_cache_destroy(sp_parent_cache);
1846	kmem_cache_destroy(sp_child_cache);
1847	sp_parent_cache = sp_child_cache = NULL;
1848}
1849
1850/* define the module linkage */
1851MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())
1852