md_mddb.c revision 8452:89d32dfdae6e
1226031Sstas/*
2226031Sstas * CDDL HEADER START
3226031Sstas *
4226031Sstas * The contents of this file are subject to the terms of the
5226031Sstas * Common Development and Distribution License (the "License").
6226031Sstas * You may not use this file except in compliance with the License.
7226031Sstas *
8226031Sstas * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9226031Sstas * or http://www.opensolaris.org/os/licensing.
10226031Sstas * See the License for the specific language governing permissions
11226031Sstas * and limitations under the License.
12226031Sstas *
13226031Sstas * When distributing Covered Code, include this CDDL HEADER in each
14226031Sstas * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15226031Sstas * If applicable, add the following below this CDDL HEADER, with the
16226031Sstas * fields enclosed by brackets "[]" replaced with your own identifying
17226031Sstas * information: Portions Copyright [yyyy] [name of copyright owner]
18226031Sstas *
19226031Sstas * CDDL HEADER END
20226031Sstas */
21226031Sstas
22226031Sstas/*
23226031Sstas * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24226031Sstas * Use is subject to license terms.
25226031Sstas */
26226031Sstas
27226031Sstas#include <sys/types.h>
28226031Sstas#include <sys/conf.h>
29226031Sstas#include <sys/time.h>
30226031Sstas#include <sys/uio.h>
31226031Sstas#include <sys/param.h>
32226031Sstas#include <sys/systm.h>
33226031Sstas#include <sys/systeminfo.h>
34226031Sstas#include <sys/sysmacros.h>
35226031Sstas#include <sys/buf.h>
36226031Sstas#include <sys/kmem.h>
37226031Sstas#include <sys/file.h>
38226031Sstas#include <sys/open.h>
39226031Sstas#include <sys/debug.h>
40226031Sstas#include <sys/stat.h>
41226031Sstas#include <sys/lvm/mdvar.h>
42226031Sstas#include <sys/lvm/md_crc.h>
43226031Sstas#include <sys/lvm/md_convert.h>
44226031Sstas#include <sys/types.h>
45226031Sstas#include <sys/kmem.h>
46226031Sstas#include <sys/lvm/mdmn_commd.h>
47226031Sstas#include <sys/cladm.h>
48226031Sstas
49226031Sstasmhd_mhiargs_t	defmhiargs = {
50226031Sstas	1000,
51226031Sstas	{ 6000, 6000, 30000 }
52226031Sstas};
53226031Sstas
54226031Sstas#define	MDDB
55226031Sstas
56226031Sstas#include <sys/lvm/mdvar.h>
57226031Sstas#include <sys/lvm/mdmed.h>
58226031Sstas#include <sys/lvm/md_names.h>
59226031Sstas#include <sys/cred.h>
60226031Sstas#include <sys/ddi.h>
61226031Sstas#include <sys/sunddi.h>
62226031Sstas#include <sys/esunddi.h>
63226031Sstas
64226031Sstas#include <sys/sysevent/eventdefs.h>
65226031Sstas#include <sys/sysevent/svm.h>
66226031Sstas
67226031Sstasextern char svm_bootpath[];
68226031Sstas
69226031Sstasint			md_maxbootlist = MAXBOOTLIST;
70226031Sstasstatic ulong_t		mddb_maxblocks = 0;	/* tune for small records */
71226031Sstasstatic int		mddb_maxbufheaders = 50;
72226031Sstasstatic uint_t		mddb_maxcopies = MDDB_NLB;
73226031Sstas
74226031Sstas/*
75226031Sstas * If this is set, more detailed messages about DB init will be given, instead
76226031Sstas * of just the MDE_DB_NODB.
77226031Sstas */
78226031Sstasstatic int		mddb_db_err_detail = 0;
79226031Sstas
80226031Sstas/*
81226031Sstas * This lock is used to single-thread load/unload of all sets
82226031Sstas */
83226031Sstasstatic kmutex_t		mddb_lock;
84226031Sstas
85226031Sstas/*
86226031Sstas * You really do NOT want to change this boolean.
87226031Sstas * It can be VERY dangerous to do so.  Loss of
88226031Sstas * data may occur. USE AT YOUR OWN RISK!!!!
89226031Sstas */
90226031Sstasstatic int		mddb_allow_half = 0;
91226031Sstas/*
92226031Sstas * For mirrored root allow reboot with only half the replicas available
93226031Sstas * Flag inserted for Santa Fe project.
94226031Sstas */
95226031Sstasint mirrored_root_flag;
96226031Sstas
97226031Sstas#define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
98226031Sstas			    ((c) == '\r') || ((c) == '\n'))
99226031Sstas#define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))
100226031Sstas
101226031Sstas#define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)
102226031Sstas
103226031Sstasextern md_krwlock_t	md_unit_array_rw;	/* md.c */
104226031Sstasextern set_t		md_nsets;		/* md.c */
105226031Sstasextern int		md_nmedh;		/* md.c */
106226031Sstasextern md_set_t		md_set[];		/* md.c */
107226031Sstasextern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
108226031Sstasextern dev_info_t	*md_devinfo;
109226031Sstasextern int		md_init_debug;
110226031Sstasextern int		md_status;
111226031Sstasextern md_ops_t		*md_opslist;
112226031Sstasextern md_krwlock_t	nm_lock;
113226031Sstas
114226031Sstasstatic int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
115226031Sstas				ddi_devid_t didptr, ddi_devid_t old_didptr);
116226031Sstas
117226031Sstas/*
118226031Sstas * Defines for crc calculation for records
119226031Sstas * rec_crcgen generates a crc checksum for a record block
120226031Sstas * rec_crcchk checks the crc checksum for a record block
121226031Sstas */
122226031Sstas#define	REC_CRCGEN	0
123226031Sstas#define	REC_CRCCHK	1
124226031Sstas#define	rec_crcgen(s, dep, rbp) \
125226031Sstas	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
126226031Sstas#define	rec_crcchk(s, dep, rbp) \
127226031Sstas	rec_crcfunc(s, dep, rbp, REC_CRCCHK)
128226031Sstas
129226031Sstas/*
130226031Sstas * During upgrade, SVM basically runs with the devt from the target
131226031Sstas * being upgraded.  Translations are made from the target devt to the
132226031Sstas * miniroot devt when writing data out to the disk.  This is done by
133226031Sstas * the following routines:
134226031Sstas *	wrtblklst
135226031Sstas *	writeblks
136226031Sstas *	readblklst
137226031Sstas *	readblks
138226031Sstas *	dt_read
139226031Sstas *
140226031Sstas * The following routines are used by the routines listed above and
141226031Sstas * expect a translated (aka miniroot) devt:
142226031Sstas *	getblks
143226031Sstas * 	getmasters
144226031Sstas *
145226031Sstas * Also, when calling any system routines, such as ddi_lyr_get_devid,
146226031Sstas * the translated (aka miniroot) devt must be used.
147226031Sstas *
148226031Sstas * By the same token, the major number and major name conversion operations
149226031Sstas * need to use the name_to_major file from the target system instead
150226031Sstas * of the name_to_major file on the miniroot.  So, calls to
151226031Sstas * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
152226031Sstas * when running on an upgrade.  Same is true with calls to
153226031Sstas * ddi_major_to_name.
154226031Sstas */
155226031Sstas
156226031Sstas
157226031Sstas#ifndef MDDB_FAKE
158226031Sstas
159226031Sstasstatic int
160226031Sstasmddb_rwdata(
161226031Sstas	mddb_set_t	*s,	/* incore db set structure */
162226031Sstas	int		flag,	/* B_ASYNC, B_FAILFAST or 0 passed in here */
163226031Sstas	buf_t		*bp
164226031Sstas)
165226031Sstas{
166226031Sstas	int		err = 0;
167226031Sstas
168226031Sstas	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
169226031Sstas
170226031Sstas	mutex_exit(SETMUTEX(s->s_setno));
171226031Sstas	if (mdv_strategy_tstpnt == NULL ||
172226031Sstas	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
173226031Sstas		(void) bdev_strategy(bp);
174226031Sstas
175226031Sstas	if (flag & B_ASYNC) {
176226031Sstas		mutex_enter(SETMUTEX(s->s_setno));
177226031Sstas		return (0);
178226031Sstas	}
179226031Sstas
180226031Sstas	err = biowait(bp);
181226031Sstas	mutex_enter(SETMUTEX(s->s_setno));
182226031Sstas	return (err);
183226031Sstas}
184226031Sstas
185226031Sstasstatic void
186226031Sstassetidentifier(
187226031Sstas	mddb_set_t	*s,
188226031Sstas	identifier_t	*ident
189226031Sstas)
190226031Sstas{
191226031Sstas	if (s->s_setno == MD_LOCAL_SET)
192226031Sstas		(void) strcpy(&ident->serial[0], s->s_ident.serial);
193226031Sstas	else
194226031Sstas		ident->createtime = s->s_ident.createtime;
195226031Sstas}
196226031Sstas
197226031Sstasstatic int
198226031Sstascmpidentifier(
199226031Sstas	mddb_set_t	*s,
200226031Sstas	identifier_t	*ident
201226031Sstas)
202226031Sstas{
203226031Sstas	if (s->s_setno == MD_LOCAL_SET)
204226031Sstas		return (strcmp(ident->serial, s->s_ident.serial));
205226031Sstas	else
206226031Sstas		return (timercmp(&ident->createtime,
207226031Sstas		    /*CSTYLED*/
208226031Sstas		    &s->s_ident.createtime, !=));
209226031Sstas}
210226031Sstas
211226031Sstasstatic int
212226031Sstasmddb_devopen(
213226031Sstas	md_dev64_t	dev
214226031Sstas)
215226031Sstas{
216226031Sstas	dev_t		ddi_dev = md_dev64_to_dev(dev);
217226031Sstas
218226031Sstas	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
219226031Sstas		return (0);
220226031Sstas	return (1);
221226031Sstas}
222226031Sstas
223226031Sstasstatic void
224226031Sstasmddb_devclose(
225226031Sstas	md_dev64_t	dev
226226031Sstas)
227226031Sstas{
228226031Sstas	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
229226031Sstas}
230226031Sstas
231226031Sstas/*
232226031Sstas * stripe_skip_ts
233226031Sstas *
234226031Sstas * Returns a list of fields to be skipped in the stripe record structure.
235226031Sstas * These fields are ms_timestamp in the component structure.
236226031Sstas * Used to skip these fields when calculating the checksum.
237226031Sstas */
238226031Sstasstatic crc_skip_t *
239226031Sstasstripe_skip_ts(void *un, uint_t revision)
240226031Sstas{
241226031Sstas	struct ms_row32_od	*small_mdr;
242226031Sstas	struct ms_row		*big_mdr;
243226031Sstas	uint_t			row, comp, ncomps, compoff;
244226031Sstas	crc_skip_t		*skip;
245226031Sstas	crc_skip_t		*skip_prev;
246226031Sstas	crc_skip_t		skip_start = {0, 0, 0};
247226031Sstas	ms_unit_t		*big_un;
248226031Sstas	ms_unit32_od_t		*small_un;
249226031Sstas	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);
250226031Sstas
251226031Sstas	switch (revision) {
252226031Sstas	case MDDB_REV_RB:
253226031Sstas	case MDDB_REV_RBFN:
254226031Sstas		small_un = (ms_unit32_od_t *)un;
255226031Sstas		skip_prev = &skip_start;
256226031Sstas
257226031Sstas		if (small_un->un_nrows == 0)
258226031Sstas			return (NULL);
259226031Sstas		/*
260226031Sstas		 * walk through all rows to find the total number
261226031Sstas		 * of components
262226031Sstas		 */
263226031Sstas		small_mdr   = &small_un->un_row[0];
264226031Sstas		ncomps = 0;
265226031Sstas		for (row = 0; (row < small_un->un_nrows); row++) {
266226031Sstas			ncomps += small_mdr[row].un_ncomp;
267226031Sstas		}
268226031Sstas
269226031Sstas		/* Now walk through the components */
270226031Sstas		compoff = small_un->un_ocomp + rb_off;
271226031Sstas		for (comp = 0; (comp < ncomps); ++comp) {
272226031Sstas			uint_t	mdcp = compoff +
273226031Sstas			    (comp * sizeof (ms_comp32_od_t));
274226031Sstas			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
275226031Sstas			    KM_SLEEP);
276226031Sstas			skip->skip_offset = mdcp +
277226031Sstas			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
278226031Sstas			skip->skip_size = sizeof (md_timeval32_t);
279226031Sstas			skip_prev->skip_next = skip;
280226031Sstas			skip_prev = skip;
281226031Sstas		}
282226031Sstas		break;
283226031Sstas	case MDDB_REV_RB64:
284226031Sstas	case MDDB_REV_RB64FN:
285226031Sstas		big_un = (ms_unit_t *)un;
286226031Sstas		skip_prev = &skip_start;
287226031Sstas
288226031Sstas		if (big_un->un_nrows == 0)
289226031Sstas			return (NULL);
290226031Sstas		/*
291226031Sstas		 * walk through all rows to find the total number
292226031Sstas		 * of components
293226031Sstas		 */
294226031Sstas		big_mdr   = &big_un->un_row[0];
295226031Sstas		ncomps = 0;
296226031Sstas		for (row = 0; (row < big_un->un_nrows); row++) {
297226031Sstas			ncomps += big_mdr[row].un_ncomp;
298226031Sstas		}
299226031Sstas
300226031Sstas		/* Now walk through the components */
301226031Sstas		compoff = big_un->un_ocomp + rb_off;
302226031Sstas		for (comp = 0; (comp < ncomps); ++comp) {
303226031Sstas			uint_t	mdcp = compoff +
304226031Sstas			    (comp * sizeof (ms_comp_t));
305226031Sstas			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
306226031Sstas			    KM_SLEEP);
307226031Sstas			skip->skip_offset = mdcp +
308226031Sstas			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
309226031Sstas			skip->skip_size = sizeof (md_timeval32_t);
310226031Sstas			skip_prev->skip_next = skip;
311226031Sstas			skip_prev = skip;
312226031Sstas		}
313226031Sstas		break;
314226031Sstas	}
315226031Sstas	/* Return the start of the list of fields to skip */
316226031Sstas	return (skip_start.skip_next);
317226031Sstas}
318226031Sstas
319226031Sstas/*
320226031Sstas * mirror_skip_ts
321226031Sstas *
322226031Sstas * Returns a list of fields to be skipped in the mirror record structure.
323226031Sstas * This includes un_last_read and sm_timestamp for each submirror
324226031Sstas * Used to skip these fields when calculating the checksum.
325226031Sstas */
326226031Sstasstatic crc_skip_t *
327226031Sstasmirror_skip_ts(uint_t revision)
328226031Sstas{
329226031Sstas	int		i;
330226031Sstas	crc_skip_t	*skip;
331226031Sstas	crc_skip_t	*skip_prev;
332226031Sstas	crc_skip_t	skip_start = {0, 0, 0};
333226031Sstas	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
334226031Sstas
335226031Sstas	skip_prev = &skip_start;
336226031Sstas
337226031Sstas	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
338226031Sstas	switch (revision) {
339226031Sstas	case MDDB_REV_RB:
340226031Sstas	case MDDB_REV_RBFN:
341226031Sstas		skip->skip_offset = offsetof(mm_unit32_od_t,
342226031Sstas		    un_last_read) + rb_off;
343226031Sstas		break;
344226031Sstas	case MDDB_REV_RB64:
345226031Sstas	case MDDB_REV_RB64FN:
346226031Sstas		skip->skip_offset = offsetof(mm_unit_t,
347226031Sstas		    un_last_read) + rb_off;
348226031Sstas		break;
349226031Sstas	}
350226031Sstas	skip->skip_size = sizeof (int);
351226031Sstas	skip_prev->skip_next = skip;
352226031Sstas	skip_prev = skip;
353226031Sstas
354226031Sstas	for (i = 0; i < NMIRROR; i++) {
355226031Sstas		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
356226031Sstas		switch (revision) {
357226031Sstas		case MDDB_REV_RB:
358226031Sstas		case MDDB_REV_RBFN:
359226031Sstas			skip->skip_offset = offsetof(mm_unit32_od_t,
360226031Sstas			    un_sm[i].sm_timestamp) + rb_off;
361226031Sstas			break;
362226031Sstas		case MDDB_REV_RB64:
363226031Sstas		case MDDB_REV_RB64FN:
364226031Sstas			skip->skip_offset = offsetof(mm_unit_t,
365226031Sstas			    un_sm[i].sm_timestamp) + rb_off;
366226031Sstas			break;
367226031Sstas		}
368226031Sstas		skip->skip_size = sizeof (md_timeval32_t);
369226031Sstas		skip_prev->skip_next = skip;
370226031Sstas		skip_prev = skip;
371226031Sstas	}
372226031Sstas	/* Return the start of the list of fields to skip */
373226031Sstas	return (skip_start.skip_next);
374226031Sstas}
375226031Sstas
376226031Sstas/*
377226031Sstas * hotspare_skip_ts
378226031Sstas *
379226031Sstas * Returns a list of the timestamp fields in the hotspare record structure.
380226031Sstas * Used to skip these fields when calculating the checksum.
381226031Sstas */
382226031Sstasstatic crc_skip_t *
383226031Sstashotspare_skip_ts(uint_t revision)
384226031Sstas{
385226031Sstas	crc_skip_t	*skip;
386226031Sstas	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
387226031Sstas
388226031Sstas	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
389226031Sstas	switch (revision) {
390226031Sstas	case MDDB_REV_RB:
391226031Sstas	case MDDB_REV_RBFN:
392226031Sstas		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
393226031Sstas		    rb_off;
394226031Sstas		break;
395226031Sstas	case MDDB_REV_RB64:
396226031Sstas	case MDDB_REV_RB64FN:
397226031Sstas		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
398226031Sstas		    rb_off;
399226031Sstas		break;
400226031Sstas	}
401226031Sstas	skip->skip_size = sizeof (md_timeval32_t);
402226031Sstas	return (skip);
403226031Sstas}
404226031Sstas
405226031Sstas/*
406226031Sstas * rec_crcfunc
407226031Sstas *
408226031Sstas * Calculate or check the checksum for a record
409226031Sstas * Calculate the crc if check == 0, Check the crc if check == 1
410226031Sstas *
411226031Sstas * Record block may be written by different nodes in a multi-owner diskset
412226031Sstas * (in case of master change), the function rec_crcchk excludes timestamp
413226031Sstas * fields in crc computation of record data.
414226031Sstas * Otherwise, timestamp fields will cause each node to have a different
415226031Sstas * checksum for same record block causing the exclusive-or of all record block
416226031Sstas * checksums and data block record sums to be non-zero after new master writes
417226031Sstas * at least one record block.
418226031Sstas */
419226031Sstasstatic uint_t
420226031Sstasrec_crcfunc(
421226031Sstas	mddb_set_t	*s,
422226031Sstas	mddb_de_ic_t	*dep,
423226031Sstas	mddb_rb32_t	*rbp,
424226031Sstas	int		check
425226031Sstas)
426226031Sstas{
427226031Sstas	crc_skip_t	*skip;
428226031Sstas	crc_skip_t	*skip_tail;
429226031Sstas	mddb_type_t	type = dep->de_type1;
430226031Sstas	uint_t		ret;
431226031Sstas
432226031Sstas	/*
433226031Sstas	 * Generate a list of the areas to be skipped when calculating
434226031Sstas	 * the checksum.
435226031Sstas	 * First skip rb_checksum, rb_private and rb_userdata.
436226031Sstas	 */
437226031Sstas	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
438226031Sstas	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
439226031Sstas	skip->skip_size = 3 * sizeof (uint_t);
440226031Sstas	skip_tail = skip;
441226031Sstas	if (MD_MNSET_SETNO(s->s_setno)) {
442226031Sstas		/* For a MN set, skip rb_timestamp */
443226031Sstas		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
444226031Sstas		    KM_SLEEP);
445226031Sstas		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
446226031Sstas		skip_tail->skip_size = sizeof (md_timeval32_t);
447226031Sstas		skip->skip_next = skip_tail;
448226031Sstas
449226031Sstas		/* Now add a list of timestamps to be skipped */
450226031Sstas		if (type >= MDDB_FIRST_MODID) {
451226031Sstas			switch (dep->de_flags) {
452226031Sstas				case MDDB_F_STRIPE:
453226031Sstas					skip_tail->skip_next =
454226031Sstas					    stripe_skip_ts((void *)rbp->rb_data,
455226031Sstas					    rbp->rb_revision);
456226031Sstas					break;
457226031Sstas				case MDDB_F_MIRROR:
458226031Sstas					skip_tail->skip_next =
459226031Sstas					    mirror_skip_ts(rbp->rb_revision);
460226031Sstas					break;
461226031Sstas				case MDDB_F_HOTSPARE:
462226031Sstas					skip_tail->skip_next =
463226031Sstas					    hotspare_skip_ts(rbp->rb_revision);
464226031Sstas					break;
465226031Sstas				default:
466226031Sstas					break;
467226031Sstas			}
468226031Sstas		}
469226031Sstas	}
470226031Sstas
471226031Sstas	if (check) {
472226031Sstas		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
473226031Sstas	} else {
474226031Sstas		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
475226031Sstas		ret = rbp->rb_checksum;
476226031Sstas	}
477226031Sstas	while (skip) {
478226031Sstas		crc_skip_t	*skip_save = skip;
479226031Sstas
480226031Sstas		skip = skip->skip_next;
481226031Sstas		kmem_free(skip_save, sizeof (crc_skip_t));
482226031Sstas	}
483226031Sstas	return (ret);
484226031Sstas}
485226031Sstas
486226031Sstasstatic mddb_bf_t *
487226031Sstasallocbuffer(
488226031Sstas	mddb_set_t	*s,
489226031Sstas	int		sleepflag
490226031Sstas)
491226031Sstas{
492226031Sstas	mddb_bf_t	*bfp;
493226031Sstas
494226031Sstas	while ((bfp = s->s_freebufhead) == NULL) {
495226031Sstas		if (sleepflag == MDDB_NOSLEEP)
496226031Sstas			return ((mddb_bf_t *)NULL);
497226031Sstas		++s->s_bufmisses;
498226031Sstas#ifdef	DEBUG
499226031Sstas		if (s->s_bufmisses == 1)
500226031Sstas			cmn_err(CE_NOTE,
501226031Sstas			    "md: mddb: set %u sleeping for buffer", s->s_setno);
502226031Sstas#endif
503226031Sstas		s->s_bufwakeup = 1;
504226031Sstas		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
505226031Sstas	}
506226031Sstas	s->s_freebufhead = bfp->bf_next;
507226031Sstas	bzero((caddr_t)bfp, sizeof (*bfp));
508226031Sstas	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
509226031Sstas	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
510226031Sstas	return (bfp);
511226031Sstas}
512226031Sstas
513226031Sstasstatic void
514226031Sstasfreebuffer(
515226031Sstas	mddb_set_t		*s,
516226031Sstas	mddb_bf_t	*bfp
517226031Sstas)
518226031Sstas{
519226031Sstas	bfp->bf_next = s->s_freebufhead;
520226031Sstas	s->s_freebufhead = bfp;
521226031Sstas	if (s->s_bufwakeup) {
522226031Sstas		cv_broadcast(&s->s_buf_cv);
523226031Sstas		s->s_bufwakeup = 0;
524226031Sstas	}
525226031Sstas}
526226031Sstas
527226031Sstas
528226031Sstasstatic void
529226031Sstasblkbusy(
530226031Sstas	mddb_set_t	*s,
531226031Sstas	mddb_block_t	blk
532226031Sstas)
533226031Sstas{
534226031Sstas	int		bit, byte;
535226031Sstas
536226031Sstas	s->s_freeblkcnt--;
537226031Sstas	byte = blk / 8;
538226031Sstas	bit = 1 << (blk & 7);
539226031Sstas	ASSERT(! (s->s_freebitmap[byte] & bit));
540226031Sstas	s->s_freebitmap[byte] |= bit;
541226031Sstas}
542226031Sstas
543226031Sstasstatic void
544226031Sstasblkfree(
545226031Sstas	mddb_set_t	*s,
546226031Sstas	mddb_block_t	blk
547226031Sstas)
548226031Sstas{
549226031Sstas	int		bit, byte;
550226031Sstas
551226031Sstas	s->s_freeblkcnt++;
552226031Sstas	byte = blk / 8;
553226031Sstas	bit = 1 << (blk & 7);
554226031Sstas	ASSERT(s->s_freebitmap[byte] & bit);
555226031Sstas	s->s_freebitmap[byte] &= ~bit;
556226031Sstas}
557226031Sstas
558226031Sstasstatic int
559226031Sstasblkcheck(
560226031Sstas	mddb_set_t	*s,
561226031Sstas	mddb_block_t	blk
562226031Sstas)
563226031Sstas{
564226031Sstas	int		bit, byte;
565226031Sstas
566226031Sstas	byte = blk / 8;
567226031Sstas	bit = 1 << (blk & 7);
568226031Sstas	return (s->s_freebitmap[byte] & bit);
569226031Sstas}
570226031Sstas
571226031Sstas/*
572226031Sstas * not fast but simple
573226031Sstas */
574226031Sstasstatic mddb_block_t
575226031Sstasgetfreeblks(
576226031Sstas	mddb_set_t	*s,
577226031Sstas	size_t		count
578226031Sstas)
579226031Sstas{
580226031Sstas	int		i;
581226031Sstas	size_t		contig;
582226031Sstas
583226031Sstas	contig = 0;
584226031Sstas	for (i = 0; i < s->s_totalblkcnt; i++) {
585226031Sstas		if (blkcheck(s, i)) {
586226031Sstas			contig = 0;
587226031Sstas		} else {
588226031Sstas			contig++;
589226031Sstas			if (contig == count) {
590226031Sstas				contig = i - count + 1;
591226031Sstas				for (i = (int)contig; i < contig + count; i++)
592226031Sstas					blkbusy(s, i);
593226031Sstas				return ((mddb_block_t)contig);
594226031Sstas			}
595226031Sstas		}
596226031Sstas	}
597226031Sstas	return (0);
598226031Sstas}
599226031Sstas
600226031Sstasstatic void
601226031Sstascomputefreeblks(
602226031Sstas	mddb_set_t	*s
603226031Sstas)
604226031Sstas{
605226031Sstas	mddb_db_t	*dbp;
606226031Sstas	mddb_de_ic_t	*dep;
607226031Sstas	int		i;
608226031Sstas	int		minblks;
609226031Sstas	int		freeblks;
610226031Sstas	mddb_mb_ic_t	*mbip;
611226031Sstas	mddb_lb_t	*lbp;
612226031Sstas	mddb_block_t	maxblk;
613226031Sstas	mddb_did_db_t	*did_dbp;
614226031Sstas	int		nblks;
615226031Sstas
616226031Sstas	minblks = 0;
617226031Sstas	lbp = s->s_lbp;
618226031Sstas	maxblk = 0;
619226031Sstas
620226031Sstas	/*
621226031Sstas	 * Determine the max number of blocks.
622226031Sstas	 */
623226031Sstas	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
624226031Sstas	/*
625226031Sstas	 * go through and find highest logical block
626226031Sstas	 */
627226031Sstas	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
628226031Sstas		if (dbp->db_blknum > maxblk)
629226031Sstas			maxblk = dbp->db_blknum;
630226031Sstas		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
631226031Sstas			for (i = 0; i < dep->de_blkcount; i++)
632226031Sstas				if (dep->de_blks[i] > maxblk)
633226031Sstas					maxblk = dep->de_blks[i];
634226031Sstas	}
635226031Sstas
636226031Sstas	for (i = 0; i < lbp->lb_loccnt; i++) {
637226031Sstas		mddb_locator_t	*lp = &lbp->lb_locators[i];
638226031Sstas
639226031Sstas		if ((lp->l_flags & MDDB_F_DELETED) ||
640226031Sstas		    (lp->l_flags & MDDB_F_EMASTER))
641226031Sstas			continue;
642226031Sstas
643226031Sstas		freeblks = 0;
644226031Sstas		for (mbip = s->s_mbiarray[i]; mbip != NULL;
645226031Sstas		    mbip = mbip->mbi_next) {
646226031Sstas			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
647226031Sstas		}
648226031Sstas		if (freeblks == 0)	/* this happen when there is no */
649226031Sstas			continue;	/*	master blk		*/
650226031Sstas
651226031Sstas		if (freeblks <= maxblk) {
652226031Sstas			lp->l_flags |= MDDB_F_TOOSMALL;
653226031Sstas			lp->l_flags &= ~MDDB_F_ACTIVE;
654226031Sstas		}
655226031Sstas
656226031Sstas		if (freeblks < minblks || minblks == 0)
657226031Sstas			minblks = freeblks;
658226031Sstas	}
659226031Sstas	/*
660226031Sstas	 * set up reasonable freespace if no
661226031Sstas	 * data bases exist
662226031Sstas	 */
663226031Sstas	if (minblks == 0)
664226031Sstas		minblks = 100;
665226031Sstas	if (minblks > nblks)
666226031Sstas		minblks = nblks;
667226031Sstas	s->s_freeblkcnt = minblks;
668226031Sstas	s->s_totalblkcnt = minblks;
669226031Sstas	if (! s->s_freebitmapsize) {
670226031Sstas		s->s_freebitmapsize = nblks / 8;
671226031Sstas		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
672226031Sstas		    KM_SLEEP);
673226031Sstas	}
674226031Sstas	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
675226031Sstas
676226031Sstas	/* locator block sectors */
677226031Sstas	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
678226031Sstas		blkbusy(s, i);
679226031Sstas
680226031Sstas	/* locator name sectors */
681226031Sstas	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
682226031Sstas		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
683226031Sstas
684226031Sstas	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
685226031Sstas		/* locator block device id information */
686226031Sstas		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
687226031Sstas			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
688226031Sstas
689226031Sstas		/* disk blocks containing actual device ids */
690226031Sstas		did_dbp = s->s_did_icp->did_ic_dbp;
691226031Sstas		while (did_dbp) {
692226031Sstas			for (i = 0; i < did_dbp->db_blkcnt; i++) {
693226031Sstas				blkbusy(s, did_dbp->db_firstblk + i);
694226031Sstas			}
695226031Sstas			did_dbp = did_dbp->db_next;
696226031Sstas		}
697226031Sstas	}
698226031Sstas
699226031Sstas	/* Only use data tags if not a MN set */
700226031Sstas	if (!(lbp->lb_flags & MDDB_MNSET)) {
701226031Sstas		/* Found a bad tag, do NOT mark the data tag blks busy here */
702226031Sstas		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
703226031Sstas			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
704226031Sstas				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
705226031Sstas		}
706226031Sstas	}
707226031Sstas
708226031Sstas	/* directory block/entry sectors */
709226031Sstas	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
710226031Sstas		blkbusy(s, dbp->db_blknum);
711226031Sstas		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
712226031Sstas			for (i = 0; i < dep->de_blkcount; i++)
713226031Sstas				blkbusy(s, dep->de_blks[i]);
714226031Sstas	}
715226031Sstas}
716226031Sstas
717226031Sstas/*
718226031Sstas * Add free space to the device id incore free list.
719226031Sstas * Called:
720226031Sstas *    - During startup when all devid blocks are temporarily placed on the
721226031Sstas *       free list
722226031Sstas *    - After a devid has been deleted via the metadb command.
723226031Sstas *    - When mddb_devid_free_get adds unused space from a disk block
724226031Sstas *       to free list
725226031Sstas */
726226031Sstasstatic int
727226031Sstasmddb_devid_free_add(
728226031Sstas	mddb_set_t *s,
729226031Sstas	uint_t firstblk,
730226031Sstas	uint_t offset,
731226031Sstas	uint_t length
732226031Sstas)
733226031Sstas{
734226031Sstas	mddb_did_free_t	*did_freep;
735226031Sstas
736226031Sstas	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
737226031Sstas		return (0);
738226031Sstas	}
739226031Sstas
740226031Sstas	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
741226031Sstas	    KM_SLEEP);
742226031Sstas	did_freep->free_blk = firstblk;
743226031Sstas	did_freep->free_offset = offset;
744226031Sstas	did_freep->free_length = length;
745226031Sstas	did_freep->free_next = s->s_did_icp->did_ic_freep;
746226031Sstas	s->s_did_icp->did_ic_freep = did_freep;
747226031Sstas
748226031Sstas	return (0);
749226031Sstas}
750226031Sstas
751226031Sstas/*
752226031Sstas * Remove specific free space from the device id incore free list.
753226031Sstas * Called at startup (after all devid blocks have been placed on
754226031Sstas * free list) in order to remove the free space from the list that
755226031Sstas * contains actual devids.
756226031Sstas * Returns 0 if area successfully removed.
757226031Sstas * Returns 1 if no matching area is found - so nothing removed.
758226031Sstas */
759226031Sstasstatic int
760226031Sstasmddb_devid_free_delete(
761226031Sstas	mddb_set_t *s,
762226031Sstas	uint_t firstblk,
763226031Sstas	uint_t offset,
764226031Sstas	uint_t length
765226031Sstas)
766226031Sstas{
767226031Sstas	int		block_found = 0;
768226031Sstas	mddb_did_free_t	*did_freep1;		/* next free block */
769226031Sstas	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
770226031Sstas	mddb_did_free_t *did_freep_before;	/* area before offset, len */
771226031Sstas	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
772226031Sstas	uint_t		old_length;
773226031Sstas
774226031Sstas	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
775226031Sstas		return (1);
776226031Sstas	}
777226031Sstas
778226031Sstas	/* find free block for this devid */
779226031Sstas	did_freep1 = s->s_did_icp->did_ic_freep;
780226031Sstas	while (did_freep1) {
781226031Sstas		/*
782226031Sstas		 * Look through free list of <block, offset, length> to
783226031Sstas		 * find our entry in the free list.  Our entry should
784226031Sstas		 * exist since the entire devid block was placed into
785226031Sstas		 * this free list at startup.  This code is just removing
786226031Sstas		 * the non-free (in-use) portions of the devid block so
787226031Sstas		 * that the remaining linked list does indeed just
788226031Sstas		 * contain a free list.
789226031Sstas		 *
790226031Sstas		 * Our entry has been found if
791226031Sstas		 *   - the blocks match,
792226031Sstas		 *   - the offset (starting address) in the free list is
793226031Sstas		 *	less than the offset of our entry and
794226031Sstas		 *   - the length+offset (ending address) in the free list is
795226031Sstas		 *	greater than the length+offset of our entry.
796226031Sstas		 */
797226031Sstas		if ((did_freep1->free_blk == firstblk) &&
798226031Sstas		    (did_freep1->free_offset <= offset) &&
799226031Sstas		    ((did_freep1->free_length + did_freep1->free_offset) >=
800226031Sstas		    (length + offset))) {
801226031Sstas			/* Have found our entry - remove from list */
802226031Sstas			block_found = 1;
803226031Sstas			did_freep_before = did_freep1;
804226031Sstas			old_length = did_freep1->free_length;
805226031Sstas			/* did_freep1 - pts to next free block */
806226031Sstas			did_freep1 = did_freep1->free_next;
807226031Sstas			if (did_freep2) {
808226031Sstas				did_freep2->free_next = did_freep1;
809226031Sstas			} else {
810226031Sstas				s->s_did_icp->did_ic_freep = did_freep1;
811226031Sstas			}
812226031Sstas
813226031Sstas			/*
814226031Sstas			 * did_freep_before points to area in block before
815226031Sstas			 * offset, length.
816226031Sstas			 */
817226031Sstas			did_freep_before->free_length = offset -
818226031Sstas			    did_freep_before->free_offset;
819226031Sstas			/*
820226031Sstas			 * did_freep_after points to area in block after
821226031Sstas			 * offset, length.
822226031Sstas			 */
823226031Sstas			did_freep_after = (mddb_did_free_t *)kmem_zalloc
824226031Sstas			    (sizeof (mddb_did_free_t), KM_SLEEP);
825226031Sstas			did_freep_after->free_blk = did_freep_before->free_blk;
826226031Sstas			did_freep_after->free_offset = offset + length;
827226031Sstas			did_freep_after->free_length = old_length - length -
828226031Sstas			    did_freep_before->free_length;
829226031Sstas			/*
830226031Sstas			 * Add before and after areas to free list
831226031Sstas			 * If area before or after offset, length has length
832226031Sstas			 * of 0, that entry is not added.
833226031Sstas			 */
834226031Sstas			if (did_freep_after->free_length) {
835226031Sstas				did_freep_after->free_next = did_freep1;
836226031Sstas				if (did_freep2) {
837226031Sstas					did_freep2->free_next =
838226031Sstas					    did_freep_after;
839226031Sstas				} else {
840226031Sstas					s->s_did_icp->did_ic_freep =
841226031Sstas					    did_freep_after;
842226031Sstas				}
843226031Sstas				did_freep1 = did_freep_after;
844226031Sstas			} else {
845226031Sstas				kmem_free(did_freep_after,
846226031Sstas				    sizeof (mddb_did_free_t));
847226031Sstas			}
848226031Sstas
849226031Sstas			if (did_freep_before->free_length) {
850226031Sstas				did_freep_before->free_next = did_freep1;
851226031Sstas				if (did_freep2) {
852226031Sstas					did_freep2->free_next =
853226031Sstas					    did_freep_before;
854226031Sstas				} else {
855226031Sstas					s->s_did_icp->did_ic_freep =
856226031Sstas					    did_freep_before;
857226031Sstas				}
858226031Sstas			} else {
859226031Sstas				kmem_free(did_freep_before,
860226031Sstas				    sizeof (mddb_did_free_t));
861226031Sstas			}
862226031Sstas			break;
863226031Sstas		} else {
864226031Sstas			did_freep2 = did_freep1;
865226031Sstas			did_freep1 = did_freep1->free_next;
866226031Sstas		}
867226031Sstas	}
868226031Sstas	if (block_found == 0) {
869226031Sstas		return (1);
870226031Sstas	} else {
871226031Sstas		return (0);
872226031Sstas	}
873226031Sstas}
874226031Sstas
875226031Sstas/*
876226031Sstas * Find free space of devid length and remove free space from list.
877226031Sstas * Return a pointer to the previously free area.
878226031Sstas *
879226031Sstas * If there's not enough free space on the free list, get an empty
880226031Sstas * disk block, put the empty disk block on the did_ic_dbp linked list,
881226031Sstas * and add the disk block space not used for devid to the free list.
882226031Sstas *
883226031Sstas * Return pointer to address (inside disk block) of free area for devid.
884226031Sstas * Return 0 if error.
885226031Sstas */
886226031Sstasstatic caddr_t
887226031Sstasmddb_devid_free_get(
888226031Sstas	mddb_set_t *s,
889226031Sstas	uint_t len,
890226031Sstas	uint_t *blk,
891226031Sstas	uint_t *cnt,
892226031Sstas	uint_t *offset
893226031Sstas)
894226031Sstas{
895226031Sstas	mddb_did_free_t	*freep, *freep2;
896226031Sstas	mddb_did_db_t	*dbp;
897226031Sstas	uint_t		blk_cnt, blk_num;
898226031Sstas	ddi_devid_t	devid_ptr = NULL;
899226031Sstas
900226031Sstas	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
901226031Sstas		return (0);
902226031Sstas	}
903226031Sstas
904226031Sstas	freep = s->s_did_icp->did_ic_freep;
905226031Sstas	freep2 = (mddb_did_free_t *)NULL;
906226031Sstas	while (freep) {
907226031Sstas		/* found a free area - remove from free list */
908226031Sstas		if (len <= freep->free_length) {
909226031Sstas			*blk = freep->free_blk;
910226031Sstas			*offset = freep->free_offset;
911226031Sstas			/* find disk block pointer that contains free area */
912226031Sstas			dbp = s->s_did_icp->did_ic_dbp;
913226031Sstas			while (dbp) {
914226031Sstas				if (dbp->db_firstblk == *blk)
915226031Sstas					break;
916226031Sstas				else
917226031Sstas					dbp = dbp->db_next;
918226031Sstas			}
919226031Sstas			/*
920226031Sstas			 * If a disk block pointer can't be found - something
921226031Sstas			 * is wrong, so don't use this free space.
922226031Sstas			 */
923226031Sstas			if (dbp == NULL) {
924226031Sstas				freep2 = freep;
925226031Sstas				freep = freep->free_next;
926226031Sstas				continue;
927226031Sstas			}
928226031Sstas
929226031Sstas			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
930226031Sstas			*cnt = dbp->db_blkcnt;
931226031Sstas
932226031Sstas			/* Update free list information */
933226031Sstas			freep->free_offset += len;
934226031Sstas			freep->free_length -= len;
935226031Sstas			if (freep->free_length == 0) {
936226031Sstas				if (freep2) {
937226031Sstas					freep2->free_next =
938226031Sstas					    freep->free_next;
939226031Sstas				} else {
940226031Sstas					s->s_did_icp->did_ic_freep =
941226031Sstas					    freep->free_next;
942226031Sstas				}
943226031Sstas				kmem_free(freep, sizeof (mddb_did_free_t));
944226031Sstas			}
945226031Sstas			break;
946226031Sstas		}
947226031Sstas		freep2 = freep;
948226031Sstas		freep = freep->free_next;
949226031Sstas	}
950226031Sstas
951226031Sstas	/* Didn't find a free spot */
952226031Sstas	if (freep == NULL) {
953226031Sstas		/* get free logical disk blk in replica */
954226031Sstas		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
955226031Sstas		blk_num = getfreeblks(s, blk_cnt);
956226031Sstas		if (blk_num == 0)
957226031Sstas			return (0);
958226031Sstas
959226031Sstas		/* Add disk block to disk block linked list */
960226031Sstas		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
961226031Sstas		dbp->db_firstblk = blk_num;
962226031Sstas		dbp->db_blkcnt = blk_cnt;
963226031Sstas		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
964226031Sstas		dbp->db_next = s->s_did_icp->did_ic_dbp;
965226031Sstas		s->s_did_icp->did_ic_dbp = dbp;
966226031Sstas		devid_ptr = (ddi_devid_t)dbp->db_ptr;
967226031Sstas
968226031Sstas		/* Update return values */
969226031Sstas		*blk = blk_num;
970226031Sstas		*offset = 0;
971226031Sstas		*cnt = blk_cnt;
972226031Sstas
973226031Sstas		/* Add unused part of block to free list */
974226031Sstas		(void) mddb_devid_free_add(s, blk_num,
975226031Sstas		    len, (dbtob(blk_cnt) - len));
976226031Sstas	}
977226031Sstas
978226031Sstas	return ((caddr_t)devid_ptr);
979226031Sstas}
980226031Sstas
981226031Sstas/*
982226031Sstas * Add device id information for locator index to device id area in set.
983226031Sstas * Get free area to store device id from free list.   Update checksum
984226031Sstas * for mddb_did_blk.
985226031Sstas *
986226031Sstas * This routine does not write any data out to disk.
987226031Sstas * After this routine has been called, the routine, writelocall, should
988226031Sstas * be called to write both the locator block and device id area out
989226031Sstas * to disk.
990226031Sstas */
991226031Sstasstatic int
992226031Sstasmddb_devid_add(
993226031Sstas	mddb_set_t	*s,
994226031Sstas	uint_t		index,
995226031Sstas	ddi_devid_t	devid,
996226031Sstas	char		*minor_name
997226031Sstas)
998226031Sstas{
999226031Sstas	uint_t		devid_len;
1000226031Sstas	uint_t		blk, offset;
1001226031Sstas	ddi_devid_t	devid_ptr;
1002226031Sstas	mddb_did_info_t	*did_info;
1003226031Sstas	uint_t		blkcnt, i;
1004226031Sstas	mddb_did_blk_t	*did_blk;
1005226031Sstas
1006226031Sstas	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1007226031Sstas		return (1);
1008226031Sstas	}
1009226031Sstas	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
1010226031Sstas		return (1);
1011226031Sstas
1012226031Sstas	/* Check if device id has already been added */
1013226031Sstas	did_blk = s->s_did_icp->did_ic_blkp;
1014226031Sstas	did_info = &(did_blk->blk_info[index]);
1015226031Sstas	if (did_info->info_flags & MDDB_DID_EXISTS)
1016226031Sstas		return (0);
1017226031Sstas
1018226031Sstas	devid_len = ddi_devid_sizeof(devid);
1019226031Sstas	devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
1020226031Sstas	    devid_len, &blk, &blkcnt, &offset);
1021226031Sstas
1022226031Sstas	if (devid_ptr == NULL) {
1023226031Sstas		return (1);
1024226031Sstas	}
1025226031Sstas
1026226031Sstas	/* Copy devid into devid free area */
1027226031Sstas	for (i = 0; i < devid_len; i++)
1028226031Sstas		((char *)devid_ptr)[i] = ((char *)devid)[i];
1029226031Sstas
1030226031Sstas	/* Update mddb_did_info area for new device id */
1031226031Sstas	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;
1032226031Sstas
1033226031Sstas	/*
1034226031Sstas	 * Only set UPDATED flag for non-replicated import cases.
1035226031Sstas	 * This allows the side locator driver name index to get
1036226031Sstas	 * updated in load_old_replicas.
1037226031Sstas	 */
1038226031Sstas	if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
1039226031Sstas		did_info->info_flags |= MDDB_DID_UPDATED;
1040226031Sstas
1041226031Sstas	did_info->info_firstblk = blk;
1042226031Sstas	did_info->info_blkcnt = blkcnt;
1043226031Sstas	did_info->info_offset = offset;
1044226031Sstas	did_info->info_length = devid_len;
1045226031Sstas	(void) strcpy(did_info->info_minor_name, minor_name);
1046226031Sstas	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
1047226031Sstas
1048226031Sstas	/* Add device id pointer to did_ic_devid array */
1049226031Sstas	s->s_did_icp->did_ic_devid[index] = devid_ptr;
1050226031Sstas
1051226031Sstas	return (0);
1052226031Sstas}
1053226031Sstas
1054226031Sstas
1055226031Sstas/*
1056226031Sstas * Delete device id information for locator index from device id area in set.
1057226031Sstas * Add device id space to free area.
1058226031Sstas *
1059226031Sstas * This routine does not write any data out to disk.
1060226031Sstas * After this routine has been called, the routine, writelocall, should
1061226031Sstas * be called to write both the locator block and device id area out
1062226031Sstas * to disk.
1063226031Sstas */
1064226031Sstasstatic int
1065226031Sstasmddb_devid_delete(mddb_set_t *s, uint_t index)
1066226031Sstas{
1067226031Sstas	mddb_did_info_t	*did_info;
1068226031Sstas	mddb_did_blk_t	*did_blk;
1069226031Sstas
1070226031Sstas	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1071226031Sstas		return (1);
1072226031Sstas	}
1073226031Sstas
1074226031Sstas	/* Get device id information from mddb_did_blk */
1075226031Sstas	did_blk = s->s_did_icp->did_ic_blkp;
1076226031Sstas	did_info = &(did_blk->blk_info[index]);
1077226031Sstas
1078226031Sstas	/*
1079226031Sstas	 * Ensure that the underlying device supports device ids
1080226031Sstas	 * before arbitrarily removing them.
1081226031Sstas	 */
1082226031Sstas	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
1083226031Sstas		return (1);
1084226031Sstas	}
1085226031Sstas
1086226031Sstas	/* Remove device id information from mddb_did_blk */
1087226031Sstas	did_info->info_flags = 0;
1088226031Sstas
1089226031Sstas	/* Remove device id from incore area */
1090226031Sstas	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
1091226031Sstas
1092226031Sstas	/* Add new free space in disk block to free list */
1093226031Sstas	(void) mddb_devid_free_add(s, did_info->info_firstblk,
1094226031Sstas	    did_info->info_offset, did_info->info_length);
1095226031Sstas
1096226031Sstas	return (0);
1097226031Sstas}
1098226031Sstas
1099226031Sstas/*
1100226031Sstas * Check if there is a device id for a locator index.
1101226031Sstas *
1102226031Sstas * Caller of this routine should not free devid or minor_name since
1103226031Sstas * these will point to internal data structures that should not
1104226031Sstas * be freed.
1105226031Sstas */
1106226031Sstasstatic int
1107226031Sstasmddb_devid_get(
1108226031Sstas	mddb_set_t *s,
1109226031Sstas	uint_t index,
1110226031Sstas	ddi_devid_t *devid,
1111226031Sstas	char **minor_name
1112226031Sstas)
1113226031Sstas{
1114226031Sstas	mddb_did_info_t	*did_info;
1115226031Sstas
1116226031Sstas	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1117226031Sstas		return (0);
1118226031Sstas	}
1119226031Sstas	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
1120226031Sstas
1121226031Sstas	if (did_info->info_flags & MDDB_DID_EXISTS) {
1122226031Sstas		*devid = s->s_did_icp->did_ic_devid[index];
1123226031Sstas		*minor_name =
1124226031Sstas		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
1125226031Sstas		return (1);
1126226031Sstas	} else
1127226031Sstas		return (0);
1128226031Sstas
1129226031Sstas
1130226031Sstas}
1131226031Sstas
1132226031Sstas/*
1133226031Sstas * Check if device id is valid on current system.
1134226031Sstas * Needs devid, previously known dev_t and current minor_name.
1135226031Sstas *
1136226031Sstas * Success:
1137226031Sstas * 	Returns 0 if valid device id is found and updates
1138226031Sstas * 	dev_t if the dev_t associated with the device id is
1139226031Sstas *	different than dev_t.
1140226031Sstas * Failure:
1141226031Sstas * 	Returns 1 if device id not valid on current system.
1142226031Sstas */
1143226031Sstasstatic int
1144226031Sstasmddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
1145226031Sstas{
1146226031Sstas	int		retndevs;
1147226031Sstas	dev_t		*ddi_devs;
1148226031Sstas	int		devid_flag = 0;
1149226031Sstas	int 		cnt;
1150226031Sstas
1151226031Sstas	if (dev == 0)
1152226031Sstas		return (1);
1153226031Sstas	/*
1154226031Sstas	 * See if devid is valid in the current system.
1155226031Sstas	 * If so, set dev to match the devid.
1156226031Sstas	 */
1157226031Sstas	if (ddi_lyr_devid_to_devlist(devid, minor_name,
1158226031Sstas	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
1159226031Sstas		if (retndevs > 0) {
1160226031Sstas			/* devid is valid to use */
1161226031Sstas			devid_flag = 1;
1162226031Sstas			/* does dev_t in list match dev */
1163226031Sstas			cnt = 0;
1164226031Sstas			while (cnt < retndevs) {
1165226031Sstas				if (*dev == md_expldev(ddi_devs[cnt]))
1166226031Sstas					break;
1167226031Sstas				cnt++;
1168226031Sstas			}
1169226031Sstas			/*
1170226031Sstas			 * If a different dev_t, then setup
1171226031Sstas			 * new dev and new major name
1172226031Sstas			 */
1173226031Sstas			if (cnt == retndevs) {
1174226031Sstas				*dev = md_expldev(ddi_devs[0]);
1175226031Sstas			}
1176226031Sstas			ddi_lyr_free_devlist(ddi_devs, retndevs);
1177226031Sstas		}
1178226031Sstas	}
1179226031Sstas	if (devid_flag)
1180226031Sstas		return (0);
1181226031Sstas	else
1182226031Sstas		return (1);
1183226031Sstas}
1184226031Sstas
1185226031Sstas
1186226031Sstas/*
1187226031Sstas * Free the devid incore data areas
1188226031Sstas */
1189226031Sstasstatic void
1190226031Sstasmddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
1191226031Sstas{
1192226031Sstas	mddb_did_free_t	*did_freep1, *did_freep2;
1193226031Sstas	mddb_did_db_t	*did_dbp1, *did_dbp2;
1194226031Sstas	mddb_did_ic_t	*icp = *did_icp;
1195226031Sstas
1196226031Sstas	if (icp) {
1197226031Sstas		if (icp->did_ic_blkp) {
1198226031Sstas			kmem_free((caddr_t)icp->did_ic_blkp,
1199226031Sstas			    dbtob(lbp->lb_didblkcnt));
1200226031Sstas			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
1201226031Sstas		}
1202226031Sstas
1203226031Sstas		if (icp->did_ic_dbp) {
1204226031Sstas			did_dbp1 = icp->did_ic_dbp;
1205226031Sstas			while (did_dbp1) {
1206226031Sstas				did_dbp2 = did_dbp1->db_next;
1207226031Sstas				kmem_free((caddr_t)did_dbp1->db_ptr,
1208226031Sstas				    dbtob(did_dbp1->db_blkcnt));
1209226031Sstas				kmem_free((caddr_t)did_dbp1,
1210226031Sstas				    sizeof (mddb_did_db_t));
1211226031Sstas				did_dbp1 = did_dbp2;
1212226031Sstas			}
1213226031Sstas		}
1214226031Sstas
1215226031Sstas		if (icp->did_ic_freep) {
1216226031Sstas			did_freep1 = icp->did_ic_freep;
1217226031Sstas			while (did_freep1) {
1218226031Sstas				did_freep2 = did_freep1->free_next;
1219226031Sstas				kmem_free((caddr_t)did_freep1,
1220226031Sstas				    sizeof (mddb_did_free_t));
1221226031Sstas				did_freep1 = did_freep2;
1222226031Sstas			}
1223226031Sstas		}
1224226031Sstas
1225226031Sstas		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
1226226031Sstas		*did_icp = (mddb_did_ic_t *)NULL;
1227226031Sstas	}
1228226031Sstas
1229226031Sstas}
1230226031Sstas
1231226031Sstasstatic daddr_t
1232226031Sstasgetphysblk(
1233226031Sstas	mddb_block_t		blk,
1234226031Sstas	mddb_mb_ic_t		*mbip
1235226031Sstas)
1236226031Sstas{
1237226031Sstas	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);
1238226031Sstas
1239226031Sstas	while (blk >= mbp->mb_blkcnt) {
1240226031Sstas		if (! mbip->mbi_next)
1241226031Sstas			return ((daddr_t)-1);	/* no such block */
1242226031Sstas		blk -= mbp->mb_blkcnt;
1243226031Sstas		mbip = mbip->mbi_next;
1244226031Sstas		mbp = &(mbip->mbi_mddb_mb);
1245226031Sstas	}
1246226031Sstas
1247226031Sstas	if (blk >= mbp->mb_blkmap.m_consecutive)
1248226031Sstas		return ((daddr_t)-1);	/* no such block */
1249226031Sstas
1250226031Sstas	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
1251226031Sstas}
1252226031Sstas
1253226031Sstas/*
1254226031Sstas * when a buf header is passed in the new buffer must be
1255226031Sstas * put on the front of the chain. writerec counts on it
1256226031Sstas */
1257226031Sstasstatic int
1258226031Sstasputblks(
1259226031Sstas	mddb_set_t	*s,		/* incore db set structure */
1260226031Sstas	caddr_t		buffer,		/* adr of buffer to be written */
1261226031Sstas	daddr_t		blk,		/* block number for first block */
1262226031Sstas	int		cnt,		/* number of blocks to be written */
1263226031Sstas	md_dev64_t	device,		/* device to be written to */
1264226031Sstas	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
1265226031Sstas					/*    and put buf address here */
1266226031Sstas)
1267226031Sstas{
1268226031Sstas	buf_t		*bp;
1269226031Sstas	mddb_bf_t	*bfp;
1270226031Sstas	int		err = 0;
1271226031Sstas
1272226031Sstas	bfp = allocbuffer(s, MDDB_SLEEPOK);
1273226031Sstas	bp = &bfp->bf_buf;
1274226031Sstas	bp->b_bcount = MDDB_BSIZE * cnt;
1275226031Sstas	bp->b_un.b_addr = buffer;
1276226031Sstas	bp->b_blkno = blk;
1277226031Sstas	bp->b_edev = md_dev64_to_dev(device);
1278226031Sstas	/*
1279226031Sstas	 * if a header for a buf chain is passed in this is async io.
1280226031Sstas	 * currently only done for optimize  records
1281226031Sstas	 */
1282226031Sstas	if (bufhead) {
1283226031Sstas		bfp->bf_next = *bufhead;
1284226031Sstas		*bufhead = bfp;
1285226031Sstas		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
1286226031Sstas		return (0);
1287226031Sstas	}
1288226031Sstas	err = mddb_rwdata(s, B_WRITE, bp);
1289226031Sstas	freebuffer(s, bfp);
1290226031Sstas	if (err) {
1291226031Sstas		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1292226031Sstas		    s->s_setno, device);
1293226031Sstas		return (MDDB_F_EWRITE);
1294226031Sstas	}
1295226031Sstas	return (0);
1296226031Sstas}
1297226031Sstas
1298226031Sstas/*
1299226031Sstas * wrtblklst - takes an array of logical block numbers
1300226031Sstas *		and writes the buffer to those blocks (scatter).
1301226031Sstas * If called during upgrade, this routine expects a
1302226031Sstas * non-translated (aka target) dev.
1303226031Sstas */
1304226031Sstasstatic int
1305226031Sstaswrtblklst(
1306226031Sstas	mddb_set_t	*s,		/* incore set structure */
1307226031Sstas	caddr_t		buffer,		/* buffer to be written (record blk) */
1308226031Sstas	mddb_block_t	blka[],		/* list of logical blks for record */
1309226031Sstas	daddr_t		cnt,		/* number of logical blks */
1310226031Sstas	const int	li,		/* locator index */
1311226031Sstas	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
1312226031Sstas					/*    and put buf address here */
1313226031Sstas	int		master_only	/* allow only master node to write */
1314226031Sstas)
1315226031Sstas{
1316226031Sstas	daddr_t		blk;
1317226031Sstas	daddr_t		blk1;
1318226031Sstas	int		err = 0;
1319226031Sstas	int		cons;
1320226031Sstas	mddb_lb_t	*lbp = s->s_lbp;
1321226031Sstas	mddb_locator_t	*lp = &lbp->lb_locators[li];
1322226031Sstas	md_dev64_t	dev;
1323226031Sstas	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];
1324226031Sstas
1325226031Sstas	/*
1326226031Sstas	 * If a MN diskset and only the master can write,
1327226031Sstas	 * then a non-master node will just return success.
1328226031Sstas	 */
1329226031Sstas	if (lbp->lb_flags & MDDB_MNSET) {
1330226031Sstas		if (master_only == MDDB_WR_ONLY_MASTER) {
1331226031Sstas			/* return successfully if we aren't the master */
1332226031Sstas			if (!(md_set[s->s_setno].s_am_i_master)) {
1333226031Sstas				return (0);
1334226031Sstas			}
1335226031Sstas		}
1336226031Sstas		if (mbip == NULL)
1337226031Sstas			return (MDDB_F_EWRITE);
1338226031Sstas	}
1339226031Sstas
1340226031Sstas	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1341226031Sstas	if (dev == NODEV64) {
1342226031Sstas		return (1);
1343226031Sstas	}
1344226031Sstas
1345226031Sstas	blk = getphysblk(blka[0], mbip);
1346226031Sstas	ASSERT(blk >= 0);
1347226031Sstas
1348226031Sstas	cons = 1;
1349226031Sstas	while (cnt) {
1350226031Sstas		if (cons != cnt) {
1351226031Sstas			blk1 = getphysblk(blka[cons], mbip);
1352226031Sstas			ASSERT(blk1 >= 0);
1353226031Sstas			if ((blk + cons) == blk1) {
1354226031Sstas				cons++;
1355226031Sstas				continue;
1356226031Sstas			}
1357226031Sstas		}
1358226031Sstas		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
1359226031Sstas			/*
1360226031Sstas			 * If an MN diskset and any_node_can_write
1361226031Sstas			 * then this request is coming from writeoptrecord
1362226031Sstas			 * and l_flags field should not be updated.
1363226031Sstas			 * l_flags will be updated as a result of sending
1364226031Sstas			 * a class1 message to the master.  Setting l_flags
1365226031Sstas			 * here will cause slave to be out of sync with
1366226031Sstas			 * master.
1367226031Sstas			 *
1368226031Sstas			 * Otherwise, set the error in l_flags
1369226031Sstas			 * (this occurs if this is not a MN diskset or
1370226031Sstas			 * only_master_can_write is set).
1371226031Sstas			 */
1372226031Sstas			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
1373226031Sstas			    (master_only == MDDB_WR_ONLY_MASTER)) {
1374226031Sstas				lp->l_flags |= MDDB_F_EWRITE;
1375226031Sstas			}
1376226031Sstas			return (err);
1377226031Sstas		}
1378226031Sstas		if (bufhead)
1379226031Sstas			(*bufhead)->bf_locator = lp;
1380226031Sstas
1381226031Sstas		buffer += MDDB_BSIZE * cons;
1382226031Sstas		cnt -= cons;
1383226031Sstas		blka += cons;
1384226031Sstas		if (cnt) {
1385226031Sstas			blk = getphysblk(blka[0], mbip);
1386226031Sstas			ASSERT(blk >= 0);
1387226031Sstas		}
1388226031Sstas		cons = 1;
1389226031Sstas	}
1390226031Sstas
1391226031Sstas	return (0);
1392226031Sstas}
1393226031Sstas
1394226031Sstas/*
1395226031Sstas * writeblks - takes a logical block number/block count pair
1396226031Sstas * 		and writes the buffer to those contiguous logical blocks.
1397226031Sstas * If called during upgrade, this routine expects a non-translated
1398226031Sstas * (aka target) dev.
1399226031Sstas */
1400226031Sstasstatic int
1401226031Sstaswriteblks(
1402226031Sstas	mddb_set_t	*s,		/* incore set structure */
1403226031Sstas	caddr_t		buffer,		/* buffer to be written */
1404226031Sstas	mddb_block_t	blk,		/* starting logical block number */
1405226031Sstas	int		cnt,		/* number of log blocks to be written */
1406226031Sstas	const int	li,		/* locator index */
1407226031Sstas	int		master_only	/* allow only master node to write */
1408226031Sstas)
1409226031Sstas{
1410226031Sstas	daddr_t		physblk;
1411226031Sstas	int		err = 0;
1412226031Sstas	int		i;
1413226031Sstas	mddb_lb_t	*lbp = s->s_lbp;
1414226031Sstas	mddb_locator_t	*lp = &lbp->lb_locators[li];
1415226031Sstas	md_dev64_t	dev;
1416226031Sstas	mddb_block_t	*blkarray;
1417226031Sstas	int		size;
1418226031Sstas	int		ret;
1419226031Sstas
1420226031Sstas	/*
1421226031Sstas	 * If a MN diskset and only the master can write,
1422226031Sstas	 * then a non-master node will just return success.
1423226031Sstas	 */
1424226031Sstas	if ((lbp->lb_flags & MDDB_MNSET) &&
1425226031Sstas	    (master_only == MDDB_WR_ONLY_MASTER)) {
1426226031Sstas		/* return successfully if we aren't the master */
1427226031Sstas		if (!(md_set[s->s_setno].s_am_i_master)) {
1428226031Sstas			return (0);
1429226031Sstas		}
1430226031Sstas	}
1431226031Sstas
1432226031Sstas	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1433226031Sstas	if (dev == NODEV64) {
1434226031Sstas		return (1);
1435226031Sstas	}
1436226031Sstas
1437226031Sstas	if (cnt > 1) {
1438226031Sstas		size = sizeof (mddb_block_t) * cnt;
1439226031Sstas		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1440226031Sstas		for (i = 0; i < cnt; i++)
1441226031Sstas			blkarray[i] = blk + i;
1442226031Sstas		ret = wrtblklst(s, buffer, blkarray, cnt,
1443226031Sstas		    li, 0, MDDB_WR_ONLY_MASTER);
1444226031Sstas		kmem_free(blkarray, size);
1445226031Sstas		return (ret);
1446226031Sstas	}
1447226031Sstas	physblk = getphysblk(blk, s->s_mbiarray[li]);
1448226031Sstas	ASSERT(physblk > 0);
1449226031Sstas	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
1450226031Sstas		lp->l_flags |= MDDB_F_EWRITE;
1451226031Sstas		return (err);
1452226031Sstas	}
1453226031Sstas	return (0);
1454226031Sstas}
1455226031Sstas
1456226031Sstas/*
1457226031Sstas * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
1458226031Sstas */
1459226031Sstasstatic int
1460226031Sstaswriteall(
1461226031Sstas	mddb_set_t	*s,		/* incore set structure */
1462226031Sstas	caddr_t		buffer,		/* buffer to be written */
1463226031Sstas	mddb_block_t	block,		/* starting logical block number */
1464226031Sstas	int		cnt,		/* number of log blocks to be written */
1465226031Sstas	int		master_only	/* allow only master node to write */
1466226031Sstas)
1467226031Sstas{
1468226031Sstas	int		li;
1469226031Sstas	int		err = 0;
1470226031Sstas	mddb_lb_t	*lbp = s->s_lbp;
1471226031Sstas
1472226031Sstas	for (li = 0; li < lbp->lb_loccnt; li++) {
1473226031Sstas		mddb_locator_t	*lp = &lbp->lb_locators[li];
1474226031Sstas
1475226031Sstas		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1476226031Sstas		    (lp->l_flags & MDDB_F_EWRITE))
1477226031Sstas			continue;
1478226031Sstas
1479226031Sstas		err |= writeblks(s, buffer, block, cnt, li, master_only);
1480226031Sstas	}
1481226031Sstas
1482226031Sstas	return (err);
1483226031Sstas}
1484226031Sstas
1485226031Sstas/*
1486226031Sstas * writelocall - write the locator block and device id information (if
1487226031Sstas * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
1488226031Sstas *
1489226031Sstas * Increments the locator block's commitcnt.  Updates the device id area's
1490226031Sstas * commitcnt if the replica is in device id format.  Regenerates the
1491226031Sstas * checksums after updating the commitcnt(s).
1492226031Sstas */
1493226031Sstasstatic int
1494226031Sstaswritelocall(
1495226031Sstas	mddb_set_t	*s	/* incore set structure */
1496226031Sstas)
1497226031Sstas{
1498226031Sstas	int		li;
1499226031Sstas	int		err = 0;
1500226031Sstas	mddb_lb_t	*lbp = s->s_lbp;
1501226031Sstas	mddb_did_blk_t	*did_blk;
1502226031Sstas	mddb_did_db_t	*did_dbp;
1503226031Sstas
1504226031Sstas	s->s_lbp->lb_commitcnt++;
1505226031Sstas	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1506226031Sstas		did_blk = s->s_did_icp->did_ic_blkp;
1507226031Sstas		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
1508226031Sstas		crcgen(did_blk, &did_blk->blk_checksum,
1509226031Sstas		    dbtob(lbp->lb_didblkcnt), NULL);
1510226031Sstas	}
1511226031Sstas	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
1512226031Sstas
1513226031Sstas	for (li = 0; li < lbp->lb_loccnt; li++) {
1514226031Sstas		mddb_locator_t	*lp = &lbp->lb_locators[li];
1515226031Sstas
1516226031Sstas		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1517226031Sstas		    (lp->l_flags & MDDB_F_EWRITE))
1518226031Sstas			continue;
1519226031Sstas
1520226031Sstas		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1521226031Sstas			/* write out blocks containing actual device ids */
1522226031Sstas			did_dbp = s->s_did_icp->did_ic_dbp;
1523226031Sstas			while (did_dbp) {
1524226031Sstas				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
1525226031Sstas				    did_dbp->db_firstblk,
1526226031Sstas				    did_dbp->db_blkcnt, li,
1527226031Sstas				    MDDB_WR_ONLY_MASTER);
1528226031Sstas				did_dbp = did_dbp->db_next;
1529226031Sstas			}
1530226031Sstas
1531226031Sstas			/* write out device id area block */
1532226031Sstas			err |= writeblks(s, (caddr_t)did_blk,
1533226031Sstas			    lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
1534226031Sstas			    MDDB_WR_ONLY_MASTER);
1535226031Sstas		}
1536226031Sstas		/* write out locator block */
1537226031Sstas		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
1538226031Sstas		    MDDB_WR_ONLY_MASTER);
1539226031Sstas	}
1540226031Sstas
1541226031Sstas	/*
1542226031Sstas	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
1543226031Sstas	 * in the mddb_set structure to show that the locator block has
1544226031Sstas	 * been changed.
1545226031Sstas	 */
1546226031Sstas
1547226031Sstas	if ((lbp->lb_flags & MDDB_MNSET) &&
1548226031Sstas	    (md_set[s->s_setno].s_am_i_master)) {
1549226031Sstas		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
1550226031Sstas	}
1551226031Sstas	return (err);
1552226031Sstas}
1553226031Sstas
1554226031Sstas/*
1555226031Sstas * If called during upgrade, this routine expects a translated
1556226031Sstas * (aka miniroot) dev.
1557226031Sstas */
1558226031Sstasstatic int
1559226031Sstasgetblks(
1560226031Sstas	mddb_set_t	*s,	/* incore db set structure */
1561226031Sstas	caddr_t		buffer,	/* buffer to read data into */
1562226031Sstas	md_dev64_t	device,	/* device to read from */
1563226031Sstas	daddr_t		blk,	/* physical block number to read */
1564226031Sstas	int		cnt,	/* number of blocks to read */
1565226031Sstas	int		flag	/* flags for I/O */
1566226031Sstas)
1567226031Sstas{
1568226031Sstas	buf_t		*bp;
1569226031Sstas	mddb_bf_t	*bfp;
1570226031Sstas	int		err = 0;
1571226031Sstas
1572226031Sstas	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
1573226031Sstas	bp = &bfp->bf_buf;
1574226031Sstas	bp->b_bcount = MDDB_BSIZE * cnt;
1575226031Sstas	bp->b_un.b_addr = buffer;
1576226031Sstas	bp->b_blkno = blk;
1577226031Sstas	bp->b_edev = md_dev64_to_dev(device);
1578226031Sstas	err = mddb_rwdata(s, (B_READ | flag), bp);
1579226031Sstas	freebuffer(s, bfp);
1580226031Sstas	if (err) {
1581226031Sstas		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1582226031Sstas		    s->s_setno, device);
1583226031Sstas		return (MDDB_F_EREAD);
1584226031Sstas	}
1585226031Sstas	return (0);
1586226031Sstas}
1587226031Sstas
1588226031Sstas/*
1589226031Sstas * readblklst - takes an array of logical block numbers
1590226031Sstas * 		and reads those blocks (gather) into the buffer.
1591226031Sstas * If called during upgrade, this routine expects a non-translated
1592226031Sstas * (aka target) dev.
1593226031Sstas */
1594226031Sstasstatic int
1595226031Sstasreadblklst(
1596226031Sstas	mddb_set_t	*s,	/* incore set structure */
1597226031Sstas	caddr_t		buffer,	/* buffer to be read (record block) */
1598226031Sstas	mddb_block_t	blka[],	/* list of logical blocks to be read */
1599226031Sstas	daddr_t		cnt,	/* number of logical blocks */
1600226031Sstas	int		li,	/* locator index */
1601226031Sstas	int		flag	/* flags for I/O */
1602226031Sstas)
1603226031Sstas{
1604226031Sstas	daddr_t		blk;
1605226031Sstas	daddr_t		blk1;
1606226031Sstas	int		err = 0;
1607226031Sstas	int		cons;
1608226031Sstas	md_dev64_t	dev;
1609226031Sstas	mddb_mb_ic_t	*mbip;
1610226031Sstas
1611226031Sstas	mbip = s->s_mbiarray[li];
1612226031Sstas	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1613226031Sstas	dev = md_xlate_targ_2_mini(dev);
1614226031Sstas	if (dev == NODEV64) {
1615226031Sstas		return (1);
1616226031Sstas	}
1617226031Sstas
1618226031Sstas	blk = getphysblk(blka[0], mbip);
1619226031Sstas	ASSERT(blk >= 0);
1620226031Sstas
1621226031Sstas	cons = 1;
1622226031Sstas	while (cnt) {
1623226031Sstas		if (cons != cnt) {
1624226031Sstas			blk1 = getphysblk(blka[cons], mbip);
1625226031Sstas			ASSERT(blk1 >= 0);
1626226031Sstas			if ((blk + cons) == blk1) {
1627226031Sstas				cons++;
1628226031Sstas				continue;
1629226031Sstas			}
1630226031Sstas		}
1631226031Sstas		if (err = getblks(s, buffer, dev, blk, cons, flag))
1632226031Sstas			return (err);
1633226031Sstas		buffer += MDDB_BSIZE * cons;
1634226031Sstas		cnt -= cons;
1635226031Sstas		blka += cons;
1636226031Sstas		if (cnt) {
1637226031Sstas			blk = getphysblk(blka[0], mbip);
1638226031Sstas			ASSERT(blk >= 0);
1639226031Sstas		}
1640226031Sstas		cons = 1;
1641226031Sstas	}
1642226031Sstas	return (0);
1643226031Sstas}
1644226031Sstas
1645226031Sstas/*
1646226031Sstas * readblks - takes a logical block number/block count pair
1647226031Sstas * 		and reads those contiguous logical blocks into the buffer.
1648226031Sstas * If called during upgrade, this routine expects a non-translated
1649226031Sstas * (aka target) dev.
1650226031Sstas */
1651226031Sstasstatic int
1652226031Sstasreadblks(
1653226031Sstas	mddb_set_t	*s,	/* incore set structure */
1654226031Sstas	caddr_t		buffer,	/* buffer to be read into */
1655226031Sstas	mddb_block_t	blk,	/* logical block number to be read */
1656226031Sstas	int		cnt,	/* number of logical blocks to be read */
1657226031Sstas	int		li	/* locator index */
1658226031Sstas)
1659226031Sstas{
1660226031Sstas	daddr_t		physblk;
1661226031Sstas	md_dev64_t	device;
1662226031Sstas	int		i;
1663226031Sstas	mddb_block_t	*blkarray;
1664226031Sstas	int		size;
1665226031Sstas	int		ret;
1666226031Sstas
1667226031Sstas	if (cnt > 1) {
1668226031Sstas		size = sizeof (mddb_block_t) * cnt;
1669226031Sstas		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1670226031Sstas		for (i = 0; i < cnt; i++)
1671226031Sstas			blkarray[i] = blk + i;
1672226031Sstas		ret = readblklst(s, buffer, blkarray, cnt, li, 0);
1673226031Sstas		kmem_free(blkarray, size);
1674226031Sstas		return (ret);
1675226031Sstas	}
1676226031Sstas	physblk = getphysblk(blk, s->s_mbiarray[li]);
1677226031Sstas	ASSERT(physblk > 0);
1678226031Sstas	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1679226031Sstas	device = md_xlate_targ_2_mini(device);
1680226031Sstas	if (device == NODEV64) {
1681226031Sstas		return (1);
1682226031Sstas	}
1683226031Sstas	return (getblks(s, buffer, device, physblk, 1, 0));
1684226031Sstas}
1685226031Sstas
1686226031Sstasstatic void
1687226031Sstassingle_thread_start(
1688226031Sstas	mddb_set_t	*s
1689226031Sstas)
1690226031Sstas{
1691226031Sstas	while (s->s_singlelockgotten) {
1692226031Sstas		s->s_singlelockwanted++;
1693226031Sstas		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
1694226031Sstas	}
1695226031Sstas	s->s_singlelockgotten++;
1696226031Sstas}
1697226031Sstas
1698226031Sstasstatic void
1699226031Sstassingle_thread_end(
1700226031Sstas	mddb_set_t	*s
1701226031Sstas)
1702226031Sstas{
1703226031Sstas	ASSERT(s->s_singlelockgotten);
1704226031Sstas	s->s_singlelockgotten = 0;
1705226031Sstas	if (s->s_singlelockwanted) {
1706226031Sstas		s->s_singlelockwanted = 0;
1707226031Sstas		cv_broadcast(&s->s_single_thread_cv);
1708226031Sstas	}
1709226031Sstas}
1710226031Sstas
1711226031Sstasstatic size_t
1712226031Sstassizeofde(
1713226031Sstas	mddb_de_ic_t	*dep
1714226031Sstas)
1715226031Sstas{
1716226031Sstas	size_t		size;
1717226031Sstas
1718226031Sstas	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
1719226031Sstas	    sizeof (mddb_block_t) * dep->de_blkcount;
1720226031Sstas	return (size);
1721226031Sstas}
1722226031Sstas
1723226031Sstasstatic size_t
1724226031Sstassizeofde32(
1725226031Sstas	mddb_de32_t	*dep
1726226031Sstas)
1727226031Sstas{
1728226031Sstas	size_t		size;
1729226031Sstas
1730226031Sstas	size = sizeof (*dep) - sizeof (dep->de32_blks) +
1731226031Sstas	    sizeof (mddb_block_t) * dep->de32_blkcount;
1732226031Sstas	return (size);
1733226031Sstas}
1734226031Sstas
1735226031Sstasstatic mddb_de32_t *
1736226031Sstasnextentry(
1737226031Sstas	mddb_de32_t	*dep
1738226031Sstas)
1739226031Sstas{
1740226031Sstas	mddb_de32_t	*ret;
1741226031Sstas
1742226031Sstas	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
1743226031Sstas	return (ret);
1744226031Sstas}
1745226031Sstas
1746226031Sstasstatic void
1747226031Sstascreate_db32rec(
1748226031Sstas	mddb_db32_t *db32p,
1749226031Sstas	mddb_db_t *dbp
1750226031Sstas)
1751226031Sstas{
1752226031Sstas	mddb_de_ic_t *dep;
1753226031Sstas	mddb_de32_t *de32p;
1754226031Sstas
1755226031Sstas#if defined(_ILP32) && !defined(lint)
1756226031Sstas	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1757226031Sstas	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1758226031Sstas#endif
1759226031Sstas
1760226031Sstas	dbtodb32(dbp, db32p);
1761226031Sstas	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
1762226031Sstas		db32p->db32_firstentry = 0x4;
1763226031Sstas	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
1764226031Sstas	    + sizeof (db32p->db32_firstentry)));
1765226031Sstas	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
1766226031Sstas		detode32(dep, de32p);
1767226031Sstas		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
1768226031Sstas			de32p->de32_next = 0x4;
1769226031Sstas		de32p = nextentry(de32p);
1770226031Sstas	}
1771226031Sstas	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
1772226031Sstas}
1773226031Sstas
1774226031Sstas/*
1775226031Sstas * If called during upgrade, this routine expects a translated
1776226031Sstas * (aka miniroot) dev.
1777226031Sstas * If master blocks are found, set the mn_set parameter to 1 if the
1778226031Sstas * the master block revision number is MDDB_REV_MNMB; otherwise,
1779226031Sstas * set it to 0.
1780226031Sstas * If master blocks are not found, do not change the mnset parameter.
1781226031Sstas */
1782226031Sstasstatic mddb_mb_ic_t *
1783226031Sstasgetmasters(
1784226031Sstas	mddb_set_t	*s,
1785226031Sstas	md_dev64_t	dev,
1786226031Sstas	daddr_t		blkno,
1787226031Sstas	uint_t		*flag,
1788226031Sstas	int		*mn_set
1789226031Sstas)
1790226031Sstas{
1791226031Sstas	mddb_mb_ic_t	*mbi = NULL;
1792226031Sstas	mddb_mb_t	*mb;
1793226031Sstas	int		error = 0;
1794226031Sstas	ddi_devid_t	devid;
1795226031Sstas
1796226031Sstas
1797226031Sstas	if (mddb_devopen(dev)) {
1798226031Sstas		if (flag)
1799226031Sstas			*flag |= MDDB_F_EMASTER;
1800226031Sstas		return ((mddb_mb_ic_t *)NULL);
1801226031Sstas	}
1802226031Sstas
1803226031Sstas
1804226031Sstas	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
1805226031Sstas	mb = &(mbi->mbi_mddb_mb);
1806226031Sstas	if (error = getblks(s, (caddr_t)mb, dev, blkno,
1807226031Sstas	    btodb(MDDB_BSIZE), 0)) {
1808226031Sstas		error |= MDDB_F_EMASTER;
1809226031Sstas	}
1810226031Sstas	if (mb->mb_magic != MDDB_MAGIC_MB) {
1811226031Sstas		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1812226031Sstas	}
1813226031Sstas	/* Check for MDDB_REV_MNMB and lower */
1814226031Sstas	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
1815226031Sstas		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1816226031Sstas	}
1817226031Sstas	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
1818226031Sstas		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1819226031Sstas	}
1820226031Sstas
1821226031Sstas	if (!(md_get_setstatus(s->s_setno) &
1822226031Sstas	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
1823226031Sstas	    (mb->mb_setno != s->s_setno)) {
1824226031Sstas		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1825226031Sstas	}
1826226031Sstas	if (mb->mb_blkno != blkno) {
1827226031Sstas		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1828226031Sstas	}
1829226031Sstas	mb->mb_next = NULL;
1830226031Sstas	mbi->mbi_next = NULL;
1831226031Sstas
1832226031Sstas	if (error)
1833226031Sstas		goto out;
1834226031Sstas
1835226031Sstas	/*
1836226031Sstas	 * Check the md_devid_destroy and md_keep_repl_state flags
1837226031Sstas	 * to see if we need to regen the devid or not.
1838226031Sstas	 *
1839226031Sstas	 * Don't care about devid in local set since it is not used
1840226031Sstas	 * and this should not be part of set importing
1841226031Sstas	 */
1842226031Sstas	if ((s->s_setno != MD_LOCAL_SET) &&
1843226031Sstas	    !(md_get_setstatus(s->s_setno) &
1844226031Sstas	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
1845226031Sstas		/*
1846226031Sstas		 * Now check the destroy flag. We also need to handle
1847226031Sstas		 * the case where the destroy flag is reset after the
1848226031Sstas		 * destroy
1849226031Sstas		 */
1850226031Sstas		if (md_devid_destroy || (mb->mb_devid_len == 0)) {
1851226031Sstas
1852226031Sstas			if (md_devid_destroy) {
1853226031Sstas				bzero(mb->mb_devid, mb->mb_devid_len);
1854226031Sstas				mb->mb_devid_len = 0;
1855226031Sstas			}
1856226031Sstas
1857226031Sstas			/*
1858226031Sstas			 * Try to regenerate it if the 'keep' flag is not set
1859226031Sstas			 */
1860226031Sstas			if (!md_keep_repl_state) {
1861226031Sstas				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
1862226031Sstas				    &devid) == DDI_SUCCESS) {
1863226031Sstas					mb->mb_devid_len =
1864226031Sstas					    ddi_devid_sizeof(devid);
1865226031Sstas					bcopy(devid, mb->mb_devid,
1866226031Sstas					    mb->mb_devid_len);
1867226031Sstas					ddi_devid_free(devid);
1868226031Sstas				} else {
1869226031Sstas					error = MDDB_F_EFMT | MDDB_F_EMASTER;
1870226031Sstas				}
1871226031Sstas			}
1872226031Sstas
1873226031Sstas			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
1874226031Sstas
1875226031Sstas			/*
1876226031Sstas			 * Push
1877226031Sstas			 */
1878226031Sstas			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
1879226031Sstas				error = MDDB_F_EFMT | MDDB_F_EMASTER;
1880226031Sstas			}
1881226031Sstas		}
1882226031Sstas	}
1883226031Sstas
1884226031Sstas	if (! error) {
1885226031Sstas		/* Set mn_set parameter to 1 if a MN set */
1886226031Sstas		if (mb->mb_revision == MDDB_REV_MNMB)
1887226031Sstas			*mn_set = 1;
1888226031Sstas		else
1889226031Sstas			*mn_set = 0;
1890226031Sstas		return (mbi);
1891226031Sstas	}
1892226031Sstas
1893226031Sstasout:
1894226031Sstas	/* Error Out */
1895226031Sstas	if (flag)
1896226031Sstas		*flag |= error;
1897226031Sstas
1898226031Sstas	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
1899226031Sstas	mddb_devclose(dev);
1900226031Sstas	return ((mddb_mb_ic_t *)NULL);
1901226031Sstas}
1902226031Sstas
1903226031Sstasstatic int
1904226031Sstasgetrecord(
1905226031Sstas	mddb_set_t	*s,
1906226031Sstas	mddb_de_ic_t	*dep,
1907226031Sstas	int		li
1908226031Sstas)
1909226031Sstas{
1910226031Sstas	int		err = 0;
1911226031Sstas	mddb_rb32_t	*rbp;
1912226031Sstas
1913226031Sstas#if defined(_ILP32) && !defined(lint)
1914226031Sstas	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
1915226031Sstas#endif
1916226031Sstas
1917226031Sstas
1918226031Sstas	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
1919226031Sstas	rbp = dep->de_rb;
1920226031Sstas
1921226031Sstas	err = readblklst(s, (caddr_t)rbp, dep->de_blks,
1922226031Sstas	    dep->de_blkcount, li, 0);
1923226031Sstas	if (err) {
1924226031Sstas		return (MDDB_F_EDATA | err);
1925226031Sstas	}
1926226031Sstas	if (rbp->rb_magic != MDDB_MAGIC_RB) {
1927226031Sstas		return (MDDB_F_EFMT | MDDB_F_EDATA);
1928226031Sstas	}
1929226031Sstas	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
1930226031Sstas	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
1931226031Sstas	    (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
1932226031Sstas	    (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
1933226031Sstas		return (MDDB_F_EFMT | MDDB_F_EDATA);
1934226031Sstas	}
1935226031Sstas	/* Check crc for this record */
1936226031Sstas	if (rec_crcchk(s, dep, rbp)) {
1937226031Sstas		return (MDDB_F_EFMT | MDDB_F_EDATA);
1938226031Sstas	}
1939226031Sstas	return (0);
1940226031Sstas}
1941226031Sstas
1942226031Sstas/*
1943226031Sstas * Code to read in the locator name information
1944226031Sstas */
1945226031Sstasstatic int
1946226031Sstasreadlocnames(
1947226031Sstas	mddb_set_t	*s,
1948226031Sstas	int		li
1949226031Sstas)
1950226031Sstas{
1951226031Sstas	mddb_ln_t	*lnp;
1952226031Sstas	int		err = 0;
1953226031Sstas	mddb_block_t	ln_blkcnt, ln_blkno;
1954226031Sstas
1955226031Sstas	/*
1956226031Sstas	 * read in the locator name blocks
1957226031Sstas	 */
1958226031Sstas	s->s_lnp = NULL;
1959226031Sstas
1960226031Sstas	ln_blkno = s->s_lbp->lb_lnfirstblk;
1961226031Sstas	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
1962226031Sstas	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
1963226031Sstas
1964226031Sstas	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
1965226031Sstas	if (err) {
1966226031Sstas		err |= MDDB_F_EDATA;
1967226031Sstas		goto out;
1968226031Sstas	}
1969226031Sstas	if (lnp->ln_magic != MDDB_MAGIC_LN) {
1970226031Sstas		err = MDDB_F_EDATA | MDDB_F_EFMT;
1971226031Sstas		goto out;
1972226031Sstas	}
1973226031Sstas	if (s->s_lbp->lb_flags & MDDB_MNSET) {
1974226031Sstas		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
1975226031Sstas			err = MDDB_F_EDATA | MDDB_F_EFMT;
1976226031Sstas			goto out;
1977226031Sstas		}
1978226031Sstas	} else {
1979226031Sstas		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
1980226031Sstas			err = MDDB_F_EDATA | MDDB_F_EFMT;
1981226031Sstas			goto out;
1982226031Sstas		}
1983226031Sstas	}
1984226031Sstas	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
1985226031Sstas		err = MDDB_F_EDATA | MDDB_F_EFMT;
1986226031Sstas		goto out;
1987226031Sstas	}
1988226031Sstasout:
1989226031Sstas	/*
1990226031Sstas	 *	if error occurred in locator name blocks free them
1991226031Sstas	 *	and return
1992226031Sstas	 */
1993226031Sstas	if (err) {
1994226031Sstas		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
1995226031Sstas		return (err);
1996226031Sstas	}
1997226031Sstas	s->s_lnp = lnp;
1998226031Sstas	return (0);
1999226031Sstas}
2000226031Sstas
2001226031Sstas/*
2002226031Sstas * code to read in a copy of the database.
2003226031Sstas */
2004226031Sstas
2005226031Sstasstatic int
2006226031Sstasreadcopy(
2007226031Sstas	mddb_set_t	*s,
2008226031Sstas	int		li
2009226031Sstas)
2010226031Sstas{
2011226031Sstas	uint_t		blk;
2012226031Sstas	mddb_db_t	*dbp, *dbp1, *dbhp;
2013226031Sstas	mddb_db32_t	*db32p;
2014226031Sstas	mddb_de_ic_t	*dep, *dep2;
2015226031Sstas	mddb_de32_t	*de32p, *de32p2;
2016226031Sstas	int		err = 0;
2017226031Sstas	uint_t		checksum;
2018226031Sstas
2019226031Sstas
2020226031Sstas#if defined(_ILP32) && !defined(lint)
2021226031Sstas	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2022226031Sstas	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2023226031Sstas#endif
2024226031Sstas
2025226031Sstas	dbp = NULL;
2026226031Sstas	dbhp = NULL;
2027226031Sstas	/*
2028226031Sstas	 *	read in all the directory blocks
2029226031Sstas	 */
2030226031Sstas	blk = s->s_lbp->lb_dbfirstblk;
2031226031Sstas	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2032226031Sstas
2033226031Sstas	for (; blk != 0; blk = dbp->db_nextblk) {
2034226031Sstas		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
2035226031Sstas		if (! dbhp) {
2036226031Sstas			dbhp = dbp1;
2037226031Sstas		} else {
2038226031Sstas			dbp->db_next = dbp1;
2039226031Sstas		}
2040226031Sstas		dbp = dbp1;
2041226031Sstas
2042226031Sstas		err = readblks(s, (caddr_t)db32p, blk, 1, li);
2043226031Sstas		if (err) {
2044226031Sstas			err |= MDDB_F_EDATA;
2045226031Sstas			break;
2046226031Sstas		}
2047226031Sstas		db32todb(db32p, dbp);
2048226031Sstas		if (db32p->db32_magic != MDDB_MAGIC_DB) {
2049226031Sstas			err = MDDB_F_EDATA | MDDB_F_EFMT;
2050226031Sstas			break;
2051226031Sstas		}
2052226031Sstas		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
2053226031Sstas			err = MDDB_F_EDATA | MDDB_F_EFMT;
2054226031Sstas			break;
2055226031Sstas		}
2056226031Sstas		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
2057226031Sstas			err = MDDB_F_EDATA | MDDB_F_EFMT;
2058226031Sstas			break;
2059226031Sstas		}
2060226031Sstas		/*
2061226031Sstas		 * first go through and fix up all de_next pointers
2062226031Sstas		 */
2063226031Sstas		if (dbp->db_firstentry) {
2064226031Sstas
2065226031Sstas			de32p = (mddb_de32_t *)
2066226031Sstas			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
2067226031Sstas			    + sizeof (db32p->db32_firstentry)));
2068226031Sstas
2069226031Sstas			dep = (mddb_de_ic_t *)
2070226031Sstas			    kmem_zalloc(sizeof (mddb_de_ic_t) -
2071226031Sstas			    sizeof (mddb_block_t) +
2072226031Sstas			    sizeof (mddb_block_t) * de32p->de32_blkcount,
2073226031Sstas			    KM_SLEEP);
2074226031Sstas			de32tode(de32p, dep);
2075226031Sstas
2076226031Sstas			dbp->db_firstentry = dep;
2077226031Sstas			while (de32p && de32p->de32_next) {
2078226031Sstas
2079226031Sstas				de32p2 = nextentry(de32p);
2080226031Sstas
2081226031Sstas				dep2 = (mddb_de_ic_t *)kmem_zalloc(
2082226031Sstas				    sizeof (mddb_de_ic_t) -
2083226031Sstas				    sizeof (mddb_block_t) +
2084226031Sstas				    sizeof (mddb_block_t) *
2085226031Sstas				    de32p2->de32_blkcount, KM_SLEEP);
2086226031Sstas
2087226031Sstas				de32tode(de32p2, dep2);
2088226031Sstas
2089226031Sstas				dep->de_next = dep2;
2090226031Sstas				dep = dep2;
2091226031Sstas				de32p = de32p2;
2092226031Sstas			}
2093226031Sstas		}
2094226031Sstas		/*
2095226031Sstas		 * go through and make all of the pointer to record blocks
2096226031Sstas		 * are null;
2097226031Sstas		 */
2098226031Sstas		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
2099226031Sstas			dep->de_rb = NULL;
2100226031Sstas	}
2101226031Sstas	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2102226031Sstas	dbp->db_next = NULL;
2103226031Sstas	/*
2104226031Sstas	 *	if error occurred in directory blocks free them
2105226031Sstas	 *	and return
2106226031Sstas	 */
2107226031Sstas	if (err) {
2108226031Sstas		dbp = dbhp;
2109226031Sstas		while (dbp) {
2110226031Sstas			dep = dbp->db_firstentry;
2111226031Sstas			while (dep) {
2112226031Sstas				/* No mddb_rb32_t structures yet */
2113226031Sstas				dep2 = dep->de_next;
2114226031Sstas				kmem_free((caddr_t)dep, sizeofde(dep));
2115226031Sstas				dep = dep2;
2116226031Sstas			}
2117226031Sstas			dbp1 = dbp->db_next;
2118226031Sstas			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2119226031Sstas			dbp = dbp1;
2120226031Sstas		}
2121226031Sstas		s->s_dbp = NULL;
2122226031Sstas		return (err);
2123226031Sstas
2124226031Sstas	}
2125226031Sstas	/*
2126226031Sstas	 */
2127226031Sstas	err = 0;
2128226031Sstas	checksum = MDDB_GLOBAL_XOR;
2129226031Sstas	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
2130226031Sstas		checksum ^= dbp->db_recsum;
2131226031Sstas		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2132226031Sstas			if (dep->de_flags & MDDB_F_OPT)
2133226031Sstas				continue;
2134226031Sstas			err = getrecord(s, dep, li);
2135226031Sstas			if (err)
2136226031Sstas				break;
2137226031Sstas			/* Don't include CHANGELOG in big XOR */
2138226031Sstas			if (dep->de_flags & MDDB_F_CHANGELOG)
2139226031Sstas				continue;
2140226031Sstas			checksum ^= dep->de_rb->rb_checksum;
2141226031Sstas			checksum ^= dep->de_rb->rb_checksum_fiddle;
2142226031Sstas		}
2143226031Sstas		if (err)
2144226031Sstas			break;
2145226031Sstas	}
2146226031Sstas	if (checksum) {
2147226031Sstas		if (! err)
2148226031Sstas			err = MDDB_F_EDATA | MDDB_F_EFMT;
2149226031Sstas	}
2150226031Sstas	if (err) {
2151226031Sstas		dbp = dbhp;
2152226031Sstas		dbhp = NULL;
2153226031Sstas		while (dbp) {
2154226031Sstas			dep = dbp->db_firstentry;
2155226031Sstas			while (dep) {
2156226031Sstas				if (dep->de_rb)
2157226031Sstas					kmem_free((caddr_t)dep->de_rb,
2158226031Sstas					    dep->de_recsize);
2159226031Sstas				dep2 = dep->de_next;
2160226031Sstas				kmem_free((caddr_t)dep, sizeofde(dep));
2161226031Sstas				dep = dep2;
2162226031Sstas			}
2163226031Sstas			dbp1 = dbp->db_next;
2164226031Sstas			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2165226031Sstas			dbp = dbp1;
2166226031Sstas		}
2167226031Sstas	}
2168226031Sstas	s->s_dbp = dbhp;
2169226031Sstas	return (err);
2170226031Sstas}
2171226031Sstas
2172226031Sstasstatic int
2173226031Sstasgetoptcnt(
2174226031Sstas	mddb_set_t	*s,
2175226031Sstas	int		li)
2176226031Sstas{
2177226031Sstas	int		result;
2178226031Sstas	mddb_de_ic_t	*dep;
2179226031Sstas	mddb_db_t	*dbp;
2180226031Sstas
2181226031Sstas#if defined(_ILP32) && !defined(lint)
2182226031Sstas	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2183226031Sstas	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2184226031Sstas#endif
2185226031Sstas
2186226031Sstas	result = 0;
2187226031Sstas	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2188226031Sstas		dep = dbp->db_firstentry;
2189226031Sstas		for (; dep != NULL; dep = dep->de_next) {
2190226031Sstas			if (! (dep->de_flags & MDDB_F_OPT))
2191226031Sstas				continue;
2192226031Sstas			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
2193226031Sstas			    (li == dep->de_optinfo[0].o_li)) ||
2194226031Sstas			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
2195226031Sstas			    (li == dep->de_optinfo[1].o_li)))
2196226031Sstas			result++;
2197226031Sstas		}
2198226031Sstas	}
2199226031Sstas	return (result);
2200226031Sstas}
2201226031Sstas
2202226031Sstasstatic void
2203226031Sstasgetoptdev(
2204226031Sstas	mddb_set_t	*s,
2205226031Sstas	mddb_de_ic_t	*rdep,
2206226031Sstas	int		opti
2207226031Sstas)
2208226031Sstas{
2209226031Sstas	mddb_lb_t	*lbp;
2210226031Sstas	mddb_locator_t	*lp;
2211226031Sstas	mddb_optinfo_t	*otherop;
2212226031Sstas	mddb_optinfo_t	*resultop;
2213226031Sstas	int		li;
2214226031Sstas	dev_t		otherdev;
2215226031Sstas	int		blkonly = 0;
2216226031Sstas	int		mincnt;
2217226031Sstas	int		thiscnt;
2218226031Sstas
2219226031Sstas	lbp = s->s_lbp;
2220226031Sstas
2221226031Sstas	resultop = &rdep->de_optinfo[opti];
2222226031Sstas	otherop = &rdep->de_optinfo[1-opti];
2223226031Sstas
2224226031Sstas	resultop->o_flags = 0;
2225226031Sstas
2226226031Sstas	/*
2227226031Sstas	 * scan through and see if data bases have to vary by only device
2228226031Sstas	 */
2229226031Sstas
2230226031Sstas	if (otherop->o_flags & MDDB_F_ACTIVE) {
2231226031Sstas		blkonly = 1;
2232226031Sstas		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
2233226031Sstas		for (li = 0; li < lbp->lb_loccnt; li++) {
2234226031Sstas			lp = &lbp->lb_locators[li];
2235226031Sstas			if (! (lp->l_flags & MDDB_F_ACTIVE))
2236226031Sstas				continue;
2237226031Sstas			if (expldev(lp->l_dev) != otherdev) {
2238226031Sstas				blkonly = 0;
2239226031Sstas				break;
2240226031Sstas			}
2241226031Sstas		}
2242226031Sstas	}
2243226031Sstas
2244226031Sstas	mincnt = 999999;
2245226031Sstas	for (li = 0; li < lbp->lb_loccnt; li++) {
2246226031Sstas		dev_info_t	*devi;
2247226031Sstas		int		removable = 0;
2248226031Sstas
2249226031Sstas		lp = &lbp->lb_locators[li];
2250226031Sstas		if (! (lp->l_flags & MDDB_F_ACTIVE))
2251226031Sstas			continue;
2252226031Sstas		if (otherop->o_flags & MDDB_F_ACTIVE) {
2253226031Sstas			if (blkonly) {
2254226031Sstas				if (otherop->o_li == li)
2255226031Sstas					continue;
2256226031Sstas			} else {
2257226031Sstas				if (otherdev == expldev(lp->l_dev))
2258226031Sstas					continue;
2259226031Sstas			}
2260226031Sstas		}
2261226031Sstas
2262226031Sstas		/*
2263226031Sstas		 * Check if this is a removable device.  If it is we
2264226031Sstas		 * assume it is something like a USB flash disk, a zip disk
2265226031Sstas		 * or even a floppy that is being used to help maintain
2266226031Sstas		 * mddb quorum.  We don't want to put any optimized resync
2267226031Sstas		 * records on these kinds of disks since they are usually
2268226031Sstas		 * slower or don't have the same read/write lifetimes as
2269226031Sstas		 * a regular fixed disk.
2270226031Sstas		 */
2271226031Sstas		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
2272226031Sstas			int		error;
2273226031Sstas			struct cb_ops	*cb;
2274226031Sstas			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
2275226031Sstas			int		propvalue = 0;
2276226031Sstas			int		proplength = sizeof (int);
2277226031Sstas
2278226031Sstas			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
2279226031Sstas			    != NULL) {
2280226031Sstas				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
2281226031Sstas				    prop_op, DDI_PROP_NOTPROM |
2282226031Sstas				    DDI_PROP_DONTPASS, "removable-media",
2283226031Sstas				    (caddr_t)&propvalue, &proplength);
2284226031Sstas
2285226031Sstas				if (error == DDI_PROP_SUCCESS)
2286226031Sstas					removable = 1;
2287226031Sstas			}
2288226031Sstas
2289226031Sstas			ddi_release_devi(devi);
2290226031Sstas		}
2291226031Sstas
2292226031Sstas		if (removable)
2293226031Sstas			continue;
2294226031Sstas
2295226031Sstas		thiscnt = getoptcnt(s, li);
2296226031Sstas		if (thiscnt < mincnt) {
2297226031Sstas			resultop->o_li  = li;
2298226031Sstas			mincnt = thiscnt;
2299226031Sstas			resultop->o_flags = MDDB_F_ACTIVE;
2300226031Sstas		}
2301226031Sstas	}
2302226031Sstas}
2303226031Sstas
2304226031Sstasstatic void
2305226031Sstasallocuserdata(
2306226031Sstas	mddb_de_ic_t	*dep
2307226031Sstas)
2308226031Sstas{
2309226031Sstas	mddb_rb32_t	*rbp;
2310226031Sstas
2311226031Sstas#if defined(_ILP32) && !defined(lint)
2312226031Sstas	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2313226031Sstas#endif
2314226031Sstas
2315226031Sstas	rbp = dep->de_rb;
2316226031Sstas	rbp->rb_private = 0;
2317226031Sstas	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
2318226031Sstas	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
2319226031Sstas	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
2320226031Sstas}
2321226031Sstas
2322226031Sstas
2323226031Sstasstatic void
2324226031Sstasgetuserdata(
2325226031Sstas	set_t		setno,
2326226031Sstas	mddb_de_ic_t	*dep
2327226031Sstas)
2328226031Sstas{
2329226031Sstas	mddb_rb32_t	 *rbp;
2330226031Sstas
2331226031Sstas
2332226031Sstas	mddb_type_t	type = dep->de_type1;
2333226031Sstas	caddr_t		data, udata;
2334226031Sstas
2335226031Sstas#if defined(_ILP32) && !defined(lint)
2336226031Sstas	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2337226031Sstas#endif
2338226031Sstas	rbp = dep->de_rb;
2339226031Sstas	data = (caddr_t)rbp->rb_data;
2340226031Sstas	udata = (caddr_t)dep->de_rb_userdata;
2341226031Sstas
2342226031Sstas	/*
2343226031Sstas	 * If it's a driver record, and an old style record, and not a DRL
2344226031Sstas	 * record, we must convert it because it was incore as a 64 bit
2345226031Sstas	 * structure but its on disk layout has only 32 bit for block sizes
2346226031Sstas	 */
2347226031Sstas	if (!(md_get_setstatus(setno) &
2348226031Sstas	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
2349226031Sstas	    (type >= MDDB_FIRST_MODID) &&
2350226031Sstas	    ((rbp->rb_revision == MDDB_REV_RB) ||
2351226031Sstas	    (rbp->rb_revision == MDDB_REV_RBFN))) {
2352226031Sstas
2353226031Sstas		switch (dep->de_flags) {
2354226031Sstas
2355226031Sstas			case MDDB_F_STRIPE:
2356226031Sstas				stripe_convert(data, udata, BIG_2_SMALL);
2357226031Sstas				break;
2358226031Sstas
2359226031Sstas			case MDDB_F_MIRROR:
2360226031Sstas				mirror_convert(data, udata, BIG_2_SMALL);
2361226031Sstas				break;
2362226031Sstas
2363226031Sstas			case MDDB_F_RAID:
2364226031Sstas				raid_convert(data, udata, BIG_2_SMALL);
2365226031Sstas				break;
2366226031Sstas
2367226031Sstas			case MDDB_F_SOFTPART:
2368226031Sstas				softpart_convert(data, udata, BIG_2_SMALL);
2369226031Sstas				break;
2370226031Sstas
2371226031Sstas			case MDDB_F_TRANS_MASTER:
2372226031Sstas				trans_master_convert(data, udata, BIG_2_SMALL);
2373226031Sstas				break;
2374226031Sstas
2375226031Sstas			case MDDB_F_TRANS_LOG:
2376226031Sstas				trans_log_convert(data, udata, BIG_2_SMALL);
2377226031Sstas				break;
2378226031Sstas
2379226031Sstas			case MDDB_F_HOTSPARE:
2380226031Sstas				hs_convert(data, udata, BIG_2_SMALL);
2381226031Sstas				break;
2382226031Sstas
2383226031Sstas			case MDDB_F_OPT:
2384226031Sstas			default:
2385226031Sstas				bcopy(udata, data, dep->de_reqsize);
2386226031Sstas		}
2387226031Sstas	} else {
2388226031Sstas		bcopy(udata, data, dep->de_reqsize);
2389226031Sstas	}
2390226031Sstas}
2391226031Sstas
2392226031Sstasstatic void
2393226031Sstasgetoptrecord(
2394226031Sstas	mddb_set_t	*s,
2395226031Sstas	mddb_de_ic_t	*dep
2396226031Sstas)
2397226031Sstas{
2398226031Sstas	mddb_lb_t	*lbp;
2399226031Sstas	mddb_locator_t	*lp;
2400226031Sstas	mddb_rb32_t	*rbp, *crbp;
2401226031Sstas	int		li;
2402226031Sstas	int		i;
2403226031Sstas	int		err = 0;
2404226031Sstas	size_t		recsize;
2405226031Sstas
2406226031Sstas#if defined(_ILP32) && !defined(lint)
2407226031Sstas	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2408226031Sstas#endif
2409226031Sstas
2410226031Sstas	lbp = s->s_lbp;
2411226031Sstas
2412226031Sstas	recsize = dep->de_recsize;
2413226031Sstas	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2414226031Sstas	rbp = dep->de_rb;
2415226031Sstas	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2416226031Sstas
2417226031Sstas	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
2418226031Sstas	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2419226031Sstas
2420226031Sstas	for (i = 0; i < 2; i++) {
2421226031Sstas		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2422226031Sstas			continue;
2423226031Sstas		li = dep->de_optinfo[i].o_li;
2424226031Sstas		lp = &lbp->lb_locators[li];
2425226031Sstas
2426226031Sstas		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
2427226031Sstas		    (lp->l_flags & MDDB_F_EMASTER))
2428226031Sstas			continue;
2429226031Sstas
2430226031Sstas		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
2431226031Sstas		    dep->de_blkcount, li, 0);
2432226031Sstas
2433226031Sstas		if (err)
2434226031Sstas			continue;
2435226031Sstas
2436226031Sstas		if (rbp->rb_magic != MDDB_MAGIC_RB)
2437226031Sstas			continue;
2438226031Sstas
2439226031Sstas		if (revchk(MDDB_REV_RB, rbp->rb_revision))
2440226031Sstas			continue;
2441226031Sstas
2442226031Sstas		/* Check the crc for this record */
2443226031Sstas		if (rec_crcchk(s, dep, rbp)) {
2444226031Sstas			continue;
2445226031Sstas		}
2446226031Sstas
2447226031Sstas		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
2448226031Sstas
2449226031Sstas		if (rbp == crbp) {
2450226031Sstas			if (rbp->rb_checksum != crbp->rb_checksum)
2451226031Sstas				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2452226031Sstas			break;
2453226031Sstas		}
2454226031Sstas		rbp = crbp;
2455226031Sstas	}
2456226031Sstas
2457226031Sstas	if (rbp == crbp) {
2458226031Sstas		rbp->rb_private = 0;
2459226031Sstas		kmem_free((caddr_t)crbp, recsize);
2460226031Sstas		return;
2461226031Sstas	}
2462226031Sstas	bzero((caddr_t)rbp, recsize);
2463226031Sstas	rbp->rb_magic = MDDB_MAGIC_RB;
2464226031Sstas	rbp->rb_revision = MDDB_REV_RB;
2465226031Sstas	uniqtime32(&rbp->rb_timestamp);
2466226031Sstas	/* Generate the crc for this record */
2467226031Sstas	rec_crcgen(s, dep, rbp);
2468226031Sstas	kmem_free((caddr_t)crbp, recsize);
2469226031Sstas}
2470226031Sstas
2471226031Sstas/*
2472226031Sstas * writeoptrecord writes out an optimized record.
2473226031Sstas */
2474226031Sstasstatic int
2475226031Sstaswriteoptrecord(
2476226031Sstas	mddb_set_t	*s,
2477226031Sstas	mddb_de_ic_t	*dep
2478226031Sstas)
2479226031Sstas{
2480226031Sstas	mddb_rb32_t	*rbp;
2481226031Sstas	int		li;
2482226031Sstas	int		err = 0, wrt_err = 0;
2483226031Sstas	mddb_bf_t	*bufhead, *bfp;
2484226031Sstas	mddb_lb_t	*lbp = s->s_lbp;
2485226031Sstas	mddb_locator_t	*lp;
2486226031Sstas	int		i;
2487226031Sstas
2488226031Sstas#if defined(_ILP32) && !defined(lint)
2489226031Sstas	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2490226031Sstas#endif
2491226031Sstas
2492226031Sstas	bufhead = NULL;
2493226031Sstas	err = 0;
2494226031Sstas
2495226031Sstas	while (s->s_opthavequeuinglck) {
2496226031Sstas		s->s_optwantqueuinglck++;
2497226031Sstas		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
2498226031Sstas	}
2499226031Sstas	s->s_opthavequeuinglck++;
2500226031Sstas	rbp = dep->de_rb;
2501226031Sstas	for (i = 0; i < 2; i++) {
2502226031Sstas		/*
2503226031Sstas		 * only possible error is xlate. This can
2504226031Sstas		 * occur if a replica was off line and came
2505226031Sstas		 * back. During the mean time the database grew
2506226031Sstas		 * large than the now on line replica can store
2507226031Sstas		 */
2508226031Sstas		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2509226031Sstas			continue;
2510226031Sstas		li = dep->de_optinfo[i].o_li;
2511226031Sstas		/*
2512226031Sstas		 * In a MN diskset, any node can write optimized record(s).
2513226031Sstas		 */
2514226031Sstas		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
2515226031Sstas		    dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
2516226031Sstas		/*
2517226031Sstas		 * For MN diskset, set error in optinfo structure so
2518226031Sstas		 * that mddb_commitrec knows which replica failed.
2519226031Sstas		 */
2520226031Sstas		if ((MD_MNSET_SETNO(s->s_setno)) &&
2521226031Sstas		    (wrt_err & MDDB_F_EWRITE)) {
2522226031Sstas			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
2523226031Sstas		}
2524226031Sstas		err |= wrt_err;
2525226031Sstas	}
2526226031Sstas	s->s_opthavequeuinglck = 0;
2527226031Sstas	if (s->s_optwantqueuinglck) {
2528226031Sstas		s->s_optwantqueuinglck = 0;
2529226031Sstas		cv_broadcast(&s->s_optqueuing_cv);
2530226031Sstas	}
2531226031Sstas	for (bfp = bufhead; bfp; bfp = bufhead) {
2532226031Sstas		mutex_exit(SETMUTEX(s->s_setno));
2533226031Sstas		(void) biowait(&bfp->bf_buf);
2534226031Sstas		mutex_enter(SETMUTEX(s->s_setno));
2535226031Sstas		if (bfp->bf_buf.b_flags & B_ERROR) {
2536226031Sstas			/*
2537226031Sstas			 * If an MN diskset, don't set replica
2538226031Sstas			 * in error since this hasn't been set in master.
2539226031Sstas			 * Setting replica in error before master could
2540226031Sstas			 * leave the nodes with different views of the
2541226031Sstas			 * world since a class 1 configuration change
2542226031Sstas			 * could occur in mddb_commitrec as soon as
2543226031Sstas			 * all locks are dropped.  Must keep this
2544226031Sstas			 * node the same as master and can't afford a
2545226031Sstas			 * failure from the class 1 config change
2546226031Sstas			 * if master succeeded.
2547226031Sstas			 */
2548226031Sstas			if (!(MD_MNSET_SETNO(s->s_setno))) {
2549226031Sstas				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
2550226031Sstas			} else {
2551226031Sstas				/*
2552226031Sstas				 * Find which de_optinfo (which replica)
2553226031Sstas				 * had a failure and set the failure in
2554226031Sstas				 * the o_flags field.
2555226031Sstas				 */
2556226031Sstas				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
2557226031Sstas				if (lp == bfp->bf_locator) {
2558226031Sstas					dep->de_optinfo[0].o_flags |=
2559226031Sstas					    MDDB_F_EWRITE;
2560226031Sstas				} else {
2561226031Sstas					dep->de_optinfo[1].o_flags |=
2562226031Sstas					    MDDB_F_EWRITE;
2563226031Sstas				}
2564226031Sstas			}
2565226031Sstas			err |= MDDB_F_EWRITE;
2566226031Sstas		}
2567226031Sstas		bufhead = bfp->bf_next;
2568226031Sstas		freebuffer(s, bfp);
2569226031Sstas	}
2570226031Sstas	return (err);
2571226031Sstas}
2572226031Sstas
2573226031Sstas/*
2574226031Sstas * Fix up the optimized resync record.  Used in the traditional and local
2575226031Sstas * disksets to move an optimized record from a failed or deleted mddb
2576226031Sstas * to an active one.
2577226031Sstas *
2578226031Sstas * In a MN diskset, the fixing of the optimized record is split between
2579226031Sstas * the master and slave nodes.  If the master node moves the optimized
2580226031Sstas * resync record, then the master node will send a MDDB_PARSE_OPTRECS
2581226031Sstas * message to the slave nodes causing the slave nodes to reget the
2582226031Sstas * directory entry containing the location of the optimized resync record.
2583226031Sstas * After the record is reread from disk, then writeoptrecord is called
2584226031Sstas * if the location of the optimized resync record or flags have changed.
2585226031Sstas * When writeoptrecord is called, the node that is the owner of this record
2586226031Sstas * will write the optimized record to the location specified in the directory
2587226031Sstas * entry.  Since the master node uses the highest class message (PARSE)
2588226031Sstas * the record owner node is guaranteed to already have an updated
2589226031Sstas * directory entry incore.
2590226031Sstas *
2591226031Sstas * The other difference between the traditional/local set and MN diskset
2592226031Sstas * is that the directory entry can be written to disk before the optimized
2593226031Sstas * record in a MN diskset if the record is owned by a slave node.  So,
2594226031Sstas * the users of an optimized record must handle the failure case when no
2595226031Sstas * data is available from an optimized record since the master node could
2596226031Sstas * have failed during the relocation of the optimized record to another mddb.
2597226031Sstas */
2598226031Sstasstatic int
2599226031Sstasfixoptrecord(
2600226031Sstas	mddb_set_t	*s,
2601226031Sstas	mddb_de_ic_t	*dep,
2602226031Sstas	mddb_db_t	*dbp
2603226031Sstas)
2604226031Sstas{
2605226031Sstas	int		changed;
2606226031Sstas	int		writedata;
2607226031Sstas	int		err = 0;
2608226031Sstas	int		i;
2609226031Sstas	mddb_lb_t	*lbp;
2610226031Sstas	mddb_optinfo_t	*op;
2611226031Sstas	mddb_db32_t	*db32p;
2612226031Sstas	int		rec_owner;	/* Is node owner of record? */
2613226031Sstas
2614#if defined(_ILP32) && !defined(lint)
2615	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2616#endif
2617
2618	lbp = s->s_lbp;
2619	changed = 0;
2620	writedata = 0;
2621	for (i = 0; i < 2; i++) {
2622		op = &dep->de_optinfo[i];
2623
2624		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
2625			op->o_flags = 0;
2626
2627		/*
2628		 * If optimized record has seen a replica failure,
2629		 * assign new replica to record and re-write data
2630		 * to new record.
2631		 */
2632		if (! (op->o_flags & MDDB_F_ACTIVE)) {
2633			getoptdev(s, dep, i);
2634			writedata++;
2635			changed++;
2636			/* Set flag for slaves to reread dep and write rec */
2637			if (lbp->lb_flags & MDDB_MNSET) {
2638				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
2639			}
2640		}
2641
2642		/*
2643		 * If just an error in the data was seen, set
2644		 * the optimized record's replica flag to active (ok)
2645		 * and try again.
2646		 */
2647		if (op->o_flags & MDDB_F_EDATA) {
2648			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
2649			writedata++;
2650		}
2651	}
2652
2653	rec_owner = 0;
2654	if (lbp->lb_flags & MDDB_MNSET) {
2655		/*
2656		 * If a MN diskset then check the owner of optimized record.
2657		 * If the master node owns the record or if there is
2658		 * no owner of the record, then the master can write the
2659		 * optimized record to disk.
2660		 * Master node can write the optimized record now, but
2661		 * slave nodes write their records during handling of
2662		 * the MDDB_PARSE_OPTRECS message.
2663		 */
2664		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
2665		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
2666			rec_owner = 1;
2667		}
2668	} else {
2669		/*
2670		 * In traditional diskset and local set, this node
2671		 * is always the record owner and always the master.
2672		 */
2673		rec_owner = 1;
2674	}
2675
2676	/*
2677	 * If this node is the record owner, write out record.
2678	 */
2679	if ((writedata) && (rec_owner)) {
2680		if (err = writeoptrecord(s, dep)) {
2681			return (err);
2682		}
2683	}
2684	if (! changed)
2685		return (0);
2686	uniqtime32(&dbp->db_timestamp);
2687	dbp->db_revision = MDDB_REV_DB;
2688	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2689	create_db32rec(db32p, dbp);
2690	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
2691	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
2692	    1, MDDB_WR_ONLY_MASTER);
2693	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2694	return (err);
2695}
2696
2697static int
2698fixoptrecords(
2699	mddb_set_t		*s
2700)
2701{
2702	mddb_de_ic_t	*dep;
2703	mddb_db_t	*dbp;
2704	int		err = 0;
2705	set_t		setno;
2706
2707	/*
2708	 * In a MN diskset, the master node is the only node that runs
2709	 * fixoptrecords.  If the master node changes anything, then the
2710	 * master node sends PARSE message to the slave nodes.  The slave
2711	 * nodes will then re-read in the locator block or re-read in the
2712	 * directory blocks and re-write the optimized resync records.
2713	 */
2714	setno = s->s_setno;
2715	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
2716	    (md_set[setno].s_am_i_master == 0)) {
2717		return (0);
2718	}
2719
2720	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2721		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2722			if (! (dep->de_flags & MDDB_F_OPT))
2723				continue;
2724			err = fixoptrecord(s, dep, dbp);
2725			if (err != 0)
2726				return (err);
2727		}
2728	}
2729	return (0);
2730}
2731
2732/*
2733 * Checks incore version of mddb data to mddb data ondisk.
2734 *
2735 * Returns:
2736 *	- 0 if the data was successfully read and is good.
2737 *	- MDDB_F_EREAD if a read error occurred.
2738 *	- 1 if the data read is bad (checksum failed, etc)
2739 */
2740static int
2741checkcopy
2742(
2743	mddb_set_t	*s,
2744	int		li
2745)
2746{
2747	mddb_db_t	*dbp;
2748	mddb_db32_t	*cdb32p;
2749	mddb_de_ic_t	*dep;
2750	mddb_de32_t	*cde32p;
2751	mddb_rb32_t	*rbp, *crbp;
2752	size_t		size;
2753	int		i;
2754	int		retval = 1;
2755
2756#if defined(_ILP32) && !defined(lint)
2757	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2758	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2759	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2760#endif
2761
2762	if (s->s_databuffer_size == 0) {
2763		size_t maxrecsize = MDDB_BSIZE;
2764
2765		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
2766			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
2767				if (! (dep->de_flags & MDDB_F_OPT) &&
2768				    dep->de_recsize > maxrecsize)
2769					maxrecsize = dep->de_recsize;
2770
2771		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
2772		s->s_databuffer_size = maxrecsize;
2773	}
2774
2775	cdb32p = (mddb_db32_t *)s->s_databuffer;
2776
2777	/*
2778	 * first go through and make sure all directory stuff
2779	 * is the same
2780	 */
2781	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2782		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
2783			retval = MDDB_F_EREAD;
2784			goto err;
2785		}
2786		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
2787			goto err;
2788		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
2789			goto err;
2790		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
2791			goto err;
2792		if (cdb32p->db32_nextblk != dbp->db_nextblk)
2793			goto err;
2794		if (cdb32p->db32_recsum != dbp->db_recsum)
2795			goto err;
2796		if (cdb32p->db32_firstentry) {
2797			cde32p = (mddb_de32_t *)
2798			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
2799			    + sizeof (cdb32p->db32_firstentry)));
2800		} else
2801			cde32p = NULL;
2802
2803		dep = dbp->db_firstentry;
2804		/*
2805		 * check if all directory entries are identical
2806		 */
2807		while (dep && cde32p) {
2808			if (dep->de_recid != cde32p->de32_recid)
2809				goto err;
2810			if (dep->de_type1 != cde32p->de32_type1)
2811				goto err;
2812			if (dep->de_type2 != cde32p->de32_type2)
2813				goto err;
2814			if (dep->de_reqsize != cde32p->de32_reqsize)
2815				goto err;
2816			if (dep->de_flags != cde32p->de32_flags)
2817				goto err;
2818
2819			for (i = 0; i < 2; i++) {
2820				if (dep->de_optinfo[i].o_li !=
2821				    cde32p->de32_optinfo[i].o_li)
2822					break;
2823			}
2824			if (i != 2)
2825				goto err;
2826			size = sizeof (mddb_block_t) * dep->de_blkcount;
2827			if (bcmp((caddr_t)dep->de_blks,
2828			    (caddr_t)cde32p->de32_blks, size))
2829				goto err;
2830			dep = dep->de_next;
2831			if (cde32p->de32_next)
2832				cde32p = nextentry(cde32p);
2833			else
2834				cde32p = NULL;
2835		}
2836		if (dep || cde32p)
2837			goto err;
2838	}
2839	/*
2840	 * If here, all directories are functionally identical
2841	 * check to make sure all records are identical
2842	 * the reason the records are not just bcmped is that the
2843	 * lock flag does not want to be compared.
2844	 */
2845	crbp = (mddb_rb32_t *)cdb32p;
2846	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2847		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2848			if ((dep->de_flags & MDDB_F_OPT) ||
2849			    (dep->de_flags & MDDB_F_CHANGELOG))
2850				continue;
2851			rbp = (mddb_rb32_t *)dep->de_rb;
2852			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
2853			    dep->de_blkcount, li, 0)) {
2854				retval = MDDB_F_EREAD;
2855				goto err;
2856			}
2857			/* Check the crc for this record */
2858			if (rec_crcchk(s, dep, crbp))
2859				goto err;
2860
2861			if (rbp->rb_checksum != crbp->rb_checksum ||
2862			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
2863				goto err;
2864		}
2865	}
2866	return (0);
2867err:
2868	return (retval);
2869}
2870
2871/*
2872 * Determine if the location information for two mddbs is the same.
2873 * The device slice and block offset should match.  If both have devids then
2874 * use that for the comparison, otherwise we compare the dev_ts.
2875 * Comparing with the devid allows us to handle the case where a mddb was
2876 * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
2877 * the dead mddb but the devid comparison will catch this and not match.
2878 *
2879 * Return 1 if the location of the two mddbs match, 0 if not.
2880 */
2881static int
2882match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
2883	daddr32_t blkno)
2884{
2885	if (rip->ri_flags & MDDB_F_EMASTER) {
2886		/*
2887		 * If this element is errored then we don't try to match on it.
2888		 * If we try to match we could erroneously match on the dev_t
2889		 * of a relocated disk.
2890		 */
2891		return (0);
2892	}
2893
2894	if (rip->ri_devid && devid && minor) {
2895		/*
2896		 * If old devid exists, then this is a replicated diskset
2897		 * and both old and new devids must be checked.
2898		 */
2899		if (rip->ri_old_devid) {
2900			if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
2901			    (ddi_devid_compare(rip->ri_old_devid,
2902			    devid) != 0)) ||
2903			    (strcmp(rip->ri_minor_name, minor) != 0))
2904				return (0);
2905		} else {
2906			if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
2907			    strcmp(rip->ri_minor_name, minor) != 0)
2908				return (0);
2909		}
2910	} else {
2911		if (rip->ri_dev != dev)
2912			return (0);
2913	}
2914
2915	if (rip->ri_blkno != blkno)
2916		return (0);
2917
2918	return (1);
2919}
2920
2921static int
2922ridev(
2923	mddb_ri_t	**rip,
2924	mddb_cfg_loc_t	*clp,
2925	dev32_t		*dev_2b_fixed,
2926	int		flag)
2927{
2928	mddb_ri_t	*r, *r1;
2929	md_dev64_t	ldev, ndev;
2930	major_t		majordev;
2931	int		sz;
2932
2933	if (MD_UPGRADE) {
2934		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
2935		    clp->l_mnum);
2936	} else {
2937		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
2938			return (EINVAL);
2939
2940		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
2941		    clp->l_mnum);
2942	}
2943
2944	if (clp->l_devid != 0) {
2945		/*
2946		 * Get dev associated with device id and minor name.
2947		 * Setup correct driver name if dev is now different.
2948		 * Don't change driver name if during upgrade.
2949		 */
2950		ndev = ldev;
2951		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
2952		    &ndev, clp->l_minor_name)) {
2953			if ((ndev != ldev) && (!(MD_UPGRADE))) {
2954				majordev = md_getmajor(ndev);
2955				(void) strcpy(clp->l_driver,
2956				    ddi_major_to_name(majordev));
2957				clp->l_mnum = md_getminor(ndev);
2958				clp->l_devid_flags |= MDDB_DEVID_VALID;
2959				ldev = ndev;
2960			}
2961		} else {
2962			/* Mark as invalid */
2963			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
2964		}
2965	}
2966
2967	clp->l_dev = md_cmpldev(ldev);
2968	if (dev_2b_fixed)
2969		*dev_2b_fixed = clp->l_dev;
2970	r = *rip;
2971
2972	while (r) {
2973		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
2974		    clp->l_minor_name, ldev, clp->l_blkno)) {
2975			if ((clp->l_devid != 0) &&
2976			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2977				r->ri_flags |= MDDB_F_EMASTER;
2978			} else {
2979				r->ri_flags |= flag;
2980			}
2981			return (0);	/* already entered return success */
2982		}
2983		r = r->ri_next;
2984	}
2985
2986	/*
2987	 * This replica not represented in the current rip list,
2988	 * so add it to the list.
2989	 */
2990	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
2991	r->ri_dev = ldev;
2992	r->ri_blkno = clp->l_blkno;
2993	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
2994	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
2995		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
2996	}
2997	if (clp->l_devname != NULL) {
2998		(void) strcpy(r->ri_devname, clp->l_devname);
2999	}
3000	r->ri_flags |= flag;
3001	if (clp->l_devid != 0) {
3002		sz = clp->l_devid_sz;
3003		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
3004		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
3005
3006		if (clp->l_old_devid != NULL) {
3007			sz = clp->l_old_devid_sz;
3008			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
3009			    KM_SLEEP);
3010			bcopy((char *)(uintptr_t)clp->l_old_devid,
3011			    (char *)r->ri_old_devid, sz);
3012		} else {
3013			r->ri_old_devid = 0;
3014		}
3015		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
3016			(void) strcpy(r->ri_minor_name, clp->l_minor_name);
3017
3018		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
3019			/*
3020			 * Devid is present, but not valid.  This could
3021			 * happen if device has been powered off or if
3022			 * the device has been removed.  Mark the device in
3023			 * error.  Don't allow any writes to this device
3024			 * based on the dev_t since another device could
3025			 * have been placed in its spot and be responding to
3026			 * the dev_t accesses.
3027			 */
3028			r->ri_flags |= MDDB_F_EMASTER;
3029		}
3030	} else {
3031		r->ri_devid = 0;
3032		r->ri_old_devid = 0;
3033	}
3034
3035	/*
3036	 * If the rip list is empty then this entry
3037	 * is the list.
3038	 */
3039	if (*rip == NULL) {
3040		*rip = r;
3041		return (0);
3042	}
3043
3044	/*
3045	 * Add this entry to the end of the rip list
3046	 */
3047	r1 = *rip;
3048	while (r1->ri_next)
3049		r1 = r1->ri_next;
3050	r1->ri_next = r;
3051	return (0);
3052}
3053
3054/*
3055 * writecopy writes the incore data blocks out to all of the replicas.
3056 * This is called from writestart
3057 *	- when a diskset is started or
3058 *	- when an error has been enountered during the write to a mddb.
3059 * and from newdev when a new mddb is being added.
3060 *
3061 * flag can be 2 values:
3062 *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3063 *		always used for traditional and local disksets.
3064 *		For MN diskset:
3065 *			All nodes can call writecopy, but only the
3066 *			master node actually writes data to the disk
3067 *			except for optimized resync records.
3068 *			An optimized resync record can only be written to
3069 *			by the record owner.
3070 *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3071 *		master has been chosen, the new master may need to
3072 * 		write its incore mddb to disk (this is the case where the
3073 *		old master had executed a message but hadn't relayed it
3074 *		to this slave yet).  New master should not write the
3075 *		change log records since new master would be overwriting
3076 *		valuable data.  Only used during a reconfig cycle.
3077 */
3078static int
3079writecopy(
3080	mddb_set_t	*s,
3081	int		li,
3082	int		flag
3083)
3084{
3085	mddb_db_t	*dbp;
3086	mddb_db32_t	*db32p;
3087	mddb_de_ic_t	*dep;
3088	mddb_rb32_t	*rbp;
3089	uint_t		checksum;
3090	int		err = 0;
3091
3092#if defined(_ILP32) && !defined(lint)
3093	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
3094	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
3095#endif
3096
3097	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
3098		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
3099		create_db32rec(db32p, dbp);
3100		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
3101		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
3102		    MDDB_WR_ONLY_MASTER);
3103		kmem_free((caddr_t)db32p, MDDB_BSIZE);
3104		if (err)
3105			return (err);
3106		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
3107			/*
3108			 * In a multinode diskset, when a new master is
3109			 * chosen the new master may need to write its
3110			 * incore copy of the mddb to disk.  In this case,
3111			 * don't want to overwrite the change log records
3112			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
3113			 */
3114			if (flag == MDDB_WRITECOPY_SYNC) {
3115				if (dep->de_flags & MDDB_F_CHANGELOG)
3116					continue;
3117			}
3118			/*
3119			 * In a multinode diskset, don't write out optimized
3120			 * resync resyncs since only the mirror owner node
3121			 * will have the correct data.  If writecopy is
3122			 * being called from writestart as a result of
3123			 * an mddb failure, then writestart will handle
3124			 * the optimized records when it calls fixoptrecords.
3125			 */
3126			if ((MD_MNSET_SETNO(s->s_setno)) &&
3127			    (dep->de_flags & MDDB_F_OPT)) {
3128				continue;
3129			}
3130
3131			rbp = dep->de_rb;
3132			checksum = rbp->rb_checksum_fiddle;
3133			checksum ^= rbp->rb_checksum;
3134			/* Generate the crc for this record */
3135			rec_crcgen(s, dep, rbp);
3136			checksum ^= rbp->rb_checksum;
3137			rbp->rb_checksum_fiddle = checksum;
3138			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
3139			    dep->de_blkcount, li, (mddb_bf_t **)0,
3140			    MDDB_WR_ONLY_MASTER))
3141				return (err);
3142		}
3143	}
3144	return (0);
3145}
3146
3147static int
3148upd_med(
3149	mddb_set_t	*s,
3150	char		*tag
3151)
3152{
3153	med_data_t	meddb;
3154	int		medok;
3155	mddb_lb_t	*lbp = s->s_lbp;
3156	set_t		setno = s->s_setno;
3157	int		li;
3158	int		alc;
3159	int		lc;
3160
3161
3162	/* If no mediator hosts, nothing to do */
3163	if (s->s_med.n_cnt == 0)
3164		return (0);
3165
3166	/*
3167	 * If this is a MN set and we are not the master, then don't
3168	 * update mediator hosts or mark mediator as golden since
3169	 * only master node should do that.
3170	 */
3171	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
3172	    (md_set[setno].s_am_i_master == 0)) {
3173		return (0);
3174	}
3175
3176	bzero((char *)&meddb, sizeof (med_data_t));
3177	meddb.med_dat_mag = MED_DATA_MAGIC;
3178	meddb.med_dat_rev = MED_DATA_REV;
3179	meddb.med_dat_fl = 0;
3180	meddb.med_dat_sn = setno;
3181	meddb.med_dat_cc = lbp->lb_commitcnt;
3182	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
3183	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3184
3185	/* count accessible mediators */
3186	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3187
3188	/* count accessible and existing replicas */
3189	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
3190		mddb_locator_t	*lp = &lbp->lb_locators[li];
3191
3192		if (lp->l_flags & MDDB_F_DELETED)
3193			continue;
3194
3195		lc++;
3196
3197		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
3198		    (lp->l_flags & MDDB_F_EMASTER) ||
3199		    (lp->l_flags & MDDB_F_EWRITE))
3200			continue;
3201
3202		alc++;
3203	}
3204
3205	/*
3206	 * Mediator update quorum is >= 50%: check for less than
3207	 * "mediator update" quorum.
3208	 */
3209	if ((medok * 2) < s->s_med.n_cnt) {
3210		/* panic if <= 50% of all replicas are accessible */
3211		if ((lc > 0) && ((alc * 2) <= lc)) {
3212			cmn_err(CE_PANIC,
3213			    "md: Update of 50%% of the mediator hosts failed");
3214			/* NOTREACHED */
3215		}
3216
3217		cmn_err(CE_WARN,
3218		    "md: Update of 50%% of the mediator hosts failed");
3219	}
3220
3221	/*
3222	 * If we have mediator update quorum and exactly 50% of the replicas
3223	 * are accessible then mark the mediator as golden.
3224	 */
3225	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
3226	    ((alc * 2) == lc)) {
3227		meddb.med_dat_fl = MED_DFL_GOLDEN;
3228		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3229		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3230	}
3231
3232	return (0);
3233}
3234
3235static int
3236push_lb(mddb_set_t *s)
3237{
3238	mddb_lb_t	*lbp = s->s_lbp;
3239
3240	/* push the change to all the replicas */
3241	uniqtime32(&lbp->lb_timestamp);
3242	if (MD_MNSET_SETNO(s->s_setno)) {
3243		lbp->lb_revision = MDDB_REV_MNLB;
3244	} else {
3245		lbp->lb_revision = MDDB_REV_LB;
3246	}
3247	/*
3248	 * The updates to the mediator hosts are done
3249	 * by the callers of this function.
3250	 */
3251	return (writelocall(s));
3252}
3253
3254/* Should not call for MN diskset since data tags are not supported */
3255static int
3256dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
3257{
3258	int 		diff = 0;
3259
3260	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
3261	if (diff)
3262		return (diff);
3263
3264	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
3265	if (diff)
3266		return (diff);
3267
3268	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
3269	if (diff)
3270		return (diff);
3271
3272	/*CSTYLED*/
3273	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
3274}
3275
3276/* Should not call for MN diskset since data tags are not supported */
3277static int
3278dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
3279{
3280	int		nextid = 0;
3281	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
3282
3283	/* Run to the end of the list */
3284	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
3285		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
3286			return (0);
3287		nextid++;
3288	}
3289
3290	/* Add the new member */
3291	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
3292
3293	/* Update the dtag portion of the list */
3294	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
3295	    sizeof (mddb_dtag_t));
3296
3297	/* Fix up the id value */
3298	(*dtlpp)->dtl_dt.dt_id = ++nextid;
3299
3300	return (0);
3301}
3302
3303/*
3304 * Even though data tags are not supported in MN disksets, dt_cntl may
3305 * be called for a MN diskset since this routine is called even before
3306 * it is known the kind of diskset being read in from disk.
3307 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
3308 */
3309static int
3310dtl_cntl(mddb_set_t *s)
3311{
3312	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3313	int		ndt = 0;
3314
3315	while (dtlp != NULL) {
3316		ndt++;
3317		dtlp = dtlp->dtl_nx;
3318	}
3319
3320	return (ndt);
3321}
3322
3323/*
3324 * Even though data tags are not supported in MN disksets, dt_cntl may
3325 * be called for a MN diskset since this routine is called even before
3326 * it is known the kind of diskset being read in from disk.
3327 * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
3328 */
3329static mddb_dtag_t *
3330dtl_findl(mddb_set_t *s, int id)
3331{
3332	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3333
3334	while (dtlp != NULL) {
3335		if (dtlp->dtl_dt.dt_id == id)
3336			return (&dtlp->dtl_dt);
3337		dtlp = dtlp->dtl_nx;
3338	}
3339	return ((mddb_dtag_t *)NULL);
3340}
3341
3342/* Should not call for MN diskset since data tags are not supported */
3343static void
3344dtl_freel(mddb_dtag_lst_t **dtlpp)
3345{
3346	mddb_dtag_lst_t	*dtlp;
3347	mddb_dtag_lst_t	*tdtlp;
3348
3349
3350	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
3351		dtlp = tdtlp->dtl_nx;
3352		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
3353	}
3354	*dtlpp = (mddb_dtag_lst_t *)NULL;
3355}
3356
3357/*
3358 * Even though data tags are not supported in MN disksets, dt_setup will
3359 * be called for a MN diskset since this routine is called even before
3360 * it is known the kind of diskset being read in from disk.
3361 * Once this set is known as a MN diskset, the dtp area will be freed.
3362 */
3363static void
3364dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
3365{
3366	mddb_dt_t	*dtp;
3367	set_t		setno = s->s_setno;
3368
3369
3370	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
3371		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3372	else if (dtagp == (mddb_dtag_t *)NULL)
3373		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
3374
3375	/* shorthand */
3376	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3377
3378	dtp->dt_mag = MDDB_MAGIC_DT;
3379	dtp->dt_rev = MDDB_REV_DT;
3380
3381	if (dtagp != NULL)
3382		dtp->dt_dtag = *dtagp;		/* structure assignment */
3383
3384	/* Initialize the setno */
3385	dtp->dt_dtag.dt_setno = setno;
3386
3387	/* Clear the id and flags, this is only used in user land */
3388	dtp->dt_dtag.dt_id = 0;
3389
3390	/* Checksum it */
3391	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
3392}
3393
3394/* Should not call for MN diskset since data tags are not supported */
3395static int
3396set_dtag(mddb_set_t *s, md_error_t *ep)
3397{
3398	mddb_lb_t	*lbp = s->s_lbp;
3399	mddb_dtag_t	tag;
3400
3401	if (lbp->lb_dtblkcnt == 0) {
3402		/* Data tags not used in a MN set - so no failure returned */
3403		if (lbp->lb_flags & MDDB_MNSET)
3404			return (0);
3405
3406		cmn_err(CE_WARN,
3407		    "No tag record allocated, unable to tag data");
3408		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
3409		return (1);
3410	}
3411
3412	/* Clear the stack variable */
3413	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
3414
3415	/* Get the HW serial number for this host */
3416	(void) strncpy(tag.dt_sn, hw_serial, MDDB_SN_LEN);
3417	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
3418
3419	/* Get the nodename that this host goes by */
3420	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
3421	tag.dt_hn[MD_MAX_NODENAME] = '\0';
3422
3423	/* Get a time stamp for NOW */
3424	uniqtime32(&tag.dt_tv);
3425
3426	/* Setup the data tag record */
3427	dt_setup(s, &tag);
3428
3429	/* Free any list of tags if they exist */
3430	dtl_freel(&s->s_dtlp);
3431
3432	/* Put the new tag onto the tag list */
3433	(void) dtl_addl(s, &tag);
3434
3435	return (0);
3436}
3437
3438/*
3439 * If called during upgrade, this routine expects a non-translated
3440 * (aka target) dev.
3441 * Should not call for MN diskset since data tags are not supported.
3442 */
3443static int
3444dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
3445{
3446	int		err = 0;
3447	md_dev64_t	dev;
3448	caddr_t		tbuf;
3449	daddr_t		physblk;
3450	mddb_block_t	blk;
3451	mddb_dt_t	*dtp;
3452	mddb_dtag_t	*dtagp;
3453	set_t		setno = s->s_setno;
3454
3455	/* If have not allocated a data tag record, there is nothing to do */
3456	if (lbp->lb_dtblkcnt == 0)
3457		return (1);
3458
3459	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3460
3461	if (dtp == (mddb_dt_t *)NULL)
3462		return (1);
3463
3464	/* shorthand */
3465	dev = md_xlate_targ_2_mini(rip->ri_dev);
3466	if (dev == NODEV64) {
3467		return (1);
3468	}
3469
3470	tbuf = (caddr_t)rip->ri_dtp;
3471
3472	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
3473		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
3474		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
3475		/* error reading the tag */
3476		if (err) {
3477			err = 1;
3478			goto out;
3479		}
3480		tbuf += MDDB_BSIZE;
3481	}
3482
3483	/* magic is valid? */
3484	if (dtp->dt_mag != MDDB_MAGIC_DT) {
3485		err = 1;
3486		goto out;
3487	}
3488
3489	/* revision is valid? */
3490	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
3491		err = 1;
3492		goto out;
3493	}
3494
3495	/* crc is valid? */
3496	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
3497		err = 1;
3498		goto out;
3499	}
3500
3501	/* shorthand */
3502	dtagp = &dtp->dt_dtag;
3503
3504	/* set number match? */
3505	if (dtagp->dt_setno != setno) {
3506		err = 1;
3507		goto out;
3508	}
3509
3510	/* tag is not empty? */
3511	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3512	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3513	    dtagp->dt_id == 0) {
3514		err = 2;
3515		goto out;
3516	}
3517
3518	/* Mark the locator as having tagged data */
3519	rip->ri_flags |= MDDB_F_TAGDATA;
3520
3521out:
3522	if (err) {
3523		if (err == 1) {
3524			md_set_setstatus(setno, MD_SET_BADTAG);
3525			rip->ri_flags |= MDDB_F_BADTAG;
3526		}
3527		if (dtp != NULL) {
3528			kmem_free(dtp, MDDB_DT_BYTES);
3529			rip->ri_dtp = (mddb_dt_t *)NULL;
3530		}
3531	}
3532
3533	return (err);
3534}
3535
3536/* Should not call for MN diskset since data tags are not supported */
3537static int
3538dt_write(mddb_set_t *s)
3539{
3540	int		li;
3541	int		err = 0;
3542	int		werr;
3543	int		empty_tag = 0;
3544	mddb_dtag_t	*dtagp;
3545	mddb_dt_t	*dtp;
3546	mddb_lb_t	*lbp = s->s_lbp;
3547	set_t		setno = s->s_setno;
3548	uint_t		set_status = md_get_setstatus(setno);
3549
3550
3551	ASSERT(md_set[setno].s_dtp != NULL);
3552
3553	/* Nowhere to write to */
3554	if (lbp->lb_dtblkcnt == 0)
3555		return (err);
3556
3557	if (set_status & MD_SET_BADTAG)
3558		return (err);
3559
3560	/* shorthand */
3561	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3562	dtagp = &dtp->dt_dtag;
3563
3564	/* See if the tag is empty. */
3565	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3566	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3567	    dtagp->dt_id == 0)
3568		empty_tag = 1;
3569
3570	/* Write the tag to the locators and reset appropriate flags. */
3571	for (li = 0; li < lbp->lb_loccnt; li++) {
3572		mddb_locator_t	*lp = &lbp->lb_locators[li];
3573
3574		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3575		    (lp->l_flags & MDDB_F_DELETED) ||
3576		    (lp->l_flags & MDDB_F_EWRITE))
3577			continue;
3578
3579		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
3580		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
3581
3582		if (werr) {
3583			err |= werr;
3584			continue;
3585		}
3586
3587		if (empty_tag)
3588			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
3589		else {
3590			lp->l_flags |= MDDB_F_TAGDATA;
3591			lp->l_flags &= ~MDDB_F_BADTAG;
3592		}
3593	}
3594
3595	if (err)
3596		return (err);
3597
3598
3599	/* If the tags were written, check to see if any tags remain. */
3600	for (li = 0; li < lbp->lb_loccnt; li++) {
3601		mddb_locator_t	*lp = &lbp->lb_locators[li];
3602
3603		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3604		    (lp->l_flags & MDDB_F_DELETED) ||
3605		    (lp->l_flags & MDDB_F_EWRITE))
3606			continue;
3607
3608		if (lp->l_flags & MDDB_F_TAGDATA)
3609			break;
3610	}
3611
3612	/* If there are no tags, then clear CLRTAG and TAGDATA */
3613	if (li == lbp->lb_loccnt) {
3614		md_clr_setstatus(setno, MD_SET_CLRTAG);
3615		md_clr_setstatus(setno, MD_SET_TAGDATA);
3616	}
3617
3618	return (err);
3619}
3620
3621/* Should not call for MN diskset since data tags are not supported */
3622static int
3623dt_alloc_if_needed(mddb_set_t *s)
3624{
3625	int		i;
3626	int		li;
3627	int		moveit = 0;
3628	mddb_lb_t	*lbp = s->s_lbp;
3629	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
3630	set_t		setno = s->s_setno;
3631	uint_t		set_status = md_get_setstatus(setno);
3632
3633	/*
3634	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
3635	 * not detected, there is nothing to do.
3636	 */
3637	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
3638		return (0);
3639
3640	/* Bitmap not setup, checks can't be done */
3641	if (s->s_totalblkcnt == 0)
3642		return (0);
3643
3644	/* While reading the tag(s) an invalid tag data record was seen */
3645	if (set_status & MD_SET_BADTAG)
3646		/* See if the invalid tag needs to be moved */
3647		for (i = 0; i < MDDB_DT_BLOCKS; i++)
3648			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
3649				moveit = 1;
3650				break;
3651			}
3652
3653	/* Need to move or allocate the tag data record */
3654	if (moveit || blkcnt == 0) {
3655		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
3656		if (lbp->lb_dtfirstblk == 0) {
3657			cmn_err(CE_WARN,
3658			    "Unable to allocate data tag record");
3659			return (0);
3660		}
3661		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
3662
3663		/* Mark the locators so that they get written to disk. */
3664		for (li = 0; li < lbp->lb_loccnt; li++) {
3665			mddb_locator_t	*lp = &lbp->lb_locators[li];
3666
3667			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3668			    (lp->l_flags & MDDB_F_DELETED) ||
3669			    (lp->l_flags & MDDB_F_EWRITE))
3670				continue;
3671
3672			lp->l_flags |= MDDB_F_BADTAG;
3673		}
3674		return (1);
3675	}
3676
3677	/*
3678	 * Make sure the blocks are owned, since the calculation in
3679	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
3680	 */
3681	for (i = 0; i < MDDB_DT_BLOCKS; i++)
3682		blkbusy(s, (i + lbp->lb_dtfirstblk));
3683
3684	return (1);
3685}
3686
3687/*
3688 * Writestart writes the incore mddb out to all of the replicas.
3689 * This is called when a diskset is started and when an error has
3690 * been enountered during the write to a mddb.
3691 *
3692 * flag can be 2 values:
3693 *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3694 *		always used for traditional and local disksets.
3695 *		This is the normal path for MN disksets since the slave
3696 *		nodes aren't actually allowed to write to disk.
3697 *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3698 *		master has been chosen, the new master may need to
3699 * 		write its incore mddb to disk (this is the case where the
3700 *		old master had executed a message but hadn't relayed it
3701 *		to this slave yet).  New master should not write the
3702 *		change log records since new master would be overwriting
3703 *		valuable data.  Only used during a reconfig cycle.
3704 */
3705static int
3706writestart(
3707	mddb_set_t	*s,
3708	int		flag
3709)
3710{
3711	int		li;
3712	mddb_locator_t	*lp;
3713	mddb_lb_t	*lbp;
3714	mddb_ln_t	*lnp;
3715	int		err = 0;
3716	uint_t		set_status;
3717
3718	lbp = s->s_lbp;
3719
3720	for (li = 0; li < lbp->lb_loccnt; li++) {
3721		lp = &lbp->lb_locators[li];
3722		if (! (lp->l_flags & MDDB_F_ACTIVE))
3723			continue;
3724		if (! (lp->l_flags & MDDB_F_SUSPECT))
3725			continue;
3726		if (writecopy(s, li, flag))
3727			return (1);
3728		lp->l_flags |= MDDB_F_UP2DATE;
3729	}
3730
3731	for (li = 0; li < lbp->lb_loccnt; li++) {
3732		lp = &lbp->lb_locators[li];
3733		if (! (lp->l_flags & MDDB_F_ACTIVE))
3734			continue;
3735		if ((lp->l_flags & MDDB_F_UP2DATE))
3736			continue;
3737		if (checkcopy(s, li))
3738			if (err = writecopy(s, li, flag))
3739				return (1);
3740		lp->l_flags |= MDDB_F_UP2DATE;
3741	}
3742
3743	/*
3744	 * Call fixoptrecord even during a reconfig cycle since a replica
3745	 * failure may force the master to re-assign the optimized
3746	 * resync record to another replica.
3747	 */
3748	if (fixoptrecords(s))
3749		return (1);
3750
3751	set_status = md_get_setstatus(s->s_setno);
3752
3753	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
3754	for (li = 0; li < lbp->lb_loccnt; li++) {
3755		lp = &lbp->lb_locators[li];
3756
3757		if (lp->l_flags & MDDB_F_DELETED)
3758			continue;
3759
3760		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
3761		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
3762		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
3763		    (lp->l_flags & MDDB_F_OLDACT) != 0))
3764			break;
3765
3766		if ((set_status & MD_SET_TAGDATA) ||
3767		    (set_status & MD_SET_CLRTAG))
3768			if ((lp->l_flags & MDDB_F_TAGDATA) ||
3769			    (lp->l_flags & MDDB_F_BADTAG))
3770				break;
3771	}
3772
3773	/*
3774	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
3775	 * the lbp identifier and the set identifier doesn't match.
3776	 */
3777	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
3778
3779		/* Only call for traditional and local sets */
3780		if (!(lbp->lb_flags & MDDB_MNSET))
3781			(void) dt_write(s);
3782
3783		setidentifier(s, &lbp->lb_ident);
3784
3785		if (err = push_lb(s)) {
3786			(void) upd_med(s, "writestart(0)");
3787			return (err);
3788		}
3789
3790		(void) upd_med(s, "writestart(0)");
3791
3792		if (err = push_lb(s)) {
3793			(void) upd_med(s, "writestart(1)");
3794			return (err);
3795		}
3796
3797		(void) upd_med(s, "writestart(1)");
3798
3799		lnp = s->s_lnp;
3800		uniqtime32(&lnp->ln_timestamp);
3801		if (lbp->lb_flags & MDDB_MNSET)
3802			lnp->ln_revision = MDDB_REV_MNLN;
3803		else
3804			lnp->ln_revision = MDDB_REV_LN;
3805		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
3806		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
3807		    lbp->lb_lnblkcnt, 0);
3808		/*
3809		 * If a MN diskset and this is the master, set the PARSE_LOCNM
3810		 * flag in the mddb_set structure to show that the locator
3811		 * names have changed.
3812		 * Don't set parseflags as a result of a new master sync
3813		 * during reconfig cycle since slaves nodes are already
3814		 * in-sync with the new master.
3815		 */
3816
3817		if ((lbp->lb_flags & MDDB_MNSET) &&
3818		    (md_set[s->s_setno].s_am_i_master) &&
3819		    (flag != MDDB_WRITECOPY_SYNC)) {
3820			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
3821		}
3822
3823		if (err)
3824			return (err);
3825	}
3826
3827	for (li = 0; li < lbp->lb_loccnt; li++) {
3828		lp = &lbp->lb_locators[li];
3829		if (lp->l_flags & MDDB_F_DELETED)
3830			continue;
3831		if (lp->l_flags & MDDB_F_ACTIVE) {
3832			lp->l_flags |= MDDB_F_OLDACT;
3833		} else {
3834			lp->l_flags &= ~MDDB_F_OLDACT;
3835		}
3836	}
3837
3838	md_clr_setstatus(s->s_setno, MD_SET_STALE);
3839
3840	return (0);
3841}
3842
3843/*
3844 * selectreplicas selects the working replicas and may write the incore
3845 * version of the mddb out to the replicas ondisk.
3846 *
3847 * flag can be 3 values:
3848 *	MDDB_RETRYSCAN - quick scan to see if there is an error.
3849 *			If no new error, returns without writing mddb
3850 *			to disks.  If a new error is seen, writes out
3851 *			mddb to disks.
3852 *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
3853 *			out mddb to the replica ondisk.  Calls writecopy
3854 *			with MDDB_WRITECOPY_ALL flag which writes out
3855 *			all records to the replicas ondisk.
3856 *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
3857 *			and ondisk mddbs by writing incore values to disk.
3858 *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
3859 *			that change log records are not written out.
3860 *			Only used by MN disksets.
3861 *
3862 * Returns:
3863 *	0 - Successful
3864 *	1 - Unable to write incore mddb data to disk since < 50% replicas.
3865 */
3866int
3867selectreplicas(
3868	mddb_set_t	*s,
3869	int		flag
3870)
3871{
3872	int		li;
3873	int		alc;
3874	int		lc;
3875	mddb_locator_t	*lp;
3876	mddb_lb_t	*lbp = s->s_lbp;
3877	set_t		setno = s->s_setno;
3878	int		wc_flag;
3879
3880	/*
3881	 * can never transition from stale to not stale
3882	 */
3883	if (md_get_setstatus(setno) & MD_SET_STALE) {
3884		for (li = 0; li < lbp->lb_loccnt; li++) {
3885			lp = &lbp->lb_locators[li];
3886			if (lp->l_flags & MDDB_F_DELETED)
3887				continue;
3888			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3889				lp->l_flags |= MDDB_F_ACTIVE;
3890			} else {
3891				lp->l_flags &= ~MDDB_F_ACTIVE;
3892			}
3893		}
3894		return (1);
3895	}
3896
3897	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
3898		for (li = 0; li < lbp->lb_loccnt; li++) {
3899			lp = &lbp->lb_locators[li];
3900			if (lp->l_flags & MDDB_F_DELETED)
3901				continue;
3902			if (lp->l_flags & MDDB_F_ACTIVE) {
3903				lp->l_flags |= MDDB_F_OLDACT;
3904				lp->l_flags &= ~MDDB_F_SUSPECT;
3905			} else {
3906				lp->l_flags |= MDDB_F_SUSPECT;
3907				lp->l_flags &= ~MDDB_F_OLDACT;
3908			}
3909
3910			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3911				lp->l_flags |= MDDB_F_ACTIVE;
3912				lp->l_flags &= ~MDDB_F_EWRITE;
3913				lp->l_flags &= ~MDDB_F_TOOSMALL;
3914			} else {
3915				lp->l_flags &= ~MDDB_F_ACTIVE;
3916			}
3917		}
3918		computefreeblks(s); /* set up free block bits */
3919	} else {
3920		for (li = 0; li < lbp->lb_loccnt; li++) {
3921			lp = &lbp->lb_locators[li];
3922			if (! (lp->l_flags & MDDB_F_ACTIVE))
3923				continue;
3924			if (lp->l_flags & MDDB_F_EWRITE)
3925				break;
3926		}
3927
3928		/*
3929		 * if there are no errors this is error has already
3930		 * been processed return current state
3931		 */
3932		if (li == lbp->lb_loccnt)
3933			return (md_get_setstatus(setno) & MD_SET_TOOFEW);
3934
3935		lp->l_flags &= ~MDDB_F_ACTIVE;
3936		do {
3937			lp = &lbp->lb_locators[li];
3938			lp->l_flags &= ~MDDB_F_UP2DATE;
3939		} while (++li < lbp->lb_loccnt);
3940	}
3941
3942	alc = 0;
3943	lc = 0;
3944	for (li = 0; li < lbp->lb_loccnt; li++) {
3945		lp = &lbp->lb_locators[li];
3946		if (lp->l_flags & MDDB_F_DELETED)
3947			continue;
3948		lc++;
3949		if (! (lp->l_flags & MDDB_F_ACTIVE))
3950			continue;
3951		alc++;
3952	}
3953
3954	if (alc < ((lc + 1) / 2)) {
3955		md_set_setstatus(setno, MD_SET_TOOFEW);
3956		return (1);
3957	}
3958
3959	/* Set wc_flag based on flag passed in. */
3960	if (flag == MDDB_SCANALLSYNC)
3961		wc_flag = MDDB_WRITECOPY_SYNC;
3962	else
3963		wc_flag = MDDB_WRITECOPY_ALL;
3964
3965	do {
3966		if (! writestart(s, wc_flag)) {
3967			md_clr_setstatus(setno, MD_SET_TOOFEW);
3968			return (0);
3969		}
3970		alc  = 0;
3971		for (li = 0; li < lbp->lb_loccnt; li++) {
3972			lp = &lbp->lb_locators[li];
3973			if ((lp->l_flags & MDDB_F_DELETED) ||
3974			    (lp->l_flags & MDDB_F_EMASTER))
3975				continue;
3976
3977			if (lp->l_flags & MDDB_F_EWRITE) {
3978				lp->l_flags &= ~MDDB_F_ACTIVE;
3979				lp->l_flags &= ~MDDB_F_UP2DATE;
3980				continue;
3981			}
3982			alc++;
3983		}
3984	} while (alc >= ((lc + 1) / 2));
3985	md_set_setstatus(setno, MD_SET_TOOFEW);
3986	return (1);
3987}
3988
3989static int
3990checkstate(
3991	mddb_set_t	*s,
3992	int		probe
3993)
3994{
3995	int		error;
3996	uint_t		set_status = md_get_setstatus(s->s_setno);
3997
3998	ASSERT(s != NULL);
3999
4000	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
4001		return (0);
4002
4003	if (probe == MDDB_NOPROBE)
4004		return (1);
4005
4006	single_thread_start(s);
4007	error = selectreplicas(s, MDDB_SCANALL);
4008	single_thread_end(s);
4009
4010	if (error == 0 && s->s_zombie != 0) {
4011		mutex_exit(SETMUTEX(s->s_setno));
4012		error = mddb_deleterec(s->s_zombie);
4013		mutex_enter(SETMUTEX(s->s_setno));
4014		if (error == 0)
4015			s->s_zombie = 0;
4016	}
4017	return (error);
4018}
4019
4020static int
4021writeretry(
4022	mddb_set_t	*s
4023)
4024{
4025	if (selectreplicas(s, MDDB_RETRYSCAN))
4026		if (selectreplicas(s, MDDB_SCANALL))
4027			return (1);
4028	return (0);
4029}
4030
4031static void
4032free_mbipp(mddb_mb_ic_t **mbipp)
4033{
4034	mddb_mb_ic_t	*mbip1, *mbip2;
4035
4036	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
4037		mbip2 = mbip1->mbi_next;
4038		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
4039	}
4040	*mbipp = (mddb_mb_ic_t *)NULL;
4041}
4042
4043static mddb_ri_t *
4044save_rip(mddb_set_t *s)
4045{
4046	mddb_ri_t	*trip = s->s_rip;
4047	mddb_ri_t	*nrip = NULL;
4048	mddb_ri_t	**nripp = &nrip;
4049	mddb_ri_t	*rip;
4050
4051	while (trip) {
4052		/* Run to the end of the list */
4053		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
4054			/* void */;
4055
4056		/* Add the new member */
4057		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
4058
4059		ASSERT(*nripp != NULL);
4060
4061		/* shorthand */
4062		rip = *nripp;
4063
4064		*rip = *trip;			/* structure assignment */
4065
4066		/* Clear the stuff that is not needed for hints */
4067		rip->ri_flags = 0;
4068		rip->ri_commitcnt = 0;
4069		rip->ri_transplant = 0;
4070		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
4071		rip->ri_dtp = (mddb_dt_t *)NULL;
4072		rip->ri_lbp = (mddb_lb_t *)NULL;
4073		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4074		rip->ri_devid = (ddi_devid_t)NULL;
4075		rip->ri_old_devid = (ddi_devid_t)NULL;
4076		rip->ri_next = (mddb_ri_t *)NULL;
4077
4078		trip = trip->ri_next;
4079	}
4080	return (nrip);
4081}
4082
4083static void
4084free_rip(mddb_ri_t **ripp)
4085{
4086	mddb_ri_t	*rip;
4087	mddb_ri_t	*arip;
4088
4089	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
4090		arip = rip->ri_next;
4091		if (rip->ri_devid != (ddi_devid_t)NULL) {
4092			ddi_devid_free(rip->ri_devid);
4093			rip->ri_devid = (ddi_devid_t)NULL;
4094		}
4095		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
4096			ddi_devid_free(rip->ri_old_devid);
4097			rip->ri_old_devid = (ddi_devid_t)NULL;
4098		}
4099		kmem_free((caddr_t)rip, sizeof (*rip));
4100	}
4101	*ripp = (mddb_ri_t *)NULL;
4102}
4103
4104/*
4105 * this routine selects the correct replica to use
4106 * the rules are as follows
4107 *	1.	if all replica has same init time select highest commit count
4108 *	2.	if some but not all replicas are from another hostid discard
4109 *		them.
4110 *	3.	find which init time is present is most replicas
4111 *	4.	discard all replicas which do not match most init times
4112 *	5.	select replica with highest commit count
4113 */
4114
4115static mddb_lb_t *
4116selectlocator(
4117	mddb_set_t	*s
4118)
4119{
4120	mddb_ri_t	*rip = s->s_rip;
4121	mddb_ri_t	*r, *r1;
4122	mddb_lb_t	*lbp;
4123	struct timeval32 *tp = (struct timeval32 *)NULL;
4124	int		different;
4125	int		same;
4126	int		count;
4127	int		maxcount;
4128	set_t		setno = s->s_setno;
4129	size_t		sz;
4130	int		mn_set = 0;
4131
4132	/* Clear the ri_transplant flag on all the rip entries. */
4133	/* Set ri_commitcnt to locator's commitcnt - if available */
4134	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4135		r->ri_transplant = 0;
4136		if (r->ri_lbp != (mddb_lb_t *)NULL) {
4137			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
4138			/* If any locators have MN bit set, set flag */
4139			if (r->ri_lbp->lb_flags & MDDB_MNSET)
4140				mn_set = 1;
4141		}
4142	}
4143
4144	/*
4145	 * A data tag is being used, so use it to limit the selection first.
4146	 * Data tags not used in MN diskset.
4147	 */
4148	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
4149		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;
4150
4151		/*
4152		 * now toss any locators that have a different data tag
4153		 */
4154		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4155			if (r->ri_lbp == (mddb_lb_t *)NULL)
4156				continue;
4157
4158			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4159				/* If same tag, keep it */
4160				if (dtl_cmp(&dtp->dt_dtag,
4161				    &r->ri_dtp->dt_dtag) == 0)
4162					continue;
4163			}
4164
4165			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4166				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4167				r->ri_dtp = (mddb_dt_t *)NULL;
4168			}
4169
4170			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4171			if (!(md_get_setstatus(setno) &
4172			    MD_SET_REPLICATED_IMPORT)) {
4173				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4174					sz = ddi_devid_sizeof(r->ri_old_devid);
4175					kmem_free((caddr_t)r->ri_old_devid, sz);
4176					r->ri_old_devid = (ddi_devid_t)NULL;
4177				}
4178			}
4179
4180			kmem_free((caddr_t)r->ri_lbp,
4181			    dbtob(r->ri_lbp->lb_blkcnt));
4182			r->ri_lbp = (mddb_lb_t *)NULL;
4183
4184			r->ri_transplant = 1;
4185		}
4186
4187		/* Tag used, clear the bit */
4188		md_clr_setstatus(s->s_setno, MD_SET_USETAG);
4189
4190		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
4191			/*
4192			 * Get rid of the list of tags.
4193			 */
4194			dtl_freel(&s->s_dtlp);
4195
4196			/*
4197			 * Re-create the list with the tag used.
4198			 */
4199			(void) dtl_addl(s, &dtp->dt_dtag);
4200		}
4201	}
4202
4203	/*
4204	 * scan to see if all replicas have same time
4205	 */
4206	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4207		if (r->ri_lbp == (mddb_lb_t *)NULL)
4208			continue;
4209		if (tp == NULL) {
4210			tp = &r->ri_lbp->lb_inittime;
4211			continue;
4212		}
4213		/* CSTYLED */
4214		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
4215			break;
4216	}
4217
4218	/*
4219	 * if r == NULL then they were all them same. Choose highest
4220	 * commit count
4221	 */
4222	if (r == (mddb_ri_t *)NULL)
4223		goto out;
4224
4225	/*
4226	 * If here, a bogus replica is present and at least 1 lb_inittime
4227	 * did not match.
4228	 */
4229
4230	/*
4231	 * look and see if any but not all are from different id
4232	 */
4233
4234	different = 0;
4235	same = 0;
4236	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4237		if (r->ri_lbp == (mddb_lb_t *)NULL)
4238			continue;
4239		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4240			different = 1;
4241		else
4242			same = 1;
4243	}
4244
4245	/*
4246	 * now go through and throw out different if there are some
4247	 * that are the same
4248	 */
4249	if (different != 0 && same != 0) {
4250		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4251			if (r->ri_lbp == (mddb_lb_t *)NULL)
4252				continue;
4253
4254			if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
4255				continue;
4256
4257			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4258				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4259				r->ri_dtp = (mddb_dt_t *)NULL;
4260			}
4261
4262			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4263			if (!(md_get_setstatus(setno) &
4264			    MD_SET_REPLICATED_IMPORT)) {
4265				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4266					sz = ddi_devid_sizeof(r->ri_old_devid);
4267					kmem_free((caddr_t)r->ri_old_devid, sz);
4268					r->ri_old_devid = (ddi_devid_t)NULL;
4269				}
4270			}
4271
4272			kmem_free((caddr_t)r->ri_lbp,
4273			    dbtob(r->ri_lbp->lb_blkcnt));
4274			r->ri_lbp = (mddb_lb_t *)NULL;
4275
4276			r->ri_transplant = 1;
4277		}
4278	}
4279
4280	/*
4281	 * go through and pick highest. Use n square because it is
4282	 * simple and 40 some is max possible
4283	 */
4284	maxcount = 0;
4285	lbp = (mddb_lb_t *)NULL;
4286	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
4287		if (r1->ri_lbp == (mddb_lb_t *)NULL)
4288			continue;
4289		count = 0;
4290		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4291			if (r->ri_lbp == (mddb_lb_t *)NULL)
4292				continue;
4293			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
4294			    &r->ri_lbp->lb_inittime, ==))
4295				count++;
4296		}
4297		if (count > maxcount) {
4298			maxcount = count;
4299			lbp = r1->ri_lbp;
4300		}
4301	}
4302
4303	/*
4304	 * now go though and toss any that are of a different time stamp
4305	 */
4306	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4307		if (r->ri_lbp == (mddb_lb_t *)NULL)
4308			continue;
4309		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
4310		    &r->ri_lbp->lb_inittime, ==))
4311			continue;
4312
4313		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4314			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4315			r->ri_dtp = (mddb_dt_t *)NULL;
4316		}
4317
4318		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4319		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4320			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4321				sz = ddi_devid_sizeof(r->ri_old_devid);
4322				kmem_free((caddr_t)r->ri_old_devid, sz);
4323				r->ri_old_devid = (ddi_devid_t)NULL;
4324			}
4325		}
4326
4327		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4328		r->ri_lbp = (mddb_lb_t *)NULL;
4329
4330		r->ri_transplant = 1;
4331	}
4332
4333out:
4334	/*
4335	 * Find the locator with the highest commit count, and make it the
4336	 * "chosen" one.
4337	 */
4338	lbp = (mddb_lb_t *)NULL;
4339	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4340		if (r->ri_lbp == (mddb_lb_t *)NULL)
4341			continue;
4342
4343		if (lbp == NULL) {
4344			lbp = r->ri_lbp;
4345			continue;
4346		}
4347
4348		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
4349			lbp = r->ri_lbp;
4350	}
4351
4352	/* Toss all locator blocks, except the "chosen" one. */
4353	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4354		if (r->ri_lbp == (mddb_lb_t *)NULL)
4355			continue;
4356
4357		/* Get rid of all dtp's */
4358		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4359			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4360			r->ri_dtp = (mddb_dt_t *)NULL;
4361		}
4362
4363		if (r->ri_lbp == lbp)
4364			continue;
4365
4366		/* Get rid of extra locator devid block info */
4367		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4368		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4369			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4370				sz = ddi_devid_sizeof(r->ri_old_devid);
4371				kmem_free((caddr_t)r->ri_old_devid, sz);
4372				r->ri_old_devid = (ddi_devid_t)NULL;
4373			}
4374		}
4375
4376		/* Get rid of extra locators */
4377		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4378		r->ri_lbp = (mddb_lb_t *)NULL;
4379	}
4380	return (lbp);
4381}
4382
4383static void
4384locator2cfgloc(
4385	mddb_lb_t		*lbp,
4386	mddb_cfg_loc_t		*clp,
4387	int			li,
4388	side_t			sideno,
4389	mddb_did_ic_t		*did_icp
4390)
4391{
4392	mddb_drvnm_t		*dn;
4393	mddb_locator_t		*lp = &lbp->lb_locators[li];
4394	mddb_sidelocator_t	*slp;
4395	mddb_mnsidelocator_t	*mnslp;
4396	mddb_did_info_t		*did_info;
4397	int 			i, sz, szalloc;
4398	int			mn_set = 0;
4399	mddb_mnlb_t		*mnlbp;
4400
4401	if (lbp->lb_flags & MDDB_MNSET) {
4402		mn_set = 1;
4403		mnlbp = (mddb_mnlb_t *)lbp;
4404		for (i = 0; i < MD_MNMAXSIDES; i++) {
4405			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4406			if (mnslp->mnl_sideno == sideno)
4407				break;
4408		}
4409		if (i == MD_MNMAXSIDES)
4410			return;
4411	} else {
4412		slp = &lbp->lb_sidelocators[sideno][li];
4413	}
4414
4415	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4416		did_info = &(did_icp->did_ic_blkp->blk_info[li]);
4417		if (did_info->info_flags & MDDB_DID_EXISTS) {
4418			sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
4419			if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
4420				/*
4421				 * copy device id from mddb to
4422				 * cfg_loc structure
4423				 */
4424				szalloc = clp->l_devid_sz;
4425				if (sz <= szalloc) {
4426					for (i = 0; i < sz; i++) {
4427						((char *)(uintptr_t)
4428						    clp->l_devid)[i] =
4429						    ((char *)did_icp->
4430						    did_ic_devid[li])[i];
4431					}
4432					clp->l_devid_flags |= MDDB_DEVID_VALID;
4433					(void) strcpy(clp->l_minor_name,
4434					    did_info->info_minor_name);
4435				} else {
4436					clp->l_devid_flags |=
4437					    MDDB_DEVID_NOSPACE;
4438				}
4439			} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
4440				clp->l_devid_flags = MDDB_DEVID_SZ;
4441				clp->l_devid_sz = sz;
4442			}
4443		}
4444	}
4445
4446	/*
4447	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
4448	 * and sidelocators.  During startup, the dev, drvnm and mnum in
4449	 * these structures may not match the devid (the locators and
4450	 * sidelocators will be updated to match the devid by the routine
4451	 * load_old_replicas).  Using out-of-sync values won't cause any
4452	 * problems since ridev will re-derive these from the devid and mnum.
4453	 * After startup, the dev, drvnm and mnum in these structures have
4454	 * been updated and can be used.
4455	 */
4456
4457	clp->l_blkno = lp->l_blkno;
4458	clp->l_flags = lp->l_flags;
4459	clp->l_dev = lp->l_dev;
4460
4461	if (mn_set) {
4462		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
4463		clp->l_mnum = mnslp->mnl_mnum;
4464	} else {
4465		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
4466		clp->l_mnum = slp->l_mnum;
4467	}
4468	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
4469}
4470
4471/*
4472 * Find the index into the mnsidelocator where entry will go.
4473 * Then index can be fed into both splitname2locatorblocks and
4474 * cfgloc2locator so that those entries can be kept in sync.
4475 *
4476 * Returns:
4477 *	-1 if failed to find unused slot or if a traditional diskset
4478 *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
4479 */
4480static int
4481checklocator(
4482	mddb_lb_t		*lbp,
4483	int			li,
4484	side_t			sideno
4485)
4486{
4487	uchar_t			i;
4488	mddb_mnsidelocator_t	*mnslp;
4489	mddb_mnlb_t		*mnlbp;
4490	int			index = -1;
4491
4492	if (lbp->lb_flags & MDDB_MNSET) {
4493		/*
4494		 * Checking side locator structure.  First, check if
4495		 * there is already an entry for this side.  If so,
4496		 * then use that entry.  Otherwise, find an entry
4497		 * that has a sideno of 0.
4498		 */
4499		mnlbp = (mddb_mnlb_t *)lbp;
4500		for (i = 0; i < MD_MNMAXSIDES; i++) {
4501			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4502			if (mnslp->mnl_sideno == sideno) {
4503				/* Found a match - stop looking */
4504				index = i;
4505				break;
4506			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
4507				/* Set first empty slot, but keep looking */
4508				index = i;
4509			}
4510		}
4511		/* Didn't find empty slot or previously used slot */
4512		if ((i == MD_MNMAXSIDES) && (index == -1)) {
4513			return (-1);
4514		}
4515		return (index);
4516	} else
4517		return (0);
4518}
4519
4520/*
4521 * Takes locator information (driver name, minor number, sideno) and
4522 * stores it in the locator block.
4523 * For traditional diskset, the sideno is the index into the sidelocator
4524 * array in the locator block.
4525 * For the MN diskset, the sideno is the nodeid which can be any number,
4526 * so the index passed in is the index into the mnsidelocator array
4527 * in the locator block.
4528 */
4529static int
4530cfgloc2locator(
4531	mddb_lb_t		*lbp,
4532	mddb_cfg_loc_t		*clp,
4533	int			li,
4534	side_t			sideno,
4535	int			index	/* Only useful in MNsets when > 1 */
4536)
4537{
4538	uchar_t			i;
4539	mddb_sidelocator_t	*slp;
4540	mddb_mnsidelocator_t	*mnslp;
4541	mddb_set_t		*s;
4542	int			mn_set = 0;
4543	mddb_mnlb_t		*mnlbp;
4544
4545	if (lbp->lb_flags & MDDB_MNSET) {
4546		mnlbp = (mddb_mnlb_t *)lbp;
4547		mn_set = 1;
4548		/*
4549		 * Index will be the slot that has the given sideno or
4550		 * the first empty slot if no match is found.
4551		 * This was pre-checked out in check locator.
4552		 */
4553		mnslp = &mnlbp->lb_mnsidelocators[index][li];
4554	} else {
4555		slp = &lbp->lb_sidelocators[sideno][li];
4556	}
4557
4558	/*
4559	 * Look for the driver name
4560	 */
4561	for (i = 0; i < MDDB_DRVNMCNT; i++) {
4562		if (lbp->lb_drvnm[i].dn_len == 0)
4563			continue;
4564		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4565		    MD_MAXDRVNM) == 0)
4566			break;
4567	}
4568
4569	/*
4570	 * Didn't find one, add a new one
4571	 */
4572	if (i == MDDB_DRVNMCNT) {
4573		for (i = 0; i < MDDB_DRVNMCNT; i++) {
4574			if (lbp->lb_drvnm[i].dn_len == 0)
4575				break;
4576		}
4577		if (i == MDDB_DRVNMCNT)
4578			return (1);
4579		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4580		    MD_MAXDRVNM);
4581		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
4582	}
4583
4584	/* Fill in the drvnm index */
4585	if (mn_set) {
4586		mnslp->mnl_drvnm_index = i;
4587		mnslp->mnl_mnum = clp->l_mnum;
4588		mnslp->mnl_sideno = sideno;
4589	} else {
4590		slp->l_drvnm_index = i;
4591		slp->l_mnum = clp->l_mnum;
4592	}
4593
4594	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4595		/*
4596		 * This device id could already be associated with this index
4597		 * if this is not the first side added to the set.
4598		 * If device id is 0, there is no device id for this device.
4599		 */
4600		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
4601			return (0);
4602		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
4603		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
4604		    clp->l_minor_name)) {
4605			return (1);
4606		}
4607	}
4608
4609	return (0);
4610}
4611
4612/*
4613 * See if there are mediator hosts and try to use the data.
4614 */
4615static int
4616mediate(
4617	mddb_set_t	*s
4618)
4619{
4620	mddb_lb_t	*lbp = s->s_lbp;
4621	med_data_lst_t	*meddlp = NULL;
4622	med_data_lst_t	*tmeddlp = NULL;
4623	med_data_t	*meddp;
4624	int		medok = 0;
4625	int		medacc = 0;
4626	uint_t		maxcc;
4627	int		golden = 0;
4628	int		err = 1;
4629	set_t		setno = s->s_setno;
4630
4631	/* Do not have a mediator, then the state is stale */
4632	if (s->s_med.n_cnt == 0)
4633		return (err);
4634
4635	/* Contact the mediator hosts for the data */
4636	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
4637
4638	/* No mediator data, stale */
4639	if (meddlp == NULL)
4640		return (err);
4641
4642	/* Mark all the mediator data that is not for this set as errored */
4643	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4644		struct timeval32 tmptime;
4645		meddp = tmeddlp->mdl_med;
4646
4647		/* Count the number of mediators contacted */
4648		medacc++;
4649
4650		/* Paranoid check */
4651		if (meddp->med_dat_sn != setno)
4652			meddp->med_dat_fl |= MED_DFL_ERROR;
4653
4654		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
4655
4656		/*CSTYLED*/
4657		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
4658			meddp->med_dat_fl |= MED_DFL_ERROR;
4659	}
4660
4661	/* Get the max commitcount */
4662	maxcc = 0;
4663	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4664		meddp = tmeddlp->mdl_med;
4665		if (meddp->med_dat_fl & MED_DFL_ERROR)
4666			continue;
4667		if (meddp->med_dat_cc > maxcc)
4668			maxcc = meddp->med_dat_cc;
4669	}
4670
4671	/* Now mark the records that don't have the highest cc as errored */
4672	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4673		meddp = tmeddlp->mdl_med;
4674		if (meddp->med_dat_fl & MED_DFL_ERROR)
4675			continue;
4676		if (meddp->med_dat_cc != maxcc)
4677			meddp->med_dat_fl |= MED_DFL_ERROR;
4678	}
4679
4680	/* Now mark the records that don't match the lb commitcnt as errored */
4681	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4682		meddp = tmeddlp->mdl_med;
4683		if (meddp->med_dat_fl & MED_DFL_ERROR)
4684			continue;
4685		if (meddp->med_dat_cc != lbp->lb_commitcnt)
4686			meddp->med_dat_fl |= MED_DFL_ERROR;
4687	}
4688
4689	/* Is there a "golden" copy and how many valid mediators */
4690	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4691		meddp = tmeddlp->mdl_med;
4692		if (meddp->med_dat_fl & MED_DFL_ERROR)
4693			continue;
4694
4695		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
4696			golden++;
4697
4698		medok++;
4699	}
4700
4701	/* No survivors, stale */
4702	if (medok == 0)
4703		goto out;
4704
4705	/* No mediator quorum and no golden copies, stale */
4706	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
4707		/* Skip odd numbers, no exact 50% */
4708		if (s->s_med.n_cnt & 1)
4709			goto out;
4710		/* Have 50%, allow an accept */
4711		if (medacc == (s->s_med.n_cnt / 2))
4712			md_set_setstatus(setno, MD_SET_ACCOK);
4713		goto out;
4714	}
4715
4716	/* We either have a quorum or a golden copy, or both */
4717	err = 0;
4718
4719out:
4720	if (meddlp) {
4721		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
4722			tmeddlp = meddlp->mdl_nx;
4723			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
4724			kmem_free(meddlp, sizeof (med_data_lst_t));
4725		}
4726	}
4727
4728	return (err);
4729}
4730
4731/*
4732 *	1. read masterblks and locator blocks for all know database locations
4733 *		a. keep track of which have good master blks
4734 *		b. keep track of which have good locators
4735 *
4736 */
4737static int
4738get_mbs_n_lbs(
4739	mddb_set_t	*s,
4740	int		*write_lb
4741)
4742{
4743	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
4744						/* May be cast to mddb_mnlb_t */
4745						/* if accessing sidenames in */
4746						/* MN set */
4747	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
4748	mddb_did_blk_t	*did_blkp = 0;
4749	int		did_blkp_sz = 0;
4750	mddb_did_db_t	*did_dbp;
4751	mddb_did_info_t	*did_info;
4752	caddr_t		did_block;
4753	mddb_ri_t	*rip;
4754	mddb_dtag_lst_t	*dtlp;
4755	mddb_locator_t	*lp;
4756	daddr_t		physblk;
4757	int		li;
4758	uint_t		blk;
4759	md_dev64_t	dev;
4760	caddr_t		buffer;
4761	uint_t		lb_blkcnt;
4762	int		retval = 0;
4763	int		err = 0;
4764	int		lb_ok = 0;
4765	int		lb_total = 0;
4766	int		lb_tagged = 0;
4767	int		lb_tags;
4768	set_t		setno = s->s_setno;
4769	int		cont_flag, i;
4770	mddb_did_db_t	*did_dbp1, *did_dbp2;
4771	int		mn_set = 0;
4772	mddb_cfg_loc_t	*cl;
4773
4774	/*
4775	 * read in master blocks and locator block for all known locators.
4776	 * lb_blkcnt will be set correctly for MN set later once getmasters
4777	 * has determined that the set is a MN set.
4778	 */
4779	lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
4780
4781	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
4782		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
4783		    MDDB_F_EMASTER);
4784		rip->ri_lbp = (mddb_lb_t *)NULL;
4785		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4786
4787		/*
4788		 * Translated dev is only used in calls to getmasters and
4789		 * getblks which expect a translated (aka miniroot) dev.
4790		 */
4791		dev = md_xlate_targ_2_mini(rip->ri_dev);
4792		if (dev == NODEV64) {
4793			/* Set error flag that getmasters would have set */
4794			/* if getmasters had been allowed to fail */
4795			rip->ri_flags |= MDDB_F_EMASTER;
4796		}
4797
4798		/*
4799		 * Invalid device id on system (due to failed or
4800		 * removed device) or invalid devt during upgrade
4801		 * (due to powered off device) will cause this
4802		 * replica to be marked in error and not used.
4803		 */
4804		if (rip->ri_flags & MDDB_F_EMASTER)
4805			continue;
4806
4807		/* get all master blocks, does mddb_devopen() */
4808		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
4809		    &rip->ri_flags, &mn_set);
4810
4811		/* if invalid master block - try next replica */
4812		if (! rip->ri_mbip)
4813			continue;
4814
4815		/*
4816		 * If lbp alloc'd to wrong size - reset it.
4817		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
4818		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
4819		 */
4820		if (lbp) {
4821			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
4822			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
4823				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
4824				lbp = (mddb_lb_t *)NULL;
4825			}
4826		}
4827
4828		if (lbp == (mddb_lb_t *)NULL) {
4829			/* If a MN set, set lb_blkcnt for MN loc blk size */
4830			if (mn_set)
4831				lb_blkcnt = MDDB_MNLBCNT;
4832			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
4833			    KM_SLEEP);
4834		}
4835
4836		/*
4837		 * Read in all the sectors for the locator block
4838		 * NOTE: Need to use getblks, rather than readblklst.
4839		 *	because it is too early and things are
4840		 *	NOT set up yet for read*()'s
4841		 */
4842		buffer = (caddr_t)lbp;
4843		for (blk = 0; blk < lb_blkcnt; blk++) {
4844			physblk = getphysblk(blk, rip->ri_mbip);
4845			err = getblks(s, buffer, dev, physblk,
4846			    btodb(MDDB_BSIZE), 0);
4847			if (err) {
4848				rip->ri_flags |= err;
4849				break;
4850			}
4851			buffer += MDDB_BSIZE;
4852		}
4853
4854		if (err)
4855			continue;
4856
4857		/* Verify the locator block */
4858		if (blk != lb_blkcnt)
4859			continue;
4860		if (lbp->lb_magic != MDDB_MAGIC_LB)
4861			continue;
4862		if (lbp->lb_blkcnt != lb_blkcnt)
4863			continue;
4864		if (mn_set) {
4865			/* If a MN set, check for MNLB revision in lb. */
4866			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
4867				continue;
4868		} else {
4869			/* If not a MN set, check for LB revision in lb. */
4870			if (revchk(MDDB_REV_LB, lbp->lb_revision))
4871				continue;
4872		}
4873		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
4874			continue;
4875
4876		/*
4877		 * With the addition of MultiNode Disksets, we must make sure
4878		 * to verify that this is the correct set.  A node could
4879		 * have been out of the config for awhile and this disk could
4880		 * have been moved to a different diskset and we don't want
4881		 * to accidentally start the wrong set.
4882		 *
4883		 * We don't do this check if we're in the middle of
4884		 * importing a set.
4885		 */
4886		if (!(md_get_setstatus(s->s_setno) &
4887		    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
4888		    (lbp->lb_setno != s->s_setno))
4889			continue;
4890
4891		rip->ri_flags |= MDDB_F_LOCACC;
4892
4893		/*
4894		 * a commit count of zero means this locator has been deleted
4895		 */
4896		if (lbp->lb_commitcnt == 0)
4897			continue;
4898
4899		/*
4900		 * If replica is in the device ID style and md_devid_destroy
4901		 * flag is set, turn off device id style.  This is only to be
4902		 * used in a catastrophic failure case.  Examples would be
4903		 * where the device id of all drives in the system
4904		 * (especially the mirror'd root drives) had been changed
4905		 * by firmware upgrade or by a patch to an existing disk
4906		 * driver.  Another example would be in the case of non-unique
4907		 * device ids due to a bug.  The device id would be valid on
4908		 * the system, but would return the wrong dev_t.
4909		 */
4910		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
4911			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
4912			lbp->lb_didfirstblk = 0;
4913			lbp->lb_didblkcnt = 0;
4914			*write_lb = 1;
4915		}
4916
4917
4918		/*
4919		 * If replica is in device ID style, read in device ID
4920		 * block and verify device ID block information.
4921		 */
4922		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4923
4924			/* Read in device ID block */
4925			if (did_icp == NULL) {
4926				did_icp = (mddb_did_ic_t *)
4927				    kmem_zalloc(sizeof (mddb_did_ic_t),
4928				    KM_SLEEP);
4929			} else {
4930				/* Reuse did_icp, but clear out data */
4931				if (did_icp->did_ic_blkp !=
4932				    (mddb_did_blk_t *)NULL) {
4933					kmem_free((caddr_t)did_icp->did_ic_blkp,
4934					    did_blkp_sz);
4935					did_blkp = (mddb_did_blk_t *)NULL;
4936					did_icp->did_ic_blkp =
4937					    (mddb_did_blk_t *)NULL;
4938				}
4939				if (did_icp->did_ic_dbp !=
4940				    (mddb_did_db_t *)NULL) {
4941					did_dbp1 = did_icp->did_ic_dbp;
4942					while (did_dbp1) {
4943						did_dbp2 = did_dbp1->db_next;
4944						kmem_free((caddr_t)
4945						    did_dbp1->db_ptr,
4946						    dbtob(did_dbp1->db_blkcnt));
4947						kmem_free((caddr_t)did_dbp1,
4948						    sizeof (mddb_did_db_t));
4949						did_dbp1 = did_dbp2;
4950					}
4951					did_icp->did_ic_dbp =
4952					    (mddb_did_db_t *)NULL;
4953				}
4954				for (i = 0; i < MDDB_NLB; i++) {
4955					did_icp->did_ic_devid[i] =
4956					    (ddi_devid_t)NULL;
4957				}
4958			}
4959
4960			/* Can't reuse blkp since size could be different */
4961			if (did_blkp != (mddb_did_blk_t *)NULL) {
4962				kmem_free(did_blkp, did_blkp_sz);
4963			}
4964			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
4965			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
4966			    KM_SLEEP);
4967			did_icp->did_ic_blkp = did_blkp;
4968			buffer = (caddr_t)did_blkp;
4969			for (blk = lbp->lb_didfirstblk;
4970			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
4971			    blk++) {
4972				physblk = getphysblk(blk, rip->ri_mbip);
4973				err = getblks(s, buffer, dev, physblk,
4974				    btodb(MDDB_BSIZE), 0);
4975				if (err) {
4976					rip->ri_flags |= err;
4977					break;
4978				}
4979				buffer += MDDB_BSIZE;
4980			}
4981			if (err)
4982				continue;
4983
4984			/* Verify the Device ID block */
4985			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
4986				continue;
4987			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
4988				continue;
4989			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
4990				continue;
4991			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
4992				continue;
4993			if (crcchk(did_blkp, &did_blkp->blk_checksum,
4994			    dbtob(lbp->lb_didblkcnt), NULL))
4995				continue;
4996
4997			/*
4998			 * Check if device ID block is out of sync with the
4999			 * Locator Block by checking if the locator block
5000			 * commitcnt does not match the device id block
5001			 * commitcnt.  If an 'out of sync' condition
5002			 * exists, discard this replica since it has
5003			 * inconsistent data and can't be used in
5004			 * determining the best replica.
5005			 *
5006			 * An 'out of sync' condition could happen if old
5007			 * SDS code was running with new devid style replicas
5008			 * or if a failure occurred between the writing of
5009			 * the locator block's commitcnt and the device
5010			 * id block's commitcnt.
5011			 *
5012			 * If old SDS code had been running, the upgrade
5013			 * process should detect this situation and
5014			 * have removed all of the device id information
5015			 * via the md_devid_destroy flag in md.conf.
5016			 */
5017			if (did_blkp->blk_commitcnt !=
5018			    lbp->lb_commitcnt) {
5019				continue;
5020			}
5021		}
5022
5023
5024		/*
5025		 * If replica is still in device ID style, read in all
5026		 * of the device IDs, verify the checksum of the device IDs.
5027		 */
5028		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5029			/*
5030			 * Reset valid bit in device id info block flags. This
5031			 * flag is stored on disk, but the valid bit is reset
5032			 * when reading in the replica.  If the corresponding
5033			 * device id is valid (aka meaning that the system
5034			 * knows about this device id), the valid bit will
5035			 * be set at a later time.  The valid bit for this
5036			 * replica's device ID will be set in this routine.
5037			 * The valid bits for the rest of the device id's
5038			 * will be set after the 'best' replica has
5039			 * been selected in routine load_old_replicas.
5040			 * Reset updated bit in device id info block flags.
5041			 * This flag is also stored on disk, reset when read
5042			 * in and set when the locators and side locators
5043			 * have been updated to match this valid device
5044			 * id information.
5045			 */
5046			for (li = 0; li < lbp->lb_loccnt; li++) {
5047				did_info = &did_blkp->blk_info[li];
5048				if (did_info->info_flags & MDDB_DID_EXISTS)
5049					did_info->info_flags &=
5050					    ~(MDDB_DID_VALID |
5051					    MDDB_DID_UPDATED);
5052			}
5053
5054			cont_flag = 0;
5055			for (li = 0; li < lbp->lb_loccnt; li++) {
5056				did_info = &did_blkp->blk_info[li];
5057				did_block = (caddr_t)NULL;
5058				if (did_info->info_flags & MDDB_DID_EXISTS) {
5059					/*
5060					 * Check if block has
5061					 * already been read in
5062					 */
5063					did_dbp = did_icp->did_ic_dbp;
5064					while (did_dbp != 0) {
5065						if (did_dbp->db_firstblk ==
5066						    did_info->info_firstblk)
5067							break;
5068						else
5069							did_dbp =
5070							    did_dbp->db_next;
5071					}
5072					/* if block not found, read it in */
5073					if (did_dbp == NULL) {
5074						did_block = (caddr_t)
5075						    (kmem_zalloc(dbtob(
5076						    did_info->info_blkcnt),
5077						    KM_SLEEP));
5078						buffer = (caddr_t)did_block;
5079						for (blk =
5080						    did_info->info_firstblk;
5081						    blk < (did_info->
5082						    info_firstblk +
5083						    did_info->info_blkcnt);
5084						    blk++) {
5085							physblk =
5086							    getphysblk(blk,
5087							    rip->ri_mbip);
5088							err = getblks(s,
5089							    buffer, dev,
5090							    physblk, btodb(
5091							    MDDB_BSIZE), 0);
5092							if (err) {
5093								rip->ri_flags |=
5094								    err;
5095								break;
5096							}
5097							buffer += MDDB_BSIZE;
5098						}
5099						if (err) {
5100							kmem_free(did_block,
5101							    dbtob(did_info->
5102							    info_blkcnt));
5103							did_block =
5104							    (caddr_t)NULL;
5105							cont_flag = 1;
5106							break;
5107						}
5108
5109						/*
5110						 * Block read in -
5111						 * alloc Disk Block area
5112						 */
5113						did_dbp = (mddb_did_db_t *)
5114						    kmem_zalloc(
5115						    sizeof (mddb_did_db_t),
5116						    KM_SLEEP);
5117						did_dbp->db_ptr = did_block;
5118						did_dbp->db_firstblk =
5119						    did_info->info_firstblk;
5120						did_dbp->db_blkcnt =
5121						    did_info->info_blkcnt;
5122
5123						/* Add to front of dbp list */
5124						did_dbp->db_next =
5125						    did_icp->did_ic_dbp;
5126						did_icp->did_ic_dbp = did_dbp;
5127					}
5128					/* Check validity of devid in block */
5129					if (crcchk(((char *)did_dbp->db_ptr +
5130					    did_info->info_offset),
5131					    &did_info->info_checksum,
5132					    did_info->info_length, NULL)) {
5133						cont_flag = 1;
5134						break;
5135					}
5136
5137					/* Block now pointed to by did_dbp */
5138					did_icp->did_ic_devid[li] =
5139					    (ddi_devid_t)((char *)
5140					    did_dbp->db_ptr +
5141					    did_info->info_offset);
5142				}
5143			}
5144			if (cont_flag)
5145				continue;
5146		}
5147
5148		/*
5149		 * All blocks containing devids are now in core.
5150		 */
5151
5152		/*
5153		 * If we're doing a replicated import (also known as
5154		 * remote copy import), the device id in the locator
5155		 * block is incorrect and we need to fix it up here
5156		 * alongwith the l_dev otherwise we run into lots of
5157		 * trouble later on.
5158		 */
5159		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5160			mddb_ri_t	*trip;
5161			for (li = 0; li < lbp->lb_loccnt; li++) {
5162				did_info = &did_blkp->blk_info[li];
5163				lp = &lbp->lb_locators[li];
5164
5165				if (lp->l_flags & MDDB_F_DELETED)
5166					continue;
5167
5168				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5169					continue;
5170
5171				if (did_icp->did_ic_devid[li] == NULL)
5172					continue;
5173
5174				for (trip = s->s_rip; trip != NULL;
5175				    trip = trip->ri_next) {
5176					if (trip->ri_old_devid == NULL)
5177						continue;
5178					if (ddi_devid_compare(
5179					    trip->ri_old_devid,
5180					    did_icp->did_ic_devid[li]) != 0) {
5181						continue;
5182					}
5183
5184					/* update l_dev and side mnum */
5185					lp->l_dev = md_cmpldev(trip->ri_dev);
5186					lbp->lb_sidelocators[0][li].l_mnum =
5187					    md_getminor(trip->ri_dev);
5188				}
5189			}
5190		}
5191
5192		/*
5193		 * If there is a valid devid, verify that this locator
5194		 * block has information about itself by checking the
5195		 * device ID, minor_name and block
5196		 * number from this replica's incore data structure
5197		 * against the locator block information that has just
5198		 * been read in from disk.
5199		 *
5200		 * If not a valid devid, verify that this locator block
5201		 * has information about itself by checking the minor
5202		 * number, block number and driver name from this
5203		 * replica's incore data structure against the locator
5204		 * block information that has just been read in from disk.
5205		 */
5206		if ((rip->ri_devid != NULL) &&
5207		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
5208			/*
5209			 * This locator block MUST have locator (replica)
5210			 * information about itself.  Check against devid,
5211			 * slice part of minor number, and block number.
5212			 */
5213			for (li = 0; li < lbp->lb_loccnt; li++) {
5214				did_info = &did_blkp->blk_info[li];
5215				lp = &lbp->lb_locators[li];
5216				if (lp->l_flags & MDDB_F_DELETED)
5217					continue;
5218
5219				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5220					continue;
5221
5222				if (((md_get_setstatus(setno) &
5223				    MD_SET_REPLICATED_IMPORT)) &&
5224				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
5225					if (ddi_devid_compare(rip->ri_old_devid,
5226					    did_icp->did_ic_devid[li]) != 0)
5227						continue;
5228				} else {
5229					if (ddi_devid_compare(rip->ri_devid,
5230					    did_icp->did_ic_devid[li]) != 0)
5231						continue;
5232				}
5233
5234				if (strcmp(rip->ri_minor_name,
5235				    did_info->info_minor_name) != 0)
5236					continue;
5237
5238				if (lp->l_blkno == rip->ri_blkno)
5239					break;
5240			}
5241		} else {
5242			/*
5243			 * This locator block MUST have locator (replica)
5244			 * information about itself.
5245			 */
5246			if (!mn_set) {
5247				for (li = 0; li < lbp->lb_loccnt; li++) {
5248					mddb_drvnm_t		*dn;
5249					mddb_sidelocator_t	*slp;
5250
5251					lp = &lbp->lb_locators[li];
5252					slp = &lbp->
5253					    lb_sidelocators[s->s_sideno][li];
5254					if (lp->l_flags & MDDB_F_DELETED)
5255						continue;
5256					if (slp->l_mnum != md_getminor(
5257					    rip->ri_dev))
5258						continue;
5259					if (lp->l_blkno != rip->ri_blkno)
5260						continue;
5261					dn = &lbp->lb_drvnm[slp->l_drvnm_index];
5262					if (strncmp(dn->dn_data,
5263					    rip->ri_driver, MD_MAXDRVNM) == 0)
5264						break;
5265				}
5266			} else {
5267				for (li = 0; li < lbp->lb_loccnt; li++) {
5268					mddb_drvnm_t		*dn;
5269					mddb_mnsidelocator_t	*mnslp;
5270					mddb_mnlb_t		*mnlbp;
5271					int			i;
5272
5273					/*
5274					 * Check all possible locators locking
5275					 * for match to the currently read-in
5276					 * locator, must match on:
5277					 *	- blkno
5278					 *	- side locator for this
5279					 *	  node's side
5280					 *	- side locator minor number
5281					 *	- side locator driver name
5282					 */
5283
5284					/*
5285					 * Looking at sidelocs:
5286					 * cast lbp -> mnlbp
5287					 */
5288					mnlbp = (mddb_mnlb_t *)lbp;
5289					lp = &mnlbp->lb_locators[li];
5290					if (lp->l_flags & MDDB_F_DELETED)
5291						continue;
5292					if (lp->l_blkno != rip->ri_blkno)
5293						continue;
5294
5295					for (i = 0; i < MD_MNMAXSIDES; i++) {
5296						mnslp = &mnlbp->
5297						    lb_mnsidelocators[i][li];
5298						if (mnslp->mnl_sideno ==
5299						    s->s_sideno) {
5300							break;
5301						}
5302					}
5303					/* No matching side found */
5304					if (i == MD_MNMAXSIDES)
5305						continue;
5306					if (mnslp->mnl_mnum !=
5307					    md_getminor(rip->ri_dev))
5308						continue;
5309					dn = &lbp->
5310					    lb_drvnm[mnslp->mnl_drvnm_index];
5311					if (strncmp(dn->dn_data,
5312					    rip->ri_driver, MD_MAXDRVNM) == 0)
5313						break;
5314				}
5315			}
5316		}
5317
5318		/*
5319		 * Didn't find ourself in this locator block it means
5320		 * the locator block is a stale transplant. Probably from
5321		 * a user doing a dd.
5322		 */
5323		if (li == lbp->lb_loccnt)
5324			continue;
5325
5326		/*
5327		 * Keep track of the number of accessed and valid
5328		 * locator blocks.
5329		 */
5330		lb_ok++;
5331
5332		/*
5333		 * Read the tag in, skips invalid or blank tags.
5334		 * Only valid tags allocate storage
5335		 * Data tags are not used in MN disksets.
5336		 */
5337		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
5338			/*
5339			 * Keep track of the number of tagged
5340			 * locator blocks.
5341			 */
5342			lb_tagged++;
5343
5344			/* Keep a list of unique tags. */
5345			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
5346		}
5347
5348		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5349			/*
5350			 * go through locator block and add any other
5351			 * locations of the data base.
5352			 * For the replicated import case, this was done earlier
5353			 * and we really don't need or want to do so again
5354			 */
5355			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
5356			for (li = 0; li < lbp->lb_loccnt; li++) {
5357				lp = &lbp->lb_locators[li];
5358				if (lp->l_flags & MDDB_F_DELETED)
5359					continue;
5360
5361				cl->l_devid_flags = MDDB_DEVID_GETSZ;
5362				cl->l_devid = (uint64_t)0;
5363				cl->l_devid_sz = 0;
5364				cl->l_old_devid = (uint64_t)0;
5365				cl->l_old_devid_sz = 0;
5366				cl->l_minor_name[0] = '\0';
5367				locator2cfgloc(lbp, cl, li, s->s_sideno,
5368				    did_icp);
5369
5370				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
5371					if ((cl->l_devid = (uintptr_t)kmem_alloc
5372					    (cl->l_devid_sz, KM_SLEEP))
5373					    == NULL) {
5374						continue;
5375					} else {
5376						cl->l_devid_flags =
5377						    MDDB_DEVID_SPACE;
5378					}
5379				}
5380				locator2cfgloc(lbp, cl, li, s->s_sideno,
5381				    did_icp);
5382
5383				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
5384
5385				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
5386					kmem_free((caddr_t)(uintptr_t)
5387					    cl->l_devid, cl->l_devid_sz);
5388			}
5389			kmem_free(cl, sizeof (mddb_cfg_loc_t));
5390		}
5391
5392		/* Save LB for later */
5393		rip->ri_lbp = lbp;
5394		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5395			rip->ri_did_icp = did_icp;
5396			did_icp = (mddb_did_ic_t *)NULL;
5397			did_blkp = (mddb_did_blk_t *)NULL;
5398		} else
5399			rip->ri_did_icp = NULL;
5400		lbp = (mddb_lb_t *)NULL;
5401	}
5402
5403	if (lbp != (mddb_lb_t *)NULL)
5404		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
5405
5406	if (did_icp != (mddb_did_ic_t *)NULL) {
5407		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
5408			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
5409			did_blkp = (mddb_did_blk_t *)NULL;
5410		}
5411		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
5412			mddb_did_db_t	*did_dbp1, *did_dbp2;
5413
5414			did_dbp1 = did_icp->did_ic_dbp;
5415			while (did_dbp1) {
5416				did_dbp2 = did_dbp1->db_next;
5417				kmem_free((caddr_t)did_dbp1->db_ptr,
5418				    dbtob(did_dbp1->db_blkcnt));
5419				kmem_free((caddr_t)did_dbp1,
5420				    sizeof (mddb_did_db_t));
5421				did_dbp1 = did_dbp2;
5422			}
5423		}
5424		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
5425	}
5426
5427	if (did_blkp != (mddb_did_blk_t *)NULL) {
5428		kmem_free((caddr_t)did_blkp, did_blkp_sz);
5429	}
5430
5431	/* No locator blocks were ok */
5432	if (lb_ok == 0)
5433		goto out;
5434
5435	/* No tagged data was found - will be 0 for MN diskset */
5436	if (lb_tagged == 0)
5437		goto out;
5438
5439	/* Find the highest non-deleted replica count */
5440	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5441		int		lb_tot = 0;
5442
5443		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
5444			continue;
5445
5446		if (rip->ri_lbp == (mddb_lb_t *)NULL)
5447			continue;
5448
5449		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
5450			lp = &rip->ri_lbp->lb_locators[li];
5451			if (lp->l_flags & MDDB_F_DELETED)
5452				continue;
5453			lb_tot++;
5454		}
5455
5456		if (lb_tot > lb_total)
5457			lb_total = lb_tot;
5458	}
5459
5460	/* Count the number of unique tags */
5461	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
5462		lb_tags++;
5463
5464	/* Should have at least one tag at this point */
5465	ASSERT(lb_tags > 0);
5466
5467
5468	/*
5469	 * If the number of tagged locators is not the same as the number of
5470	 * OK locators OR more than one tag exists, then make sure the
5471	 * selected tag will be written out later.
5472	 */
5473	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
5474		md_set_setstatus(setno, MD_SET_TAGDATA);
5475
5476	/* Only a single tag, take the tagged data */
5477	if (lb_tags == 1) {
5478		dt_setup(s, &s->s_dtlp->dtl_dt);
5479		md_set_setstatus(setno, MD_SET_USETAG);
5480		goto out;
5481	}
5482
5483	/* Multiple tags, not selecting a tag, tag mode is on */
5484	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
5485		retval = MDDB_E_TAGDATA;
5486
5487out:
5488
5489	return (retval);
5490}
5491
5492/*
5493 *	1. Select a locator.
5494 *	2. check if enough locators now have current copies
5495 *	3. read in database from one of latest
5496 *	4. if known to have latest make all database the same
5497 *	5. if configuration has changed rewrite locators
5498 *
5499 * Parameters:
5500 * 	s - pointer to mddb_set structure
5501 *	flag - used in MN disksets to tell if this node is being joined to
5502 *		a diskset that is in the STALE state.  If the flag is
5503 *		MDDB_MN_STALE, then this node should be marked in the STALE
5504 *		state even if > 50% mddbs are available.  (The diskset can
5505 *		only change from STALE->OK if all nodes withdraw from the
5506 *		MN diskset and then rejoin).
5507 */
5508static int
5509load_old_replicas(
5510	mddb_set_t	*s,
5511	int		flag
5512)
5513{
5514	mddb_lb_t	*lbp = NULL;
5515	mddb_mnlb_t	*mnlbp = NULL;
5516	mddb_ri_t	*rip;
5517	mddb_locator_t	*lp;
5518	mddb_db_t	*dbp;
5519	mddb_de_ic_t	*dep;
5520	int		li;
5521	int		alc;
5522	int		lc;
5523	int		tlc;
5524	int		retval = 0;
5525	caddr_t		p;
5526	size_t		maxrecsize;
5527	set_t		setno = s->s_setno;
5528	mddb_did_db_t	*did_dbp1;
5529	mddb_did_info_t	*did_info;
5530	mddb_did_ic_t	*did_icp = NULL;
5531	md_dev64_t	*newdev;
5532	mddb_sidelocator_t	*slp = 0;
5533	mddb_mnsidelocator_t	*mnslp = 0;
5534	uchar_t		i;
5535	char		*name;
5536	ddi_devid_t	ret_devid;
5537	md_dev64_t	dev;
5538	uint_t		len, sz;
5539	char		*minor_name;
5540	int		write_lb = 0;
5541	int		rval;
5542	int		stale_rtn = 0;
5543
5544	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
5545	if (retval = get_mbs_n_lbs(s, &write_lb))
5546		goto errout;
5547
5548	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
5549		retval = MDDB_E_NOLOCBLK;
5550		goto errout;
5551	}
5552
5553	/* If a multi-node set, then set md_set.s_status flag */
5554	if (lbp->lb_flags & MDDB_MNSET) {
5555		md_set_setstatus(setno, MD_SET_MNSET);
5556		/*
5557		 * If data tag area had been allocated before set type was
5558		 * known - free it now.
5559		 */
5560		if (md_set[setno].s_dtp) {
5561			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
5562			md_set[setno].s_dtp = NULL;
5563		}
5564	}
5565
5566	/*
5567	 * If the replica is in devid format, setup the devid incore ptr.
5568	 */
5569	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5570		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5571			if (rip->ri_lbp == s->s_lbp) {
5572				did_icp = s->s_did_icp = rip->ri_did_icp;
5573				break;
5574			}
5575		}
5576		/*
5577		 * If no devid incore info found - something has gone
5578		 * wrong so errout.
5579		 */
5580		if (rip == NULL) {
5581			retval = MDDB_E_NODEVID;
5582			goto errout;
5583		}
5584
5585		/*
5586		 * Add all blocks containing devids to free list.
5587		 * Then remove addresses that actually contain devids.
5588		 */
5589		did_dbp1 = did_icp->did_ic_dbp;
5590		while (did_dbp1) {
5591			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
5592			    0, dbtob(did_dbp1->db_blkcnt))) {
5593				retval = MDDB_E_NOSPACE;
5594				goto errout;
5595			}
5596
5597			did_dbp1 = did_dbp1->db_next;
5598		}
5599		for (li = 0; li < lbp->lb_loccnt; li++) {
5600			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5601			if (!(did_info->info_flags & MDDB_DID_EXISTS))
5602				continue;
5603
5604			if (mddb_devid_free_delete(s, did_info->info_firstblk,
5605			    did_info->info_offset, did_info->info_length)) {
5606				/* unable to find disk block */
5607				retval = MDDB_E_NODEVID;
5608				goto errout;
5609			}
5610		}
5611	}
5612
5613	/*
5614	 * create mddb_mbaray, count all locators and active locators.
5615	 */
5616	alc = 0;
5617	lc = 0;
5618	for (li = 0; li < lbp->lb_loccnt; li++) {
5619		ddi_devid_t	li_devid;
5620
5621		lp = &lbp->lb_locators[li];
5622
5623		if (lp->l_flags & MDDB_F_DELETED)
5624			continue;
5625
5626		/* Count non-deleted replicas */
5627		lc++;
5628
5629		/*
5630		 * Use the devid of this locator to compare with the rip
5631		 * list.  The scenario to watch out for here is that this
5632		 * locator could be on a disk that is dead and there could
5633		 * be a valid entry in the rip list for a different disk
5634		 * that has been moved to the dead disks dev_t.  We don't
5635		 * want to match with the moved disk.
5636		 */
5637		li_devid = NULL;
5638		(void) mddb_devid_get(s, li, &li_devid, &minor_name);
5639
5640		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5641			if (match_mddb(rip, li_devid, minor_name,
5642			    md_expldev(lp->l_dev), lp->l_blkno)) {
5643				break;
5644			}
5645		}
5646		if (rip == NULL) {
5647			/*
5648			 * If rip not found, then mark error in master block
5649			 * so that no writes are later attempted to this
5650			 * replica.  rip may not be setup if ridev
5651			 * failed due to un-found driver name.
5652			 */
5653			lp->l_flags |= MDDB_F_EMASTER;
5654			continue;
5655		}
5656
5657		s->s_mbiarray[li] = rip->ri_mbip;
5658
5659		lp->l_flags &= MDDB_F_ACTIVE;
5660		lp->l_flags |= (int)rip->ri_flags;
5661
5662		if (rip->ri_transplant)
5663			lp->l_flags &= ~MDDB_F_ACTIVE;
5664
5665		if (lp->l_flags & MDDB_F_LOCACC)
5666			alc++;
5667	}
5668
5669	/* Save on a divide - calculate 50% + 1 up front */
5670	tlc = ((lc + 1) / 2);
5671
5672	if (alc > tlc) {		/* alc > tlc		- OK */
5673		md_clr_setstatus(setno, MD_SET_STALE);
5674	} else if (alc < tlc) {		/* alc < tlc		- stale */
5675		md_set_setstatus(setno, MD_SET_STALE);
5676	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
5677		md_clr_setstatus(setno, MD_SET_STALE);
5678	} else {			/* alc == tlc && even	- ? */
5679		/* Can do an accept, and are */
5680		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
5681			md_clr_setstatus(setno, MD_SET_STALE);
5682		} else {		/* possibly has a mediator */
5683			if (mediate(s)) {
5684				md_set_setstatus(setno, MD_SET_STALE);
5685			} else {
5686				md_clr_setstatus(setno, MD_SET_STALE);
5687			}
5688		}
5689
5690		/*
5691		 * The mirrored_root_flag allows the sysadmin to decide to
5692		 * start the local set in a read/write (non-stale) mode
5693		 * when there are only 50% available mddbs on the system and
5694		 * when the root file system is on a mirror.  This is useful
5695		 * in a 2 disk system where 1 disk failure would cause an mddb
5696		 * quorum failure and subsequent boot failures since the root
5697		 * filesystem would be in a read-only state.
5698		 */
5699		if (mirrored_root_flag == 1 && setno == 0 &&
5700		    svm_bootpath[0] != 0) {
5701			md_clr_setstatus(setno, MD_SET_STALE);
5702		} else {
5703			if (md_get_setstatus(setno) & MD_SET_STALE) {
5704				/* Allow half mode - CAREFUL! */
5705				if (mddb_allow_half)
5706					md_clr_setstatus(setno, MD_SET_STALE);
5707			}
5708		}
5709
5710		/*
5711		 * In a MN diskset,
5712		 *	- if 50% mddbs are unavailable and this
5713		 *		has been marked STALE above
5714		 * 	- master node isn't in the STALE state
5715		 *	- this node isn't the master node (this node
5716		 *		isn't the first node to join the set)
5717		 * then clear the STALE state and set TOOFEW.
5718		 *
5719		 * If this node is the master node and set was marked STALE,
5720		 * then the set stays STALE.
5721		 *
5722		 * If this node is not the master and this node's state is
5723		 * STALE and the master node is not marked STALE,
5724		 * then master node must be in the TOOFEW state or the
5725		 * master is panic'ing.  A MN diskset can only be placed into
5726		 * the STALE state by having the first node join the set
5727		 * with <= 50% mddbs.  There's no way for a MN diskset to
5728		 * transition between STALE and not-STALE states unless all
5729		 * nodes are withdrawn from the diskset or all nodes in the
5730		 * diskset are rebooted at the same time.
5731		 *
5732		 * So, mark this node's state as TOOFEW instead of STALE.
5733		 */
5734		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
5735		    == (MD_SET_MNSET | MD_SET_STALE)) &&
5736		    ((flag & MDDB_MN_STALE) == 0) &&
5737		    (!(md_set[setno].s_am_i_master))) {
5738			md_clr_setstatus(setno, MD_SET_STALE);
5739			md_set_setstatus(setno, MD_SET_TOOFEW);
5740		}
5741	}
5742
5743	/*
5744	 * If a MN set is marked STALE on the other nodes,
5745	 * mark it stale here.  Override all other considerations
5746	 * such as a mediator or > 50% mddbs available.
5747	 */
5748	if (md_get_setstatus(setno) & MD_SET_MNSET) {
5749		if (flag & MDDB_MN_STALE)
5750			md_set_setstatus(setno, MD_SET_STALE);
5751	}
5752
5753	/*
5754	 * read a good copy of the locator names
5755	 * if an error occurs reading what is suppose
5756	 * to be a good copy continue looking for another
5757	 * good copy
5758	 */
5759	s->s_lnp = NULL;
5760	for (li = 0; li < lbp->lb_loccnt; li++) {
5761		lp = &lbp->lb_locators[li];
5762		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5763		    (lp->l_flags & MDDB_F_EMASTER))
5764			continue;
5765
5766		/* Find rip entry for this locator if one exists */
5767		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5768			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5769			    lp->l_blkno))
5770				break;
5771		}
5772
5773		if (rip == NULL) {
5774			continue;
5775		}
5776
5777		/*
5778		 * Use the rip commitcnt since the commitcnt in lbp could
5779		 * been cleared by selectlocator.  Looking for a replica with
5780		 * the same commitcnt as the 'golden' copy in order to
5781		 * get the same data.
5782		 */
5783		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5784			continue;
5785		}
5786
5787		/*
5788		 * Now have a copy of the database that is equivalent
5789		 * to the chosen locator block with respect to
5790		 * inittime, identifier and commitcnt.   Trying the
5791		 * equivalent databases in the order that they were
5792		 * written will provide the most up to date data.
5793		 */
5794		lp->l_flags |= readlocnames(s, li);
5795		if (s->s_lnp)
5796			break;
5797	}
5798
5799	if (s->s_lnp == NULL) {
5800		retval = MDDB_E_NOLOCNMS;
5801		goto errout;
5802	}
5803
5804	/*
5805	 * read a good copy of the data base
5806	 * if an error occurs reading what is suppose
5807	 * to be a good copy continue looking for another
5808	 * good copy
5809	 */
5810
5811	s->s_dbp = NULL;
5812	for (li = 0; li < lbp->lb_loccnt; li++) {
5813		lp = &lbp->lb_locators[li];
5814		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5815		    (lp->l_flags & MDDB_F_EMASTER))
5816			continue;
5817
5818		/* Find rip entry for this locator if one exists */
5819		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5820			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5821			    lp->l_blkno))
5822				break;
5823		}
5824
5825		if (rip == NULL) {
5826			continue;
5827		}
5828
5829		/*
5830		 * Use the rip commitcnt since the commitcnt in lbp could
5831		 * been cleared by selectlocator.  Looking for a replica with
5832		 * the same commitcnt as the 'golden' copy in order to
5833		 * get the same data.
5834		 */
5835		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5836			continue;
5837		}
5838
5839		/*
5840		 * Now have a copy of the database that is equivalent
5841		 * to the chosen locator block with respect to
5842		 * inittime, identifier and commitcnt.   Trying the
5843		 * equivalent databases in the order that they were
5844		 * written will provide the most up to date data.
5845		 */
5846		lp->l_flags |= readcopy(s, li);
5847
5848		if (s->s_dbp)
5849			break;
5850	}
5851
5852	if (s->s_dbp == NULL) {
5853		retval = MDDB_E_NODIRBLK;
5854		goto errout;
5855	}
5856
5857	lp->l_flags |= MDDB_F_MASTER;
5858	lp->l_flags |= MDDB_F_UP2DATE;
5859
5860	/*
5861	 * go through and find largest record;
5862	 * Also fixup the user data area's
5863	 */
5864	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
5865
5866	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
5867		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
5868			if (dep->de_flags & MDDB_F_OPT)
5869				getoptrecord(s, dep);
5870			else {
5871				allocuserdata(dep);
5872				maxrecsize = MAX(dep->de_recsize, maxrecsize);
5873			}
5874
5875	if (maxrecsize > s->s_databuffer_size) {
5876		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
5877		if (s->s_databuffer_size)
5878			kmem_free(s->s_databuffer, s->s_databuffer_size);
5879		s->s_databuffer = p;
5880		s->s_databuffer_size = maxrecsize;
5881	}
5882
5883	/* If we can clear the tag data record, do it now. */
5884	/* Data tags not supported on MN sets */
5885	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
5886	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
5887		dt_setup(s, NULL);
5888
5889	/* This will return non-zero if STALE or TOOFEW */
5890	/* This will write out chosen replica image to all replicas */
5891	stale_rtn = selectreplicas(s, MDDB_SCANALL);
5892
5893	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5894		ddi_devid_t	devidptr;
5895
5896		/*
5897		 * ignore the return value from selectreplicas because we
5898		 * may have a STALE or TOOFEW set in the case of a partial
5899		 * replicated diskset. We will fix that up later.
5900		 */
5901
5902		lbp = s->s_lbp;
5903		for (li = 0; li < lbp->lb_loccnt; li++) {
5904			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5905
5906			if (did_info->info_flags & MDDB_DID_EXISTS) {
5907				devidptr = s->s_did_icp->did_ic_devid[li];
5908				lp = &lbp->lb_locators[li];
5909				for (rip = s->s_rip; rip != NULL;
5910				    rip = rip->ri_next) {
5911					if (rip->ri_old_devid == 0)
5912						continue;
5913					if (ddi_devid_compare(rip->ri_old_devid,
5914					    devidptr) != 0) {
5915						continue;
5916					}
5917					if (update_locatorblock(s,
5918					    md_expldev(lp->l_dev),
5919					    rip->ri_devid, rip->ri_old_devid)) {
5920						goto errout;
5921					}
5922				}
5923			}
5924		}
5925	} else {
5926		if (stale_rtn)
5927			goto errout;
5928	}
5929
5930	/*
5931	 * If the replica is in device id style - validate the device id's,
5932	 * if present, in the locator block devid area.
5933	 */
5934	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
5935	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5936		for (li = 0; li < lbp->lb_loccnt; li++) {
5937			newdev[li] = 0;
5938			lp = &lbp->lb_locators[li];
5939			if (lp->l_flags & MDDB_F_DELETED)
5940				continue;
5941			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5942			dev = md_expldev(lp->l_dev);
5943			if (did_info->info_flags & MDDB_DID_EXISTS) {
5944				/* Validate device id on current system */
5945				newdev[li] = dev;
5946				if (mddb_devid_validate(
5947				    did_icp->did_ic_devid[li],
5948				    &(newdev[li]),
5949				    did_info->info_minor_name) == 0) {
5950					/* Set valid flag */
5951					did_info->info_flags |= MDDB_DID_VALID;
5952				} else {
5953					lp->l_flags |= MDDB_F_EMASTER;
5954				}
5955			} else if (!(MD_UPGRADE)) {
5956				/*
5957				 * If a device doesn't have a device id,
5958				 * check if there is now a device ID
5959				 * associated with device.  If one exists,
5960				 * add it to the locator block devid area.
5961				 * If there's not enough space to add it,
5962				 * print a warning.
5963				 * Don't do this during upgrade.
5964				 */
5965				dev_t ddi_dev = md_dev64_to_dev(dev);
5966				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
5967				    DDI_SUCCESS) {
5968					if (ddi_lyr_get_minor_name(ddi_dev,
5969					    S_IFBLK, &minor_name)
5970					    == DDI_SUCCESS) {
5971						if (mddb_devid_add(s, li,
5972						    ret_devid, minor_name)) {
5973							cmn_err(CE_WARN,
5974							    "Not enough space"
5975							    " in metadevice"
5976							    " state"
5977							    " database\n");
5978							cmn_err(CE_WARN,
5979							    "to add relocation"
5980							    " information for"
5981							    " device:\n");
5982							cmn_err(CE_WARN,
5983							    " major = %d, "
5984							    " minor = %d\n",
5985							    getmajor(ddi_dev),
5986							    getminor(ddi_dev));
5987						} else {
5988							write_lb = 1;
5989						}
5990						kmem_free(minor_name,
5991						    strlen(minor_name) + 1);
5992					}
5993					ddi_devid_free(ret_devid);
5994				}
5995			}
5996		}
5997
5998		/*
5999		 * If a device has a valid device id and if the dev_t
6000		 * associated with the device id has changed, update the
6001		 * driver name, minor num and dev_t in the local and side
6002		 * locators to match the dev_t that the system currently
6003		 * associates with the device id.
6004		 *
6005		 * Don't do this during upgrade.
6006		 */
6007		if (!(MD_UPGRADE)) {
6008		    for (li = 0; li < lbp->lb_loccnt; li++) {
6009			lp = &lbp->lb_locators[li];
6010			if (lp->l_flags & MDDB_F_DELETED)
6011				continue;
6012			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
6013			if ((did_info->info_flags & MDDB_DID_VALID) &&
6014			    !(did_info->info_flags & MDDB_DID_UPDATED)) {
6015				if (lbp->lb_flags & MDDB_MNSET) {
6016					int 	j;
6017					int	index = -1;
6018					mnlbp = (mddb_mnlb_t *)lbp;
6019					for (j = 0; j < MD_MNMAXSIDES; j++) {
6020					    mnslp = &mnlbp->
6021						lb_mnsidelocators[j][li];
6022					    if (mnslp->mnl_sideno ==
6023						s->s_sideno)
6024						break;
6025					    if (mnslp->mnl_sideno == 0)
6026						index = j;
6027					}
6028					if (j == MD_MNMAXSIDES) {
6029					    /* No match found; take empty */
6030					    mnslp = &mnlbp->
6031						lb_mnsidelocators[index][li];
6032					    write_lb = 1;
6033					    mnslp->mnl_mnum =
6034						md_getminor(newdev[li]);
6035					} else if (mnslp->mnl_mnum !=
6036					    md_getminor(newdev[li])) {
6037						write_lb = 1;
6038						mnslp->mnl_mnum =
6039						    md_getminor(newdev[li]);
6040					}
6041				} else {
6042					slp = &lbp->
6043					    lb_sidelocators[s->s_sideno][li];
6044					if (slp->l_mnum !=
6045					    md_getminor(newdev[li])) {
6046						write_lb = 1;
6047						slp->l_mnum =
6048						    md_getminor(newdev[li]);
6049					}
6050				}
6051				name = ddi_major_to_name(
6052						md_getmajor(newdev[li]));
6053				if (lbp->lb_flags & MDDB_MNSET) {
6054					i = mnslp->mnl_drvnm_index;
6055				} else {
6056					i = slp->l_drvnm_index;
6057				}
6058				if (strncmp(lbp->lb_drvnm[i].dn_data, name,
6059					lbp->lb_drvnm[i].dn_len) != 0) {
6060					/* Driver name has changed */
6061					len = strlen(name);
6062					/* Look for the driver name */
6063					for (i = 0; i < MDDB_DRVNMCNT; i++) {
6064						if (lbp->lb_drvnm[i].dn_len
6065						    != len)
6066							continue;
6067						if (strncmp(
6068						    lbp->lb_drvnm[i].dn_data,
6069						    name, len) == 0)
6070							break;
6071					}
6072					/* Didn't find one, add it */
6073					if (i == MDDB_DRVNMCNT) {
6074					    for (i = 0; i < MDDB_DRVNMCNT;
6075						i++) {
6076						if (lbp->lb_drvnm[i].dn_len
6077						    == 0)
6078							break;
6079					    }
6080					    if (i == MDDB_DRVNMCNT) {
6081						cmn_err(CE_WARN,
6082						    "Unable to update driver"
6083						    " name for dev:  "
6084						    "major = %d, "
6085						    "minor = %d\n",
6086						    md_getmajor(newdev[li]),
6087						    md_getminor(newdev[li]));
6088						continue;
6089					    }
6090					    (void) strncpy(
6091						lbp->lb_drvnm[i].dn_data,
6092						name, MD_MAXDRVNM);
6093					    lbp->lb_drvnm[i].dn_len =
6094						(uchar_t)strlen(name);
6095					}
6096					/* Fill in the drvnm index */
6097					if (lbp->lb_flags & MDDB_MNSET) {
6098						mnslp->mnl_drvnm_index = i;
6099					} else {
6100						slp->l_drvnm_index = i;
6101					}
6102					write_lb = 1;
6103				}
6104				did_info->info_flags |= MDDB_DID_UPDATED;
6105			}
6106		}
6107	    }
6108	}
6109	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
6110
6111	/*
6112	 * If locator block has been changed by get_mbs_n_lbs,
6113	 * by addition of new device id, by updated minor name or
6114	 * by updated driver name - write out locator block.
6115	 */
6116	if (write_lb) {
6117		rval = push_lb(s);
6118		(void) upd_med(s, "load_old_replicas(0)");
6119		if (rval)
6120			goto errout;
6121	}
6122
6123	/*
6124	 * If the tag was moved, allocated, or a BADTAG was seen for some other
6125	 * reason, then make sure tags are written to all the replicas.
6126	 * Data tags not supported on MN sets.
6127	 */
6128	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
6129		if (! (lc = dt_alloc_if_needed(s))) {
6130			for (li = 0; li < lbp->lb_loccnt; li++) {
6131				lp = &lbp->lb_locators[li];
6132
6133				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
6134				    (lp->l_flags & MDDB_F_EMASTER))
6135					continue;
6136
6137				if (lp->l_flags & MDDB_F_BADTAG) {
6138					lc = 1;
6139					break;
6140				}
6141			}
6142		}
6143
6144		if (lc) {
6145			md_set_setstatus(setno, MD_SET_TAGDATA);
6146			md_clr_setstatus(setno, MD_SET_BADTAG);
6147			(void) selectreplicas(s, MDDB_SCANALL);
6148		}
6149	}
6150
6151errout:
6152
6153	/* Free extraneous rip components. */
6154	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
6155		/* Get rid of lbp's and dtp's */
6156
6157		if (rip->ri_lbp != lbp) {
6158			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
6159				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
6160				rip->ri_dtp = (mddb_dt_t *)NULL;
6161			}
6162
6163			if (rip->ri_devid != (ddi_devid_t)NULL) {
6164				sz = (int)ddi_devid_sizeof(rip->ri_devid);
6165				kmem_free((caddr_t)rip->ri_devid, sz);
6166				rip->ri_devid = (ddi_devid_t)NULL;
6167			}
6168			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
6169				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
6170				kmem_free((caddr_t)rip->ri_old_devid, sz);
6171				rip->ri_old_devid = (ddi_devid_t)NULL;
6172			}
6173
6174			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
6175				mddb_devid_icp_free(&rip->ri_did_icp,
6176				    rip->ri_lbp);
6177
6178				kmem_free((caddr_t)rip->ri_lbp,
6179				    dbtob(rip->ri_lbp->lb_blkcnt));
6180				rip->ri_lbp = (mddb_lb_t *)NULL;
6181			}
6182		}
6183
6184		if (lbp != NULL) {
6185			for (li = 0; li < lbp->lb_loccnt; li++) {
6186				lp = &lbp->lb_locators[li];
6187				if (lp->l_flags & MDDB_F_DELETED)
6188					continue;
6189				if (rip->ri_dev == md_expldev(lp->l_dev) &&
6190				    rip->ri_blkno == lp->l_blkno)
6191					break;
6192			}
6193			if (li < lbp->lb_loccnt)
6194				continue;
6195		}
6196
6197		/*
6198		 * Get rid of mbp's:
6199		 *	if lbp, those out of lb_loccnt bounds
6200		 *	if !lbp,  all of them.
6201		 */
6202		if (rip->ri_mbip) {
6203			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
6204			if (dev64 != NODEV64)
6205				mddb_devclose(dev64);
6206
6207			free_mbipp(&rip->ri_mbip);
6208		}
6209		/*
6210		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
6211		 * code always ends up calling ridev for all replicas
6212		 * before calling load_old_replicas.  ridev will reset
6213		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
6214		 */
6215		if (setno != MD_LOCAL_SET)
6216			rip->ri_flags &= ~MDDB_F_EMASTER;
6217	}
6218	return (retval);
6219}
6220
6221/*
6222 * Given the devt from the md.conf info, get the devid for the device.
6223 */
6224static void
6225lookup_db_devid(mddb_cfg_loc_t *cl)
6226{
6227	dev_t		ldev;
6228	ddi_devid_t	devid;
6229	char		*minor;
6230
6231	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
6232		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
6233		return;
6234	}
6235
6236	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
6237	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
6238		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
6239		    cl->l_driver, cl->l_mnum);
6240		return;
6241	}
6242
6243	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
6244		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
6245		    cl->l_mnum);
6246		return;
6247	}
6248
6249	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
6250	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
6251	cl->l_devid = (uint64_t)(uintptr_t)devid;
6252	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
6253
6254	kmem_free(minor, strlen(minor) + 1);
6255}
6256
6257/*
6258 * grab driver name, minor, block and devid out of
6259 * strings like "driver:minor:block:devid"
6260 */
6261static int
6262parse_db_loc(
6263	char		*str,
6264	mddb_cfg_loc_t	*clp
6265)
6266{
6267	char		*p, *e;
6268	char		*minor_name;
6269	ddi_devid_t	ret_devid;
6270
6271	clp->l_dev = 0;
6272	p = clp->l_driver;
6273	e = p + sizeof (clp->l_driver) - 1;
6274	while ((*str != ':') && (*str != '\0') && (p < e))
6275		*p++ = *str++;
6276	*p = '\0';
6277	if (*str++ != ':')
6278		return (-1);
6279	clp->l_mnum = 0;
6280	while (ISNUM(*str)) {
6281		clp->l_mnum *= 10;
6282		clp->l_mnum += *str++ - '0';
6283	}
6284	if (*str++ != ':')
6285		return (-1);
6286	clp->l_blkno = 0;
6287	while (ISNUM(*str)) {
6288		clp->l_blkno *= 10;
6289		clp->l_blkno += *str++ - '0';
6290	}
6291	if (*str++ != ':')
6292		return (-1);
6293
6294	/*
6295	 * If the md_devid_destroy flag is set, ignore the device ids.
6296	 * This is only to used in a catastrophic failure case.  Examples
6297	 * would be where the device id of all drives in the system
6298	 * (especially the mirror'd root drives) had been changed
6299	 * by firmware upgrade or by a patch to an existing disk
6300	 * driver.  Another example would be in the case of non-unique
6301	 * device ids due to a bug.  The device id would be valid on
6302	 * the system, but would return the wrong dev_t.
6303	 */
6304	if (md_devid_destroy) {
6305		clp->l_devid_flags = 0;
6306		clp->l_devid = (uint64_t)NULL;
6307		clp->l_devid_sz = 0;
6308		clp->l_old_devid = (uint64_t)NULL;
6309		clp->l_old_devid_sz = 0;
6310		clp->l_minor_name[0] = '\0';
6311		return (0);
6312	}
6313
6314	if (ddi_devid_str_decode(str,
6315	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
6316		return (-1);
6317
6318	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
6319	clp->l_devid_flags = 0;
6320	clp->l_old_devid = (uint64_t)NULL;
6321	clp->l_old_devid_sz = 0;
6322
6323	/* If no device id associated with device, just return */
6324	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
6325		clp->l_devid_sz = 0;
6326		clp->l_minor_name[0] = '\0';
6327		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
6328		    md_keep_repl_state == 0) {
6329			/*
6330			 * No devid in md.conf; we're in recovery mode so
6331			 * lookup the devid for the device as specified by
6332			 * the devt in md.conf.
6333			 */
6334			lookup_db_devid(clp);
6335		}
6336		return (0);
6337	}
6338
6339	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
6340	    MDDB_DEVID_SZ;
6341	clp->l_devid_sz = (int)ddi_devid_sizeof(
6342	    (ddi_devid_t)(uintptr_t)clp->l_devid);
6343	(void) strcpy(clp->l_minor_name, minor_name);
6344	kmem_free(minor_name, strlen(minor_name) + 1);
6345
6346	return (0);
6347}
6348
6349/*
6350 * grab driver name, minor, and block out of
6351 * strings like "driver:minor:block:devid driver:minor:block:devid ..."
6352 */
6353static void
6354parse_db_string(
6355	char		*str
6356)
6357{
6358	char		*p, *e;
6359	mddb_cfg_loc_t	*cl;
6360	char		restore_space;
6361
6362	/* CSTYLED */
6363	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
6364	for (p = str; (*p != '\0'); ) {
6365		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
6366			;
6367		if (*p == '\0')
6368			break;
6369		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
6370			;
6371		/*
6372		 * Only give parse_db_loc 1 entry, so stuff a null into
6373		 * the string if we're not at the end.  We need to save this
6374		 * char and restore it after call.
6375		 */
6376		restore_space = '\0';
6377		if (*e != '\0') {
6378			restore_space = *e;
6379			*e = '\0';
6380		}
6381		if (parse_db_loc(p, cl) != 0) {
6382			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
6383		} else {
6384			(void) ridev(
6385			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
6386			    cl, NULL, MDDB_F_PTCHED);
6387			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
6388				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
6389				    cl->l_devid_sz);
6390			}
6391		}
6392		if (restore_space != '\0') {
6393			*e = restore_space;
6394		}
6395		p = e;
6396	}
6397	kmem_free(cl, sizeof (mddb_cfg_loc_t));
6398}
6399
6400/*
6401 * grab database locations supplied by md.conf as properties
6402 */
6403static void
6404parse_db_strings(void)
6405{
6406	int		bootlist_id;
6407	int		proplen;
6408	/*
6409	 * size of _bootlist_name should match uses of line and entry in
6410	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
6411	 */
6412	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
6413	char		*bootlist_name;
6414	caddr_t		prop;
6415
6416/*
6417 * Step through the bootlist properties one at a time by forming the
6418 * correct name, fetching the property, parsing the property and
6419 * then freeing the memory.  If a property does not exist or returns
6420 * some form of error just ignore it.  There is no guarantee that
6421 * the properties will always exist in sequence, for example
6422 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
6423 * mddb_bootlist3 existing.
6424 */
6425	bootlist_name = &_bootlist_name[0];
6426	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
6427
6428		proplen = 0;
6429		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
6430
6431		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
6432		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
6433		    &proplen) != DDI_PROP_SUCCESS)
6434			continue;
6435
6436		if (proplen <= 0)
6437			continue;
6438
6439		if (md_init_debug)
6440			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
6441
6442		parse_db_string(prop);
6443		kmem_free(prop, proplen);
6444	}
6445}
6446
6447static int
6448initit(
6449	set_t		setno,
6450	int		flag
6451)
6452{
6453	int		i;
6454	mddb_set_t	*s;
6455	mddb_lb_t	*lbp;		/* pointer to locator block */
6456	mddb_ln_t	*lnp;		/* pointer to locator names */
6457	mddb_db_t	*dbp;		/* pointer to directory block */
6458	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
6459	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
6460	mddb_bf_t	*bfp;
6461	side_t		sideno;
6462	side_t		maxsides;
6463	mddb_block_t	lb_blkcnt;
6464	int		retval = 0;
6465	md_dev64_t	dev;
6466	mddb_mnlb_t	*mnlbp;
6467	int		devid_flag;
6468
6469	/* single thread's all loads/unloads of set's */
6470	mutex_enter(&mddb_lock);
6471	mutex_enter(SETMUTEX(setno));
6472
6473	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
6474		mutex_exit(SETMUTEX(setno));
6475		mutex_exit(&mddb_lock);
6476		return (MDDB_E_NOTNOW);
6477	}
6478
6479	s = (mddb_set_t *)md_set[setno].s_db;
6480
6481	single_thread_start(s);
6482
6483	/*
6484	 * init is already underway, block. Return success.
6485	 */
6486	if (s->s_lbp) {
6487		single_thread_end(s);
6488		mutex_exit(SETMUTEX(setno));
6489		mutex_exit(&mddb_lock);
6490		return (0);
6491	}
6492
6493	uniqtime32(&s->s_inittime);
6494
6495	/* grab database locations patched by /etc/system */
6496	if (setno == MD_LOCAL_SET)
6497		parse_db_strings();
6498
6499	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
6500	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
6501
6502	s->s_zombie = 0;
6503	s->s_staledeletes = 0;
6504	s->s_optcmtcnt = 0;
6505	s->s_opthavelck = 0;
6506	s->s_optwantlck = 0;
6507	s->s_optwaiterr = 0;
6508	s->s_opthungerr = 0;
6509
6510	/*
6511	 * KEEPTAG can never be set for a MN diskset since no tags are
6512	 * allowed to be stored in a MN diskset.  No way to check
6513	 * if this is a MN diskset or not at this point since the mddb
6514	 * hasn't been read in from disk yet.  (flag will only have
6515	 * MUTLINODE bit set if a new set is being created.)
6516	 */
6517	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
6518		dt_setup(s, NULL);
6519
6520	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
6521
6522	for (i = 0; i <	mddb_maxbufheaders; i++) {
6523		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
6524		sema_init(&bfp->bf_buf.b_io, 0, NULL,
6525		    SEMA_DEFAULT, NULL);
6526		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
6527		    SEMA_DEFAULT, NULL);
6528		bfp->bf_buf.b_offset = -1;
6529		freebuffer(s, bfp);
6530	}
6531
6532	retval = load_old_replicas(s, flag);
6533	/* If 0 return value - success */
6534	if (! retval) {
6535		single_thread_end(s);
6536		mutex_exit(SETMUTEX(setno));
6537		mutex_exit(&mddb_lock);
6538		return (0);
6539	}
6540
6541	/*
6542	 * If here, then the load_old_replicas() failed
6543	 */
6544
6545
6546	/* If the database was supposed to exist. */
6547	if (flag & MDDB_MUSTEXIST) {
6548		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
6549			for (i = 0; i < mddb_maxcopies;	 i++) {
6550				if (! s->s_mbiarray[i])
6551					continue;
6552				dev = md_expldev(
6553				    s->s_lbp->lb_locators[i].l_dev);
6554				dev = md_xlate_targ_2_mini(dev);
6555				if (dev != NODEV64)
6556					mddb_devclose(dev);
6557
6558				free_mbipp(&s->s_mbiarray[i]);
6559			}
6560
6561			kmem_free((caddr_t)s->s_mbiarray,
6562			    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
6563			s->s_mbiarray = NULL;
6564		}
6565
6566		if (s->s_lnp != (mddb_ln_t *)NULL) {
6567			kmem_free((caddr_t)s->s_lnp,
6568			    dbtob(s->s_lbp->lb_lnblkcnt));
6569			s->s_lnp = (mddb_ln_t *)NULL;
6570		}
6571
6572		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
6573
6574		if (s->s_lbp != (mddb_lb_t *)NULL) {
6575			kmem_free((caddr_t)s->s_lbp,
6576			    dbtob(s->s_lbp->lb_blkcnt));
6577			s->s_lbp = (mddb_lb_t *)NULL;
6578		}
6579
6580		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
6581			kmem_free((caddr_t)bfp, sizeof (*bfp));
6582
6583		single_thread_end(s);
6584		mutex_exit(SETMUTEX(setno));
6585		mutex_exit(&mddb_lock);
6586
6587		if (retval == MDDB_E_TAGDATA)
6588			return (retval);
6589
6590		/* Want a bit more detailed error messages */
6591		if (mddb_db_err_detail)
6592			return (retval);
6593
6594		return (MDDB_E_NODB);
6595	}
6596
6597
6598	/*
6599	 * MDDB_NOOLDOK set - Creating a new database, so do
6600	 * more initialization.
6601	 */
6602
6603	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6604	    MDDB_LOCAL_LBCNT : MDDB_LBCNT);
6605	if (flag & MDDB_MULTINODE) {
6606		lb_blkcnt = MDDB_MNLBCNT;
6607	}
6608
6609	if (s->s_lbp == NULL)
6610		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
6611	lbp = s->s_lbp;
6612
6613	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
6614	lbp->lb_setno = setno;
6615	lbp->lb_magic = MDDB_MAGIC_LB;
6616	if (flag & MDDB_MULTINODE) {
6617		lbp->lb_revision = MDDB_REV_MNLB;
6618	} else {
6619		lbp->lb_revision = MDDB_REV_LB;
6620	}
6621	lbp->lb_inittime = s->s_inittime;
6622	if (flag & MDDB_MULTINODE) {
6623		mnlbp = (mddb_mnlb_t *)lbp;
6624		for (i = 0; i < MDDB_NLB; i++) {
6625			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
6626				mddb_mnsidelocator_t	*mnslp;
6627				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
6628				mnslp->mnl_mnum = NODEV32;
6629				mnslp->mnl_sideno = 0;
6630				mnslp->mnl_drvnm_index = 0;
6631			}
6632		}
6633	} else {
6634		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
6635		for (i = 0; i < MDDB_NLB; i++) {
6636			for (sideno = 0; sideno < maxsides; sideno++) {
6637				mddb_sidelocator_t	*slp;
6638				slp = &lbp->lb_sidelocators[sideno][i];
6639				slp->l_mnum = NODEV32;
6640			}
6641		}
6642	}
6643	lbp->lb_blkcnt = lb_blkcnt;
6644
6645	/* lb starts on block 0 */
6646	/* locator names starts after locator block */
6647	lbp->lb_lnfirstblk = lb_blkcnt;
6648	if (flag & MDDB_MULTINODE) {
6649		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
6650	} else {
6651		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6652		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
6653	}
6654
6655	if (flag & MDDB_MULTINODE) {
6656		/* Creating a multinode diskset */
6657		md_set_setstatus(setno, MD_SET_MNSET);
6658		lbp->lb_flags |= MDDB_MNSET;
6659	}
6660
6661	/* Data portion of mddb located after locator names */
6662	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
6663
6664	/* the btodb that follows is converting the directory block size */
6665	/* Data tag part of mddb located after first block of mddb data */
6666	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
6667	    btodb(MDDB_BSIZE));
6668	/* Data tags are not used in MN diskset - so set count to 0 */
6669	if (flag & MDDB_MULTINODE)
6670		lbp->lb_dtblkcnt = (mddb_block_t)0;
6671	else
6672		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
6673
6674
6675	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
6676	lnp->ln_magic = MDDB_MAGIC_LN;
6677	if (flag & MDDB_MULTINODE) {
6678		lnp->ln_revision = MDDB_REV_MNLN;
6679	} else {
6680		lnp->ln_revision = MDDB_REV_LN;
6681	}
6682	s->s_lnp = lnp;
6683
6684	/*
6685	 * Set up Device ID portion of Locator Block.
6686	 * Do not set locator to device id style if
6687	 * md_devid_destroy is 1 and md_keep_repl_state is 1
6688	 * (destroy all device id data and keep replica in
6689	 * non device id mode).
6690	 *
6691	 * This is logically equivalent to set locator to
6692	 * device id style if md_devid_destroy is 0 or
6693	 * md_keep_repl_state is 0.
6694	 *
6695	 * In SunCluster environment, device id mode is disabled
6696	 * which means diskset will be run in non-devid mode.  For
6697	 * localset, the behavior will remain intact and run in
6698	 * device id mode.
6699	 *
6700	 * In multinode diskset devids are turned off.
6701	 */
6702	devid_flag = 1;
6703	if (cluster_bootflags & CLUSTER_CONFIGURED)
6704		if (setno != MD_LOCAL_SET)
6705			devid_flag = 0;
6706	if (flag & MDDB_MULTINODE)
6707		devid_flag = 0;
6708	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
6709		devid_flag = 0;
6710	/*
6711	 * if we weren't devid style before and md_keep_repl_state=1
6712	 * we need to stay non-devid
6713	 */
6714	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
6715	    (md_keep_repl_state == 1))
6716		devid_flag = 0;
6717	if (devid_flag) {
6718		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
6719		    lbp->lb_dtblkcnt;
6720		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6721		lbp->lb_flags |= MDDB_DEVID_STYLE;
6722
6723		did_icp = (mddb_did_ic_t *)kmem_zalloc
6724		    (sizeof (mddb_did_ic_t), KM_SLEEP);
6725		did_blkp = (mddb_did_blk_t *)
6726		    kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
6727		did_blkp->blk_magic = MDDB_MAGIC_DI;
6728		did_blkp->blk_revision = MDDB_REV_DI;
6729		did_icp->did_ic_blkp = did_blkp;
6730		s->s_did_icp = did_icp;
6731	}
6732
6733	setidentifier(s, &lbp->lb_ident);
6734	uniqtime32(&lbp->lb_timestamp);
6735	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
6736	dbp->db_magic = MDDB_MAGIC_DB;
6737	dbp->db_revision = MDDB_REV_DB;
6738	uniqtime32(&dbp->db_timestamp);
6739	dbp->db_nextblk = 0;
6740	dbp->db_firstentry = NULL;
6741	dbp->db_blknum = lbp->lb_dbfirstblk;
6742	dbp->db_recsum = MDDB_GLOBAL_XOR;
6743	s->s_dbp = dbp;
6744	single_thread_end(s);
6745	mutex_exit(SETMUTEX(setno));
6746	mutex_exit(&mddb_lock);
6747	return (0);
6748}
6749
6750mddb_set_t *
6751mddb_setenter(
6752	set_t		setno,
6753	int		flag,
6754	int		*errorcodep
6755)
6756{
6757	mddb_set_t	*s;
6758	int		err = 0;
6759	size_t		sz = sizeof (void *) * MD_MAXUNITS;
6760
6761	mutex_enter(SETMUTEX(setno));
6762	if (! md_set[setno].s_db) {
6763		mutex_exit(SETMUTEX(setno));
6764		if (errorcodep != NULL)
6765			*errorcodep = MDDB_E_NOTOWNER;
6766		return (NULL);
6767	}
6768
6769	/* Allocate s_un and s_ui arrays if not already present. */
6770	if (md_set[setno].s_un == NULL) {
6771		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
6772		if (md_set[setno].s_un == NULL) {
6773			mutex_exit(SETMUTEX(setno));
6774			if (errorcodep != NULL)
6775				*errorcodep = MDDB_E_NOTOWNER;
6776			return (NULL);
6777		}
6778	}
6779	if (md_set[setno].s_ui == NULL) {
6780		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
6781		if (md_set[setno].s_ui == NULL) {
6782			mutex_exit(&md_set[setno].s_dbmx);
6783			kmem_free(md_set[setno].s_un, sz);
6784			md_set[setno].s_un = NULL;
6785			if (errorcodep != NULL)
6786				*errorcodep = MDDB_E_NOTOWNER;
6787			return (NULL);
6788		}
6789	}
6790	s = (mddb_set_t *)md_set[setno].s_db;
6791	if (s->s_lbp)
6792		return (s);
6793
6794	if (flag & MDDB_NOINIT)
6795		return (s);
6796
6797	/*
6798	 * Release the set mutex - it will be acquired and released in
6799	 * initit after acquiring the mddb_lock.  This is done to assure
6800	 * that mutexes are always acquired in the same order to prevent
6801	 * possible deadlock
6802	 */
6803	mutex_exit(SETMUTEX(setno));
6804
6805	if ((err = initit(setno, flag)) != 0) {
6806		if (errorcodep != NULL)
6807			*errorcodep = err;
6808		return (NULL);
6809	}
6810
6811	mutex_enter(SETMUTEX(setno));
6812	return ((mddb_set_t *)md_set[setno].s_db);
6813}
6814
6815/*
6816 * Release the set lock for a given set.
6817 *
6818 * In a MN diskset, this routine may send messages to the rpc.mdcommd
6819 * in order to have the slave nodes re-parse parts of the mddb.
6820 * Messages are only sent if the global ioctl lock is not held.
6821 *
6822 * With the introduction of multi-threaded ioctls, there is no way
6823 * to determine which thread(s) are holding the ioctl lock.  So, if
6824 * the ioctl lock is held (by process X) process X will send the
6825 * messages to the slave nodes when process X releases the ioctl lock.
6826 */
6827void
6828mddb_setexit(
6829	mddb_set_t	*s
6830)
6831{
6832	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
6833	md_mn_kresult_t			*kresult;
6834	mddb_lb_t			*lbp = s->s_lbp;
6835	int				i;
6836	int				rval = 1;
6837
6838	/*
6839	 * If not a MN diskset OR
6840	 * a MN diskset but this node isn't master,
6841	 * then release the mutex.
6842	 */
6843	if (!(MD_MNSET_SETNO(s->s_setno)) ||
6844	    ((MD_MNSET_SETNO(s->s_setno)) &&
6845	    (!md_set[s->s_setno].s_am_i_master))) {
6846		mutex_exit(SETMUTEX(s->s_setno));
6847		return;
6848	}
6849
6850	/*
6851	 * If global ioctl lock is held, then send no messages,
6852	 * just release mutex and return.
6853	 *
6854	 */
6855	if (md_status & MD_GBL_IOCTL_LOCK) {
6856		mutex_exit(SETMUTEX(s->s_setno));
6857		return;
6858	}
6859
6860	/*
6861	 * This thread is not holding the ioctl lock, so drop the set
6862	 * lock, send messages to slave nodes to reparse portions
6863	 * of the mddb and return.
6864	 *
6865	 * If the block parse flag is set, do not send parse messages.
6866	 * This flag is set when master is adding a new mddb that would
6867	 * cause parse messages to be sent to the slaves, but the slaves
6868	 * don't have knowledge of the new mddb yet since the mddb add
6869	 * operation hasn't been run on the slave nodes yet.  When the
6870	 * master unblocks the parse flag, the parse messages will be
6871	 * generated.
6872	 *
6873	 * If s_mn_parseflags_sending is non-zero, then another thread
6874	 * is already currently sending a parse message, so just release
6875	 * the mutex and return.  If an mddb change occurred that results
6876	 * in a parse message to be generated, the thread that is currently
6877	 * sending a parse message would generate the additional parse message.
6878	 *
6879	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
6880	 * then loop until s_mn_parseflags is 0 (until there are no more
6881	 * messages to send).
6882	 * While s_mn_parseflags is non-zero,
6883	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
6884	 * 	set s_mn_parseflags to zero
6885	 *	release mutex
6886	 *	send message
6887	 *	re-grab mutex
6888	 *	set s_mn_parseflags_sending to zero
6889	 */
6890	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
6891	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
6892	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
6893	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
6894		/* Grab snapshot of parse flags */
6895		s->s_mn_parseflags_sending = s->s_mn_parseflags;
6896		s->s_mn_parseflags = 0;
6897
6898		mutex_exit(SETMUTEX(s->s_setno));
6899
6900		/*
6901		 * Send the message to the slaves to re-parse
6902		 * the indicated portions of the mddb. Send the status
6903		 * of the 50 mddbs in this set so that slaves know which
6904		 * mddbs that the master node thinks are 'good'.
6905		 * Otherwise, slave may reparse, but from wrong replica.
6906		 */
6907		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
6908		for (i = 0; i < MDDB_NLB; i++) {
6909			mddb_parse_msg->msg_lb_flags[i] =
6910			    lbp->lb_locators[i].l_flags;
6911		}
6912		kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
6913		while (rval != 0) {
6914			rval = mdmn_ksend_message(s->s_setno,
6915			    MD_MN_MSG_MDDB_PARSE, 0, 0,
6916			    (char *)mddb_parse_msg,
6917			    sizeof (md_mn_msg_mddb_parse_t), kresult);
6918			if (rval != 0)
6919				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
6920				    "mddb update message to other nodes in "
6921				    "diskset %s\n", s->s_setname);
6922		}
6923		kmem_free(kresult, sizeof (md_mn_kresult_t));
6924
6925		/*
6926		 * Re-grab mutex to clear sending field and to
6927		 * see if another parse message needs to be generated.
6928		 */
6929		mutex_enter(SETMUTEX(s->s_setno));
6930		s->s_mn_parseflags_sending = 0;
6931	}
6932	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
6933	mutex_exit(SETMUTEX(s->s_setno));
6934}
6935
6936static void
6937mddb_setexit_no_parse(
6938	mddb_set_t	*s
6939)
6940{
6941	mutex_exit(SETMUTEX(s->s_setno));
6942}
6943
6944uint_t
6945mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
6946{
6947	uint_t			li;
6948	mddb_lb_t		*lbp = s->s_lbp;
6949	mddb_locator_t		*lp;
6950	ddi_devid_t		ret_devid;
6951	uint_t			devid_len;
6952	dev_t			ddi_dev;
6953	mddb_did_ic_t		*did_icp;
6954	mddb_did_blk_t		*did_blkp;
6955	char			*minor_name;
6956	size_t			sz;
6957	int			retval;
6958	int			err;
6959	md_dev64_t		dev64; /* tmp var to make code look better */
6960
6961
6962	/* Need disk block(s) to hold mddb_did_blk_t */
6963	*blk_cnt = MDDB_DID_BLOCKS;
6964
6965	if (doit) {
6966		/*
6967		 * Alloc mddb_did_blk_t disk block and fill in header area.
6968		 * Don't fill in did magic number until end of routine so
6969		 * if machine panics in the middle of conversion, the
6970		 * device id information will be thrown away at the
6971		 * next snarfing of this set.
6972		 * Need to set DEVID_STYLE so that mddb_devid_add will
6973		 * function properly.
6974		 */
6975		/* grab the mutex */
6976		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
6977			return (1);
6978		}
6979		single_thread_start(s);
6980		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
6981		if (lbp->lb_didfirstblk == 0) {
6982			single_thread_end(s);
6983			mddb_setexit(s);
6984			return (1);
6985		}
6986		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6987		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
6988		    KM_SLEEP);
6989		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
6990		    KM_SLEEP);
6991
6992		did_blkp->blk_revision = MDDB_REV_DI;
6993		did_icp->did_ic_blkp = did_blkp;
6994		s->s_did_icp = did_icp;
6995		lbp->lb_flags |= MDDB_DEVID_STYLE;
6996	}
6997
6998	/* Fill in information in mddb_did_info_t array */
6999	for (li = 0; li < lbp->lb_loccnt; li++) {
7000		lp = &lbp->lb_locators[li];
7001		if (lp->l_flags & MDDB_F_DELETED)
7002			continue;
7003
7004		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
7005		ddi_dev = md_dev64_to_dev(dev64);
7006		if (ddi_dev == NODEV) {
7007			/*
7008			 * No translation available for replica.
7009			 * Could fail conversion to device id replica,
7010			 * but instead will just continue with next
7011			 * replica in list.
7012			 */
7013			continue;
7014		}
7015		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
7016			/*
7017			 * Just count each devid as at least 1 block.  This
7018			 * is conservative since several device id's may fit
7019			 * into 1 disk block, but it's better to overestimate
7020			 * the number of blocks needed than to underestimate.
7021			 */
7022			devid_len = (int)ddi_devid_sizeof(ret_devid);
7023			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
7024			if (doit) {
7025				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
7026				    &minor_name) == DDI_SUCCESS) {
7027					if (mddb_devid_add(s, li, ret_devid,
7028					    minor_name)) {
7029						cmn_err(CE_WARN,
7030						    "Not enough space in metadb"
7031						    " to add device id for"
7032						    "  dev: major = %d, "
7033						    "minor = %d\n",
7034						    getmajor(ddi_dev),
7035						    getminor(ddi_dev));
7036					}
7037					sz = strlen(minor_name) + 1;
7038					kmem_free(minor_name, sz);
7039				}
7040			}
7041			ddi_devid_free(ret_devid);
7042		}
7043	}
7044
7045	if (doit) {
7046		did_blkp->blk_magic = MDDB_MAGIC_DI;
7047		retval = push_lb(s);
7048		(void) upd_med(s, "mddb_lb_did_convert(0)");
7049		single_thread_end(s);
7050		mddb_setexit(s);
7051		if (retval != 0)
7052			return (1);
7053	}
7054
7055	return (0);
7056}
7057
7058static mddb_set_t *
7059init_set(
7060	mddb_config_t	*cp,
7061	int		flag,
7062	int		*errp
7063)
7064{
7065	mddb_set_t	*s;
7066	char		*setname = NULL;
7067	set_t		setno = MD_LOCAL_SET;
7068	side_t		sideno = 0;
7069	struct timeval32 *created = NULL;
7070
7071	if (cp != NULL) {
7072		setname = cp->c_setname;
7073		setno = cp->c_setno;
7074		sideno = cp->c_sideno;
7075		created = &cp->c_timestamp;
7076	}
7077
7078	if (setno >= MD_MAXSETS)
7079		return ((mddb_set_t *)NULL);
7080
7081	if (md_set[setno].s_db)
7082		return (mddb_setenter(setno, flag, errp));
7083
7084	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
7085
7086	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
7087	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
7088	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
7089	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
7090	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
7091
7092	s->s_setno = setno;
7093	s->s_sideno = sideno;
7094	if (setno == MD_LOCAL_SET) {
7095		(void) strcpy(s->s_ident.serial, hw_serial);
7096	} else {
7097		s->s_ident.createtime = *created;
7098		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
7099		    KM_SLEEP);
7100		(void) strcpy(s->s_setname, setname);
7101	}
7102
7103	/* have a config struct,  copy mediator information */
7104	if (cp != NULL)
7105		s->s_med = cp->c_med;		/* structure assignment */
7106
7107	md_set[setno].s_db = (void *) s;
7108
7109	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
7110
7111	return (mddb_setenter(setno, flag, errp));
7112}
7113
7114void
7115mddb_unload_set(
7116	set_t		setno
7117)
7118{
7119
7120	mddb_set_t	*s;
7121	mddb_db_t	*dbp, *adbp = NULL;
7122	mddb_de_ic_t	*dep, *dep2;
7123	mddb_bf_t	*bfp;
7124	int		i;
7125	md_dev64_t	dev;
7126
7127	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
7128		return;
7129
7130	single_thread_start(s);
7131
7132	s->s_opthavequeuinglck = 0;
7133	s->s_optwantqueuinglck = 0;
7134
7135	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
7136		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
7137			if (dep->de_rb_userdata != NULL) {
7138				if (dep->de_icreqsize)
7139					kmem_free(dep->de_rb_userdata_ic,
7140					    dep->de_icreqsize);
7141				else
7142					kmem_free(dep->de_rb_userdata,
7143					    dep->de_reqsize);
7144			}
7145			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
7146			dep2 = dep->de_next;
7147			kmem_free((caddr_t)dep, sizeofde(dep));
7148		}
7149		adbp = dbp->db_next;
7150		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
7151	}
7152	s->s_dbp = (mddb_db_t *)NULL;
7153
7154	free_rip(&s->s_rip);
7155
7156	for (i = 0; i < mddb_maxcopies;	 i++) {
7157		if (! s->s_mbiarray)
7158			break;
7159
7160		if (! s->s_mbiarray[i])
7161			continue;
7162
7163		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
7164		dev = md_xlate_targ_2_mini(dev);
7165		if (dev != NODEV64)
7166			mddb_devclose(dev);
7167
7168		free_mbipp(&s->s_mbiarray[i]);
7169	}
7170
7171	if (s->s_mbiarray) {
7172		kmem_free((caddr_t)s->s_mbiarray,
7173		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
7174		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
7175	}
7176
7177	if (s->s_lnp) {
7178		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
7179		s->s_lnp = (mddb_ln_t *)NULL;
7180	}
7181
7182	if (s->s_lbp) {
7183		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
7184		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
7185		s->s_lbp = (mddb_lb_t *)NULL;
7186	}
7187
7188	if (s->s_freebitmap) {
7189		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
7190		s->s_freebitmap = NULL;
7191		s->s_freebitmapsize = 0;
7192	}
7193
7194	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
7195		kmem_free((caddr_t)bfp, sizeof (*bfp));
7196
7197	if (s->s_databuffer_size) {
7198		kmem_free(s->s_databuffer, s->s_databuffer_size);
7199		s->s_databuffer_size = 0;
7200	}
7201
7202	if (s->s_setname != NULL)
7203		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
7204
7205	/* Data tags not supported on MN sets. */
7206	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
7207		dtl_freel(&s->s_dtlp);
7208
7209	md_set[setno].s_db = NULL;
7210	ASSERT(s->s_singlelockwanted == 0);
7211	kmem_free(s, sizeof (mddb_set_t));
7212
7213	/* Take care of things setup in the md_set array */
7214	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
7215		if (md_set[setno].s_dtp) {
7216			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
7217			md_set[setno].s_dtp = NULL;
7218		}
7219	}
7220
7221	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
7222	    MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
7223	    MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
7224	    MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
7225	    MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
7226
7227	mutex_exit(SETMUTEX(setno));
7228}
7229
7230/*
7231 * returns 0 if name can be put into locator block
7232 * returns 1 if locator block prefixes are all used
7233 *
7234 * Takes splitname (suffix, prefix, sideno) and
7235 * stores it in the locator name structure.
7236 * For traditional diskset, the sideno is the index into the suffixes
7237 * array in the locator name structure.
7238 * For the MN diskset, the sideno is the nodeid which can be any number,
7239 * so the index passed in is the index into the mnsuffixes array
7240 * in the locator structure.  This index was computed by the
7241 * routine checklocator which basically checked the locator block
7242 * mnside locator structure.
7243 */
7244static int
7245splitname2locatorblock(
7246	md_splitname	*spn,
7247	mddb_ln_t	*lnp,
7248	int		li,
7249	side_t		sideno,
7250	int		index
7251)
7252{
7253	uchar_t			i;
7254	md_name_suffix		*sn;
7255	md_mnname_suffix_t	*mnsn;
7256	mddb_mnln_t		*mnlnp;
7257
7258	for (i = 0; i < MDDB_PREFIXCNT; i++) {
7259		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
7260			continue;
7261		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
7262		    SPN_PREFIX(spn).pre_len) == 0)
7263			break;
7264	}
7265	if (i == MDDB_PREFIXCNT) {
7266		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7267			if (lnp->ln_prefixes[i].pre_len == 0)
7268				break;
7269		}
7270		if (i == MDDB_PREFIXCNT)
7271			return (1);
7272		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
7273		    SPN_PREFIX(spn).pre_len);
7274		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
7275	}
7276
7277	if (lnp->ln_revision == MDDB_REV_MNLN) {
7278		/* If a MN diskset, use index */
7279		mnlnp = (mddb_mnln_t *)lnp;
7280		mnsn = &mnlnp->ln_mnsuffixes[index][li];
7281		mnsn->mn_ln_sideno = sideno;
7282		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
7283		mnsn->mn_ln_suffix.suf_prefix = i;
7284		bcopy(SPN_SUFFIX(spn).suf_data,
7285		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
7286	} else {
7287		sn = &lnp->ln_suffixes[sideno][li];
7288		sn->suf_len = SPN_SUFFIX(spn).suf_len;
7289		sn->suf_prefix = i;
7290		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
7291		    SPN_SUFFIX(spn).suf_len);
7292	}
7293	return (0);
7294}
7295
7296/*
7297 * Find the locator name for the given sideno and convert the locator name
7298 * information into a splitname structure.
7299 */
7300void
7301mddb_locatorblock2splitname(
7302	mddb_ln_t	*lnp,
7303	int		li,
7304	side_t		sideno,
7305	md_splitname	*spn
7306)
7307{
7308	int			iprefix;
7309	md_name_suffix		*sn;
7310	md_mnname_suffix_t	*mnsn;
7311	int			i;
7312	mddb_mnln_t		*mnlnp;
7313
7314	if (lnp->ln_revision == MDDB_REV_MNLN) {
7315		mnlnp = (mddb_mnln_t *)lnp;
7316		for (i = 0; i < MD_MNMAXSIDES; i++) {
7317			mnsn = &mnlnp->ln_mnsuffixes[i][li];
7318			if (mnsn->mn_ln_sideno == sideno)
7319				break;
7320		}
7321		if (i == MD_MNMAXSIDES)
7322			return;
7323
7324		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
7325		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
7326		    SPN_SUFFIX(spn).suf_len);
7327		iprefix = mnsn->mn_ln_suffix.suf_prefix;
7328	} else {
7329		sn = &lnp->ln_suffixes[sideno][li];
7330		SPN_SUFFIX(spn).suf_len = sn->suf_len;
7331		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
7332		    SPN_SUFFIX(spn).suf_len);
7333		iprefix = sn->suf_prefix;
7334	}
7335	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
7336	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
7337	    SPN_PREFIX(spn).pre_len);
7338}
7339
7340static int
7341getdeldev(
7342	mddb_config_t	*cp,
7343	int		command,
7344	md_error_t	*ep
7345)
7346{
7347	mddb_set_t	*s;
7348	mddb_lb_t	*lbp;
7349	mddb_locator_t	*locators;
7350	uint_t		loccnt;
7351	mddb_mb_ic_t	*mbip;
7352	mddb_block_t	blk;
7353	int		err = 0;
7354	int		i, j;
7355	int		li;
7356	uint_t		commitcnt;
7357	set_t		setno = cp->c_setno;
7358	uint_t		set_status;
7359	md_dev64_t	dev;
7360	int		flags = MDDB_MUSTEXIST;
7361
7362	cp->c_dbmax = MDDB_NLB;
7363
7364	/*
7365	 * Data checking
7366	 */
7367	if (setno >= md_nsets || cp->c_id < 0 ||
7368	    cp->c_id > cp->c_dbmax) {
7369		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7370	}
7371
7372	if (cp->c_flags & MDDB_C_STALE)
7373		flags |= MDDB_MN_STALE;
7374
7375	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
7376		return (mddbstatus2error(ep, err, NODEV32, setno));
7377
7378	cp->c_flags = 0;
7379
7380	lbp = s->s_lbp;
7381	loccnt = lbp->lb_loccnt;
7382	locators = lbp->lb_locators;
7383
7384	/* shorthand */
7385	set_status = md_get_setstatus(setno);
7386
7387	if (set_status & MD_SET_STALE)
7388		cp->c_flags |= MDDB_C_STALE;
7389
7390	if (set_status & MD_SET_TOOFEW)
7391		cp->c_flags |= MDDB_C_TOOFEW;
7392
7393	cp->c_sideno = s->s_sideno;
7394
7395	cp->c_dbcnt = 0;
7396	/*
7397	 * go through and count active entries
7398	 */
7399	for (i = 0; i < loccnt;	 i++) {
7400		if (locators[i].l_flags & MDDB_F_DELETED)
7401			continue;
7402		cp->c_dbcnt++;
7403	}
7404
7405	/*
7406	 * add the ability to accept a locator block index
7407	 * which is not relative to previously deleted replicas.  This
7408	 * is for support of MD_DEBUG=STAT in metastat since it asks for
7409	 * replica information specifically for each of the mirror resync
7410	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
7411	 * the mddb_config_t type.
7412	 */
7413	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
7414		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
7415			mddb_setexit(s);
7416			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7417			    setno));
7418		}
7419		li = cp->c_id;
7420	} else {
7421		if (cp->c_id >= cp->c_dbcnt) {
7422			mddb_setexit(s);
7423			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7424			    setno));
7425		}
7426
7427		/* CSTYLED */
7428		for (li = 0, j = 0; /* void */; li++) {
7429			if (locators[li].l_flags & MDDB_F_DELETED)
7430				continue;
7431			j++;
7432			if (j > cp->c_id)
7433				break;
7434		}
7435	}
7436
7437	if (command == MDDB_ENDDEV) {
7438		daddr_t ib = 0, jb;
7439
7440		blk = 0;
7441		if ((s != NULL) && s->s_mbiarray[li]) {
7442			mbip = s->s_mbiarray[li];
7443			while ((jb = getphysblk(blk++, mbip)) > 0) {
7444				if (jb > ib)
7445					ib = jb;
7446			}
7447			cp->c_dbend = (int)ib;
7448		} else {
7449			cp->c_dbend = 0;
7450		}
7451	}
7452
7453	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
7454	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
7455
7456	if (command != MDDB_DELDEV) {
7457		mddb_setexit(s);
7458		return (0);
7459	}
7460
7461	/* Currently don't allow addition/deletion of sides during upgrade */
7462	if (MD_UPGRADE) {
7463		cmn_err(CE_WARN,
7464		    "Deletion of replica not allowed during upgrade.\n");
7465		mddb_setexit(s);
7466		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7467	}
7468
7469	/*
7470	 * If here, replica delete in progress.
7471	 */
7472	single_thread_start(s);
7473
7474	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
7475	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
7476		commitcnt = lbp->lb_commitcnt;
7477		lbp->lb_commitcnt = 0;
7478		setidentifier(s, &lbp->lb_ident);
7479		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
7480		/*
7481		 * Don't need to write out device id area, since locator
7482		 * block on this replica is being deleted by setting the
7483		 * commitcnt to 0.
7484		 */
7485		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
7486		    MDDB_WR_ONLY_MASTER);
7487		lbp->lb_commitcnt = commitcnt;
7488	}
7489
7490	if (s->s_mbiarray[li])
7491		free_mbipp(&s->s_mbiarray[li]);
7492
7493	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
7494		dev = md_expldev(locators[li].l_dev);
7495		dev = md_xlate_targ_2_mini(dev);
7496		if (dev != NODEV64)
7497			mddb_devclose(dev);
7498	}
7499
7500	s->s_mbiarray[li] = 0;
7501	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
7502
7503	/* Only support data tags for traditional and local sets */
7504	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
7505	    (!(lbp->lb_flags & MDDB_MNSET)) &&
7506	    setno != MD_LOCAL_SET)
7507		if (set_dtag(s, ep))
7508			mdclrerror(ep);
7509
7510	/* Write data tags to all accessible devices */
7511	/* Only support data tags for traditional and local sets */
7512	if (!(lbp->lb_flags & MDDB_MNSET)) {
7513		(void) dt_write(s);
7514	}
7515
7516	/* Delete device id of deleted replica */
7517	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7518		(void) mddb_devid_delete(s, li);
7519	}
7520	/* write new locator to all devices */
7521	err = writelocall(s);
7522
7523	(void) upd_med(s, "getdeldev(0)");
7524
7525	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
7526	    md_expldev(locators[li].l_dev));
7527
7528	computefreeblks(s); /* recompute always it may be larger */
7529	cp->c_dbcnt--;
7530	err |= fixoptrecords(s);
7531	if (err) {
7532		if (writeretry(s)) {
7533			single_thread_end(s);
7534			mddb_setexit(s);
7535			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
7536		}
7537	}
7538
7539	single_thread_end(s);
7540	mddb_setexit(s);
7541	return (0);
7542}
7543
7544static int
7545getdriver(
7546	mddb_cfg_loc_t	*clp
7547)
7548{
7549	major_t		majordev;
7550
7551	/*
7552	 * Data checking
7553	 */
7554	if (clp->l_dev <= 0)
7555		return (EINVAL);
7556
7557	majordev = getmajor(expldev(clp->l_dev));
7558
7559	if (ddi_major_to_name(majordev) == (char *)NULL)
7560		return (EINVAL);
7561
7562	if (MD_UPGRADE)
7563		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
7564	else
7565		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
7566	return (0);
7567}
7568
7569/*
7570 * update_valid_replica - updates the locator block namespace (prefix
7571 * 	and/or suffix) with new pathname and devname.
7572 *	RETURN
7573 *		1	Error
7574 *		0	Success
7575 */
7576static int
7577update_valid_replica(
7578	side_t		side,
7579	mddb_locator_t	*lp,
7580	mddb_set_t	*s,
7581	int		li,
7582	char		*devname,
7583	char		*pathname,
7584	md_dev64_t	devt
7585)
7586{
7587	uchar_t		pre_len, suf_len;
7588	md_name_suffix	*sn;
7589	mddb_ln_t	*lnp;
7590	uchar_t		pre_index;
7591	uchar_t		i;
7592
7593	if (md_expldev(lp->l_dev) != devt) {
7594		return (0);
7595	}
7596
7597	if (pathname[strlen(pathname) - 1] == '/')
7598		pathname[strlen(pathname) - 1] = '\0';
7599
7600	pre_len = (uchar_t)strlen(pathname);
7601	suf_len = (uchar_t)strlen(devname);
7602
7603	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
7604		return (1);
7605
7606	lnp = s->s_lnp;
7607
7608	/*
7609	 * Future note:  Need to do something here for the MN diskset case
7610	 * when device ids are supported in disksets.
7611	 * Can't add until merging devids_in_diskset code into code base
7612	 * Currently only called with side of 0.
7613	 */
7614
7615	sn = &lnp->ln_suffixes[side][li];
7616
7617	/*
7618	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
7619	 * If new prefix is the same as the previous prefix - no change.
7620	 *
7621	 * If new prefix is not the same, check if new prefix
7622	 * matches an existing one.  If so, use that one.
7623	 *
7624	 * If new prefix doesn't exist, add a new prefix.  If not enough
7625	 * space, return failure.
7626	 */
7627	pre_index = sn->suf_prefix;
7628	/* Check if new prefix is the same as the old prefix. */
7629	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
7630	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
7631	    pre_len) != 0)) {
7632		/* Check if new prefix is an already known prefix. */
7633		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7634			if (lnp->ln_prefixes[i].pre_len != pre_len) {
7635				continue;
7636			}
7637			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
7638			    pre_len) == 0) {
7639				break;
7640			}
7641		}
7642		/* If no match found for new prefix - add the new prefix */
7643		if (i == MDDB_PREFIXCNT) {
7644			for (i = 0; i < MDDB_PREFIXCNT; i++) {
7645				if (lnp->ln_prefixes[i].pre_len == 0)
7646					break;
7647			}
7648			/* No space to add new prefix - return failure */
7649			if (i == MDDB_PREFIXCNT) {
7650				return (1);
7651			}
7652			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
7653			lnp->ln_prefixes[i].pre_len = pre_len;
7654		}
7655		sn->suf_prefix = i;
7656	}
7657
7658	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
7659	if ((sn->suf_len != suf_len) ||
7660	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
7661		bcopy(devname, sn->suf_data, suf_len);
7662		sn->suf_len = suf_len;
7663	}
7664	return (0);
7665}
7666
7667
7668/*
7669 * md_update_locator_namespace - If in devid style and active and the devid's
7670 *		exist and are valid update the locator namespace pathname
7671 *		and devname.
7672 *	RETURN
7673 *		1	Error
7674 *		0	Success
7675 */
7676int
7677md_update_locator_namespace(
7678	set_t		setno,		/* which set to get name from */
7679	side_t		side,
7680	char		*dname,
7681	char		*pname,
7682	md_dev64_t	devt
7683)
7684{
7685	mddb_set_t	*s;
7686	mddb_lb_t	*lbp;
7687	int		li;
7688	uint_t		flg;
7689	int		err = 0;
7690	mddb_ln_t	*lnp;
7691
7692	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7693		return (1);
7694	single_thread_start(s);
7695	lbp = s->s_lbp;
7696	/* must be DEVID_STYLE */
7697	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7698		for (li = 0; li < lbp->lb_loccnt; li++) {
7699			mddb_locator_t *lp = &lbp->lb_locators[li];
7700
7701			if (lp->l_flags & MDDB_F_DELETED) {
7702				continue;
7703			}
7704
7705			/* replica also must be active */
7706			if (lp->l_flags & MDDB_F_ACTIVE) {
7707				flg = s->s_did_icp->did_ic_blkp->
7708				    blk_info[li].info_flags;
7709				/* only update if did exists and is valid */
7710				if ((flg & MDDB_DID_EXISTS) &&
7711				    (flg & MDDB_DID_VALID)) {
7712					if (update_valid_replica(side, lp, s,
7713					    li, dname, pname, devt)) {
7714						err = 1;
7715						goto out;
7716					}
7717				}
7718			}
7719		}
7720	}
7721	lnp = s->s_lnp;
7722	uniqtime32(&lnp->ln_timestamp);
7723	if (lbp->lb_flags & MDDB_MNSET)
7724		lnp->ln_revision = MDDB_REV_MNLN;
7725	else
7726		lnp->ln_revision = MDDB_REV_LN;
7727	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
7728	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
7729	    lbp->lb_lnblkcnt, 0);
7730	/*
7731	 * If a MN diskset and this is the master, set the PARSE_LOCNM
7732	 * flag in the mddb_set structure to show that the locator
7733	 * names have changed.
7734	 */
7735
7736	if ((lbp->lb_flags & MDDB_MNSET) &&
7737	    (md_set[s->s_setno].s_am_i_master)) {
7738		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
7739	}
7740out:
7741	single_thread_end(s);
7742	mddb_setexit(s);
7743	if (err)
7744		return (1);
7745	return (0);
7746}
7747
7748/*
7749 * update_locatorblock - for active entries in the locator block, check
7750 *		the devt to see if it matches the given devt. If so, and
7751 *		there is an associated device id which is not the same
7752 *		as the passed in devid, delete old devid and add a new one.
7753 *
7754 *		During import of replicated disksets, old_didptr contains
7755 *		the original disk's device id.  Use this device id in
7756 *		addition to the devt to determine if an entry is a match
7757 *		and should be updated with the new device id of the
7758 *		replicated disk.  Specifically, this is the case being handled:
7759 *
7760 *		Original_disk	Replicated_disk	Disk_Available_During_Import
7761 *		c1t1d0		c1t3d0		no - so old name c1t1d0 shown
7762 *		c1t2d0		c1t1d0		yes - name is c1t1d0
7763 *		c1t3d0		c1t2d0		yes - name is c1t2d0
7764 *
7765 *		Can't just match on devt since devt for the first and third
7766 *		disks will be the same, but the original disk's device id
7767 *		is known and can be used to distinguish which disk's
7768 *		replicated device id should be updated.
7769 *	RETURN
7770 *		MDDB_E_NODEVID
7771 *		MDDB_E_NOLOCBLK
7772 *		1	Error
7773 *		0	Success
7774 */
7775static int
7776update_locatorblock(
7777	mddb_set_t	*s,
7778	md_dev64_t	dev,
7779	ddi_devid_t	didptr,
7780	ddi_devid_t	old_didptr
7781)
7782{
7783	mddb_lb_t	*lbp = NULL;
7784	mddb_locator_t	*lp;
7785	int		li;
7786	uint_t		flg;
7787	ddi_devid_t	devid_ptr;
7788	int		retval = 0;
7789	char		*minor_name;
7790	int		repl_import_flag;
7791
7792	/* Set replicated flag if this is a replicated import */
7793	repl_import_flag = md_get_setstatus(s->s_setno) &
7794	    MD_SET_REPLICATED_IMPORT;
7795
7796	lbp = s->s_lbp;
7797	/* find replicas that haven't been deleted */
7798	for (li = 0; li < lbp->lb_loccnt; li++) {
7799		lp = &lbp->lb_locators[li];
7800
7801		if ((lp->l_flags & MDDB_F_DELETED)) {
7802			continue;
7803		}
7804		/*
7805		 * check to see if locator devt matches given dev
7806		 * and if there is a device ID associated with it
7807		 */
7808		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
7809		if ((md_expldev(lp->l_dev) == dev) &&
7810		    (flg & MDDB_DID_EXISTS)) {
7811			if (flg & MDDB_DID_VALID) {
7812				continue; /* cont to nxt active entry */
7813			}
7814			devid_ptr = s->s_did_icp->did_ic_devid[li];
7815			if (devid_ptr == NULL) {
7816				return (MDDB_E_NODEVID);
7817			}
7818
7819			/*
7820			 * During a replicated import the old_didptr
7821			 * must match the current devid before the
7822			 * devid can be updated.
7823			 */
7824			if (repl_import_flag) {
7825				if (ddi_devid_compare(devid_ptr,
7826				    old_didptr) != 0)
7827					continue;
7828			}
7829
7830			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
7831				/*
7832				 * devid's not equal so
7833				 * delete and add
7834				 */
7835				if (ddi_lyr_get_minor_name(
7836				    md_dev64_to_dev(dev),
7837				    S_IFBLK, &minor_name) == DDI_SUCCESS) {
7838					(void) mddb_devid_delete(s, li);
7839					(void) mddb_devid_add(s, li, didptr,
7840					    minor_name);
7841					kmem_free(minor_name,
7842					    strlen(minor_name)+1);
7843					break;
7844				} else {
7845					retval = 1;
7846					goto err_out;
7847				}
7848			}
7849		}
7850	} /* end for */
7851	retval = push_lb(s);
7852	(void) upd_med(s, "update_locatorblock(0)");
7853err_out:
7854	return (retval);
7855}
7856
7857static int
7858update_mb_devid(
7859	mddb_set_t	*s,
7860	mddb_ri_t	*rip,
7861	ddi_devid_t	devidptr
7862)
7863{
7864	mddb_mb_ic_t	*mbip;
7865	mddb_mb_t	*mb = NULL;
7866	daddr_t		blkno;
7867	md_dev64_t	device;
7868	uint_t		sz;
7869	int		mb2free = 0;
7870	int		err = 0;
7871
7872
7873	/*
7874	 * There is case where a disk may not have mddb,
7875	 * and only has dummy mddb which contains
7876	 * a valid devid we like to update and in this
7877	 * case, the rip_lbp will be NULL but we still
7878	 * like to update the devid embedded in the
7879	 * dummy mb block.
7880	 *
7881	 */
7882	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
7883		mbip = rip->ri_mbip;
7884		mb = &mbip->mbi_mddb_mb;
7885	} else {
7886		/*
7887		 * Done if it is non-replicated set
7888		 */
7889		if (devidptr != (ddi_devid_t)NULL) {
7890			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
7891			    KM_SLEEP);
7892			mb->mb_magic = MDDB_MAGIC_DU;
7893			mb->mb_revision = MDDB_REV_MB;
7894			mb2free = 1;
7895		} else {
7896			goto out;
7897		}
7898	}
7899
7900	blkno = rip->ri_blkno;
7901	device = rip->ri_dev;
7902	/*
7903	 * Replace the mb_devid with the new/valid one
7904	 */
7905	if (devidptr != (ddi_devid_t)NULL) {
7906		/*
7907		 * Zero out what we have previously
7908		 */
7909		if (mb->mb_devid_len)
7910			bzero(mb->mb_devid, mb->mb_devid_len);
7911		sz = ddi_devid_sizeof(devidptr);
7912		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
7913		mb->mb_devid_len = sz;
7914	}
7915
7916	mb->mb_setno = s->s_setno;
7917	uniqtime32(&mb->mb_timestamp);
7918	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
7919	/*
7920	 * putblks will
7921	 *
7922	 *	- drop the s_dbmx lock
7923	 *	- biowait
7924	 *	- regain the s_dbmx lock
7925	 *
7926	 * Need to update this if we wants to handle
7927	 * mb_next != NULL which it is unlikely will happen
7928	 */
7929	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
7930
7931	if (mb2free) {
7932		kmem_free(mb, MDDB_BSIZE);
7933	}
7934out:
7935	return (err);
7936}
7937
7938static int
7939setdid(
7940	mddb_config_t		*cp
7941)
7942{
7943	ddi_devid_t		devidp;
7944	dev_t			ddi_dev;
7945	mddb_set_t		*s;
7946	int			err = 0;
7947	mddb_ri_t		*rip;
7948
7949	/*
7950	 * Data integrity check
7951	 */
7952	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
7953		return (EINVAL);
7954
7955	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
7956		return (0);
7957
7958	ddi_dev = md_dev64_to_dev(cp->c_devt);
7959	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
7960		return (-1);
7961	}
7962	if (devidp == NULL) {
7963		return (-1);
7964	}
7965
7966	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
7967		return (-1);
7968	single_thread_start(s);
7969
7970	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
7971		if (rip->ri_lbp == (mddb_lb_t *)NULL)
7972			continue;
7973		/*
7974		 * We only update what is asked
7975		 */
7976		if (rip->ri_dev == cp->c_devt) {
7977			if (update_mb_devid(s, rip, devidp) != 0) {
7978				err = -1;
7979				goto out;
7980			}
7981		}
7982	}
7983
7984	if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
7985		err = -1;
7986		goto out;
7987	}
7988
7989out:
7990	single_thread_end(s);
7991	mddb_setexit(s);
7992	ddi_devid_free(devidp);
7993	return (err);
7994}
7995
7996static int
7997delnewside(
7998	mddb_config_t		*cp,
7999	int			command,
8000	md_error_t		*ep
8001)
8002{
8003	mddb_set_t		*s;
8004	int			li;
8005	mddb_lb_t		*lbp;		/* pointer to locator block */
8006	mddb_ln_t		*lnp;		/* pointer to locator names */
8007	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
8008	mddb_locator_t		*lp;
8009	mddb_sidelocator_t	*slp;
8010	mddb_cfg_loc_t		*clp;
8011	int			err = 0;
8012	set_t			setno = cp->c_setno;
8013	ddi_devid_t		devid;
8014	ddi_devid_t		ret_devid = NULL;
8015	char			*minor_name;
8016	uint_t			use_devid = 0;
8017	dev_t			ddi_dev;
8018	md_mnname_suffix_t	*mnsn;
8019	mddb_mnlb_t		*mnlbp;
8020	mddb_mnsidelocator_t	*mnslp;
8021
8022	/* Currently don't allow addition/deletion of sides during upgrade */
8023	if (MD_UPGRADE) {
8024		cmn_err(CE_WARN,
8025		    "Addition and deletion of sides not allowed"
8026		    " during upgrade. \n");
8027		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8028	}
8029
8030	/*
8031	 * Data integrity check
8032	 */
8033	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8034		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8035
8036	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8037		return (mddbstatus2error(ep, err, NODEV32, setno));
8038
8039	single_thread_start(s);
8040	clp = &cp->c_locator;
8041
8042	lbp = s->s_lbp;
8043
8044	if (lbp->lb_setno != setno) {
8045		single_thread_end(s);
8046		mddb_setexit(s);
8047		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8048	}
8049
8050	/*
8051	 * Find this device/blkno pair
8052	 */
8053	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8054		ddi_dev = md_dev64_to_dev(clp->l_dev);
8055		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8056		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
8057		    == DDI_SUCCESS)) {
8058			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8059				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8060				use_devid = 1;
8061				(void) strcpy(clp->l_minor_name, minor_name);
8062			}
8063			kmem_free(minor_name, strlen(minor_name)+1);
8064		}
8065		if (use_devid != 1 && ret_devid != NULL)
8066			ddi_devid_free(ret_devid);
8067	}
8068	for (li = 0; li < lbp->lb_loccnt; li++) {
8069		lp = &lbp->lb_locators[li];
8070		if (lp->l_flags & MDDB_F_DELETED)
8071			continue;
8072		if (use_devid) {
8073			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
8074				continue;
8075			if ((ddi_devid_compare(devid,
8076			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8077			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8078			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8079				break;
8080			}
8081		} else {
8082			if (lp->l_dev == clp->l_dev &&
8083			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8084				break;
8085			}
8086		}
8087	}
8088
8089	if (li == lbp->lb_loccnt) {
8090		if (use_devid)
8091			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8092		single_thread_end(s);
8093		mddb_setexit(s);
8094		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8095	}
8096
8097	lnp = s->s_lnp;
8098	if (command == MDDB_NEWSIDE) {
8099		int 	index = 0;
8100		/*
8101		 * If a MN diskset, need to find the index where the new
8102		 * locator information is to be stored in the mnsidelocator
8103		 * field of the locator block so that the locator name can
8104		 * be stored at the same array index in the mnsuffixes
8105		 * field of the locator names structure.
8106		 */
8107		if (lbp->lb_flags & MDDB_MNSET) {
8108			if ((index = checklocator(lbp, li,
8109			    cp->c_sideno)) == -1) {
8110				if (use_devid) {
8111					ddi_devid_free((ddi_devid_t)
8112					    (uintptr_t)clp->l_devid);
8113				}
8114				single_thread_end(s);
8115				mddb_setexit(s);
8116				return (mdmddberror(ep, MDE_DB_TOOSMALL,
8117				    NODEV32, setno));
8118			}
8119		}
8120
8121		/*
8122		 * Store the locator name before the sidelocator information
8123		 * in case a panic occurs between these 2 steps.  Must have
8124		 * the locator name information in order to print reasonable
8125		 * error information.
8126		 */
8127		if (splitname2locatorblock(&cp->c_devname, lnp, li,
8128		    cp->c_sideno, index)) {
8129			if (use_devid)
8130				ddi_devid_free(
8131				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8132			single_thread_end(s);
8133			mddb_setexit(s);
8134			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8135			    setno));
8136		}
8137
8138		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
8139			if (use_devid)
8140				ddi_devid_free(
8141				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8142			single_thread_end(s);
8143			mddb_setexit(s);
8144			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8145			    setno));
8146		}
8147	}
8148
8149	if (use_devid)
8150		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8151
8152	if (command == MDDB_DELSIDE) {
8153		int i;
8154		for (i = 0; i < lbp->lb_loccnt; i++) {
8155			if (lbp->lb_flags & MDDB_MNSET) {
8156				int	j;
8157				mnlbp = (mddb_mnlb_t *)lbp;
8158				for (j = 0; j < MD_MNMAXSIDES; j++) {
8159					mnslp = &mnlbp->lb_mnsidelocators[j][i];
8160					if (mnslp->mnl_sideno == cp->c_sideno)
8161						break;
8162				}
8163				if (j < MD_MNMAXSIDES) {
8164					mnslp->mnl_mnum = NODEV32;
8165					mnslp->mnl_sideno = 0;
8166					mnlnp = (mddb_mnln_t *)lnp;
8167					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
8168					bzero((caddr_t)mnsn,
8169					    sizeof (md_mnname_suffix_t));
8170				}
8171			} else {
8172				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
8173				bzero((caddr_t)&lnp->ln_suffixes
8174				    [cp->c_sideno][i], sizeof (md_name_suffix));
8175				slp->l_mnum = NODEV32;
8176			}
8177		}
8178	}
8179
8180	/* write new locator names to all devices */
8181	uniqtime32(&lnp->ln_timestamp);
8182	if (lbp->lb_flags & MDDB_MNSET)
8183		lnp->ln_revision = MDDB_REV_MNLN;
8184	else
8185		lnp->ln_revision = MDDB_REV_LN;
8186	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8187	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8188	    lbp->lb_lnblkcnt, 0);
8189	/*
8190	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8191	 * flag in the mddb_set structure to show that the locator
8192	 * names have changed.
8193	 */
8194
8195	if ((lbp->lb_flags & MDDB_MNSET) &&
8196	    (md_set[s->s_setno].s_am_i_master)) {
8197		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8198	}
8199	if (err) {
8200		if (writeretry(s)) {
8201			single_thread_end(s);
8202			mddb_setexit(s);
8203			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8204		}
8205	}
8206
8207	uniqtime32(&lbp->lb_timestamp);
8208	/* write new locator to all devices */
8209	err = writelocall(s);
8210
8211	(void) upd_med(s, "delnewside(0)");
8212
8213	computefreeblks(s); /* recompute always it may be larger */
8214	if (err) {
8215		if (writeretry(s)) {
8216			single_thread_end(s);
8217			mddb_setexit(s);
8218			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8219		}
8220	}
8221
8222	single_thread_end(s);
8223	mddb_setexit(s);
8224
8225	return (0);
8226}
8227
8228static int
8229newdev(
8230	mddb_config_t	*cp,
8231	int		command,
8232	md_error_t	*ep
8233)
8234{
8235	mddb_set_t	*s;
8236	mddb_mb_ic_t	*mbip, *mbip1;
8237	int		i, j;
8238	int		li;
8239	mddb_lb_t	*lbp;		/* pointer to locator block */
8240	mddb_ln_t	*lnp;		/* pointer to locator names */
8241	mddb_locator_t	*lp;
8242	mddb_cfg_loc_t	*clp;
8243	int		err = 0;
8244	set_t		setno = cp->c_setno;
8245	ddi_devid_t	devid2;
8246	ddi_devid_t	ret_devid = NULL;
8247	char		*minor_name;
8248	uint_t		use_devid = 0;
8249	dev_t		ddi_dev;
8250	int		old_flags;
8251	int		flags;
8252	int		mn_set = 0;
8253	int		index;
8254
8255
8256	/* Currently don't allow addition of new replica during upgrade */
8257	if (MD_UPGRADE) {
8258		cmn_err(CE_WARN,
8259		    "Addition of new replica not allowed during upgrade.\n");
8260		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8261	}
8262
8263	/*
8264	 * Data integrity check
8265	 */
8266	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8267		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8268
8269	/* Determine the flag settings for multinode sets */
8270	flags = MDDB_NOOLDOK;
8271	if (cp->c_multi_node)
8272		flags |= MDDB_MULTINODE;
8273
8274	if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
8275		if (err != MDDB_E_NOTOWNER)
8276			return (mddbstatus2error(ep, err, NODEV32, setno));
8277		s = init_set(cp, flags, &err);
8278		if (s == NULL)
8279			return (mddbstatus2error(ep, err, NODEV32, setno));
8280	}
8281
8282	single_thread_start(s);
8283
8284	/* shorthand */
8285	clp = &cp->c_locator;
8286
8287	/* shorthand */
8288	lbp = s->s_lbp;
8289
8290	if (lbp->lb_setno != setno) {
8291		single_thread_end(s);
8292		mddb_setexit(s);
8293		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8294	}
8295
8296	/*
8297	 * See if this device/blkno pair is already a replica
8298	 */
8299	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8300		ddi_dev = expldev(clp->l_dev);
8301		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8302		    (ddi_lyr_get_minor_name(ddi_dev,
8303		    S_IFBLK, &minor_name) == DDI_SUCCESS)) {
8304			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8305				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8306				use_devid = 1;
8307				(void) strcpy(clp->l_minor_name, minor_name);
8308			}
8309			kmem_free(minor_name, strlen(minor_name)+1);
8310		}
8311		if (use_devid != 1 && ret_devid != NULL)
8312			ddi_devid_free(ret_devid);
8313	}
8314
8315	for (i = 0; i < lbp->lb_loccnt;	 i++) {
8316		lp = &lbp->lb_locators[i];
8317		if (lp->l_flags & MDDB_F_DELETED)
8318			continue;
8319		if (use_devid) {
8320			if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
8321				continue;
8322			if ((ddi_devid_compare(devid2,
8323			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8324			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8325			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8326				if (command == MDDB_NEWDEV) {
8327					ddi_devid_free((ddi_devid_t)(uintptr_t)
8328					    clp->l_devid);
8329					single_thread_end(s);
8330					mddb_setexit(s);
8331					return (mdmddberror(ep,
8332					    MDE_DB_EXISTS, NODEV32, setno));
8333				}
8334			}
8335		} else {
8336			if (lp->l_dev == clp->l_dev &&
8337			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8338				if (command == MDDB_NEWDEV) {
8339					single_thread_end(s);
8340					mddb_setexit(s);
8341					return (mdmddberror(ep,
8342					    MDE_DB_EXISTS, NODEV32, setno));
8343				}
8344			}
8345		}
8346	}
8347
8348	/*
8349	 * Really is a new replica, go get the master blocks
8350	 */
8351	mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
8352	    (uint_t *)0, &mn_set);
8353	if (! mbip) {
8354		if (use_devid)
8355			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8356		single_thread_end(s);
8357		mddb_setexit(s);
8358		return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
8359	}
8360
8361	/*
8362	 * Compute free blocks in replica.
8363	 */
8364	computefreeblks(s);
8365
8366	/*
8367	 * Check if this is large enough
8368	 */
8369	for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
8370		i += mbip1->mbi_mddb_mb.mb_blkcnt;
8371	for (j = i; j < s->s_totalblkcnt; j++) {
8372		if (blkcheck(s, j)) {
8373			while (mbip) {
8374				mbip1 = mbip->mbi_next;
8375				kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
8376				mbip = mbip1;
8377			}
8378			if (use_devid)
8379				ddi_devid_free(
8380				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8381			mddb_devclose(md_expldev(clp->l_dev));
8382			single_thread_end(s);
8383			mddb_setexit(s);
8384			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8385			    setno));
8386		}
8387	}
8388
8389	/* Look for a deleted slot */
8390	for (li = 0; li < lbp->lb_loccnt; li++) {
8391		lp = &lbp->lb_locators[li];
8392		if (lp->l_flags & MDDB_F_DELETED)
8393			break;
8394	}
8395
8396	/* If no deleted slots, add a new one */
8397	if (li == lbp->lb_loccnt) {
8398		/* Already have the max replicas, bail */
8399		if (lbp->lb_loccnt == MDDB_NLB) {
8400			if (use_devid)
8401				ddi_devid_free((ddi_devid_t)(uintptr_t)
8402				    clp->l_devid);
8403			mddb_devclose(md_expldev(clp->l_dev));
8404			single_thread_end(s);
8405			mddb_setexit(s);
8406			return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
8407			    setno));
8408		}
8409		lbp->lb_loccnt++;
8410		lp = &lbp->lb_locators[li];
8411	}
8412
8413	/* Initialize the new or deleted slot */
8414	old_flags = lp->l_flags;
8415	lp->l_dev = clp->l_dev;
8416	lp->l_blkno = (daddr32_t)clp->l_blkno;
8417	lp->l_flags = clp->l_flags;
8418
8419	/* shorthand */
8420	lnp = s->s_lnp;
8421
8422	index = 0;
8423	if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
8424		/*
8425		 * If a MN diskset, need to find the index where the new
8426		 * locator information is to be stored in the mnsidelocator
8427		 * field of the locator block so that the locator name can
8428		 * be stored at the same array index in the mnsuffixes
8429		 * field of the locator names structure.
8430		 */
8431		lbp->lb_flags |= MDDB_MNSET;
8432		if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
8433			if (use_devid)
8434				ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
8435				    l_devid);
8436			lp->l_flags = old_flags;
8437			lbp->lb_loccnt--;
8438			mddb_devclose(md_expldev(clp->l_dev));
8439			single_thread_end(s);
8440			mddb_setexit(s);
8441			return (mdmddberror(ep, MDE_DB_TOOSMALL,
8442			    NODEV32, setno));
8443		}
8444	}
8445	/*
8446	 * Store the locator name before the sidelocator information
8447	 * in case a panic occurs between these 2 steps.  Must have
8448	 * the locator name information in order to print reasonable
8449	 * error information.
8450	 */
8451	if (splitname2locatorblock(&cp->c_devname, lnp, li,
8452	    s->s_sideno, index)) {
8453		if (use_devid)
8454			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8455		lp->l_flags = old_flags;
8456		lbp->lb_loccnt--;
8457		mddb_devclose(md_expldev(clp->l_dev));
8458		single_thread_end(s);
8459		mddb_setexit(s);
8460		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8461	}
8462
8463	/*
8464	 * Compute free blocks in replica before calling cfgloc2locator
8465	 * since cfgloc2locator may attempt to alloc an unused block
8466	 * to store the device id.
8467	 * mbiarray needs to be setup before calling computefreeblks.
8468	 */
8469	s->s_mbiarray[li] = mbip;
8470	computefreeblks(s);
8471
8472	if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
8473		if (use_devid)
8474			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8475		lp->l_flags = old_flags;
8476		lbp->lb_loccnt--;
8477		s->s_mbiarray[li] = 0;
8478		mddb_devclose(md_expldev(clp->l_dev));
8479		single_thread_end(s);
8480		mddb_setexit(s);
8481		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8482	}
8483
8484	if (use_devid)
8485		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8486
8487	uniqtime32(&lbp->lb_timestamp);
8488	lp->l_flags = MDDB_F_ACTIVE;
8489
8490	/* write db copy to new device */
8491	err = writecopy(s, li, MDDB_WRITECOPY_ALL);
8492	lp->l_flags |= MDDB_F_UP2DATE;
8493
8494	/* write new locator names to all devices */
8495	uniqtime32(&lnp->ln_timestamp);
8496	if (lbp->lb_flags & MDDB_MNSET)
8497		lnp->ln_revision = MDDB_REV_MNLN;
8498	else
8499		lnp->ln_revision = MDDB_REV_LN;
8500	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8501	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8502	    lbp->lb_lnblkcnt, 0);
8503	/*
8504	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8505	 * flag in the mddb_set structure to show that the locator
8506	 * names have changed.
8507	 */
8508
8509	if ((lbp->lb_flags & MDDB_MNSET) &&
8510	    (md_set[s->s_setno].s_am_i_master)) {
8511		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8512	}
8513	if (err) {
8514		if (writeretry(s)) {
8515			single_thread_end(s);
8516			mddb_setexit(s);
8517			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8518		}
8519	}
8520
8521	/* Data tags not supported on MN sets */
8522	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
8523	    (!(lbp->lb_flags & MDDB_MNSET)) &&
8524	    setno != MD_LOCAL_SET)
8525		if (set_dtag(s, ep))
8526			mdclrerror(ep);
8527
8528	/* Write data tags to all accessible devices */
8529	/* Data tags not supported on MN sets */
8530	if (!(lbp->lb_flags & MDDB_MNSET)) {
8531		(void) dt_write(s);
8532	}
8533
8534	/* write new locator to all devices */
8535	err = writelocall(s);
8536
8537	(void) upd_med(s, "newdev(0)");
8538
8539	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
8540	    md_expldev(clp->l_dev));
8541
8542	computefreeblks(s); /* recompute always it may be smaller */
8543	if (err) {
8544		if (writeretry(s)) {
8545			single_thread_end(s);
8546			mddb_setexit(s);
8547			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8548		}
8549	}
8550
8551	single_thread_end(s);
8552	mddb_setexit(s);
8553
8554	return (0);
8555}
8556
8557#ifdef DEBUG
8558static void
8559mddb_check_set(
8560	set_t	setno
8561)
8562{
8563	mddb_set_t	*s;
8564	mddb_db_t	*dbp;
8565	mddb_de_ic_t	*dep;
8566	mddb_rb32_t	*rbp;
8567
8568	if (! md_set[setno].s_db)
8569		return;
8570
8571	s = (mddb_set_t *)md_set[setno].s_db;
8572
8573	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8574		for (dep = dbp->db_firstentry;
8575		    dep != NULL; dep = dep->de_next) {
8576			rbp = dep->de_rb;
8577			ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
8578			if (dep->de_rb_userdata)
8579				ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
8580		}
8581	}
8582}
8583#endif /* DEBUG */
8584
8585/*
8586 * Exported Entry Points
8587 */
8588#ifdef DEBUG
8589void
8590mddb_check(void)
8591{
8592	int	i;
8593
8594	for (i = 0; i < md_nsets; i++) {
8595		if (! md_set[i].s_db)
8596			return;
8597
8598		mddb_check_set(i);
8599	}
8600
8601}
8602#endif /* DEBUG */
8603
8604int
8605mddb_configure(
8606	mddb_cfgcmd_t	command,
8607	mddb_config_t	*cp
8608)
8609{
8610	mddb_set_t	*s;
8611	md_error_t	*ep = &cp->c_mde;
8612	int		flag = 0;
8613	int		err = 0;
8614	set_t		setno = cp->c_setno;
8615
8616	mdclrerror(ep);
8617
8618	switch (command) {
8619		case MDDB_NEWDEV:
8620			err = newdev(cp, command, ep);
8621			break;
8622
8623		case MDDB_NEWSIDE:
8624		case MDDB_DELSIDE:
8625			err = delnewside(cp, command, ep);
8626			break;
8627
8628		case MDDB_GETDEV:
8629		case MDDB_DELDEV:
8630		case MDDB_ENDDEV:
8631			err = getdeldev(cp, command, ep);
8632			break;
8633
8634		case MDDB_GETDRVRNAME:
8635			err = getdriver(&cp->c_locator);
8636			break;
8637
8638		case MDDB_USEDEV:
8639			/*
8640			 * Note: must allow USEDEV ioctl during upgrade to
8641			 * support auto-take disksets.
8642			 *
8643			 * Also during the set import if the md_devid_destroy
8644			 * flag is set then error out
8645			 */
8646
8647			if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
8648				return (mdmderror(ep, MDE_INVAL_UNIT,
8649				    MD_ADM_MINOR));
8650
8651			if (setno >= md_nsets)
8652				return (mdmderror(ep, MDE_INVAL_UNIT,
8653				    MD_ADM_MINOR));
8654
8655			if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
8656			    NULL) {
8657				if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
8658				    NULL) {
8659					err = mddbstatus2error(ep, err,
8660					    NODEV32, setno);
8661					break;
8662				}
8663			}
8664			if (setno == MD_LOCAL_SET)
8665				flag = MDDB_F_IOCTL;
8666			if (cp->c_locator.l_old_devid) {
8667				md_set_setstatus(setno,
8668				    MD_SET_REPLICATED_IMPORT);
8669			}
8670			err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
8671			mddb_setexit(s);
8672			break;
8673
8674		case MDDB_RELEASESET:
8675			mutex_enter(&mddb_lock);
8676			mddb_unload_set(cp->c_setno);
8677			mutex_exit(&mddb_lock);
8678			break;
8679
8680		case MDDB_SETDID:
8681			err = setdid(cp);
8682			break;
8683
8684		default:
8685			err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
8686			    cp->c_setno);
8687	}
8688
8689	return (err);
8690}
8691
8692int
8693mddb_getoptloc(
8694	mddb_optloc_t		*ol
8695)
8696{
8697	mddb_set_t		*s;
8698	mddb_db_t		*dbp;
8699	mddb_de_ic_t		*dep;
8700	mddb_recid_t		id;
8701	set_t			setno;
8702
8703	ol->li[0] = -1;
8704	ol->li[1] = -1;
8705
8706	id = ol->recid;
8707	setno = DBSET(id);
8708	if (setno >= md_nsets)
8709		return (EINVAL);
8710
8711	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
8712		return (0);
8713
8714	id = DBID(id);
8715	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8716		for (dep = dbp->db_firstentry;
8717		    dep != NULL; dep = dep->de_next) {
8718			if (dep->de_recid != id)
8719				continue;
8720			ol->li[0] = dep->de_optinfo[0].o_li;
8721			ol->li[1] = dep->de_optinfo[1].o_li;
8722			mddb_setexit(s);
8723			return (0);
8724		}
8725	}
8726	mddb_setexit(s);
8727	return (0);
8728}
8729
8730void
8731mddb_init(void)
8732{
8733	mddb_set_t	*s;
8734
8735	mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
8736	if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
8737		mddb_setexit(s);
8738}
8739
8740
8741void
8742mddb_unload(void)
8743{
8744	int	i;
8745
8746	mutex_enter(&mddb_lock);
8747
8748	for (i = 0; i < md_nsets; i++) {
8749		md_clr_setstatus(i, MD_SET_KEEPTAG);
8750		mddb_unload_set(i);
8751	}
8752
8753	crcfreetab();
8754
8755	mutex_exit(&mddb_lock);
8756}
8757
8758mddb_recid_t
8759mddb_createrec(
8760	size_t		usersize,	 /* size of db record */
8761	mddb_type_t	type,		 /* type1 of db record */
8762	uint_t		type2,		 /* type2 of db record */
8763	md_create_rec_option_t	options, /* options for this creation  */
8764	set_t		setno		 /* set number to create record in */
8765)
8766{
8767	mddb_set_t	*s;
8768	mddb_db_t	*dbp, *prevdbp, *newdbp;
8769	mddb_db32_t	*db32p;
8770	mddb_de_ic_t	*dep;
8771	/* LINTED variable unused - used for sizeof calculations */
8772	mddb_de32_t	*de32p;
8773	mddb_rb32_t	*rbp;
8774	size_t		recsize;
8775	ulong_t		blkcnt;
8776	ulong_t		maxblocks;
8777	size_t		desize, desize_ic;
8778	size_t		used;
8779	mddb_recid_t	newid;
8780	caddr_t		tmppnt;
8781	int		i, err = 0;
8782	void		*userdata;
8783	uint_t		flag_type;
8784
8785#if defined(_ILP32) && !defined(lint)
8786	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
8787	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8788	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8789#endif
8790
8791	/*
8792	 * everyone is supposed to sepcify if it's a
8793	 * 32 bit or a 64 bit record
8794	 */
8795	if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
8796		return (MDDB_E_INVALID);
8797	}
8798
8799	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8800		return (err);
8801
8802	if (checkstate(s, MDDB_PROBE)) {
8803		mddb_setexit(s);
8804		return (MDDB_E_NOTNOW);
8805	}
8806
8807	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
8808	    usersize, MDDB_BSIZE);
8809	blkcnt = btodb(recsize);
8810
8811	if (mddb_maxblocks)
8812		maxblocks = mddb_maxblocks;
8813	else
8814		maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
8815		    sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
8816
8817	if (blkcnt > maxblocks) {
8818		mddb_setexit(s);
8819		return (MDDB_E_INVALID);
8820	}
8821	/*
8822	 * allocate record block
8823	 * and new directory block so to avoid sleeping
8824	 * after starting single_thread
8825	 */
8826	rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
8827	if ((options & MD_CRO_OPTIMIZE) == 0)
8828		userdata = kmem_zalloc(usersize, KM_SLEEP);
8829	newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);
8830
8831	/*
8832	 * if this is the largest record allocate new buffer for
8833	 * checkcopy();
8834	 */
8835	if (recsize > s->s_databuffer_size) {
8836		tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
8837		/*
8838		 * this test is incase when to sleep during kmem_alloc
8839		 * and some other task bumped max record size
8840		 */
8841		if (recsize > s->s_databuffer_size) {
8842			if (s->s_databuffer_size)
8843				kmem_free(s->s_databuffer,
8844				    s->s_databuffer_size);
8845			s->s_databuffer = tmppnt;
8846			s->s_databuffer_size = recsize;
8847		} else {
8848			kmem_free(tmppnt, recsize);
8849		}
8850	}
8851
8852	single_thread_start(s);
8853
8854	newid = 0;
8855	do {
8856		newid++;
8857		if (DBID(newid) == 0) {
8858			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8859			kmem_free((caddr_t)rbp, ((size_t)recsize));
8860			if ((options & MD_CRO_OPTIMIZE) == 0)
8861				kmem_free(userdata, usersize);
8862			single_thread_end(s);
8863			mddb_setexit(s);
8864			return (MDDB_E_NOTNOW);
8865		}
8866
8867		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8868			for (dep = dbp->db_firstentry; dep;
8869			    dep = dep->de_next) {
8870				if (dep->de_recid == newid)
8871					break;
8872			}
8873			if (dep != NULL)
8874				break;
8875		}
8876	} while (dbp);
8877
8878	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
8879	    (sizeof (mddb_block_t) * blkcnt);
8880
8881	/*
8882	 * see if a directory block exists which will hold this entry
8883	 */
8884	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8885		used = sizeof (*db32p);
8886		for (dep = dbp->db_firstentry;
8887		    dep != NULL; dep = dep->de_next) {
8888			used += sizeof (*de32p) - sizeof (de32p->de32_blks);
8889			used += sizeof (mddb_block_t) * dep->de_blkcount;
8890		}
8891		if ((used + desize) < MDDB_BSIZE)
8892			break;
8893	}
8894	if (dbp) {
8895		kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8896		if (blkcnt > s->s_freeblkcnt) {
8897			kmem_free((caddr_t)rbp, ((size_t)recsize));
8898			if ((options & MD_CRO_OPTIMIZE) == 0)
8899				kmem_free(userdata, usersize);
8900			single_thread_end(s);
8901			mddb_setexit(s);
8902			return (MDDB_E_NOSPACE);
8903		}
8904		prevdbp = NULL;
8905	} else {
8906		/*
8907		 * need to add directory block
8908		 */
8909		if ((blkcnt + 1) > s->s_freeblkcnt) {
8910			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8911			kmem_free((caddr_t)rbp, ((size_t)recsize));
8912			if ((options & MD_CRO_OPTIMIZE) == 0)
8913				kmem_free(userdata, usersize);
8914			single_thread_end(s);
8915			mddb_setexit(s);
8916			return (MDDB_E_NOSPACE);
8917		}
8918		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
8919			;
8920		dbp->db_next = newdbp;
8921		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
8922		dbp->db_nextblk = getfreeblks(s, 1);
8923		dbp->db_next->db_blknum = dbp->db_nextblk;
8924		prevdbp = dbp;
8925		dbp = dbp->db_next;
8926		dbp->db_nextblk = 0;
8927		dbp->db_firstentry = NULL;
8928		dbp->db_recsum = 0;
8929		dbp->db_magic = MDDB_MAGIC_DB;
8930	}
8931	/*
8932	 * ready to add record
8933	 */
8934	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
8935	    (sizeof (mddb_block_t) * blkcnt);
8936	if (dbp->db_firstentry) {
8937		for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
8938			;
8939		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8940		dep = dep->de_next;
8941	} else {
8942		dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8943		dbp->db_firstentry = dep;
8944	}
8945	bzero((caddr_t)dep, desize_ic);
8946	dep->de_recid = newid;
8947	/*
8948	 * Optimized records have an owner node associated with them in
8949	 * a MN diskset.  The owner is only set on a node that is actively
8950	 * writing to that record.  The other nodes will show that record
8951	 * as having an invalid owner.  The owner for an optimized record
8952	 * is used during fixoptrecord to determine which node should
8953	 * write out the record when the replicas associated with that
8954	 * optimized record have been changed.
8955	 */
8956	if (MD_MNSET_SETNO(s->s_setno)) {
8957		dep->de_owner_nodeid = MD_MN_INVALID_NID;
8958	}
8959	dep->de_type1 =	type;
8960	dep->de_type2 = type2;
8961	dep->de_reqsize = usersize;
8962	dep->de_recsize = recsize;
8963	dep->de_blkcount = blkcnt;
8964	flag_type = options &
8965	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
8966	    MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
8967	    MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
8968	switch (flag_type) {
8969	case MD_CRO_OPTIMIZE:
8970		dep->de_flags = MDDB_F_OPT;
8971		getoptdev(s, dep, 0);
8972		getoptdev(s, dep, 1);
8973		break;
8974	case MD_CRO_STRIPE:
8975		dep->de_flags = MDDB_F_STRIPE;
8976		break;
8977	case MD_CRO_MIRROR:
8978		dep->de_flags = MDDB_F_MIRROR;
8979		break;
8980	case MD_CRO_RAID:
8981		dep->de_flags = MDDB_F_RAID;
8982		break;
8983	case MD_CRO_SOFTPART:
8984		dep->de_flags = MDDB_F_SOFTPART;
8985		break;
8986	case MD_CRO_TRANS_MASTER:
8987		dep->de_flags = MDDB_F_TRANS_MASTER;
8988		break;
8989	case MD_CRO_TRANS_LOG:
8990		dep->de_flags = MDDB_F_TRANS_LOG;
8991		break;
8992	case MD_CRO_HOTSPARE:
8993		dep->de_flags = MDDB_F_HOTSPARE;
8994		break;
8995	case MD_CRO_HOTSPARE_POOL:
8996		dep->de_flags = MDDB_F_HOTSPARE_POOL;
8997		break;
8998	case MD_CRO_CHANGELOG:
8999		dep->de_flags = MDDB_F_CHANGELOG;
9000		break;
9001	}
9002	/*
9003	 * try to get all blocks consecutive. If not possible
9004	 * just get them one at a time
9005	 */
9006	dep->de_blks[0] = getfreeblks(s, blkcnt);
9007	if (dep->de_blks[0]) {
9008		for (i = 1; i < blkcnt; i++)
9009			dep->de_blks[i] = dep->de_blks[0] + i;
9010	} else {
9011		for (i = 0; i < blkcnt;	 i++)
9012			dep->de_blks[i] = getfreeblks(s, 1);
9013	}
9014	dep->de_rb = rbp;
9015	bzero((caddr_t)rbp, recsize);
9016	rbp->rb_magic = MDDB_MAGIC_RB;
9017
9018	/* Do we have to create an old style (32 bit) record?  */
9019	if (options & MD_CRO_32BIT) {
9020		if (options & MD_CRO_FN)
9021			rbp->rb_revision = MDDB_REV_RBFN;
9022		else
9023			rbp->rb_revision = MDDB_REV_RB;
9024	} else {
9025		if (options & MD_CRO_FN)
9026			rbp->rb_revision = MDDB_REV_RB64FN;
9027		else
9028			rbp->rb_revision = MDDB_REV_RB64;
9029	}
9030
9031	/* set de_rb_userdata for non optimization records */
9032	if ((options & MD_CRO_OPTIMIZE) == 0) {
9033		dep->de_rb_userdata = userdata;
9034	}
9035
9036	uniqtime32(&rbp->rb_timestamp);
9037	/* Generate the crc for this record */
9038	rec_crcgen(s, dep, rbp);
9039	tmppnt = (caddr_t)rbp;
9040	/*
9041	 * the following code writes new records to all instances of
9042	 * the data base. Writing one block at a time to each instance
9043	 * is safe because they are not yet in a directory entry which
9044	 * has been written to the data base
9045	 */
9046	err = 0;
9047	if ((options & MD_CRO_OPTIMIZE) == 0) {
9048		for (i = 0; i < blkcnt;	 i++) {
9049			err |= writeall(s, (caddr_t)tmppnt,
9050			    dep->de_blks[i], 1, 0);
9051			tmppnt += MDDB_BSIZE;
9052		}
9053	} else {
9054		if ((MD_MNSET_SETNO(s->s_setno)) &&
9055		    md_set[s->s_setno].s_am_i_master) {
9056		/*
9057		 * If a MN diskset then only master writes out newly
9058		 * created optimized record.
9059		 */
9060			err |= writeoptrecord(s, dep);
9061		}
9062	}
9063	uniqtime32(&dbp->db_timestamp);
9064	dbp->db_revision = MDDB_REV_DB;
9065	/* Don't include opt resync and change log records in global XOR */
9066	if (!(dep->de_flags & MDDB_F_OPT) &&
9067	    !(dep->de_flags & MDDB_F_CHANGELOG))
9068		dbp->db_recsum ^= rbp->rb_checksum;
9069	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9070	create_db32rec(db32p, dbp);
9071	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9072	err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9073	if (prevdbp) {
9074		dbp = prevdbp;
9075		uniqtime32(&dbp->db_timestamp);
9076		dbp->db_revision = MDDB_REV_DB;
9077		create_db32rec(db32p, dbp);
9078		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9079		err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9080	}
9081	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9082	if (err) {
9083		if (writeretry(s)) {
9084			s->s_zombie = newid;
9085			single_thread_end(s);
9086			mddb_setexit(s);
9087			return (MDDB_E_NOTNOW);
9088		}
9089	}
9090	single_thread_end(s);
9091	mddb_setexit(s);
9092
9093	ASSERT((newid & MDDB_SETMASK) == 0);
9094	return (MAKERECID(setno, newid));
9095}
9096
9097int
9098mddb_deleterec(
9099	mddb_recid_t	id
9100)
9101{
9102	mddb_set_t	*s;
9103	mddb_db_t	*dbp;
9104	mddb_db32_t	*db32p;
9105	mddb_de_ic_t	*dep, *dep1;
9106	int		i;
9107
9108#if defined(_ILP32) && !defined(lint)
9109	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
9110	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
9111#endif
9112
9113	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9114	ASSERT(s != NULL);
9115
9116	id = DBID(id);
9117	if (checkstate(s, MDDB_PROBE)) {
9118		mddb_setexit(s);
9119		return (MDDB_E_NOTNOW);
9120	}
9121
9122	ASSERT(s->s_lbp != NULL);
9123	single_thread_start(s);
9124
9125	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9126		dep1 = NULL;
9127		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9128			if (dep->de_recid == id)
9129				break;
9130			dep1 = dep;
9131		}
9132		if (dep != NULL)
9133			break;
9134	}
9135	/*
9136	 * no such record
9137	 */
9138	if (dep == NULL) {
9139		single_thread_end(s);
9140		ASSERT(s->s_staledeletes != 0);
9141		s->s_staledeletes--;
9142		mddb_setexit(s);
9143		return (0);
9144	}
9145
9146	if (!(dep->de_flags & MDDB_F_OPT) &&
9147	    !(dep->de_flags & MDDB_F_CHANGELOG)) {
9148		dbp->db_recsum ^= dep->de_rb->rb_checksum;
9149		dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
9150	}
9151
9152	if (dep->de_rb_userdata != NULL) {
9153		if (dep->de_icreqsize)
9154			kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
9155		else
9156			kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9157	}
9158
9159	kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
9160
9161	for (i = 0; i < dep->de_blkcount; i++)
9162		blkfree(s, dep->de_blks[i]);
9163	if (dep1)
9164		dep1->de_next = dep->de_next;
9165	else
9166		dbp->db_firstentry = dep->de_next;
9167
9168	kmem_free(dep, sizeofde(dep));
9169
9170	uniqtime32(&dbp->db_timestamp);
9171	dbp->db_revision = MDDB_REV_DB;
9172	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9173	create_db32rec(db32p, dbp);
9174	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9175	if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
9176		if (writeretry(s)) {
9177			/*
9178			 * staledelete is used to mark deletes which failed.
9179			 * its only use is to not panic when the user retries
9180			 * the delete once the database is active again
9181			 */
9182			single_thread_end(s);
9183			s->s_staledeletes++;
9184			kmem_free((caddr_t)db32p, MDDB_BSIZE);
9185			mddb_setexit(s);
9186			return (MDDB_E_NOTNOW);
9187		}
9188	}
9189	single_thread_end(s);
9190	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9191	mddb_setexit(s);
9192	return (0);
9193}
9194
9195mddb_recid_t
9196mddb_getnextrec(
9197	mddb_recid_t		id,
9198	mddb_type_t		typ,
9199	uint_t			type2
9200)
9201{
9202	mddb_set_t		*s;
9203	mddb_db_t		*dbp;
9204	mddb_de_ic_t		*dep;
9205	int			searching, err;
9206	set_t			setno;
9207
9208	setno = DBSET(id);
9209	id = DBID(id);
9210	searching = id;
9211
9212	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9213		return (err);
9214
9215	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9216		for (dep = dbp->db_firstentry;
9217		    dep != NULL; dep = dep->de_next) {
9218			if (searching) {
9219				if (dep->de_recid == id)
9220					searching = 0;
9221			} else {
9222				if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
9223				    (type2 == 0 || dep->de_type2 == type2)) {
9224					id = dep->de_recid;
9225					mddb_setexit(s);
9226					ASSERT((id & MDDB_SETMASK) == 0);
9227					return (MAKERECID(setno, id));
9228				}
9229			}
9230		}
9231	}
9232
9233	mddb_setexit(s);
9234
9235	if (searching)
9236		return (MDDB_E_NORECORD);
9237	return (0);
9238}
9239
9240void *
9241mddb_getrecaddr(
9242	mddb_recid_t		id
9243)
9244{
9245	mddb_set_t		*s;
9246	mddb_db_t		*dbp;
9247	mddb_de_ic_t		*dep;
9248	void			*rval;
9249
9250	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9251		return (NULL);
9252
9253	id = DBID(id);
9254	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9255		for (dep = dbp->db_firstentry;
9256		    dep != NULL; dep = dep->de_next) {
9257			if (dep->de_recid != id)
9258				continue;
9259			if (dep->de_rb_userdata)
9260				rval = (void *)dep->de_rb_userdata;
9261			else
9262				rval = (void *)dep->de_rb->rb_data;
9263			mddb_setexit(s);
9264			return (rval);
9265		}
9266	}
9267
9268	mddb_setexit(s);
9269	return (NULL);
9270}
9271
9272
9273mddb_de_ic_t *
9274mddb_getrecdep(
9275	mddb_recid_t		id
9276)
9277{
9278	mddb_set_t		*s;
9279	mddb_db_t		*dbp;
9280	mddb_de_ic_t		*dep;
9281
9282	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9283		return (NULL);
9284
9285	id = DBID(id);
9286	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9287		for (dep = dbp->db_firstentry;
9288		    dep != NULL; dep = dep->de_next) {
9289			if (dep->de_recid != id)
9290				continue;
9291			mddb_setexit(s);
9292			return (dep);
9293		}
9294	}
9295
9296	mddb_setexit(s);
9297	return (NULL);
9298}
9299
9300void *
9301mddb_getrecaddr_resize(
9302	mddb_recid_t		id,
9303	size_t			icsize,
9304	off_t			off
9305)
9306{
9307	mddb_set_t		*s;
9308	mddb_db_t		*dbp;
9309	mddb_de_ic_t		*dep;
9310	void			*rval = NULL;
9311
9312	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9313		return (NULL);
9314
9315	id = DBID(id);
9316	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9317		for (dep = dbp->db_firstentry;
9318		    dep != NULL; dep = dep->de_next) {
9319			if (dep->de_recid != id)
9320				continue;
9321			if (dep->de_rb_userdata)
9322				rval = (void *)dep->de_rb_userdata;
9323			else
9324				rval = (void *)dep->de_rb->rb_data;
9325			break;
9326		}
9327		if (rval != NULL)
9328			break;
9329	}
9330
9331	if (rval == NULL) {
9332		mddb_setexit(s);
9333		return (NULL);
9334	}
9335
9336	if (dep->de_rb_userdata) {
9337		caddr_t nud;
9338
9339		if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
9340			mddb_setexit(s);
9341			return (rval);
9342		}
9343		ASSERT((dep->de_reqsize + off) <= icsize);
9344		nud = kmem_zalloc(icsize, KM_SLEEP);
9345		bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
9346		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9347		dep->de_rb_userdata = nud + off;
9348		dep->de_rb_userdata_ic = nud;
9349		dep->de_icreqsize = icsize;
9350		rval = nud;
9351	} else {
9352		size_t recsize;
9353		/* LINTED variable unused - used for sizeof calculations */
9354		mddb_rb32_t *nrbp;
9355
9356		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
9357		    icsize, MDDB_BSIZE);
9358		if (dep->de_recsize < recsize)
9359			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
9360			    "nonoptimized records can be resized\n");
9361	}
9362
9363	mddb_setexit(s);
9364	return (rval);
9365}
9366
9367int
9368mddb_getrecprivate(
9369	mddb_recid_t		id
9370)
9371{
9372	mddb_set_t		*s;
9373	mddb_db_t		*dbp;
9374	mddb_de_ic_t		*dep;
9375	int			err = 0;
9376	int			private;
9377
9378	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9379		return (err);
9380
9381	id = DBID(id);
9382	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9383		for (dep = dbp->db_firstentry;
9384		    dep != NULL; dep = dep->de_next) {
9385			if (dep->de_recid != id)
9386				continue;
9387			private = (int)dep->de_rb->rb_private;
9388			mddb_setexit(s);
9389			return (private);
9390		}
9391	}
9392
9393	mddb_setexit(s);
9394	return (MDDB_E_NORECORD);
9395}
9396
9397void
9398mddb_setrecprivate(
9399	mddb_recid_t		id,
9400	uint_t			private
9401)
9402{
9403	mddb_set_t		*s;
9404	mddb_db_t		*dbp;
9405	mddb_de_ic_t		*dep;
9406
9407	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
9408		ASSERT(0);
9409		return;
9410	}
9411
9412	id = DBID(id);
9413	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9414		for (dep = dbp->db_firstentry;
9415		    dep != NULL; dep = dep->de_next) {
9416			if (dep->de_recid != id)
9417				continue;
9418			dep->de_rb->rb_private = private;
9419			mddb_setexit(s);
9420			return;
9421		}
9422	}
9423
9424	mddb_setexit(s);
9425	ASSERT(0);
9426}
9427
9428mddb_type_t
9429mddb_getrectype1(
9430	mddb_recid_t		id
9431)
9432{
9433	mddb_set_t		*s;
9434	mddb_db_t		*dbp;
9435	mddb_de_ic_t		*dep;
9436	int			err = 0;
9437	mddb_type_t		rval;
9438
9439	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9440		return (err);
9441
9442	id = DBID(id);
9443	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9444		for (dep = dbp->db_firstentry;
9445		    dep != NULL; dep = dep->de_next) {
9446			if (dep->de_recid != id)
9447				continue;
9448			rval = dep->de_type1;
9449			mddb_setexit(s);
9450			return (rval);
9451		}
9452	}
9453
9454	mddb_setexit(s);
9455	return (MDDB_E_NORECORD);
9456}
9457
9458int
9459mddb_getrectype2(
9460	mddb_recid_t		id
9461)
9462{
9463	mddb_set_t		*s;
9464	mddb_db_t		*dbp;
9465	mddb_de_ic_t		*dep;
9466	int			err = 0;
9467	int			rval;
9468
9469	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9470		return (err);
9471
9472	id = DBID(id);
9473	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9474		for (dep = dbp->db_firstentry;
9475		    dep != NULL; dep = dep->de_next) {
9476			if (dep->de_recid != id)
9477				continue;
9478			rval = (int)dep->de_type2;
9479			mddb_setexit(s);
9480			return (rval);
9481		}
9482	}
9483
9484	mddb_setexit(s);
9485	return (MDDB_E_NORECORD);
9486}
9487
9488int
9489mddb_getrecsize(
9490	mddb_recid_t		id
9491)
9492{
9493	mddb_set_t		*s;
9494	mddb_db_t		*dbp;
9495	mddb_de_ic_t		*dep;
9496	int			err = 0;
9497	int			rval;
9498
9499	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9500		return (err);
9501
9502	id = DBID(id);
9503	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9504		for (dep = dbp->db_firstentry;
9505		    dep != NULL; dep = dep->de_next) {
9506			if (dep->de_recid != id)
9507				continue;
9508			rval = (int)dep->de_reqsize;
9509			mddb_setexit(s);
9510			return (rval);
9511		}
9512	}
9513
9514	mddb_setexit(s);
9515	return (MDDB_E_NORECORD);
9516}
9517
9518
9519mddb_recstatus_t
9520mddb_getrecstatus(
9521	mddb_recid_t		id
9522)
9523{
9524	mddb_set_t		*s;
9525	mddb_db_t		*dbp;
9526	mddb_de_ic_t		*dep;
9527	int			err = 0;
9528	mddb_recstatus_t	e_err;
9529
9530	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9531		return ((mddb_recstatus_t)err);
9532
9533	id = DBID(id);
9534	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9535		for (dep = dbp->db_firstentry;
9536		    dep != NULL; dep = dep->de_next) {
9537			if (dep->de_recid == id)
9538				break;
9539		}
9540		if (dep)
9541			break;
9542	}
9543
9544	e_err = MDDB_OK;
9545
9546	if (! dep)
9547		e_err = MDDB_NORECORD;
9548	else if (! dep->de_rb->rb_commitcnt)
9549		e_err = MDDB_NODATA;
9550	else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
9551		e_err = MDDB_STALE;
9552
9553	mddb_setexit(s);
9554	return (e_err);
9555}
9556
9557/*
9558 * Commit given record to disk.
9559 * If committing an optimized record, do not call
9560 * with md ioctl lock held.
9561 */
9562int
9563mddb_commitrec(
9564	mddb_recid_t	id
9565)
9566{
9567	mddb_set_t			*s;
9568	mddb_db_t			*dbp;
9569	mddb_de_ic_t			*dep;
9570	mddb_recid_t			ids[2];
9571	mddb_rb32_t			*rbp;
9572	static int			err = 0;
9573	md_mn_msg_mddb_optrecerr_t	*msg_recerr;
9574	md_mn_kresult_t			*kres;
9575	mddb_lb_t			*lbp;
9576	mddb_mnlb_t			*mnlbp;
9577	mddb_locator_t			*lp;
9578	mddb_mnsidelocator_t		*mnslp;
9579	mddb_drvnm_t			*dn;
9580	int				li;
9581	md_replica_recerr_t		*recerr;
9582	int				i, j;
9583	int				rval;
9584	int				hit_err = 0;
9585
9586	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9587	ASSERT(s != NULL);
9588
9589	if (checkstate(s, MDDB_PROBE)) {
9590		mddb_setexit(s);
9591		return (MDDB_E_NOTNOW);
9592	}
9593
9594	if (DBID(id) == 0) {
9595		mddb_setexit(s);
9596		return (0);
9597	}
9598
9599	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9600		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9601			if (dep->de_recid == DBID(id))
9602				break;
9603		}
9604		if (dep)
9605			break;
9606	}
9607
9608	if (dep == NULL) {
9609		mddb_setexit(s);
9610		return (MDDB_E_NORECORD);
9611	}
9612
9613	if (! (dep->de_flags & MDDB_F_OPT)) {
9614		ids[0] = id;
9615		ids[1] = 0;
9616		mddb_setexit(s);
9617		return (mddb_commitrecs(ids));
9618	}
9619
9620	/*
9621	 * following code allows multiple processes to be doing
9622	 * optimization commits in parallel.
9623	 * NOTE: if lots of optimization commits then the lock
9624	 * will not get released until it winds down
9625	 */
9626	if (s->s_optwaiterr) {
9627		while (s->s_optwaiterr) {
9628			s->s_opthungerr = 1;
9629			cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
9630		}
9631		if (checkstate(s, MDDB_PROBE)) {
9632			mddb_setexit(s);
9633			return (MDDB_E_NOTNOW);
9634		}
9635	}
9636	if (s->s_optcmtcnt++ == 0) {
9637		single_thread_start(s);
9638		s->s_opthavelck = 1;
9639		if (s->s_optwantlck) {
9640			cv_broadcast(&s->s_optwantlck_cv);
9641			s->s_optwantlck = 0;
9642		}
9643	} else {
9644		while (! s->s_opthavelck) {
9645			s->s_optwantlck = 1;
9646			cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
9647		}
9648	}
9649
9650	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9651		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9652			if (dep->de_recid == DBID(id))
9653				break;
9654		}
9655		if (dep)
9656			break;
9657	}
9658
9659	if (dep == NULL) {
9660		if (! (--s->s_optcmtcnt)) {
9661			single_thread_end(s);
9662			s->s_opthavelck = 0;
9663		}
9664		mddb_setexit(s);
9665		return (MDDB_E_NORECORD);
9666	}
9667
9668	rbp = dep->de_rb;
9669	rbp->rb_commitcnt++;
9670	uniqtime32(&rbp->rb_timestamp);
9671	/* Generate the crc for this record */
9672	rec_crcgen(s, dep, rbp);
9673
9674	if (writeoptrecord(s, dep)) {
9675		if (MD_MNSET_SETNO(s->s_setno)) {
9676			hit_err = 1;
9677		}
9678		s->s_optwaiterr++;
9679	}
9680	if (MD_MNSET_SETNO(s->s_setno)) {
9681		/* If last thread out, release single_thread_start */
9682		if (! (--s->s_optcmtcnt)) {
9683			single_thread_end(s);
9684			s->s_opthavelck = 0;
9685		}
9686		/*
9687		 * If this thread had a writeoptrecords failure, then
9688		 * need to send message to master.
9689		 * But, multiple threads could all be running on the
9690		 * same single_thread_start, so serialize the threads
9691		 * by making each thread grab single_thread_start.
9692		 *
9693		 * After return from sending message to master message,
9694		 * replicas associated with optimized record will havei
9695		 * been changed (via a callback from the master to all
9696		 * nodes), so retry call to writeoptrecord.
9697		 * This code is replacing the call to writeretry that
9698		 * occurs for the local and traditional disksets.
9699		 */
9700		if (hit_err) {
9701			single_thread_start(s);
9702			/*
9703			 * If > 50% of replicas are alive then continue
9704			 * to send message to master until writeoptrecord
9705			 * succeeds.  For now, assume that minor name,
9706			 * major number on this node is the same as on
9707			 * the master node.  Once devids are turned on
9708			 * for MN disksets, can send devid.
9709			 */
9710			kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
9711			msg_recerr = kmem_zalloc(
9712			    sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
9713			while (!(md_get_setstatus(s->s_setno) &
9714			    MD_SET_TOOFEW)) {
9715				bzero((caddr_t)msg_recerr,
9716				    sizeof (md_mn_msg_mddb_optrecerr_t));
9717				lbp = s->s_lbp;
9718				mnlbp = (mddb_mnlb_t *)lbp;
9719				for (i = 0; i < 2; i++) {
9720					li = dep->de_optinfo[i].o_li;
9721					lp = &lbp->lb_locators[li];
9722					for (j = 0; j < MD_MNMAXSIDES; j++) {
9723						mnslp =
9724						    &mnlbp->
9725						    lb_mnsidelocators[j][li];
9726						if (mnslp->mnl_sideno ==
9727						    s->s_sideno)
9728							break;
9729					}
9730					if (j == MD_MNMAXSIDES)
9731						continue;
9732
9733					dn = &lbp->
9734					    lb_drvnm[mnslp->mnl_drvnm_index];
9735					recerr = &msg_recerr->msg_recerr[i];
9736					recerr->r_li = li;
9737					recerr->r_flags =
9738					    dep->de_optinfo[i].o_flags;
9739					recerr->r_blkno = lp->l_blkno;
9740					recerr->r_mnum = md_getminor(lp->l_dev);
9741					(void) strncpy(recerr->r_driver_name,
9742					    dn->dn_data, MD_MAXDRVNM);
9743				}
9744
9745				/* Release locks */
9746				single_thread_end(s);
9747				mutex_exit(SETMUTEX(s->s_setno));
9748
9749				/*
9750				 * Send message to master about optimized
9751				 * record failure.  After return, master
9752				 * should have marked failed replicas
9753				 * and sent parse message to slaves causing
9754				 * slaves to have fixed up the optimized
9755				 * record.
9756				 * On return from ksend_message, retry
9757				 * the write since this node should have fixed
9758				 * the optimized resync records it owns.
9759				 */
9760				rval = mdmn_ksend_message(s->s_setno,
9761				    MD_MN_MSG_MDDB_OPTRECERR,
9762				    MD_MSGF_NO_BCAST, 0,
9763				    (char *)msg_recerr,
9764				    sizeof (md_mn_msg_mddb_optrecerr_t),
9765				    kres);
9766				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
9767					cmn_err(CE_WARN, "mddb_commitrec: "
9768					    "Unable to send optimized "
9769					    "resync record failure "
9770					    "message to other nodes in "
9771					    "diskset %s\n", s->s_setname);
9772					mdmn_ksend_show_error(rval, kres,
9773					    "MD_MN_MSG_MDDB_OPTRECERR");
9774				}
9775
9776				/* Regrab locks */
9777				mutex_enter(SETMUTEX(s->s_setno));
9778				single_thread_start(s);
9779
9780				/* Start over in case mddb changed */
9781				for (dbp = s->s_dbp; dbp != NULL;
9782				    dbp = dbp->db_next) {
9783					for (dep = dbp->db_firstentry; dep;
9784					    dep = dep->de_next) {
9785						if (dep->de_recid == DBID(id))
9786							break;
9787					}
9788					if (dep)
9789						break;
9790				}
9791				if (dep) {
9792					rbp = dep->de_rb;
9793					rbp->rb_commitcnt++;
9794					uniqtime32(&rbp->rb_timestamp);
9795					/* Generate the crc for this record */
9796					rec_crcgen(s, dep, rbp);
9797
9798					/*
9799					 * If writeoptrecord succeeds, then
9800					 * break out.
9801					 */
9802					if (!(writeoptrecord(s, dep)))
9803						break;
9804				}
9805			}
9806			kmem_free(kres, sizeof (md_mn_kresult_t));
9807			kmem_free(msg_recerr,
9808			    sizeof (md_mn_msg_mddb_optrecerr_t));
9809
9810			/* Resync record should be fixed - if possible */
9811			s->s_optwaiterr--;
9812			if (s->s_optwaiterr == 0) {
9813				/* All errors have been handled */
9814				if (s->s_opthungerr) {
9815					s->s_opthungerr = 0;
9816					cv_broadcast(&s->s_opthungerr_cv);
9817				}
9818			}
9819			single_thread_end(s);
9820			mddb_setexit(s);
9821			if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
9822				return (MDDB_E_NOTNOW);
9823			} else {
9824				return (0);
9825			}
9826		}
9827	} else {
9828		/* If set is a traditional or local set */
9829		if (! (--s->s_optcmtcnt)) {
9830			err = 0;
9831			if (s->s_optwaiterr) {
9832				err = writeretry(s);
9833				s->s_optwaiterr = 0;
9834				if (s->s_opthungerr) {
9835					s->s_opthungerr = 0;
9836					cv_broadcast(&s->s_opthungerr_cv);
9837				}
9838			}
9839			single_thread_end(s);
9840			s->s_opthavelck = 0;
9841			mddb_setexit(s);
9842			if (err)
9843				return (MDDB_E_NOTNOW);
9844			return (0);
9845		}
9846		if (s->s_optwaiterr) {
9847			while (s->s_optwaiterr) {
9848				s->s_opthungerr = 1;
9849				cv_wait(&s->s_opthungerr_cv,
9850				    SETMUTEX(s->s_setno));
9851			}
9852			if (checkstate(s, MDDB_NOPROBE)) {
9853				mddb_setexit(s);
9854				return (MDDB_E_NOTNOW);
9855			}
9856		}
9857	}
9858
9859	mddb_setexit(s);
9860	return (0);
9861}
9862
9863int
9864mddb_commitrecs(
9865	mddb_recid_t	ids[]
9866)
9867{
9868	mddb_set_t	*s;
9869	mddb_db_t	*dbp;
9870	mddb_de_ic_t	*dep;
9871	mddb_rb32_t	*rbp;
9872	mddb_rb32_t	*saverbp;
9873	mddb_lb_t	*lbp;
9874	int		li;
9875	uint_t		checksum;
9876	mddb_recid_t	*idp;
9877	int		err = 0;
9878	set_t		setno;
9879
9880	if (panicstr)
9881		cmn_err(CE_PANIC, "md: mddb: commit not allowed");
9882
9883	/*
9884	 * scan through and make sure ids are from the same set
9885	 */
9886	setno = DBSET(ids[0]);
9887	for (idp = ids; *idp != NULL; idp++)
9888		ASSERT(DBSET(*idp) == setno);
9889
9890	s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);
9891
9892	if (checkstate(s, MDDB_PROBE)) {
9893		mddb_setexit(s);
9894		return (MDDB_E_NOTNOW);
9895	}
9896
9897	ASSERT(s->s_lbp != NULL);
9898	err = 0;
9899
9900	if (! ids[0]) {
9901		mddb_setexit(s);
9902		return (0);
9903	}
9904
9905	single_thread_start(s);
9906	/*
9907	 * scan through and make sure ids all exist
9908	 */
9909	for (idp = ids; *idp != NULL; idp++) {
9910		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9911			for (dep = dbp->db_firstentry; dep;
9912			    dep = dep->de_next) {
9913				if (dep->de_recid == DBID(*idp))
9914					break;
9915			}
9916			if (dep != NULL)
9917				break;
9918		}
9919		if (dep == NULL) {
9920			single_thread_end(s);
9921			mddb_setexit(s);
9922			return (MDDB_E_NORECORD);
9923		}
9924	}
9925
9926	/*
9927	 * scan through records fix commit counts and
9928	 * zero fiddles and update time stamp and rechecksum record
9929	 */
9930	checksum = 0;
9931	idp = ids;
9932	saverbp = NULL;
9933	while (*idp) {
9934		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9935			for (dep = dbp->db_firstentry; dep;
9936			    dep = dep->de_next) {
9937				if (dep->de_recid == DBID(*idp))
9938					break;
9939			}
9940			if (dep != NULL)
9941				break;
9942		}
9943		rbp = dep->de_rb;
9944		ASSERT(! (dep->de_flags & MDDB_F_OPT));
9945
9946		getuserdata(setno, dep);
9947		/* Don't do fiddles for CHANGE LOG records */
9948		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9949			checksum ^= rbp->rb_checksum_fiddle;
9950			rbp->rb_checksum_fiddle = 0;
9951			checksum ^= rbp->rb_checksum;
9952			saverbp = rbp;
9953		}
9954		rbp->rb_commitcnt++;
9955		uniqtime32(&rbp->rb_timestamp);
9956		/* Generate the crc for this record */
9957		rec_crcgen(s, dep, rbp);
9958
9959		/* Don't do fiddles for CHANGE LOG records */
9960		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9961			checksum ^= rbp->rb_checksum;
9962		}
9963		idp++;
9964	}
9965
9966	if (saverbp)
9967		saverbp->rb_checksum_fiddle = checksum;
9968
9969	/*
9970	 * If this is a MN set but we are not the master, then we are not
9971	 * supposed to update the mddb on disk. So we finish at this point.
9972	 */
9973	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
9974	    (md_set[setno].s_am_i_master == 0)) {
9975		single_thread_end(s);
9976		mddb_setexit(s);
9977		return (0);
9978	}
9979
9980	lbp = s->s_lbp;
9981	for (li = 0; li < lbp->lb_loccnt; li++) {
9982		if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
9983			continue;
9984
9985		idp = ids;
9986		while (*idp) {
9987			for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9988				dep = dbp->db_firstentry;
9989				while (dep && (dep->de_recid != DBID(*idp)))
9990					dep = dep->de_next;
9991				if (dep != NULL)
9992					break;
9993			}
9994			rbp = dep->de_rb;
9995			err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
9996			    dep->de_blkcount, li, (mddb_bf_t **)0,
9997			    MDDB_WR_ONLY_MASTER);
9998			if (err)
9999				break;
10000			idp++;
10001		}
10002		if (err)
10003			break;
10004	}
10005	if (err) {
10006		if (writeretry(s)) {
10007			single_thread_end(s);
10008			mddb_setexit(s);
10009			return (MDDB_E_NOTNOW);
10010		}
10011	}
10012	single_thread_end(s);
10013	mddb_setexit(s);
10014	return (0);
10015}
10016
10017mddb_recid_t
10018mddb_makerecid(
10019	set_t		setno,
10020	mddb_recid_t	id
10021)
10022{
10023	return (MAKERECID(setno, id));
10024}
10025
10026set_t
10027mddb_getsetnum(
10028	mddb_recid_t	id
10029)
10030{
10031	return (DBSET(id));
10032}
10033
10034char *
10035mddb_getsetname(
10036	set_t	setno
10037)
10038{
10039	return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
10040}
10041
10042side_t
10043mddb_getsidenum(
10044	set_t	setno
10045)
10046{
10047	if (md_set[setno].s_db)
10048		return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
10049	return (0);
10050}
10051
10052int
10053mddb_ownset(
10054	set_t	setno
10055)
10056{
10057	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
10058		return (1);
10059
10060	if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
10061		return (1);
10062
10063	return (0);
10064}
10065
10066/*ARGSUSED*/
10067int
10068getmed_ioctl(mddb_med_parm_t *medpp, int mode)
10069{
10070	mddb_set_t	*s;
10071	int		err = 0;
10072	set_t		setno = medpp->med_setno;
10073	md_error_t	*ep = &medpp->med_mde;
10074
10075	mdclrerror(ep);
10076
10077	if (setno >= md_nsets)
10078		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10079
10080	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10081		return (0);
10082
10083	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10084		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10085
10086	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10087		return (mddbstatus2error(ep, err, NODEV32, setno));
10088
10089	medpp->med = s->s_med;			/* structure assignment */
10090
10091	mddb_setexit(s);
10092
10093	return (0);
10094}
10095
10096int
10097setmed_ioctl(mddb_med_parm_t *medpp, int mode)
10098{
10099
10100	mddb_set_t	*s;
10101	int		err = 0;
10102	set_t		setno = medpp->med_setno;
10103	md_error_t	*ep = &medpp->med_mde;
10104
10105	mdclrerror(ep);
10106
10107	if ((mode & FWRITE) == 0)
10108		return (mdsyserror(ep, EACCES));
10109
10110	/*
10111	 * This should be the only thing that prevents LOCAL sets from having
10112	 * mediators, at least in the kernel, userland needs to have some code
10113	 * written.
10114	 */
10115	if (setno == MD_LOCAL_SET)
10116		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10117
10118	if (setno >= md_nsets)
10119		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10120
10121	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10122		return (0);
10123
10124	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10125		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10126
10127	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10128		return (mddbstatus2error(ep, err, NODEV32, setno));
10129
10130	s->s_med = medpp->med;			/* structure assignment */
10131
10132	mddb_setexit(s);
10133
10134	return (0);
10135}
10136
10137int
10138updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
10139{
10140
10141	mddb_set_t	*s;
10142	int		err = 0;
10143	set_t		setno = medpp->med_setno;
10144	md_error_t	*ep = &medpp->med_mde;
10145
10146	mdclrerror(ep);
10147
10148	if ((mode & FWRITE) == 0)
10149		return (mdsyserror(ep, EACCES));
10150
10151	if (setno >= md_nsets)
10152		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10153
10154	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10155		return (0);
10156
10157	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10158		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10159
10160	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10161		return (mddbstatus2error(ep, err, NODEV32, setno));
10162
10163	single_thread_start(s);
10164	(void) upd_med(s, "updmed_ioctl()");
10165	single_thread_end(s);
10166
10167	mddb_setexit(s);
10168
10169	return (0);
10170}
10171
10172int
10173take_set(mddb_config_t *cp, int mode)
10174{
10175	int			err = 0;
10176	mddb_med_upd_parm_t	medup;
10177	set_t			setno = cp->c_setno;
10178	md_error_t		*ep = &cp->c_mde;
10179	int			snarf_ok = 0;
10180
10181	if (md_get_setstatus(setno) & MD_SET_SNARFED)
10182		return (0);
10183
10184	err = mddb_configure(MDDB_GETDEV, cp);
10185	if (! err && mdisok(ep)) {
10186		if (md_snarf_db_set(setno, ep) != 0)
10187			goto out;
10188		snarf_ok = 1;
10189	}
10190
10191	/*
10192	 * Clear replicated import flag since this is
10193	 * used during the take of a diskset with
10194	 * previously unresolved replicated disks.
10195	 */
10196	if (md_get_setstatus(setno) &
10197	    MD_SET_REPLICATED_IMPORT) {
10198		md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
10199	}
10200
10201	if (! err && mdisok(ep)) {
10202		if (! cp->c_flags) {
10203			medup.med_setno = setno;
10204			mdclrerror(&medup.med_mde);
10205
10206			err = updmed_ioctl(&medup, mode);
10207			if (! mdisok(&medup.med_mde))
10208				(void) mdstealerror(ep, &medup.med_mde);
10209		}
10210	}
10211
10212out:
10213	/*
10214	 * In the case that the snarf failed, the diskset is
10215	 * left with s_db set, but s_lbp not set.  The node is not
10216	 * an owner of the set and won't be allowed to release the
10217	 * diskset in order to cleanup.  With s_db set, any call to the
10218	 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
10219	 * will cause the diskset to be loaded.  So, cleanup the diskset so
10220	 * that an inadvertent start of the diskset doesn't happen later.
10221	 */
10222	if ((snarf_ok == 0) && md_set[setno].s_db &&
10223	    (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
10224		mutex_enter(&mddb_lock);
10225		mddb_unload_set(setno);
10226		mutex_exit(&mddb_lock);
10227	}
10228	return (err);
10229}
10230
10231/*ARGSUSED*/
10232int
10233release_set(mddb_config_t *cp, int mode)
10234{
10235	int			err = 0;
10236	set_t			setno = cp->c_setno;
10237	md_error_t		*ep = &cp->c_mde;
10238
10239	/*
10240	 * Data integrity check
10241	 */
10242	if (setno >= md_nsets)
10243		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10244
10245	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
10246	md_haltsnarf_enter(setno);
10247	/*
10248	 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
10249	 * that the mirror code is currently searching all mirrors for a
10250	 * errored component that needs a hotspare. While this search is in
10251	 * progress, we cannot release the set and thgerefore we return EBUSY.
10252	 * Once we have set HOLD, the mirror function (check_4_hotspares) will
10253	 * block before the search until the set is released.
10254	 */
10255	if (md_holdset_testandenter(setno) != 0) {
10256		md_haltsnarf_exit(setno);
10257		rw_exit(&md_unit_array_rw.lock);
10258		return (EBUSY);
10259	}
10260
10261	if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
10262		err = mddb_configure(MDDB_RELEASESET, cp);
10263
10264	md_holdset_exit(setno);
10265	md_haltsnarf_exit(setno);
10266	rw_exit(&md_unit_array_rw.lock);
10267
10268	if (! err && mdisok(ep)) {
10269		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
10270		    NODEV64);
10271	}
10272
10273	return (err);
10274}
10275
10276int
10277gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
10278{
10279	mddb_set_t	*s;
10280	int		err = 0;
10281	mddb_dtag_lst_t	*dtlp;
10282	set_t		setno = dtgpp->dtgp_setno;
10283	md_error_t	*ep = &dtgpp->dtgp_mde;
10284
10285	mdclrerror(ep);
10286
10287	if ((mode & FREAD) == 0)
10288		return (mdsyserror(ep, EACCES));
10289
10290	if (setno >= md_nsets)
10291		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10292
10293	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10294		return (0);
10295
10296	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10297		return (mddbstatus2error(ep, err, NODEV32, setno));
10298
10299	/*
10300	 * Data tags not supported on MN sets so return invalid operation.
10301	 * This ioctl could be called before the mddb has been read in so
10302	 * the set status may not yet be set to MNSET, so code following
10303	 * this check must handle a MN diskset properly.
10304	 */
10305	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10306		mddb_setexit(s);
10307		return (mderror(ep, MDE_INVAL_MNOP));
10308	}
10309
10310	/* s_dtlp is NULL for MN diskset */
10311	dtlp = s->s_dtlp;
10312	while (dtlp != NULL) {
10313		if (dtgpp->dtgp_dt.dt_id == 0 ||
10314		    dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
10315			bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
10316			    sizeof (mddb_dtag_t));
10317			break;
10318		}
10319		dtlp = dtlp->dtl_nx;
10320	}
10321
10322	/* Walked the whole list and id not found, return error */
10323	if (dtlp == (mddb_dtag_lst_t *)NULL) {
10324		mddb_setexit(s);
10325		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10326	}
10327
10328	mddb_setexit(s);
10329
10330	return (0);
10331}
10332
10333int
10334usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
10335{
10336	mddb_set_t	*s;
10337	int		err = 0;
10338	mddb_config_t	*cp;
10339	mddb_ri_t	*trip = NULL;
10340	mddb_dtag_t	*dtagp = NULL;
10341	set_t		setno = dtupp->dtup_setno;
10342	md_error_t	*ep = &dtupp->dtup_mde;
10343
10344	mdclrerror(ep);
10345
10346	if ((mode & FWRITE) == 0)
10347		return (mdsyserror(ep, EACCES));
10348
10349	if (setno >= md_nsets)
10350		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10351
10352	if (dtupp->dtup_id < 0)
10353		return (mdsyserror(ep, EINVAL));
10354	else if (dtupp->dtup_id == 0)
10355		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10356
10357	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10358		return (0);
10359
10360	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
10361		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10362
10363	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10364		return (mddbstatus2error(ep, err, NODEV32, setno));
10365
10366	/*
10367	 * Data tags not supported on MN sets so return invalid operation.
10368	 * This ioctl could be called before the mddb has been read in so
10369	 * the set status may not yet be set to MNSET, so code following
10370	 * this check must handle a MN diskset properly.
10371	 */
10372	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10373		mddb_setexit(s);
10374		return (mderror(ep, MDE_INVAL_MNOP));
10375	}
10376
10377	/* Validate and find the id requested - nothing found if MN diskset */
10378	if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
10379		mddb_setexit(s);
10380		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10381	}
10382
10383	/* Usetag is only valid when more than one tag exists */
10384	if (dtl_cntl(s) < 2) {
10385		mddb_setexit(s);
10386		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10387	}
10388
10389	/* Put the selected tag in place */
10390	dt_setup(s, dtagp);
10391
10392	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10393
10394	/* Save the hint information */
10395	trip = save_rip(s);
10396
10397	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10398	cp->c_setno = setno;
10399	cp->c_sideno = s->s_sideno;
10400	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10401	cp->c_setname[MD_MAX_SETNAME] = '\0';
10402	cp->c_med = s->s_med;				/* struct assignment */
10403
10404	mddb_setexit(s);
10405
10406	s = NULL;
10407
10408	/* shorthand */
10409	setno = cp->c_setno;
10410
10411	/* Let unload know not to free the tag */
10412	md_set_setstatus(setno, MD_SET_KEEPTAG);
10413
10414	/* Release the set */
10415	if (err = release_set(cp, mode))
10416		goto out;
10417
10418	if (! mdisok(&cp->c_mde)) {
10419		(void) mdstealerror(ep, &cp->c_mde);
10420		err = 1;
10421		goto out;
10422	}
10423
10424	/* Re-init set using the saved mddb_config_t structure */
10425	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10426		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10427			err = mddbstatus2error(ep, err, NODEV32, setno);
10428			goto out;
10429		}
10430	}
10431
10432	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10433
10434	/* use the saved rip structure */
10435	s->s_rip = trip;
10436	trip = (mddb_ri_t *)NULL;
10437
10438	/* Let the take code know a tag is being used */
10439	md_set_setstatus(setno, MD_SET_USETAG);
10440
10441	mddb_setexit(s);
10442
10443	s = NULL;
10444
10445	/* Take the set */
10446	if (err = take_set(cp, mode))
10447		goto out;
10448
10449	if (! mdisok(&cp->c_mde))
10450		(void) mdstealerror(ep, &cp->c_mde);
10451
10452out:
10453	md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));
10454
10455	kmem_free(cp, sizeof (mddb_config_t));
10456
10457	if (trip)
10458		free_rip(&trip);
10459
10460	if (s)
10461		mddb_setexit(s);
10462
10463	return (err);
10464}
10465
10466int
10467accept_ioctl(mddb_accept_parm_t *accpp, int mode)
10468{
10469	mddb_set_t	*s;
10470	int		err = 0;
10471	mddb_config_t	*cp;
10472	mddb_ri_t	*trip = NULL;
10473	set_t		setno = accpp->accp_setno;
10474	md_error_t	*ep = &accpp->accp_mde;
10475
10476	mdclrerror(ep);
10477
10478	if ((mode & FWRITE) == 0)
10479		return (mdsyserror(ep, EACCES));
10480
10481	if (setno >= md_nsets)
10482		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10483
10484	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10485		return (0);
10486
10487	if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
10488		return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));
10489
10490	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10491		return (mddbstatus2error(ep, err, NODEV32, setno));
10492
10493	/*
10494	 * Data tags not supported on MN sets so return invalid operation.
10495	 * mddb is guaranteed to be incore at this point, so this
10496	 * check will catch all MN disksets.
10497	 */
10498	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10499		mddb_setexit(s);
10500		return (mderror(ep, MDE_INVAL_MNOP));
10501	}
10502
10503	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10504
10505	trip = save_rip(s);
10506
10507	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10508	cp->c_setno = setno;
10509	cp->c_sideno = s->s_sideno;
10510	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10511	cp->c_setname[MD_MAX_SETNAME] = '\0';
10512	cp->c_med = s->s_med;				/* struct assignment */
10513
10514	/* Tag the data */
10515	if (err = set_dtag(s, ep)) {
10516		err = mdsyserror(ep, err);
10517		goto out;
10518	}
10519
10520	/* If we had a BADTAG, it will be re-written, so clear the bit. */
10521	if (md_get_setstatus(setno) & MD_SET_BADTAG)
10522		md_clr_setstatus(setno, MD_SET_BADTAG);
10523
10524	if (err = dt_write(s)) {
10525		err = mdsyserror(ep, err);
10526		goto out;
10527	}
10528
10529	mddb_setexit(s);
10530
10531	s = NULL;
10532
10533	/* shorthand */
10534	setno = cp->c_setno;
10535
10536	/* Clear the keeptag */
10537	md_clr_setstatus(setno, MD_SET_KEEPTAG);
10538
10539	/* Release the set */
10540	if (err = release_set(cp, mode))
10541		goto out;
10542
10543	if (! mdisok(&cp->c_mde)) {
10544		(void) mdstealerror(ep, &cp->c_mde);
10545		goto out;
10546	}
10547
10548	/* Re-init set using the saved mddb_config_t structure */
10549	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10550		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10551			err = mddbstatus2error(ep, err, NODEV32, setno);
10552			goto out;
10553		}
10554	}
10555
10556	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10557
10558	/* Free the allocated rip structure */
10559	if (s->s_rip != (mddb_ri_t *)NULL)
10560		free_rip(&s->s_rip);
10561
10562	/* use the saved rip structure */
10563	s->s_rip = trip;
10564	trip = (mddb_ri_t *)NULL;
10565
10566	/* Let the set init code know an accept is in progress */
10567	md_set_setstatus(setno, MD_SET_ACCEPT);
10568
10569	mddb_setexit(s);
10570
10571	s = NULL;
10572
10573	/* Take the set */
10574	if (err = take_set(cp, mode))
10575		goto out;
10576
10577	if (! mdisok(&cp->c_mde))
10578		(void) mdstealerror(ep, &cp->c_mde);
10579
10580out:
10581	md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));
10582
10583	kmem_free(cp, sizeof (mddb_config_t));
10584
10585	if (trip)
10586		free_rip(&trip);
10587
10588	if (s)
10589		mddb_setexit(s);
10590
10591	return (err);
10592}
10593
10594/*
10595 * mddb_getinvlb_devid - cycles through the locator block and determines
10596 *		if the device id's for any of the replica disks are invalid.
10597 *		If so, it returns the diskname in the ctdptr.
10598 *	RETURN
10599 *		-1	Error
10600 *		cnt	number of invalid device id's
10601 */
10602int
10603mddb_getinvlb_devid(
10604	set_t	setno,
10605	int	count,
10606	int	size,
10607	char	**ctdptr
10608)
10609{
10610	mddb_set_t	*s;
10611	int		err = 0;
10612	mddb_lb_t	*lbp;
10613	int		li;
10614	mddb_did_blk_t	*did_blk;
10615	mddb_did_info_t	*did_info;
10616	int		len;
10617	int		cnt = 0;
10618	char		*cptr;
10619	md_name_suffix	*sn;
10620	int		i, dont_add_it;
10621	char		*tmpctd, *diskname;
10622	char		*tmpname;
10623
10624	cptr = *ctdptr;
10625	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
10626		return (-1);
10627	}
10628
10629	single_thread_start(s);
10630	lbp = s->s_lbp;
10631
10632	if (lbp->lb_setno != setno) {
10633		single_thread_end(s);
10634		mddb_setexit(s);
10635		return (-1);
10636	}
10637
10638	/* check for lb being devid style */
10639	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
10640		did_blk = s->s_did_icp->did_ic_blkp;
10641		for (li = 0; li < lbp->lb_loccnt; li++) {
10642			did_info = &(did_blk->blk_info[li]);
10643			/* Only if devid exists and isn't valid */
10644			if ((did_info->info_flags & MDDB_DID_EXISTS) &&
10645			    !(did_info->info_flags & MDDB_DID_VALID)) {
10646				/*
10647				 * if we count more invalid did's than
10648				 * was passed in there's an error somewhere
10649				 */
10650				if (cnt++ > count) {
10651					single_thread_end(s);
10652					mddb_setexit(s);
10653					return (-1);
10654				}
10655
10656				/*
10657				 * Future note: Need to do something here
10658				 * for the MN diskset case when device ids
10659				 * are supported in disksets.
10660				 * Can't add until merging devids_in_diskset
10661				 * code into code base.
10662				 */
10663
10664				sn = &s->s_lnp->ln_suffixes[0][li];
10665				/*
10666				 * check to make sure length of device name is
10667				 * not greater than computed first time through
10668				 */
10669				len = sn->suf_len;
10670				if (len > size) {
10671					single_thread_end(s);
10672					mddb_setexit(s);
10673					return (-1);
10674				}
10675				tmpctd = *ctdptr;
10676				/* strip off slice part */
10677				diskname = md_strdup(sn->suf_data);
10678				tmpname = strrchr(diskname, 's');
10679				*tmpname = '\0';
10680				dont_add_it = 0;
10681				/* look to see if diskname is already in list */
10682				for (i = 0; i < (cnt-1); i++) {
10683					if (strcmp(diskname, tmpctd) == 0) {
10684						/* already there, don't add */
10685						dont_add_it = 1;
10686						break;
10687					}
10688					/* point to next diskname in list */
10689					tmpctd += size;
10690				}
10691				if (dont_add_it == 0) {
10692					/* add diskname to list */
10693					(void) strcpy(cptr, diskname);
10694					cptr += size;
10695				}
10696				kmem_free(diskname, strlen(sn->suf_data) + 1);
10697			}
10698		}
10699	}
10700	/* null terminate the list */
10701	*cptr = '\0';
10702	/*
10703	 * need to save the new pointer so that calling routine can continue
10704	 * to add information onto the end.
10705	 */
10706	*ctdptr = cptr;
10707	single_thread_end(s);
10708	mddb_setexit(s);
10709	return (cnt);
10710}
10711
10712/*
10713 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
10714 *		track of length of longest devicename.
10715 *	RETURN
10716 *		-1	error
10717 *		 cnt	number of lb's with invalid devid's
10718 */
10719int
10720mddb_validate_lb(
10721	set_t	setno,
10722	int	*rmaxsz
10723)
10724{
10725	mddb_set_t	*s;
10726	int		err = 0;
10727	mddb_lb_t	*lbp;
10728	int		li;
10729	mddb_did_blk_t	*did_blk;
10730	mddb_did_info_t	*did_info;
10731	int		len;
10732	int		cnt = 0;
10733
10734	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10735		return (-1);
10736
10737	single_thread_start(s);
10738	lbp = s->s_lbp;
10739
10740	if (lbp->lb_setno != setno) {
10741		single_thread_end(s);
10742		mddb_setexit(s);
10743		return (-1);
10744	}
10745
10746	/* lb must be in devid style */
10747	if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
10748		goto mvl_out;
10749
10750	did_blk = s->s_did_icp->did_ic_blkp;
10751	for (li = 0; li < lbp->lb_loccnt; li++) {
10752		char		*minor_name;
10753		mddb_locator_t	*lp;
10754		dev_t		ddi_dev;
10755		ddi_devid_t	devid;
10756		ddi_devid_t	rtn_devid = NULL;
10757		int		get_rval;
10758
10759		did_info = &(did_blk->blk_info[li]);
10760		if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
10761		    (did_info->info_flags & MDDB_DID_VALID))
10762			continue;
10763
10764		/* Here we know, did exists but isn't valid */
10765
10766		lp = &lbp->lb_locators[li];
10767		ddi_dev = expldev(lp->l_dev);
10768		get_rval = mddb_devid_get(s, li, &devid, &minor_name);
10769		ASSERT(get_rval == 1);
10770		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
10771		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
10772			did_info->info_flags = MDDB_DID_VALID |
10773			    MDDB_DID_EXISTS | MDDB_DID_UPDATED;
10774		} else {
10775			cnt++;
10776			/*
10777			 * Future note: Need to do something here
10778			 * for the MN diskset case when device ids
10779			 * are supported in disksets.
10780			 * Can't add until merging devids_in_diskset
10781			 * code into code base.
10782			 */
10783			len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
10784			if (*rmaxsz < len)
10785				*rmaxsz = len;
10786		}
10787		if (rtn_devid != NULL)
10788			ddi_devid_free(rtn_devid);
10789	}
10790
10791mvl_out:
10792
10793	if (push_lb(s) != 0)
10794		cnt = -1;
10795	(void) upd_med(s, "mddb_validate_lb(0)");
10796	single_thread_end(s);
10797	mddb_setexit(s);
10798	return (cnt);
10799}
10800
10801int
10802check_active_locators()
10803{
10804	mddb_set_t	*s;
10805	mddb_lb_t	*lbp;
10806	int		li;
10807	int		active = 0;
10808
10809	mutex_enter(&mddb_lock);
10810	/* there is nothing here..so we can unload */
10811	if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
10812		mutex_exit(&mddb_lock);
10813		return (0);
10814	}
10815	s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
10816	lbp = s->s_lbp;
10817	if (lbp == NULL) {
10818		mutex_exit(&mddb_lock);
10819		return (0);
10820	}
10821
10822	for (li = 0; li < lbp->lb_loccnt; li++) {
10823		mddb_locator_t *lp = &lbp->lb_locators[li];
10824		if (lp->l_flags & MDDB_F_ACTIVE) {
10825			active = 1;
10826			break;
10827		}
10828	}
10829	mutex_exit(&mddb_lock);
10830	return (active);
10831}
10832
10833/*
10834 * regetoptrecord:
10835 * --------------
10836 *	Update the in-core optimized resync record contents by re-reading the
10837 *	record from the on-disk metadb.
10838 *	The contents of the resync record will be overwritten by calling this
10839 *	routine. This means that callers that require the previous contents to
10840 *	be preserved must save the data before calling this routine.
10841 *	Return values:
10842 *	0 - successfully read in resync record from a mddb
10843 *	1 - failure.  Unable to read resync record from either mddb.
10844 */
10845static int
10846regetoptrecord(
10847	mddb_set_t	*s,
10848	mddb_de_ic_t	*dep
10849)
10850{
10851	mddb_lb_t	*lbp;
10852	mddb_locator_t	*lp;
10853	mddb_rb32_t	*rbp, *crbp;
10854	int		li;
10855	int		i;
10856	int		err = 0;
10857	size_t		recsize;
10858
10859#if defined(_ILP32) && !defined(lint)
10860	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
10861#endif
10862
10863	recsize = dep->de_recsize;
10864	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
10865
10866	single_thread_start(s);
10867	rbp = dep->de_rb;
10868
10869	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
10870	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10871
10872	lbp = s->s_lbp;
10873
10874	for (i = 0; i < 2; i++) {
10875		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
10876			continue;
10877		li = dep->de_optinfo[i].o_li;
10878		lp = &lbp->lb_locators[li];
10879
10880		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
10881		    (lp->l_flags & MDDB_F_EMASTER))
10882			continue;
10883
10884		/*
10885		 * re-read the optimized resync record with failfast set
10886		 * since a failed disk could lead to a very long wait.
10887		 */
10888		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
10889		    dep->de_blkcount, li, B_FAILFAST);
10890
10891		if (err)
10892			continue;
10893
10894		if (rbp->rb_magic != MDDB_MAGIC_RB)
10895			continue;
10896
10897		if (revchk(MDDB_REV_RB, rbp->rb_revision))
10898			continue;
10899
10900		/* Check the crc for this record */
10901		if (rec_crcchk(s, dep, rbp)) {
10902			continue;
10903		}
10904		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
10905
10906		if (rbp == crbp) {
10907			if (rbp->rb_checksum != crbp->rb_checksum)
10908				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10909			break;
10910		}
10911		rbp = crbp;
10912	}
10913
10914	single_thread_end(s);
10915
10916	if (rbp == crbp) {
10917		rbp->rb_private = 0;
10918		kmem_free((caddr_t)crbp, recsize);
10919		return (0);
10920	}
10921	uniqtime32(&rbp->rb_timestamp);
10922	/* Generate the crc for this record */
10923	rec_crcgen(s, dep, rbp);
10924	kmem_free((caddr_t)crbp, recsize);
10925	return (1);
10926}
10927
10928/*
10929 * mddb_reread_rr:
10930 *	Re-read the resync record from the on-disk copy. This is required for
10931 *	multi-node support so that a new mirror-owner can determine if a resync
10932 *	operation is required to guarantee data integrity.
10933 *
10934 * Arguments:
10935 *	setno	Associated set
10936 *	id	Resync record ID
10937 *
10938 * Return Value:
10939 *	0	successful reread
10940 *	-1	invalid set (not multi-node or non-existant)
10941 *	>0	metadb state invalid, failed to reread
10942 */
10943int
10944mddb_reread_rr(
10945	set_t		setno,
10946	mddb_recid_t	id
10947)
10948{
10949	mddb_set_t	*s;
10950	int		err = 0;
10951	mddb_db_t	*dbp;
10952	mddb_de_ic_t	*dep;
10953
10954	if (setno >= md_nsets)
10955		return (-1);
10956
10957	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10958		return (-1);
10959
10960	if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
10961		mddb_setexit(s);
10962		return (-1);
10963	}
10964
10965	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10966		dep = dbp->db_firstentry;
10967		while (dep && (dep->de_recid != DBID(id)))
10968			dep = dep->de_next;
10969		if (dep != NULL)
10970			break;
10971	}
10972
10973	if (dep != NULL) {
10974		err = regetoptrecord(s, dep);
10975	} else {
10976		err = -1;
10977	}
10978	mddb_setexit(s);
10979	return (err);
10980}
10981
10982/*
10983 * Set owner associated with MN optimized resync record.
10984 *
10985 * Optimized records have an owner node associated with them in
10986 * a MN diskset.  The owner is only set on a node that is actively
10987 * writing to that record.  The other nodes will show that record
10988 * as having an invalid owner.  The owner for an optimized record
10989 * is used during fixoptrecord to determine which node should
10990 * write out the record when the replicas associated with that
10991 * optimized record have been changed.
10992 *
10993 * Called directly from mirror driver and not from an ioctl.
10994 *
10995 * Returns
10996 *	NULL if successful.
10997 *	MDDB_E_NORECORD if record not found.
10998 */
10999int
11000mddb_setowner(
11001	mddb_recid_t		id,
11002	md_mn_nodeid_t		owner
11003)
11004{
11005	mddb_set_t		*s;
11006	mddb_db_t		*dbp;
11007	mddb_de_ic_t		*dep;
11008	int			found = 0;
11009
11010
11011	if (DBSET(id) >= md_nsets)
11012		return (MDDB_E_NORECORD);
11013
11014	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
11015		return (MDDB_E_NORECORD);
11016
11017	id = DBID(id);
11018	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11019		for (dep = dbp->db_firstentry;
11020		    dep != NULL; dep = dep->de_next) {
11021			if (dep->de_recid != id)
11022				continue;
11023			dep->de_owner_nodeid = owner;
11024			found = 1;
11025			break;
11026		}
11027		if (found)
11028			break;
11029	}
11030
11031	mddb_setexit(s);
11032
11033	if (!found) {
11034		return (MDDB_E_NORECORD);
11035	}
11036
11037	return (NULL);
11038}
11039
11040/*
11041 * mddb_parse re-reads portions of the mddb from disk given a list
11042 * of good replicas to read from and flags describing
11043 * which portion of the mddb to read in.
11044 *
11045 * Used in a MN diskset when the master has made a change to some part
11046 * of the mddb and wants to relay this information to the slaves.
11047 */
11048int
11049mddb_parse(mddb_parse_parm_t *mpp)
11050{
11051	mddb_set_t	*s;
11052	int		err = 0;
11053	mddb_locator_t	*lp, *old_lp;
11054	mddb_lb_t	*lbp, *old_lbp;
11055	int		rval = 0;
11056	int		i, li;
11057	int		found_good_one = 0;
11058	mddb_ln_t	*lnp;
11059	mddb_block_t	ln_blkcnt;
11060	md_error_t	*ep = &mpp->c_mde;
11061
11062	if (mpp->c_setno >= md_nsets)
11063		return (EINVAL);
11064
11065	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11066		return (0);
11067
11068	if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11069		return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
11070	}
11071
11072	if (!(MD_MNSET_SETNO(mpp->c_setno))) {
11073		mddb_setexit_no_parse(s);
11074		return (EINVAL);
11075	}
11076
11077	/*
11078	 * Master node initiated this request, so there's no work for
11079	 * the master node to do.
11080	 */
11081	if (md_set[mpp->c_setno].s_am_i_master) {
11082		mddb_setexit_no_parse(s);
11083		return (rval);
11084	}
11085
11086	single_thread_start(s);
11087
11088	if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
11089		lbp = 0;
11090		for (i = 0; i < MDDB_NLB; i++) {
11091			/* Walk through master's active list */
11092			if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
11093				continue;
11094			if (s->s_mbiarray[i] == NULL)
11095				continue;
11096
11097			/* Assumes master blocks are already setup */
11098			if (lbp == (mddb_lb_t *)NULL) {
11099				lbp = (mddb_lb_t *)kmem_zalloc(
11100				    dbtob(MDDB_MNLBCNT), KM_SLEEP);
11101			}
11102			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
11103
11104			if (err)
11105				continue;
11106
11107			if (lbp->lb_magic != MDDB_MAGIC_LB)
11108				continue;
11109			if (lbp->lb_blkcnt != MDDB_MNLBCNT)
11110				continue;
11111			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
11112				continue;
11113			if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
11114			    NULL))
11115				continue;
11116			if (lbp->lb_setno != s->s_setno)
11117				continue;
11118			/*
11119			 * a commit count of zero means this locator has
11120			 * been deleted
11121			 */
11122			if (lbp->lb_commitcnt == 0) {
11123				continue;
11124			}
11125			/* Found a good locator - keep it */
11126			found_good_one = 1;
11127			break;
11128		}
11129
11130		/*
11131		 * If found a good copy of the mddb, then read it into
11132		 * this node's locator block.  Fix up the set's s_mbiarray
11133		 * pointer (master block incore array pointer) to be
11134		 * in sync with the newly read in locator block.  If a
11135		 * new mddb was added, read in the master blocks associated
11136		 * with the new mddb.  If an mddb was deleted, free the
11137		 * master blocks associated with deleted mddb.
11138		 */
11139		if (found_good_one)  {
11140			/* Compare old and new view of mddb locator blocks */
11141			old_lbp = s->s_lbp;
11142			for (li = 0; li < lbp->lb_loccnt; li++) {
11143				int	mn_set;
11144
11145				lp = &lbp->lb_locators[li];
11146				old_lp = &old_lbp->lb_locators[li];
11147
11148				/* If old and new views match, continue */
11149				if ((lp->l_flags & MDDB_F_ACTIVE) ==
11150				    (old_lp->l_flags & MDDB_F_ACTIVE))
11151					continue;
11152
11153				if (lp->l_flags & MDDB_F_ACTIVE) {
11154					/*
11155					 * If new mddb has been added - delete
11156					 * old mbiarray and get new one.
11157					 *
11158					 * When devids are supported, will
11159					 * need to get dev from devid.
11160					 */
11161					if (s->s_mbiarray[li]) {
11162						free_mbipp(&s->s_mbiarray[li]);
11163					}
11164					/*
11165					 * If getmasters fails, getmasters
11166					 * will set appropriate error flags.
11167					 */
11168					s->s_mbiarray[li] = getmasters(s,
11169					    md_expldev(lp->l_dev), lp->l_blkno,
11170					    (uint_t *)&(lp->l_flags), &mn_set);
11171				} else if (lp->l_flags & MDDB_F_DELETED) {
11172					/*
11173					 * If old one has been deleted -
11174					 * delete old mbiarray.
11175					 */
11176					if (s->s_mbiarray[li]) {
11177						free_mbipp(&s->s_mbiarray[li]);
11178					}
11179				}
11180			}
11181
11182			/* Free this node's old view of mddb locator blocks */
11183			kmem_free((caddr_t)s->s_lbp,
11184			    dbtob(s->s_lbp->lb_blkcnt));
11185			s->s_lbp = lbp;
11186		} else {
11187			if (lbp)
11188				kmem_free(lbp, dbtob(MDDB_MNLBCNT));
11189		}
11190	}
11191
11192	if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
11193		lnp = s->s_lnp;
11194		lbp = s->s_lbp;
11195		ln_blkcnt = lbp->lb_lnblkcnt;
11196		s->s_lnp = NULL; /* readlocnames does this anyway */
11197		for (li = 0; li < lbp->lb_loccnt; li++) {
11198			lp = &lbp->lb_locators[li];
11199
11200			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11201			    (lp->l_flags & MDDB_F_EMASTER))
11202				continue;
11203
11204			/* Successfully read the locator names */
11205			if (readlocnames(s, li) == 0)
11206				break;
11207		}
11208
11209		if (li == lbp->lb_loccnt) {
11210			/* Did not successfully read locnames; restore lnp */
11211			s->s_lnp = lnp;
11212		} else {
11213			/* readlocnames successful, free old struct */
11214			kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
11215		}
11216	}
11217
11218	if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
11219		mddb_de_ic_t	*dep, *tdep, *first_dep, *dep2;
11220		mddb_db_t	*dbp;
11221		mddb_db32_t	*db32p;
11222		mddb_de32_t	*de32p, *de32p2;
11223		int		writeout;
11224
11225		lbp = s->s_lbp;
11226		/*
11227		 * Walk through directory block and directory entry incore
11228		 * linked list looking for optimized resync records.
11229		 * For each opt record found, re-read in directory block.
11230		 * The directoy block consists of a number of directory
11231		 * entries.  The directory entry for this opt record will
11232		 * describe which 2 mddbs actually contain the resync record
11233		 * since it could have been relocated by the master node
11234		 * due to mddb failure or mddb deletion.  If this node
11235		 * is the record owner for this opt record, then write out
11236		 * the record to the 2 mddbs listed in the directory entry
11237		 * if the mddbs locations are different than previously known.
11238		 */
11239		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11240			for (dep = dbp->db_firstentry; dep;
11241			    dep = dep->de_next) {
11242				/* Found an opt record */
11243				if (dep->de_flags & MDDB_F_OPT)
11244					break;
11245			}
11246			/* If no opt records found, go to next dbp */
11247			if (dep == NULL)
11248				continue;
11249
11250			/*
11251			 * Reread directory block from disk since
11252			 * master could have rewritten in during fixoptrecord.
11253			 */
11254			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
11255			    KM_SLEEP);
11256			create_db32rec(db32p, dbp);
11257			for (li = 0; li < lbp->lb_loccnt; li++) {
11258				lp = &lbp->lb_locators[li];
11259
11260				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11261				    (lp->l_flags & MDDB_F_EMASTER))
11262					continue;
11263
11264				err = readblks(s, (caddr_t)db32p,
11265				    db32p->db32_blknum, 1, li);
11266				if (err)
11267					continue;
11268
11269				/* Reverify db; go to next mddb if bad */
11270				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
11271				    (revchk(MDDB_REV_DB,
11272				    db32p->db32_revision)) ||
11273				    (crcchk(db32p, &db32p->db32_checksum,
11274				    MDDB_BSIZE, NULL))) {
11275					continue;
11276				} else {
11277					break;
11278				}
11279			}
11280			/*
11281			 * If all mddbs are unavailable then panic since
11282			 * this slave cannot be allowed to continue out-of-sync
11283			 * with the master node.  Since the optimized resync
11284			 * records are written by all nodes, all nodes must
11285			 * stay in sync with the master.
11286			 *
11287			 * This also handles the case when all storage
11288			 * connectivity to a slave node has failed.  The
11289			 * slave node will send an MDDB_OPTRECERR message to
11290			 * the master node when the slave node has been unable
11291			 * to write an optimized resync record to both
11292			 * designated mddbs.  After the master has fixed the
11293			 * optimized records to be on available mddbs, the
11294			 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
11295			 * is sent to all slave nodes.  If a slave node is
11296			 * unable to access any mddb in order to read in the
11297			 * relocated optimized resync record, then the slave
11298			 * node must panic.
11299			 */
11300			if (li == lbp->lb_loccnt) {
11301				kmem_free((caddr_t)db32p, MDDB_BSIZE);
11302				cmn_err(CE_PANIC, "md: mddb: Node unable to "
11303				    "access any SVM state database "
11304				    "replicas for diskset %s\n", s->s_setname);
11305			}
11306			/*
11307			 * Setup temp copy of linked list of de's.
11308			 * Already have an incore copy, but need to walk
11309			 * the directory entry list contained in the
11310			 * new directory block that was just read in above.
11311			 * After finding the directory entry of an opt record
11312			 * by walking the incore list, find the corresponding
11313			 * entry in the temporary list and then update
11314			 * the incore directory entry record with
11315			 * the (possibly changed) mddb location stored
11316			 * for the optimized resync records.
11317			 */
11318			de32p = (mddb_de32_t *)
11319			    ((void *) ((caddr_t)
11320			    (&db32p->db32_firstentry)
11321			    + sizeof (db32p->db32_firstentry)));
11322			tdep = (mddb_de_ic_t *)
11323			    kmem_zalloc(sizeof (mddb_de_ic_t) -
11324			    sizeof (mddb_block_t) +
11325			    sizeof (mddb_block_t) *
11326			    de32p->de32_blkcount, KM_SLEEP);
11327			de32tode(de32p, tdep);
11328			first_dep = tdep;
11329			while (de32p && de32p->de32_next) {
11330				de32p2 = nextentry(de32p);
11331				dep2 = (mddb_de_ic_t *)kmem_zalloc(
11332				    sizeof (mddb_de_ic_t) -
11333				    sizeof (mddb_block_t) +
11334				    sizeof (mddb_block_t) *
11335				    de32p2->de32_blkcount, KM_SLEEP);
11336				de32tode(de32p2, dep2);
11337				tdep->de_next = dep2;
11338				tdep = dep2;
11339				de32p = de32p2;
11340			}
11341
11342			/* Now, walk the incore directory entry list */
11343			for (dep = dbp->db_firstentry; dep;
11344			    dep = dep->de_next) {
11345				if (! (dep->de_flags & MDDB_F_OPT))
11346					continue;
11347				/*
11348				 * Found an opt record in the incore copy.
11349				 * Find the corresponding entry in the temp
11350				 * list.  If anything has changed in the
11351				 * opt record info between the incore copy
11352				 * and the temp copy, update the incore copy
11353				 * and set a flag to writeout the opt record
11354				 * to the new mddb locations.
11355				 */
11356				for (tdep = first_dep; tdep;
11357				    tdep = tdep->de_next) {
11358					if (dep->de_recid == tdep->de_recid) {
11359					    writeout = 0;
11360					    /* Check first mddb location */
11361					    if ((dep->de_optinfo[0].o_li !=
11362						tdep->de_optinfo[0].o_li) ||
11363						(dep->de_optinfo[0].o_flags !=
11364						tdep->de_optinfo[0].o_flags)) {
11365						    dep->de_optinfo[0] =
11366						    tdep->de_optinfo[0];
11367						    writeout = 1;
11368					    }
11369					    /* Check second mddb location */
11370					    if ((dep->de_optinfo[1].o_li !=
11371						tdep->de_optinfo[1].o_li) ||
11372						(dep->de_optinfo[1].o_flags !=
11373						tdep->de_optinfo[1].o_flags)) {
11374						    dep->de_optinfo[1] =
11375						    tdep->de_optinfo[1];
11376						    writeout = 1;
11377					    }
11378					    /* Record owner should rewrite it */
11379					    if ((writeout) &&
11380						(dep->de_owner_nodeid ==
11381						md_set[mpp->c_setno].
11382						s_nodeid)) {
11383						    (void) writeoptrecord(s,
11384							dep);
11385					    }
11386					    break;
11387					}
11388				}
11389			}
11390			/*
11391			 * Update the incore checksum information for this
11392			 * directory block to match the newly read in checksum.
11393			 * This should have only changed if the incore and
11394			 * temp directory entries differed, but it takes
11395			 * more code to do the check than to just update
11396			 * the information everytime.
11397			 */
11398			dbp->db_checksum = db32p->db32_checksum;
11399
11400			/* Now free everything */
11401			tdep = first_dep;
11402			while (tdep) {
11403				dep2 = tdep->de_next;
11404				kmem_free((caddr_t)tdep,
11405				    sizeofde(tdep));
11406				tdep = dep2;
11407			}
11408			kmem_free((caddr_t)db32p, MDDB_BSIZE);
11409		}
11410		rval = 0;
11411	}
11412out:
11413	single_thread_end(s);
11414	mddb_setexit_no_parse(s);
11415	return (rval);
11416}
11417
11418int
11419mddb_block(mddb_block_parm_t *mbp)
11420{
11421	mddb_set_t	*s;
11422	int		err = 0;
11423	md_error_t	*ep = &mbp->c_mde;
11424
11425	if (mbp->c_setno >= md_nsets)
11426		return (EINVAL);
11427
11428	/*
11429	 * If the new_master flag is set for this setno we are in the middle
11430	 * of a reconfig cycle, and blocking or unblocking is not needed.
11431	 * Hence we can return success immediately
11432	 */
11433	if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
11434		return (0);
11435	}
11436
11437	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11438		return (0);
11439
11440	if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11441		return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
11442	}
11443
11444	if (!(MD_MNSET_SETNO(mbp->c_setno))) {
11445		mddb_setexit_no_parse(s);
11446		return (EINVAL);
11447	}
11448
11449	single_thread_start(s);
11450
11451	if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
11452		md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11453
11454	if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
11455		md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11456
11457	single_thread_end(s);
11458	mddb_setexit_no_parse(s);
11459	return (err);
11460}
11461
11462/*
11463 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
11464 * to relocate any optimized resync records to available mddbs.
11465 * This routine is only called on the master node.
11466 *
11467 * Used in a MN diskset when a slave node has failed to write an optimized
11468 * resync record.  The failed mddb information is sent to the master node
11469 * so the master can relocate the optimized records, if possible.  If the
11470 * failed mddb information has a mddb marked as failed that was previously
11471 * marked active on the master, the master sets its incore mddb state to
11472 * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
11473 * to relocate any optimized records on the newly failed mddbs by calling
11474 * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
11475 * optimized records are relocated.)
11476 *
11477 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
11478 * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
11479 * flag causes the slave node to re-read in the locator block from disk.
11480 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
11481 * blocks and write out any optimized resync records that have been
11482 * relocated to a different mddb.
11483 */
11484int
11485mddb_optrecfix(mddb_optrec_parm_t *mop)
11486{
11487	mddb_set_t		*s;
11488	int			err = 0;
11489	mddb_lb_t		*lbp;
11490	mddb_mnlb_t		*mnlbp;
11491	mddb_locator_t		*lp;
11492	int			li;
11493	mddb_mnsidelocator_t	*mnslp;
11494	mddb_drvnm_t		*dn;
11495	int			i, j;
11496	md_replica_recerr_t	*recerr;
11497	md_error_t		*ep = &mop->c_mde;
11498	int			something_changed = 0;
11499	int			alc, lc;
11500	int			setno;
11501
11502	setno = mop->c_setno;
11503	if (mop->c_setno >= md_nsets)
11504		return (EINVAL);
11505
11506	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11507		return (0);
11508
11509	if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11510		return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
11511	}
11512
11513	if (!(MD_MNSET_SETNO(mop->c_setno))) {
11514		mddb_setexit(s);
11515		return (EINVAL);
11516	}
11517
11518	single_thread_start(s);
11519	lbp = s->s_lbp;
11520	mnlbp = (mddb_mnlb_t *)lbp;
11521
11522	/*
11523	 * If slave node has seen an mddb failure, but the master node
11524	 * hasn't encountered this failure, mark the mddb as failed on
11525	 * the master node and set the something_changed flag to 1.
11526	 */
11527	for (i = 0; i < 2; i++) {
11528		recerr = &mop->c_recerr[i];
11529		if (recerr->r_flags & MDDB_F_EWRITE) {
11530			li = recerr->r_li;
11531			lp = &lbp->lb_locators[li];
11532			for (j = 0; j < MD_MNMAXSIDES; j++) {
11533				mnslp = &mnlbp->lb_mnsidelocators[j][li];
11534				if (mnslp->mnl_sideno == s->s_sideno)
11535					break;
11536			}
11537			/* Do quick check using li */
11538			if (j != MD_MNMAXSIDES)
11539				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11540
11541			if ((j != MD_MNMAXSIDES) &&
11542			    (strncmp(dn->dn_data, recerr->r_driver_name,
11543			    MD_MAXDRVNM) == 0) &&
11544			    (recerr->r_blkno == lp->l_blkno) &&
11545			    (recerr->r_mnum == mnslp->mnl_mnum)) {
11546				if ((lp->l_flags & MDDB_F_ACTIVE) ||
11547				    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
11548					something_changed = 1;
11549					lp->l_flags |= MDDB_F_EWRITE;
11550					lp->l_flags &= ~MDDB_F_ACTIVE;
11551				}
11552			} else {
11553				/*
11554				 * Passed in li from slave does not match
11555				 * the replica in the master's structures.
11556				 * This could have occurred if a delete
11557				 * mddb command was running when the
11558				 * optimized resync record had a failure.
11559				 * Search all replicas for this entry.
11560				 * If no match, just ignore.
11561				 * If a match, set replica in error.
11562				 */
11563				for (li = 0; li < lbp->lb_loccnt; li++) {
11564					lp = &lbp->lb_locators[li];
11565					if (lp->l_flags & MDDB_F_DELETED)
11566						continue;
11567
11568					for (j = 0; j < MD_MNMAXSIDES; j++) {
11569						mnslp =
11570						    &mnlbp->
11571						    lb_mnsidelocators[j][li];
11572						if (mnslp->mnl_sideno ==
11573						    s->s_sideno)
11574							break;
11575					}
11576					if (j == MD_MNMAXSIDES)
11577						continue;
11578
11579					dn = &lbp->
11580					    lb_drvnm[mnslp->mnl_drvnm_index];
11581					if ((strncmp(dn->dn_data,
11582					    recerr->r_driver_name,
11583					    MD_MAXDRVNM) == 0) &&
11584					    (recerr->r_blkno == lp->l_blkno) &&
11585					    (recerr->r_mnum ==
11586					    mnslp->mnl_mnum)) {
11587						if ((lp->l_flags &
11588						    MDDB_F_ACTIVE) ||
11589						    ((lp->l_flags &
11590						    MDDB_F_EWRITE) == 0)) {
11591							something_changed = 1;
11592							lp->l_flags |=
11593							    MDDB_F_EWRITE;
11594							lp->l_flags &=
11595							    ~MDDB_F_ACTIVE;
11596						}
11597						break;
11598					}
11599				}
11600			}
11601		}
11602	}
11603
11604	/*
11605	 * If this message changed nothing, then we're done since this
11606	 * failure has already been handled.
11607	 * If some mddb state has been changed, send a parse message to
11608	 * the slave nodes so that the slaves will re-read the locator
11609	 * block from disk.
11610	 */
11611	if (something_changed == 0) {
11612		single_thread_end(s);
11613		mddb_setexit(s);
11614		return (0);
11615	} else {
11616		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11617	}
11618
11619	/*
11620	 * Scan replicas setting MD_SET_TOOFEW if
11621	 * 50% or more of the mddbs have seen errors.
11622	 * Note: Don't call selectreplicas or writeretry
11623	 * since these routines may end up setting the ACTIVE flag
11624	 * on a failed mddb if the master is able to access the mddb
11625	 * but the slave node couldn't.  Need to have the ACTIVE flag
11626	 * turned off in order to relocate the optimized records to
11627	 * mddbs that are (hopefully) available on all nodes.
11628	 */
11629	alc = 0;
11630	lc = 0;
11631	for (li = 0; li < lbp->lb_loccnt; li++) {
11632		lp = &lbp->lb_locators[li];
11633		if (lp->l_flags & MDDB_F_DELETED)
11634			continue;
11635		lc++;
11636		if (! (lp->l_flags & MDDB_F_ACTIVE))
11637			continue;
11638		alc++;
11639	}
11640
11641	/*
11642	 * If more than 50% mddbs have failed, then don't relocate opt recs.
11643	 * The node sending the mddb failure information will detect TOOFEW
11644	 * and will panic when it attempts to re-write the optimized record.
11645	 */
11646	if (alc < ((lc + 1) / 2)) {
11647		md_set_setstatus(setno, MD_SET_TOOFEW);
11648		(void) push_lb(s);
11649		(void) upd_med(s, "mddb_optrecfix(0)");
11650		single_thread_end(s);
11651		mddb_setexit(s);
11652		return (0);
11653	}
11654
11655	/* Attempt to relocate optimized records that are on failed mddbs */
11656	(void) fixoptrecords(s);
11657
11658	/* Push changed locator block out to disk */
11659	(void) push_lb(s);
11660	(void) upd_med(s, "mddb_optrecfix(1)");
11661
11662	/* Recheck for TOOFEW after writing out locator blocks */
11663	alc = 0;
11664	lc = 0;
11665	for (li = 0; li < lbp->lb_loccnt; li++) {
11666		lp = &lbp->lb_locators[li];
11667		if (lp->l_flags & MDDB_F_DELETED)
11668			continue;
11669		lc++;
11670		if (! (lp->l_flags & MDDB_F_ACTIVE))
11671			continue;
11672		alc++;
11673	}
11674
11675	/* If more than 50% mddbs have failed, then don't relocate opt recs */
11676	if (alc < ((lc + 1) / 2)) {
11677		md_set_setstatus(setno, MD_SET_TOOFEW);
11678		single_thread_end(s);
11679		mddb_setexit(s);
11680		return (0);
11681	}
11682
11683	single_thread_end(s);
11684	mddb_setexit(s);
11685	return (0);
11686}
11687
11688/*
11689 * Check if incore mddb on master node matches ondisk mddb.
11690 * If not, master writes out incore view to all mddbs.
11691 * Have previously verified that master is an owner of the
11692 * diskset (master has snarfed diskset) and that diskset is
11693 * not stale.
11694 *
11695 * Meant to be called during reconfig cycle during change of master.
11696 * Previous master in diskset may have changed the mddb and
11697 * panic'd before relaying information to slave nodes.  New
11698 * master node just writes out its incore view of the mddb and
11699 * the replay of the change log will resync all the nodes.
11700 *
11701 * Only supported for MN disksets.
11702 *
11703 * Return values:
11704 *	0 - success
11705 *	non-zero - failure
11706 */
11707int
11708mddb_check_write_ioctl(mddb_config_t *info)
11709{
11710	int			err = 0;
11711	set_t			setno = info->c_setno;
11712	mddb_set_t		*s;
11713	int			li;
11714	mddb_locator_t		*lp;
11715	mddb_lb_t		*lbp;
11716	mddb_mnlb_t		*mnlbp_od;
11717	mddb_ln_t		*lnp;
11718	mddb_mnln_t		*mnlnp_od;
11719	mddb_db_t		*dbp;
11720	mddb_de_ic_t		*dep;
11721	int			write_out_mddb;
11722	md_error_t		*ep = &info->c_mde;
11723	int			mddb_err = 0;
11724	int			prev_li = 0;
11725	int			rval = 0;
11726	int			alc, lc;
11727	int			mddbs_present = 0;
11728
11729	/* Verify that setno is in valid range */
11730	if (setno >= md_nsets)
11731		return (EINVAL);
11732
11733	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11734		return (0);
11735
11736	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
11737		return (mddbstatus2error(ep, err, NODEV32, setno));
11738	}
11739
11740	/* Calling diskset must be a MN diskset */
11741	if (!(MD_MNSET_SETNO(setno))) {
11742		mddb_setexit(s);
11743		return (EINVAL);
11744	}
11745
11746	/* Re-verify that set is not stale */
11747	if (md_get_setstatus(setno) & MD_SET_STALE) {
11748		mddb_setexit(s);
11749		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
11750	}
11751
11752	lbp = s->s_lbp;
11753	lnp = s->s_lnp;
11754
11755	/*
11756	 * Previous master could have died during the write of data to
11757	 * the mddbs so that the ondisk mddbs may not be consistent.
11758	 * So, need to check the contents of the first and last active mddb
11759	 * to see if the mddbs need to be rewritten.
11760	 */
11761	for (li = 0; li < lbp->lb_loccnt; li++) {
11762		int	checkcopy_err;
11763
11764		lp = &lbp->lb_locators[li];
11765		/* Find replica that is active */
11766		if (lp->l_flags & MDDB_F_DELETED)
11767			continue;
11768		mddbs_present = 1;
11769		if (! (lp->l_flags & MDDB_F_ACTIVE))
11770			continue;
11771		if (s->s_mbiarray[li] == NULL)
11772			continue;
11773		/* Check locator block */
11774		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11775		    KM_SLEEP);
11776		/* read in on-disk locator block */
11777		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11778
11779		/* If err, try next mddb */
11780		if (err) {
11781			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11782			continue;
11783		}
11784
11785		/*
11786		 * We resnarf all changelog entries for this set.
11787		 * They may have been altered by the previous master
11788		 */
11789		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11790			for (dep = dbp->db_firstentry; dep; dep =
11791			    dep->de_next) {
11792				if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
11793					continue;
11794				}
11795				/*
11796				 * This has been alloc'ed while
11797				 * joining the set
11798				 */
11799				if (dep->de_rb) {
11800					kmem_free(dep->de_rb, dep->de_recsize);
11801					dep->de_rb = (mddb_rb32_t *)NULL;
11802				}
11803				if (dep->de_rb_userdata) {
11804					kmem_free(dep->de_rb_userdata,
11805					    dep->de_reqsize);
11806					dep->de_rb_userdata = (caddr_t)NULL;
11807				}
11808
11809				err = getrecord(s, dep, li);
11810				if (err) {
11811					/*
11812					 * When we see on error while reading
11813					 * the changelog entries, we move on
11814					 * to the next mddb
11815					 */
11816					err = 1;
11817					break; /* out of inner for-loop */
11818				}
11819				allocuserdata(dep);
11820			}
11821			if (err)
11822				break; /* out of outer for-loop */
11823		}
11824
11825		/* If err, try next mddb */
11826		if (err) {
11827			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11828			continue;
11829		}
11830
11831		/* Is incore locator block same as ondisk? */
11832		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11833		    == 1) {
11834			write_out_mddb = 1;
11835			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11836			break;
11837		}
11838
11839		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11840
11841		/* If lb ok, check locator names */
11842		mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
11843		    KM_SLEEP);
11844		/* read in on-disk locator names */
11845		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11846		    lbp->lb_lnblkcnt, li);
11847
11848		/* If err, try next mddb */
11849		if (err) {
11850			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11851			continue;
11852		}
11853
11854		/* Are incore locator names same as ondisk? */
11855		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11856		    == 1) {
11857			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11858			write_out_mddb = 1;
11859			break;
11860		}
11861
11862		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11863
11864		/*
11865		 * Check records in mddb.
11866		 * If a read error is encountered, set the error flag and
11867		 * continue to the next mddb.  Otherwise, if incore data is
11868		 * different from ondisk, then set the flag to write out
11869		 * the mddb and break out.
11870		 */
11871		checkcopy_err = checkcopy(s, li);
11872		if (checkcopy_err == MDDB_F_EREAD) {
11873			lp->l_flags |= MDDB_F_EREAD;
11874			mddb_err = 1;
11875			continue;
11876		} else if (checkcopy_err == 1) {
11877			write_out_mddb = 1;
11878			break;
11879		}
11880		/*
11881		 * Have found first active mddb and the data is the same as
11882		 * incore - break out of loop
11883		 */
11884		write_out_mddb = 0;
11885		break;
11886	}
11887
11888	/*
11889	 * Skip checking for last active mddb if:
11890	 *	- already found a mismatch in the first active mddb
11891	 *		(write_out_mddb is 1)  OR
11892	 * 	- didn't find a readable mddb when looking for first
11893	 *	  active mddb (there are mddbs present but all failed
11894	 *	  when read was attempted).
11895	 *
11896	 * In either case, go to write_out_mddb label in order to attempt
11897	 * to write out the data. If < 50% mddbs are available, panic.
11898	 */
11899	if ((write_out_mddb == 1) ||
11900	    ((li == lbp->lb_loccnt) && mddbs_present)) {
11901		write_out_mddb = 1;
11902		goto write_out_mddb;
11903	}
11904
11905	/*
11906	 * Save which index was checked for the first active mddb.  If only 1
11907	 * active mddb, don't want to recheck the same mddb when looking for
11908	 * last active mddb.
11909	 */
11910	prev_li = li;
11911
11912	/*
11913	 * Now, checking for last active mddb.  If found same index as before
11914	 * (only 1 active mddb), then skip.
11915	 */
11916	for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
11917		int	checkcopy_err;
11918
11919		lp = &lbp->lb_locators[li];
11920		/* Find replica that is active */
11921		if (! (lp->l_flags & MDDB_F_ACTIVE))
11922			continue;
11923		if (lp->l_flags & MDDB_F_DELETED)
11924			continue;
11925		if (s->s_mbiarray[li] == NULL)
11926			continue;
11927		/* If already checked mddb, bail out */
11928		if (li == prev_li)
11929			break;
11930		/* Check locator block */
11931		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11932		    KM_SLEEP);
11933		/* read in on-disk locator block */
11934		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11935
11936		/* If err, try next mddb */
11937		if (err) {
11938			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11939			continue;
11940		}
11941
11942
11943		/* Is incore locator block same as ondisk? */
11944		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11945		    == 1) {
11946			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11947			write_out_mddb = 1;
11948			break;
11949		}
11950
11951		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11952
11953		/* If lb ok, check locator names */
11954		mnlnp_od = (mddb_mnln_t *)
11955		    kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);
11956
11957		/* read in on-disk locator names */
11958		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11959		    lbp->lb_lnblkcnt, li);
11960
11961		/* If err, try next mddb */
11962		if (err) {
11963			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11964			continue;
11965		}
11966
11967		/* Are incore locator names same as ondisk? */
11968		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11969		    == 1) {
11970			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11971			write_out_mddb = 1;
11972			break;
11973		}
11974
11975		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11976
11977		/*
11978		 * Check records in mddb.
11979		 * If a read error is encountered, set the error flag and
11980		 * continue to the next mddb.  Otherwise, if incore data is
11981		 * different from ondisk, then set the flag to write out
11982		 * the mddb and break out.
11983		 */
11984		checkcopy_err = checkcopy(s, li);
11985		if (checkcopy_err == MDDB_F_EREAD) {
11986			lp->l_flags |= MDDB_F_EREAD;
11987			mddb_err = 1;
11988			continue;
11989		} else if (checkcopy_err == 1) {
11990			write_out_mddb = 1;
11991			break;
11992		}
11993		/*
11994		 * Have found last active mddb and the data is the same as
11995		 * incore - break out of loop
11996		 */
11997		write_out_mddb = 0;
11998		break;
11999	}
12000
12001	/*
12002	 * If ondisk and incore versions of the mddb don't match, then
12003	 * write out this node's incore version to disk.
12004	 * Or, if unable to read a copy of the mddb, attempt to write
12005	 * out a new one.
12006	 */
12007write_out_mddb:
12008	if (write_out_mddb) {
12009		/* Recompute free blocks based on incore information */
12010		computefreeblks(s); /* set up free block bits */
12011
12012		/*
12013		 * Write directory entries and record blocks.
12014		 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
12015		 * routine won't write out change log records.
12016		 */
12017		for (li = 0; li < lbp->lb_loccnt; li++) {
12018			lp = &lbp->lb_locators[li];
12019			/* Don't write to inactive or deleted mddbs */
12020			if (! (lp->l_flags & MDDB_F_ACTIVE))
12021				continue;
12022			if (lp->l_flags & MDDB_F_DELETED)
12023				continue;
12024			if (s->s_mbiarray[li] == NULL)
12025				continue;
12026			/* If encounter a write error, save it for later */
12027			if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
12028				lp->l_flags |= MDDB_F_EWRITE;
12029				mddb_err = 1;
12030			}
12031		}
12032
12033		/*
12034		 * Write out locator blocks to all replicas.
12035		 * push_lb will set MDDB_F_EWRITE on replicas that fail.
12036		 */
12037		if (push_lb(s))
12038			mddb_err = 1;
12039		(void) upd_med(s, "mddb_check_write_ioctl(0)");
12040
12041		/* Write out locator names to all replicas */
12042		lnp = s->s_lnp;
12043		uniqtime32(&lnp->ln_timestamp);
12044		lnp->ln_revision = MDDB_REV_MNLN;
12045		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
12046
12047		/* writeall sets MDDB_F_EWRITE if writes fails to replica */
12048		if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
12049		    lbp->lb_lnblkcnt, 0))
12050			mddb_err = 1;
12051
12052		/*
12053		 * The writes to the replicas above would have set
12054		 * the MDDB_F_EWRITE flags if any write error was
12055		 * encountered.
12056		 * If < 50% of the mddbs are available, panic.
12057		 */
12058		lc = alc = 0;
12059		for (li = 0; li < lbp->lb_loccnt; li++) {
12060			lp = &lbp->lb_locators[li];
12061			if (lp->l_flags & MDDB_F_DELETED)
12062				continue;
12063			lc++;
12064			/*
12065			 * If mddb:
12066			 *	- is not active (previously had an error)
12067			 *	- had an error reading the master blocks  or
12068			 *	- had an error in writing to the mddb
12069			 * then don't count this mddb in the active count.
12070			 */
12071			if (! (lp->l_flags & MDDB_F_ACTIVE) ||
12072			    (lp->l_flags & MDDB_F_EMASTER) ||
12073			    (lp->l_flags & MDDB_F_EWRITE))
12074				continue;
12075			alc++;
12076		}
12077		if (alc < ((lc + 1) / 2)) {
12078			cmn_err(CE_PANIC,
12079			    "md: Panic due to lack of DiskSuite state\n"
12080			    " database replicas. Fewer than 50%% of "
12081			    "the total were available,\n so panic to "
12082			    "ensure data integrity.");
12083		}
12084	}
12085
12086	/*
12087	 * If encountered an error during checking or writing of
12088	 * mddbs, call selectreplicas so that replica error can
12089	 * be properly handled. This will involve another attempt
12090	 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
12091	 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
12092	 * turned off. Set the MDDB_SCANALLSYNC flag so that
12093	 * selectreplicas doesn't overwrite the change log entries.
12094	 *
12095	 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
12096	 * that the locator block has been changed.
12097	 */
12098	if (mddb_err) {
12099		(void) selectreplicas(s, MDDB_SCANALLSYNC);
12100		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
12101	}
12102
12103write_out_end:
12104	mddb_setexit(s);
12105	return (rval);
12106}
12107
12108/*
12109 * Set/reset/get set flags in set structure.
12110 * Used during reconfig cycle
12111 * Only supported for MN disksets.
12112 *
12113 * Return values:
12114 *	0 - success
12115 *	non-zero - failure
12116 */
12117int
12118mddb_setflags_ioctl(mddb_setflags_config_t *info)
12119{
12120	set_t			setno = info->sf_setno;
12121
12122	/* Verify that setno is in valid range */
12123	if (setno >= md_nsets)
12124		return (EINVAL);
12125
12126	/*
12127	 * When setting the flags, the set may not
12128	 * be snarfed yet. So, don't check for SNARFED or MNset
12129	 * and don't call mddb_setenter.
12130	 * In order to discourage bad ioctl calls,
12131	 * verify that magic field in structure is set correctly.
12132	 */
12133	if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
12134		return (EINVAL);
12135
12136	switch (info->sf_flags) {
12137	case MDDB_NM_SET:
12138		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12139			md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12140		if (info->sf_setflags & MD_SET_MN_START_RC)
12141			md_set_setstatus(setno, MD_SET_MN_START_RC);
12142		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12143			md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12144		break;
12145
12146	case MDDB_NM_RESET:
12147		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12148			md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12149		if (info->sf_setflags & MD_SET_MN_START_RC)
12150			md_clr_setstatus(setno, MD_SET_MN_START_RC);
12151		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12152			md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12153		break;
12154
12155	case MDDB_NM_GET:
12156		info->sf_setflags = md_get_setstatus(setno) &
12157		    (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
12158		    MD_SET_MN_MIR_STATE_RC);
12159		break;
12160	}
12161
12162	return (0);
12163}
12164
12165/*
12166 * md_update_minor
12167 *
12168 * This function updates the minor in the namespace entry for an
12169 * underlying metadevice.  The function is called in mod_imp_set
12170 * where mod is sp, stripe, mirror and raid.
12171 *
12172 */
12173int
12174md_update_minor(
12175	set_t	setno,
12176	side_t	side,
12177	mdkey_t	key
12178)
12179{
12180	struct nm_next_hdr	*nh;
12181	struct nm_name		*n;
12182	char			*shn;
12183	int			retval = 1;
12184
12185	/*
12186	 * Load the devid name space if it exists
12187	 */
12188	(void) md_load_namespace(setno, NULL, NM_DEVID);
12189	if (! md_load_namespace(setno, NULL, 0L)) {
12190		/*
12191		 * Unload the devid namespace
12192		 */
12193		(void) md_unload_namespace(setno, NM_DEVID);
12194		return (0);
12195	}
12196
12197	rw_enter(&nm_lock.lock, RW_READER);
12198
12199	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12200		retval = 0;
12201		goto out;
12202	}
12203
12204	/*
12205	 * Look up the key
12206	 */
12207	if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) {
12208		/*
12209		 * Find the entry, update its n_minor if metadevice
12210		 */
12211		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12212		    == NULL) {
12213			retval = 0;
12214			goto out;
12215		}
12216
12217		if (strcmp(shn, "md") == 0) {
12218			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12219		}
12220	}
12221
12222out:
12223	rw_exit(&nm_lock.lock);
12224	return (retval);
12225}
12226
12227/*
12228 * md_update_top_device_minor
12229 *
12230 * This function updates the minor in the namespace entry for a top
12231 * level metadevice.  The function is called in mod_imp_set where
12232 * mod is sp, stripe, mirror and raid.
12233 *
12234 */
12235int
12236md_update_top_device_minor(
12237	set_t	setno,
12238	side_t	side,
12239	md_dev64_t dev
12240)
12241{
12242	struct nm_next_hdr	*nh;
12243	struct nm_name		*n;
12244	char			*shn;
12245	int			retval = 1;
12246
12247	/*
12248	 * Load the devid name space if it exists
12249	 */
12250	(void) md_load_namespace(setno, NULL, NM_DEVID);
12251	if (! md_load_namespace(setno, NULL, 0L)) {
12252		/*
12253		 * Unload the devid namespace
12254		 */
12255		(void) md_unload_namespace(setno, NM_DEVID);
12256		return (0);
12257	}
12258
12259	rw_enter(&nm_lock.lock, RW_READER);
12260
12261	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12262		retval = 0;
12263		goto out;
12264	}
12265
12266	/*
12267	 * Look up the key
12268	 */
12269	if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
12270		/*
12271		 * Find the entry, update its n_minor if metadevice
12272		 */
12273		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12274		    == NULL) {
12275			retval = 0;
12276			goto out;
12277		}
12278
12279		if (strcmp(shn, "md") == 0) {
12280			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12281		}
12282	}
12283
12284out:
12285	rw_exit(&nm_lock.lock);
12286	return (retval);
12287}
12288
12289static void
12290md_imp_nm(
12291	mddb_set_t	*s
12292)
12293{
12294	mddb_db_t		*dbp;
12295	mddb_de_ic_t		*dep;
12296	struct nm_rec_hdr	*hdr;
12297	struct nm_header	*hhdr;
12298	set_t			setno = s->s_setno;
12299
12300	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12301		for (dep = dbp->db_firstentry; dep != NULL;
12302		    dep = dep->de_next) {
12303			switch (dep->de_type1) {
12304
12305			case MDDB_NM_HDR:
12306			case MDDB_DID_NM_HDR:
12307
12308				hhdr = (struct nm_header *)
12309				    dep->de_rb_userdata;
12310
12311				hdr = &hhdr->h_names;
12312				if (hdr->r_next_recid > 0) {
12313					hdr->r_next_recid = MAKERECID(setno,
12314					    DBID(hdr->r_next_recid));
12315				}
12316
12317				hdr = &hhdr->h_shared;
12318				if (hdr->r_next_recid > 0) {
12319					hdr->r_next_recid = MAKERECID(setno,
12320					    DBID(hdr->r_next_recid));
12321				}
12322				break;
12323
12324			case MDDB_NM:
12325			case MDDB_DID_NM:
12326			case MDDB_SHR_NM:
12327			case MDDB_DID_SHR_NM:
12328
12329				hdr = (struct nm_rec_hdr *)
12330				    dep->de_rb_userdata;
12331
12332				if (hdr->r_next_recid > 0) {
12333					hdr->r_next_recid = MAKERECID
12334					    (setno, DBID(hdr->r_next_recid));
12335				}
12336				break;
12337
12338			default:
12339				break;
12340			}
12341		}
12342	}
12343}
12344
12345static int
12346update_db_rec(
12347	mddb_set_t	*s
12348)
12349{
12350	mddb_db_t	*dbp;
12351	mddb_de_ic_t	*dep;
12352	mddb_recid_t	ids[2];
12353
12354	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12355		for (dep = dbp->db_firstentry; dep != NULL;
12356		    dep = dep->de_next) {
12357			if (! (dep->de_flags & MDDB_F_OPT)) {
12358				ids[0] = MAKERECID(s->s_setno, dep->de_recid);
12359				ids[1] = 0;
12360				if (mddb_commitrecs(ids)) {
12361					return (MDDB_E_NORECORD);
12362				}
12363			}
12364		}
12365	}
12366	return (0);
12367}
12368
12369static int
12370update_mb(
12371	mddb_set_t	*s
12372)
12373{
12374	mddb_ri_t	*rip;
12375	int	err = 0;
12376
12377	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12378		if (rip->ri_flags & MDDB_F_EMASTER)
12379			/* disk is powered off or not there */
12380			continue;
12381
12382		if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12383			/*
12384			 * It is a replicated set
12385			 */
12386			if (rip->ri_devid == (ddi_devid_t)NULL) {
12387				return (-1);
12388			}
12389			err = update_mb_devid(s, rip, rip->ri_devid);
12390		} else {
12391			/*
12392			 * It is a non-replicated set
12393			 * and there is no need to update
12394			 * devid
12395			 */
12396			err = update_mb_devid(s, rip, NULL);
12397		}
12398
12399		if (err)
12400			return (err);
12401	}
12402
12403	return (0);
12404}
12405
12406static int
12407update_setname(
12408	set_t	setno
12409)
12410{
12411	struct nm_next_hdr	*nh;
12412	struct nm_shared_name	*shn, *new_shn;
12413	char			*prefix = "/dev/md/";
12414	char			*shrname;
12415	int			len;
12416	mdkey_t			o_key;
12417	uint32_t		o_count, o_data;
12418	mddb_recid_t		recid, ids[3];
12419	int			err = 0;
12420	mddb_set_t		*dbp;
12421
12422	/* Import setname */
12423	dbp = (mddb_set_t *)md_set[setno].s_db;
12424	len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
12425	shrname = kmem_zalloc(len, KM_SLEEP);
12426	(void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");
12427
12428	rw_enter(&nm_lock.lock, RW_WRITER);
12429	if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
12430		/*
12431		 * No namespace is okay
12432		 */
12433		err = 0;
12434		goto out;
12435	}
12436
12437	if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
12438	    0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
12439		/*
12440		 * No metadevice is okay
12441		 */
12442		err = 0;
12443		goto out;
12444	}
12445
12446	/*
12447	 * We have it, go ahead and update the namespace.
12448	 */
12449	o_key = shn->sn_key;
12450	o_count = shn->sn_count;
12451	o_data = shn->sn_data;
12452
12453	if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
12454	    NM_NOCOMMIT)) {
12455		err = MDDB_E_NORECORD;
12456		goto out;
12457	}
12458	if ((new_shn = (struct nm_shared_name *)alloc_entry(
12459	    nh, md_set[setno].s_nmid, len, NM_SHARED |
12460	    NM_NOCOMMIT, &recid)) == NULL) {
12461		err = MDDB_E_NORECORD;
12462		goto out;
12463	}
12464
12465	new_shn->sn_key = o_key;
12466	new_shn->sn_count = o_count;
12467	new_shn->sn_data = o_data;
12468	new_shn->sn_namlen = (ushort_t)len;
12469	(void) strcpy(new_shn->sn_name, shrname);
12470
12471	ids[0] = recid;
12472	ids[1] = md_set[setno].s_nmid;
12473	ids[2] = 0;
12474	err = mddb_commitrecs(ids);
12475
12476out:
12477	if (shrname)
12478		kmem_free(shrname, len);
12479	rw_exit(&nm_lock.lock);
12480	return (err);
12481}
12482
12483/*
12484 * Returns 0 on success.
12485 * Returns -1 on failure with ep filled in.
12486 */
12487static int
12488md_imp_db(
12489	set_t		setno,
12490	int		stale_flag,
12491	md_error_t	*ep
12492)
12493{
12494	mddb_set_t	*s;
12495	int		err = 0;
12496	mddb_dt_t	*dtp;
12497	mddb_lb_t	*lbp;
12498	int		i;
12499	int		loccnt;
12500
12501	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12502		return (mddbstatus2error(ep, err, NODEV32, setno));
12503	}
12504
12505	/* Update dt */
12506	if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
12507		crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
12508	}
12509
12510	if ((err = dt_write(s)) != 0) {
12511		err = mdsyserror(ep, err);
12512		mddb_setexit(s);
12513		return (err);
12514	}
12515
12516	/*
12517	 * Update lb, no need to update the mediator because
12518	 * the diskset will only exist on the importing node
12519	 * and as such a mediator adds no value.
12520	 */
12521
12522	/* Update lb */
12523	if (stale_flag & MD_IMP_STALE_SET) {
12524		lbp = s->s_lbp;
12525		loccnt = lbp->lb_loccnt;
12526		for (i = 0; i < loccnt; i++) {
12527			mddb_locator_t	*lp = &lbp->lb_locators[i];
12528			md_dev64_t	ndev = md_expldev(lp->l_dev);
12529			ddi_devid_t	devid_ptr;
12530
12531			devid_ptr = s->s_did_icp->did_ic_devid[i];
12532			if (devid_ptr == NULL) {
12533				/*
12534				 * Already deleted, go to next one.
12535				 */
12536				continue;
12537			}
12538			if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
12539			    NULL)) {
12540				/* disk unavailable, mark deleted */
12541				lp->l_flags = MDDB_F_DELETED;
12542				/* then remove the device id from the list */
12543				free_mbipp(&s->s_mbiarray[i]);
12544				(void) mddb_devid_delete(s, i);
12545			}
12546		}
12547		md_clr_setstatus(setno, MD_SET_STALE);
12548	}
12549
12550	if ((err = writelocall(s)) != 0) {
12551		err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
12552		mddb_setexit(s);
12553		return (err);
12554	}
12555
12556	mddb_setexit(s);
12557
12558	/* Update db records */
12559	if ((err = update_db_rec(s)) != 0) {
12560		return (mddbstatus2error(ep, err, NODEV32, setno));
12561	}
12562
12563	/* Update setname embedded in the namespace */
12564	if ((err = update_setname(setno)) != 0)
12565		return (mddbstatus2error(ep, err, NODEV32, setno));
12566
12567	return (err);
12568}
12569
12570static void
12571md_dr_add(
12572	md_set_record	*sr,
12573	md_drive_record	*dr
12574)
12575{
12576	md_drive_record	*drv;
12577
12578	if (sr->sr_driverec == 0) {
12579		sr->sr_driverec = dr->dr_selfid;
12580		return;
12581	}
12582
12583	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12584	    drv->dr_nextrec != 0;
12585	    drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
12586		;
12587	drv->dr_nextrec = dr->dr_selfid;
12588}
12589
12590static void
12591md_setup_recids(
12592	md_set_record	*sr,
12593	mddb_recid_t	**ids,
12594	size_t		size
12595)
12596{
12597	md_drive_record	*drv;
12598	int		cnt;
12599	mddb_recid_t	*recids;
12600
12601	recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
12602	    * size, KM_SLEEP);
12603	recids[0] = sr->sr_selfid;
12604	cnt = 1;
12605
12606	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12607	    /* CSTYLED */
12608	    drv != NULL;) {
12609		recids[cnt++] = drv->dr_selfid;
12610		if (drv->dr_nextrec != 0)
12611			drv = (md_drive_record *)mddb_getrecaddr
12612			    (drv->dr_nextrec);
12613		else
12614			drv = NULL;
12615	}
12616	recids[cnt] = 0;
12617	*ids = &recids[0];
12618}
12619
12620/*
12621 * The purpose of this function is to replace the old_devid with the
12622 * new_devid in the given namespace.   This is used for importing
12623 * remotely replicated drives.
12624 */
12625int
12626md_update_namespace_rr_did(
12627	mddb_config_t	*cp
12628)
12629{
12630	set_t			setno = cp->c_setno;
12631	struct nm_next_hdr	*nh;
12632	mdkey_t			key = MD_KEYWILD;
12633	side_t			side = MD_SIDEWILD;
12634	mddb_recid_t		recids[3];
12635	struct did_min_name	*n;
12636	struct nm_next_hdr	*did_shr_nh;
12637	struct did_shr_name	*shr_n;
12638	mdkey_t			ent_did_key;
12639	uint32_t		ent_did_count;
12640	uint32_t		ent_did_data;
12641	size_t			ent_size, size;
12642	ddi_devid_t		devid = NULL;
12643	struct did_shr_name	*shn;
12644	size_t			offset;
12645	struct nm_next_hdr	*this_did_shr_nh;
12646	void			*old_devid, *new_devid;
12647
12648	if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
12649		return (EIO);
12650
12651	old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
12652	new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;
12653
12654	/*
12655	 * It is okay if we dont have any configuration
12656	 */
12657	offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name));
12658	if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
12659	    == NULL) {
12660		return (0);
12661	}
12662	while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
12663		/* check out every entry in the namespace */
12664		if ((n = (struct did_min_name *)lookup_entry(nh, setno,
12665		    side, key, NODEV64, NM_DEVID)) == NULL) {
12666			continue;
12667		} else {
12668			did_shr_nh = get_first_record(setno, 0, NM_DEVID |
12669			    NM_SHARED);
12670			if (did_shr_nh == NULL) {
12671				return (ENOENT);
12672			}
12673			this_did_shr_nh = did_shr_nh->nmn_nextp;
12674			shr_n = (struct did_shr_name *)lookup_shared_entry(
12675			    did_shr_nh, n->min_devid_key, (char *)0,
12676			    &recids[0], NM_DEVID);
12677			if (shr_n == NULL) {
12678				return (ENOENT);
12679			}
12680			rw_enter(&nm_lock.lock, RW_WRITER);
12681			devid = (ddi_devid_t)shr_n->did_devid;
12682			/* find this devid in the incore replica  */
12683			if (ddi_devid_compare(devid, old_devid) == 0) {
12684				/*
12685				 * found the corresponding entry
12686				 * update with new devid
12687				 */
12688				/* first remove old devid info */
12689				ent_did_key = shr_n ->did_key;
12690				ent_did_count = shr_n->did_count;
12691				ent_did_data = shr_n->did_data;
12692				ent_size = DID_SHR_NAMSIZ(shr_n);
12693				size = ((struct nm_rec_hdr *)
12694				    this_did_shr_nh->nmn_record)->
12695				    r_used_size - offset - ent_size;
12696				if (size == 0) {
12697					(void) bzero(shr_n, ent_size);
12698				} else {
12699					(void) ovbcopy((caddr_t)shr_n +
12700					    ent_size, shr_n, size);
12701					(void) bzero((caddr_t)shr_n +
12702					    size, ent_size);
12703				}
12704				((struct nm_rec_hdr *)this_did_shr_nh->
12705				    nmn_record)->r_used_size -=
12706				    ent_size;
12707				/* add in new devid info */
12708				if ((shn = (struct did_shr_name *)
12709				    alloc_entry(did_shr_nh,
12710				    md_set[setno].s_did_nmid,
12711				    cp->c_locator.l_devid_sz,
12712				    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
12713				    &recids[0])) == NULL) {
12714						rw_exit(&nm_lock.lock);
12715						return (ENOMEM);
12716					}
12717					shn->did_key = ent_did_key;
12718					shn->did_count = ent_did_count;
12719					ent_did_data |= NM_DEVID_VALID;
12720					shn->did_data = ent_did_data;
12721					shn->did_size = ddi_devid_sizeof(
12722					    new_devid);
12723					bcopy((void *)new_devid, (void *)
12724					    shn->did_devid, shn->did_size);
12725					recids[1] = md_set[setno].s_nmid;
12726					recids[2] = 0;
12727					mddb_commitrecs_wrapper(recids);
12728			}
12729			rw_exit(&nm_lock.lock);
12730		}
12731	}
12732
12733	return (0);
12734}
12735
12736/*
12737 * namespace is loaded before this is called.
12738 * This function is a wrapper for md_update_namespace_rr_did.
12739 *
12740 * md_update_namespace_rr_did may be called twice if attempting to
12741 * resolve a replicated device id during the take of a diskset - once
12742 * for the diskset namespace and a second time for the local namespace.
12743 * The local namespace would need to be updated when a drive has been
12744 * found during a take of the diskset that hadn't been resolved during
12745 * the import (aka partial replicated import).
12746 *
12747 * If being called during the import of the diskset (IMPORT flag set)
12748 * md_update_namespace_rr_did will only be called once with the disket
12749 * namespace.
12750 */
12751int
12752md_update_nm_rr_did_ioctl(
12753	mddb_config_t	*cp
12754)
12755{
12756	int	rval = 0;
12757
12758	/* If update of diskset namespace fails, stop and return failure */
12759	if ((rval = md_update_namespace_rr_did(cp)) != 0)
12760		return (rval);
12761
12762	if (cp->c_flags & MDDB_C_IMPORT)
12763		return (0);
12764
12765	/* If update of local namespace fails, return failure */
12766	cp->c_setno = MD_LOCAL_SET;
12767	rval = md_update_namespace_rr_did(cp);
12768	return (rval);
12769}
12770
12771/*ARGSUSED*/
12772int
12773md_imp_snarf_set(
12774	mddb_config_t	*cp
12775)
12776{
12777	set_t		setno;
12778	int		stale_flag;
12779	mddb_set_t	*s;
12780	int		i, err = 0;
12781	md_ops_t	*ops;
12782	md_error_t	*ep = &cp->c_mde;
12783
12784	setno = cp->c_setno;
12785	stale_flag = cp->c_flags;
12786
12787	mdclrerror(ep);
12788	if (setno >= md_nsets) {
12789		return (mdsyserror(ep, EINVAL));
12790	}
12791
12792	md_haltsnarf_enter(setno);
12793	if (md_get_setstatus(setno) & MD_SET_IMPORT) {
12794		goto out;
12795	}
12796
12797	/* Set the bit first otherwise load_old_replicas can fail */
12798	md_set_setstatus(setno, MD_SET_IMPORT);
12799
12800	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12801		err = mddbstatus2error(ep, err, NODEV32, setno);
12802		goto out;
12803	}
12804
12805	/*
12806	 * Upon completion of load_old_replicas, the old setno is
12807	 * restored from the disk so we need to reset
12808	 */
12809	s->s_lbp->lb_setno = setno;
12810
12811	/*
12812	 * Fixup the NM records before loading namespace
12813	 */
12814	(void) md_imp_nm(s);
12815	mddb_setexit(s);
12816
12817	/*
12818	 * Load the devid name space if it exists
12819	 * and ask each module to fixup unit records
12820	 */
12821	if (!md_load_namespace(setno, NULL, NM_DEVID)) {
12822		err = mdsyserror(ep, ENOENT);
12823		goto cleanup;
12824	}
12825	if (!md_load_namespace(setno, NULL, 0L)) {
12826		(void) md_unload_namespace(setno, NM_DEVID);
12827		err = mdsyserror(ep, ENOENT);
12828		goto cleanup;
12829	}
12830
12831	do {
12832		i = 0;
12833		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
12834			if (ops->md_imp_set != NULL)
12835				i += ops->md_imp_set(setno);
12836	} while (i);
12837
12838	/*
12839	 * Fixup
12840	 *	(1) locator block
12841	 *	(2) locator name block if necessary
12842	 *	(3) master block
12843	 *	(4) directory block
12844	 * calls appropriate writes to push changes out
12845	 */
12846	if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
12847		goto cleanup;
12848	}
12849
12850	/*
12851	 * Don't unload namespace if importing a replicated diskset.
12852	 * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
12853	 */
12854	if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12855		md_haltsnarf_exit(setno);
12856		return (err);
12857	}
12858
12859cleanup:
12860	/*
12861	 * Halt the set
12862	 */
12863	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
12864	(void) md_halt_set(setno, MD_HALT_ALL);
12865	rw_exit(&md_unit_array_rw.lock);
12866
12867	/*
12868	 * Unload the namespace for the imported set
12869	 */
12870	mutex_enter(&mddb_lock);
12871	mddb_unload_set(setno);
12872	mutex_exit(&mddb_lock);
12873
12874out:
12875	md_haltsnarf_exit(setno);
12876	md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
12877	return (err);
12878}
12879#endif	/* MDDB_FAKE */
12880