1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Just in case we're not in a build environment, make sure that
29 * TEXT_DOMAIN gets set to something.
30 */
31#if !defined(TEXT_DOMAIN)
32#define	TEXT_DOMAIN "SYS_TEST"
33#endif
34
35/*
36 * Metadevice database interfaces.
37 */
38
39#define	MDDB
40
41#include <meta.h>
42#include <sys/lvm/md_mddb.h>
43#include <sys/lvm/md_crc.h>
44#include <sys/lvm/mdio.h>
45#include <string.h>
46#include <strings.h>
47#include <ctype.h>
48
49struct svm_daemon {
50	char *svmd_name;
51	char *svmd_kill_val;
52};
53
54/*
55 * This is a list of the daemons that are not stopped by the SVM smf(5)
56 * services. The mdmonitord is started via svc:/system/mdmonitor:default
57 * but no contract(4) is constructed and so it is not stopped by smf(5).
58 */
59struct svm_daemon svmd_kill_list[] = {
60		{"mdmonitord", "HUP"},
61		{"mddoors", "KILL"},
62	};
63
64#define	DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon))
65
66extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep);
67
68/*
69 * Are the locator blocks for the replicas using devids
70 */
71static int	devid_in_use = FALSE;
72
73static char *
74getlongname(
75	struct mddb_config	*c,
76	md_error_t		*ep
77)
78{
79	char		*diskname = NULL;
80	char		*devid_str;
81	devid_nmlist_t	*disklist = NULL;
82
83	c->c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
84	if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) {
85		(void) mdstealerror(ep, &c->c_mde);
86		return (NULL);
87	}
88
89	if (c->c_locator.l_devid_flags & MDDB_DEVID_SZ) {
90		c->c_locator.l_devid = (uintptr_t)
91		    Malloc(c->c_locator.l_devid_sz);
92		c->c_locator.l_devid_flags =
93		    MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
94	} else {
95		(void) mderror(ep, MDE_NODEVID, "");
96		goto out;
97	}
98
99	if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) {
100		(void) mdstealerror(ep, &c->c_mde);
101		goto out;
102	}
103
104	if (c->c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
105		(void) mderror(ep, MDE_NODEVID, "");
106		goto out;
107	}
108
109	if (metaioctl(MD_DB_GETDEV, c, &c->c_mde, NULL) != 0) {
110		(void) mdstealerror(ep, &c->c_mde);
111		goto out;
112	}
113
114	if (c->c_locator.l_devid != NULL) {
115		if (meta_deviceid_to_nmlist("/dev/dsk",
116		    (ddi_devid_t)(uintptr_t)c->c_locator.l_devid,
117		    c->c_locator.l_minor_name, &disklist) != 0) {
118			devid_str = devid_str_encode(
119			    (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, NULL);
120			(void) mderror(ep, MDE_MISSING_DEVID_DISK, "");
121			mderrorextra(ep, devid_str);
122			if (devid_str != NULL)
123				devid_str_free(devid_str);
124			goto out;
125		}
126		diskname = Strdup(disklist[0].devname);
127	}
128
129out:
130	if (disklist != NULL)
131		devid_free_nmlist(disklist);
132
133	if (c->c_locator.l_devid != NULL)
134		Free((void *)(uintptr_t)c->c_locator.l_devid);
135
136	return (diskname);
137}
138
139/*
140 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel
141 */
142md_timeval32_t
143meta_get_lb_inittime(
144	mdsetname_t	*sp,
145	md_error_t	*ep
146)
147{
148	mddb_config_t	c;
149
150	(void) memset(&c, 0, sizeof (c));
151
152	/* Fill in setno, setname, and sideno */
153	c.c_setno = sp->setno;
154
155	if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) {
156		(void) mdstealerror(ep, &c.c_mde);
157	}
158
159	return (c.c_timestamp);
160}
161
162/*
163 * mkmasterblks writes out the master blocks of the mddb to the replica.
164 *
165 * In a MN diskset, this is called by the node that is adding this replica
166 * to the diskset.
167 */
168
169#define	MDDB_VERIFY_SIZE	8192
170
171static int
172mkmasterblks(
173	mdsetname_t	*sp,
174	mdname_t	*np,
175	int		fd,
176	daddr_t		firstblk,
177	int		dbsize,
178	md_timeval32_t	inittime,
179	md_error_t	*ep
180)
181{
182	int		consecutive;
183	md_timeval32_t	tp;
184	struct mddb_mb	*mb;
185	char		*buffer;
186	int		iosize;
187	md_set_desc	*sd;
188	int		mn_set = 0;
189	daddr_t		startblk;
190	int		cnt;
191	ddi_devid_t	devid;
192
193	if (! metaislocalset(sp)) {
194		if ((sd = metaget_setdesc(sp, ep)) == NULL)
195			return (-1);
196
197		if (MD_MNSET_DESC(sd)) {
198			mn_set = 1;		/* Used later */
199		}
200	}
201
202	/*
203	 * Loop to verify the entire mddb region on disk is read/writable.
204	 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block
205	 * chunks.
206	 *
207	 * A side-effect of this loop is to zero out the entire mddb region
208	 */
209	if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL)
210		return (mdsyserror(ep, ENOMEM, np->rname));
211
212	startblk = firstblk;
213	for (cnt = dbsize; cnt > 0; cnt -= consecutive) {
214
215		if (cnt > MDDB_VERIFY_SIZE)
216			consecutive = MDDB_VERIFY_SIZE;
217		else
218			consecutive = cnt;
219
220		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
221			Free(buffer);
222			return (mdsyserror(ep, errno, np->rname));
223		}
224
225		iosize = DEV_BSIZE * consecutive;
226		if (write(fd, buffer, iosize) != iosize) {
227			Free(buffer);
228			return (mdsyserror(ep, errno, np->rname));
229		}
230
231		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
232			Free(buffer);
233			return (mdsyserror(ep, errno, np->rname));
234		}
235
236		if (read(fd, buffer, iosize) != iosize) {
237			Free(buffer);
238			return (mdsyserror(ep, errno, np->rname));
239		}
240
241		startblk += consecutive;
242	}
243
244	Free(buffer);
245	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
246		return (mdsyserror(ep, ENOMEM, np->rname));
247
248	if (meta_gettimeofday(&tp) == -1) {
249		Free(mb);
250		return (mdsyserror(ep, errno, np->rname));
251	}
252
253	mb->mb_magic = MDDB_MAGIC_MB;
254	/*
255	 * If a MN diskset, set master block revision for a MN set.
256	 * Even though the master block structure is no different
257	 * for a MN set, setting the revision field to a different
258	 * number keeps any pre-MN_diskset code from accessing
259	 * this diskset.  It also allows for an early determination
260	 * of a MN diskset when reading in from disk so that the
261	 * proper size locator block and locator names structure
262	 * can be read in thus saving time on diskset startup.
263	 */
264	if (mn_set)
265		mb->mb_revision = MDDB_REV_MNMB;
266	else
267		mb->mb_revision = MDDB_REV_MB;
268	mb->mb_timestamp = tp;
269	mb->mb_setno = sp->setno;
270	mb->mb_blkcnt = dbsize - 1;
271	mb->mb_blkno = firstblk;
272	mb->mb_nextblk = 0;
273
274	mb->mb_blkmap.m_firstblk = firstblk + 1;
275	mb->mb_blkmap.m_consecutive = dbsize - 1;
276	if (! metaislocalset(sp)) {
277		mb->mb_setcreatetime = inittime;
278	}
279
280	/*
281	 * We try to save the disks device ID into the remaining bytes in
282	 * the master block. The saved devid is used to provide a mapping
283	 * between this disk's devid and the devid stored into the master
284	 * block. This allows the disk image to be self-identifying
285	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
286	 * when we try to import these disks on the remote copied image.
287	 * If we cannot save the disks device ID onto the master block that is
288	 * ok.  The disk is just not self-identifying and won't be importable
289	 * in the remote copy scenario.
290	 */
291	if (devid_get(fd, &devid) == 0) {
292		size_t len;
293
294		len = devid_sizeof(devid);
295		if (len <= DEV_BSIZE - sizeof (*mb)) {
296			/* there is enough space to store the devid */
297			mb->mb_devid_magic = MDDB_MAGIC_DE;
298			mb->mb_devid_len = len;
299			(void) memcpy(mb->mb_devid, devid, len);
300		}
301		devid_free(devid);
302	}
303
304	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
305	    (crc_skip_t *)NULL);
306
307	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
308		Free(mb);
309		return (mdsyserror(ep, errno, np->rname));
310	}
311
312	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
313		Free(mb);
314		return (mdsyserror(ep, errno, np->rname));
315	}
316
317	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
318		Free(mb);
319		return (mdsyserror(ep, errno, np->rname));
320	}
321
322	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
323		Free(mb);
324		return (mdsyserror(ep, errno, np->rname));
325	}
326
327	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
328	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) {
329		Free(mb);
330		return (mdmddberror(ep, MDE_NOTVERIFIED,
331		    meta_getminor(np->dev), sp->setno, 0, np->rname));
332	}
333
334	Free(mb);
335	return (0);
336}
337
338void
339meta_mkdummymaster(
340	mdsetname_t	*sp,
341	int		fd,
342	daddr_t		firstblk
343)
344{
345	md_timeval32_t	tp;
346	struct mddb_mb	*mb;
347	ddi_devid_t	devid;
348	md_set_desc	*sd;
349	md_error_t	ep = mdnullerror;
350	md_timeval32_t	inittime;
351
352	/*
353	 * No dummy master blocks are written for a MN diskset since devids
354	 * are not supported in MN disksets.
355	 */
356	if (! metaislocalset(sp)) {
357		if ((sd = metaget_setdesc(sp, &ep)) == NULL)
358			return;
359
360		if (MD_MNSET_DESC(sd))
361			return;
362	}
363
364	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
365		return;
366
367	mb->mb_magic = MDDB_MAGIC_DU;
368	mb->mb_revision = MDDB_REV_MB;
369	mb->mb_setno = sp->setno;
370	inittime = meta_get_lb_inittime(sp, &ep);
371	mb->mb_setcreatetime = inittime;
372
373	if (meta_gettimeofday(&tp) != -1)
374		mb->mb_timestamp = tp;
375
376	/*
377	 * We try to save the disks device ID into the remaining bytes in
378	 * the master block.  This allows the disk image to be self-identifying
379	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
380	 * when we try to import these disks on the remote copied image.
381	 * If we cannot save the disks device ID onto the master block that is
382	 * ok.  The disk is just not self-identifying and won't be importable
383	 * in the remote copy scenario.
384	 */
385	if (devid_get(fd, &devid) == 0) {
386		int len;
387
388		len = devid_sizeof(devid);
389		if (len <= DEV_BSIZE - sizeof (*mb)) {
390			/* there is enough space to store the devid */
391			mb->mb_devid_magic = MDDB_MAGIC_DE;
392			mb->mb_devid_len = len;
393			(void) memcpy(mb->mb_devid, (char *)devid, len);
394		}
395		devid_free(devid);
396	}
397
398	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
399	    (crc_skip_t *)NULL);
400
401	/*
402	 * If any of these operations fail, we need to inform the
403	 * user that the disk won't be self identifying. When support
404	 * for importing remotely replicated disksets is added, we
405	 * want to add the error messages here.
406	 */
407	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
408		goto out;
409
410	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE)
411		goto out;
412
413	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
414		goto out;
415
416	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE)
417		goto out;
418
419	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
420	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
421		goto out;
422
423out:
424	Free(mb);
425}
426
427static int
428buildconf(mdsetname_t *sp, md_error_t *ep)
429{
430	md_replicalist_t	*rlp = NULL;
431	md_replicalist_t	*rl;
432	FILE			*cfp = NULL;
433	FILE			*mfp = NULL;
434	struct stat		sbuf;
435	int			rval = 0;
436	int			in_miniroot = 0;
437	char			line[MDDB_BOOTLIST_MAX_LEN];
438	char			*tname = NULL;
439
440	/* get list of local replicas */
441	if (! metaislocalset(sp))
442		return (0);
443
444	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
445		return (-1);
446
447	/* open tempfile, copy permissions of original file */
448	if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) {
449		/*
450		 * On the miniroot tmp files must be created in /var/tmp.
451		 * If we get a EROFS error, we assume that we are in the
452		 * miniroot.
453		 */
454		if (errno != EROFS)
455			goto error;
456		in_miniroot = 1;
457		errno = 0;
458		tname = tempnam("/var/tmp", "slvm_");
459		if (tname == NULL && errno == EROFS) {
460			/*
461			 * If we are booted on a read-only root because
462			 * of mddb quorum problems we don't want to emit
463			 * any scary error messages.
464			 */
465			errno = 0;
466			goto out;
467		}
468
469		/* open tempfile, copy permissions of original file */
470		if ((cfp = fopen(tname, "w+")) == NULL)
471			goto error;
472	}
473	if (stat(META_DBCONF, &sbuf) == 0) {
474		if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0)
475			goto error;
476		if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0)
477			goto error;
478	}
479
480	/* print header */
481	if (fprintf(cfp, "#metadevice database location file ") == EOF)
482		goto error;
483	if (fprintf(cfp, "do not hand edit\n") < 0)
484		goto error;
485	if (fprintf(cfp,
486	    "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0)
487		goto error;
488
489	/* dump replicas */
490	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
491		md_replica_t	*r = rl->rl_repp;
492		int		checksum = 42;
493		int		i;
494		char		*devidp;
495		minor_t		min;
496
497		devidp = devid_str_encode(r->r_devid, r->r_minor_name);
498		/* If devid code can't encode devidp - skip entry */
499		if (devidp == NULL) {
500			continue;
501		}
502
503		/* compute checksum */
504		for (i = 0; ((r->r_driver_name[i] != '\0') &&
505		    (i < sizeof (r->r_driver_name))); i++) {
506			checksum -= r->r_driver_name[i];
507		}
508		min = meta_getminor(r->r_namep->dev);
509		checksum -= min;
510		checksum -= r->r_blkno;
511
512		for (i = 0; i < strlen(devidp); i++) {
513			checksum -= devidp[i];
514		}
515		/* print info */
516		if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n",
517		    r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) {
518			goto error;
519		}
520
521		devid_str_free(devidp);
522	}
523
524	/* close and rename to real file */
525	if (fflush(cfp) != 0)
526		goto error;
527	if (fsync(fileno(cfp)) != 0)
528		goto error;
529	if (fclose(cfp) != 0) {
530		cfp = NULL;
531		goto error;
532	}
533	cfp = NULL;
534
535	/*
536	 * Renames don't work in the miniroot since tmpfiles are
537	 * created in /var/tmp. Hence we copy the data out.
538	 */
539
540	if (! in_miniroot) {
541		if (rename(META_DBCONFTMP, META_DBCONF) != 0)
542			goto error;
543	} else {
544		if ((cfp = fopen(tname, "r")) == NULL)
545			goto error;
546		if ((mfp = fopen(META_DBCONF, "w+")) == NULL)
547			goto error;
548		while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) {
549			if (fputs(line, mfp) == NULL)
550				goto error;
551		}
552		(void) fclose(cfp);
553		cfp = NULL;
554		if (fflush(mfp) != 0)
555			goto error;
556		if (fsync(fileno(mfp)) != 0)
557			goto error;
558		if (fclose(mfp) != 0) {
559			mfp = NULL;
560			goto error;
561		}
562		/* delete the tempfile */
563		(void) unlink(tname);
564	}
565	/* success */
566	rval = 0;
567	goto out;
568
569	/* tempfile error */
570error:
571	rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
572	    mdsyserror(ep, errno, META_DBCONFTMP);
573
574
575	/* cleanup, return success */
576out:
577	if (rlp != NULL)
578		metafreereplicalist(rlp);
579	if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) {
580		rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
581		    mdsyserror(ep, errno, META_DBCONFTMP);
582	}
583	free(tname);
584	return (rval);
585}
586
587/*
588 * check replica for dev
589 */
590static int
591in_replica(
592	mdsetname_t	*sp,
593	md_replica_t	*rp,
594	mdname_t	*np,
595	diskaddr_t	slblk,
596	diskaddr_t	nblks,
597	md_error_t	*ep
598)
599{
600	mdname_t	*repnp = rp->r_namep;
601	diskaddr_t	rep_sblk = rp->r_blkno;
602	diskaddr_t	rep_nblks = rp->r_nblk;
603
604	/* should be in the same set */
605	assert(sp != NULL);
606
607	/* if error in master block, assume whole partition */
608	if ((rep_sblk == MD_DISKADDR_ERROR) ||
609	    (rep_nblks == MD_DISKADDR_ERROR)) {
610		rep_sblk = 0;
611		rep_nblks = MD_DISKADDR_ERROR;
612	}
613
614	/* check overlap */
615	if (meta_check_overlap(
616	    MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) {
617		return (-1);
618	}
619
620	/* return success */
621	return (0);
622}
623
624/*
625 * check to see if we're in a replica
626 */
627int
628meta_check_inreplica(
629	mdsetname_t		*sp,
630	mdname_t		*np,
631	diskaddr_t		slblk,
632	diskaddr_t		nblks,
633	md_error_t		*ep
634)
635{
636	md_replicalist_t	*rlp = NULL;
637	md_replicalist_t	*rl;
638	int			rval = 0;
639
640	/* should have a set */
641	assert(sp != NULL);
642
643	/* for each replica */
644	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
645		return (-1);
646	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
647		md_replica_t	*rp = rl->rl_repp;
648
649		/* check replica */
650		if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) {
651			rval = -1;
652			break;
653		}
654	}
655
656	/* cleanup, return success */
657	metafreereplicalist(rlp);
658	return (rval);
659}
660
661/*
662 * check replica
663 */
664int
665meta_check_replica(
666	mdsetname_t	*sp,		/* set to check against */
667	mdname_t	*np,		/* component to check against */
668	mdchkopts_t	options,	/* option flags */
669	diskaddr_t	slblk,		/* start logical block */
670	diskaddr_t	nblks,		/* number of blocks (-1,rest of them) */
671	md_error_t	*ep		/* error packet */
672)
673{
674	mdchkopts_t	chkoptions = MDCHK_ALLOW_REPSLICE;
675
676	/* make sure we have a disk */
677	if (metachkcomp(np, ep) != 0)
678		return (-1);
679
680	/* check to ensure that it is not already in use */
681	if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) {
682		return (-1);
683	}
684
685	if (options & MDCHK_ALLOW_NODBS)
686		return (0);
687
688	if (options & MDCHK_DRVINSET)
689		return (0);
690
691	/* make sure it is in the set */
692	if (meta_check_inset(sp, np, ep) != 0)
693		return (-1);
694
695	/* make sure its not in a metadevice */
696	if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0)
697		return (-1);
698
699	/* return success */
700	return (0);
701}
702
703static int
704update_dbinfo_on_drives(
705	mdsetname_t	*sp,
706	md_drive_desc	*dd,
707	int		set_locked,
708	int		force,
709	md_error_t	*ep
710)
711{
712	md_set_desc		*sd;
713	int			i;
714	md_setkey_t		*cl_sk;
715	int			rval = 0;
716	md_mnnode_desc		*nd;
717
718	if ((sd = metaget_setdesc(sp, ep)) == NULL)
719		return (-1);
720
721	if (! set_locked) {
722		if (MD_MNSET_DESC(sd)) {
723			md_error_t xep = mdnullerror;
724			sigset_t sigs;
725			/* Make sure we are blocking all signals */
726			if (procsigs(TRUE, &sigs, &xep) < 0)
727				mdclrerror(&xep);
728
729			nd = sd->sd_nodelist;
730			while (nd) {
731				if (force && strcmp(nd->nd_nodename,
732				    mynode()) != 0) {
733					nd = nd->nd_next;
734					continue;
735				}
736
737				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
738					nd = nd->nd_next;
739					continue;
740				}
741
742				if (clnt_lock_set(nd->nd_nodename, sp, ep))
743					return (-1);
744				nd = nd->nd_next;
745			}
746		} else {
747			for (i = 0; i < MD_MAXSIDES; i++) {
748				/* Skip empty slots */
749				if (sd->sd_nodes[i][0] == '\0')
750					continue;
751
752				if (force && strcmp(sd->sd_nodes[i],
753				    mynode()) != 0)
754					continue;
755
756				if (clnt_lock_set(sd->sd_nodes[i], sp, ep))
757					return (-1);
758			}
759		}
760	}
761
762	if (MD_MNSET_DESC(sd)) {
763		nd = sd->sd_nodelist;
764		while (nd) {
765			if (force && strcmp(nd->nd_nodename, mynode()) != 0) {
766				nd = nd->nd_next;
767				continue;
768			}
769
770			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
771				nd = nd->nd_next;
772				continue;
773			}
774
775			if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep)
776			    == -1) {
777				rval = -1;
778				break;
779			}
780			nd = nd->nd_next;
781		}
782	} else {
783		for (i = 0; i < MD_MAXSIDES; i++) {
784			/* Skip empty slots */
785			if (sd->sd_nodes[i][0] == '\0')
786				continue;
787
788			if (force && strcmp(sd->sd_nodes[i], mynode()) != 0)
789				continue;
790
791			if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep)
792			    == -1) {
793				rval = -1;
794				break;
795			}
796		}
797	}
798
799	if (! set_locked) {
800		cl_sk = cl_get_setkey(sp->setno, sp->setname);
801		if (MD_MNSET_DESC(sd)) {
802			nd = sd->sd_nodelist;
803			while (nd) {
804				if (force &&
805				    strcmp(nd->nd_nodename, mynode()) != 0) {
806					nd = nd->nd_next;
807					continue;
808				}
809
810				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
811					nd = nd->nd_next;
812					continue;
813				}
814
815				if (clnt_unlock_set(nd->nd_nodename, cl_sk,
816				    ep)) {
817					rval = -1;
818					break;
819				}
820				nd = nd->nd_next;
821			}
822		} else {
823			for (i = 0; i < MD_MAXSIDES; i++) {
824				/* Skip empty slots */
825				if (sd->sd_nodes[i][0] == '\0')
826					continue;
827
828				if (force &&
829				    strcmp(sd->sd_nodes[i], mynode()) != 0)
830					continue;
831
832				if (clnt_unlock_set(sd->sd_nodes[i], cl_sk,
833				    ep)) {
834					rval = -1;
835					break;
836				}
837			}
838
839		}
840		cl_set_setkey(NULL);
841	}
842
843	return (rval);
844}
845
846int
847meta_db_addsidenms(
848	mdsetname_t	*sp,
849	mdname_t	*np,
850	daddr_t		blkno,
851	int		bcast,
852	md_error_t	*ep
853)
854{
855	side_t		sideno;
856	char		*bname = NULL;
857	char		*dname = NULL;
858	minor_t		mnum;
859	mddb_config_t	c;
860	int		done;
861	int		rval = 0;
862	md_set_desc	*sd;
863
864	sideno = MD_SIDEWILD;
865	/*CONSTCOND*/
866	while (1) {
867		if (bname != NULL) {
868			Free(bname);
869			bname = NULL;
870		}
871		if (dname != NULL) {
872			Free(dname);
873			dname = NULL;
874		}
875		if ((done = meta_getnextside_devinfo(sp, np->bname,
876		    &sideno, &bname, &dname, &mnum, ep)) == -1) {
877			rval = -1;
878			break;
879		}
880
881		if (done == 0)
882			break;
883
884		if (! metaislocalset(sp)) {
885			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
886				rval = -1;
887				break;
888			}
889		}
890
891		/*
892		 * Send addsidenms to all nodes using rpc.mdcommd if
893		 * sidename is being added to MN diskset.
894		 *
895		 *   It's ok to broadcast this call to other nodes.
896		 *
897		 *   Note: The broadcast to other nodes isn't needed during
898		 *   the addition of the first mddbs to the set since the
899		 *   other nodes haven't been joined to the set yet.  All
900		 *   nodes in a MN diskset are (implicitly) joined to the set
901		 *   on the addition of the first mddb.
902		 */
903		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
904		    (bcast == DB_ADDSIDENMS_BCAST)) {
905			md_mn_result_t			*resultp = NULL;
906			md_mn_msg_meta_db_newside_t	db_ns;
907			int				send_rval;
908
909			db_ns.msg_l_dev = np->dev;
910			db_ns.msg_sideno = sideno;
911			db_ns.msg_blkno = blkno;
912			(void) strncpy(db_ns.msg_dname, dname,
913			    sizeof (db_ns.msg_dname));
914			(void) splitname(np->bname, &db_ns.msg_splitname);
915			db_ns.msg_mnum = mnum;
916
917			/* Set devid to NULL until devids are supported */
918			db_ns.msg_devid[0] = NULL;
919
920			/*
921			 * If reconfig cycle has been started, this node is
922			 * stuck in in the return step until this command has
923			 * completed.  If mdcommd is suspended, ask
924			 * send_message to fail (instead of retrying)
925			 * so that metaset can finish allowing the reconfig
926			 * cycle to proceed.
927			 */
928			send_rval = mdmn_send_message(sp->setno,
929			    MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
930			    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns,
931			    sizeof (md_mn_msg_meta_db_newside_t),
932			    &resultp, ep);
933			if (send_rval != 0) {
934				rval = -1;
935				if (resultp == NULL)
936					(void) mddserror(ep,
937					    MDE_DS_COMMD_SEND_FAIL,
938					    sp->setno, NULL, NULL,
939					    sp->setname);
940				else {
941					(void) mdstealerror(ep,
942					    &(resultp->mmr_ep));
943					if (mdisok(ep)) {
944						(void) mddserror(ep,
945						    MDE_DS_COMMD_SEND_FAIL,
946						    sp->setno, NULL, NULL,
947						    sp->setname);
948					}
949					free_result(resultp);
950				}
951				break;
952			}
953			if (resultp)
954				free_result(resultp);
955		} else {
956			/*
957			 * Let this side's  device name, minor # and driver name
958			 * be known to the database replica.
959			 */
960			(void) memset(&c, 0, sizeof (c));
961
962			/* Fill in device/replica info */
963			c.c_locator.l_dev = meta_cmpldev(np->dev);
964			c.c_locator.l_blkno = blkno;
965			(void) strncpy(c.c_locator.l_driver, dname,
966			    sizeof (c.c_locator.l_driver));
967			if (splitname(np->bname, &c.c_devname) ==
968			    METASPLIT_LONGDISKNAME && devid_in_use == FALSE) {
969				rval = mddeverror(ep, MDE_DISKNAMETOOLONG,
970				    NODEV64, np->rname);
971				break;
972			}
973
974			c.c_locator.l_mnum = mnum;
975
976			/* Fill in setno, setname, and sideno */
977			c.c_setno = sp->setno;
978			(void) strncpy(c.c_setname, sp->setname,
979			    sizeof (c.c_setname));
980			c.c_sideno = sideno;
981
982			/*
983			 * Don't need device id information from this ioctl
984			 * Kernel determines device id from dev_t, which
985			 * is just what this code would do.
986			 */
987			c.c_locator.l_devid = (uint64_t)0;
988			c.c_locator.l_devid_flags = 0;
989
990			if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) {
991				rval = mdstealerror(ep, &c.c_mde);
992				break;
993			}
994		}
995	}
996
997	/* cleanup, return success */
998	if (bname != NULL) {
999		Free(bname);
1000		bname = NULL;
1001	}
1002	if (dname != NULL) {
1003		Free(dname);
1004		dname = NULL;
1005	}
1006	return (rval);
1007}
1008
1009
1010int
1011meta_db_delsidenm(
1012	mdsetname_t	*sp,
1013	side_t		sideno,
1014	mdname_t	*np,
1015	daddr_t		blkno,
1016	md_error_t	*ep
1017)
1018{
1019	mddb_config_t	c;
1020	md_set_desc	*sd;
1021
1022	if (! metaislocalset(sp)) {
1023		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1024			return (-1);
1025	}
1026	/* Use rpc.mdcommd to delete mddb side from all nodes */
1027	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1028	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1029		md_mn_result_t			*resultp = NULL;
1030		md_mn_msg_meta_db_delside_t	db_ds;
1031		int				send_rval;
1032
1033		db_ds.msg_l_dev = np->dev;
1034		db_ds.msg_blkno = blkno;
1035		db_ds.msg_sideno = sideno;
1036
1037		/* Set devid to NULL until devids are supported */
1038		db_ds.msg_devid[0] = NULL;
1039
1040		/*
1041		 * If reconfig cycle has been started, this node is
1042		 * stuck in in the return step until this command has
1043		 * completed.  If mdcommd is suspended, ask
1044		 * send_message to fail (instead of retrying)
1045		 * so that metaset can finish allowing the reconfig
1046		 * cycle to proceed.
1047		 */
1048		send_rval = mdmn_send_message(sp->setno,
1049		    MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
1050		    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds,
1051		    sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
1052		if (send_rval != 0) {
1053			if (resultp == NULL)
1054				(void) mddserror(ep,
1055				    MDE_DS_COMMD_SEND_FAIL,
1056				    sp->setno, NULL, NULL,
1057				    sp->setname);
1058			else {
1059				(void) mdstealerror(ep, &(resultp->mmr_ep));
1060				if (mdisok(ep)) {
1061					(void) mddserror(ep,
1062					    MDE_DS_COMMD_SEND_FAIL,
1063					    sp->setno, NULL, NULL,
1064					    sp->setname);
1065				}
1066				free_result(resultp);
1067			}
1068			return (-1);
1069		}
1070		if (resultp)
1071			free_result(resultp);
1072
1073	} else {
1074		/*
1075		 * Let this side's  device name, minor # and driver name
1076		 * be known to the database replica.
1077		 */
1078		(void) memset(&c, 0, sizeof (c));
1079
1080		/* Fill in device/replica info */
1081		c.c_locator.l_dev = meta_cmpldev(np->dev);
1082		c.c_locator.l_blkno = blkno;
1083
1084		/* Fill in setno, setname, and sideno */
1085		c.c_setno = sp->setno;
1086		(void) strcpy(c.c_setname, sp->setname);
1087		c.c_sideno = sideno;
1088
1089		/*
1090		 * Don't need device id information from this ioctl
1091		 * Kernel determines device id from dev_t, which
1092		 * is just what this code would do.
1093		 */
1094		c.c_locator.l_devid = (uint64_t)0;
1095		c.c_locator.l_devid_flags = 0;
1096
1097		if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0)
1098			return (mdstealerror(ep, &c.c_mde));
1099	}
1100	return (0);
1101}
1102
1103
1104static int
1105mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep)
1106{
1107	mdnamelist_t		*dnp1, *dnp2;
1108
1109	for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) {
1110		for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) {
1111			if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0)
1112				return (mderror(ep, MDE_DUPDRIVE,
1113				    dnp1->namep->cname));
1114		}
1115	}
1116	return (0);
1117}
1118
1119
1120/*
1121 * Return 1 if files are different, else return 0
1122 */
1123static int
1124filediff(char *tsname, char *sname)
1125{
1126	int ret = 1, fd;
1127	size_t tsz, sz;
1128	struct stat sbuf;
1129	char *tbuf, *buf;
1130
1131	if (stat(tsname, &sbuf) != 0)
1132		return (1);
1133	tsz = sbuf.st_size;
1134	if (stat(sname, &sbuf) != 0)
1135		return (1);
1136	sz = sbuf.st_size;
1137	if (tsz != sz)
1138		return (1);
1139
1140	/* allocate memory and read both files into buffer */
1141	tbuf = malloc(tsz);
1142	buf = malloc(sz);
1143	if (tbuf == NULL || buf == NULL)
1144		goto out;
1145
1146	fd = open(tsname, O_RDONLY);
1147	if (fd == -1)
1148		goto out;
1149	sz = read(fd, tbuf, tsz);
1150	(void) close(fd);
1151	if (sz != tsz)
1152		goto out;
1153
1154	fd = open(sname, O_RDONLY);
1155	if (fd == -1)
1156		goto out;
1157	sz = read(fd, buf, tsz);
1158	(void) close(fd);
1159	if (sz != tsz)
1160		goto out;
1161
1162	/* compare content */
1163	ret = bcmp(tbuf, buf, tsz);
1164out:
1165	if (tbuf)
1166		free(tbuf);
1167	if (buf)
1168		free(buf);
1169	return (ret);
1170}
1171
1172/*
1173 * patch md.conf file with mddb locations
1174 */
1175int
1176meta_db_patch(
1177	char		*sname,		/* system file name */
1178	char		*cname,		/* mddb.cf file name */
1179	int		patch,		/* patching locally */
1180	md_error_t	*ep
1181)
1182{
1183	char		*tsname = NULL;
1184	char		line[MDDB_BOOTLIST_MAX_LEN];
1185	FILE		*tsfp = NULL;
1186	FILE		*mfp = NULL;
1187	int		rval = -1;
1188
1189	/* check names */
1190	if (sname == NULL) {
1191		if (patch)
1192			sname = "md.conf";
1193		else
1194			sname = "/kernel/drv/md.conf";
1195	}
1196	if (cname == NULL)
1197		cname = META_DBCONF;
1198
1199	/*
1200	 * edit file
1201	 */
1202	if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) {
1203		if (mdissyserror(ep, EROFS)) {
1204			/*
1205			 * If we are booted on a read-only root because
1206			 * of mddb quorum problems we don't want to emit
1207			 * any scary error messages.
1208			 */
1209			mdclrerror(ep);
1210			rval = 0;
1211		}
1212		goto out;
1213	}
1214
1215	if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0,
1216	    ep) != 0)
1217		goto out;
1218
1219	/* if file content is identical, skip rename */
1220	if (filediff(tsname, sname) == 0) {
1221		rval = 0;
1222		goto out;
1223	}
1224
1225	if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) ||
1226	    (fclose(tsfp) != 0)) {
1227		(void) mdsyserror(ep, errno, tsname);
1228		goto out;
1229	}
1230
1231	tsfp = NULL;
1232
1233	/*
1234	 * rename file. If we get a Cross Device error then it
1235	 * is because we are in the miniroot.
1236	 */
1237	if (rename(tsname, sname) != 0 && errno != EXDEV) {
1238		(void) mdsyserror(ep, errno, sname);
1239		goto out;
1240	}
1241
1242	if (errno == EXDEV) {
1243		if ((tsfp = fopen(tsname, "r")) == NULL)
1244			goto out;
1245		if ((mfp = fopen(sname, "w+")) == NULL)
1246			goto out;
1247		while (fgets(line, sizeof (line), tsfp) != NULL) {
1248			if (fputs(line, mfp) == NULL)
1249				goto out;
1250		}
1251		(void) fclose(tsfp);
1252		tsfp = NULL;
1253		if (fflush(mfp) != 0)
1254			goto out;
1255		if (fsync(fileno(mfp)) != 0)
1256			goto out;
1257		if (fclose(mfp) != 0) {
1258			mfp = NULL;
1259			goto out;
1260		}
1261	}
1262
1263	Free(tsname);
1264	tsname = NULL;
1265	rval = 0;
1266
1267	/* cleanup, return error */
1268out:
1269	if (tsfp != NULL)
1270		(void) fclose(tsfp);
1271	if (tsname != NULL) {
1272		(void) unlink(tsname);
1273		Free(tsname);
1274	}
1275	return (rval);
1276}
1277
1278/*
1279 * Add replicas to set.  This happens as a result of:
1280 *	- metadb [-s set_name] -a
1281 *	- metaset -s set_name -a disk
1282 *	- metaset -s set_name -d disk	 (causes a rebalance of mddbs)
1283 *	- metaset -s set_name -b
1284 *
1285 * For a local set, this routine is run on the local set host.
1286 *
1287 * For a traditional diskset, this routine is run on the node that
1288 * is running the metaset command.
1289 *
1290 * For a multinode diskset, this routine is run by the node that is
1291 * running the metaset command.  If this is the first mddb added to
1292 * the MN diskset, then no communication is made to other nodes via commd
1293 * since the other nodes will be in-sync with respect to the mddbs when
1294 * those other nodes join the set and snarf in the newly created mddb.
1295 * If this is not the first mddb added to the MN diskset, then this
1296 * attach command is sent to all of the nodes using commd.  This keeps
1297 * the nodes in-sync.
1298 */
1299int
1300meta_db_attach(
1301	mdsetname_t		*sp,
1302	mdnamelist_t		*db_nlp,
1303	mdchkopts_t		options,
1304	md_timeval32_t		*timeval,
1305	int			dbcnt,
1306	int			dbsize,
1307	char			*sysfilename,
1308	md_error_t		*ep
1309)
1310{
1311	struct mddb_config	c;
1312	mdnamelist_t		*nlp;
1313	mdname_t		*np;
1314	md_drive_desc		*dd = NULL;
1315	md_drive_desc		*p;
1316	int			i;
1317	int			fd;
1318	side_t			sideno;
1319	daddr_t			blkno;
1320	int			replicacount = 0;
1321	int			start_svmdaemons = 0;
1322	int			rval = 0;
1323	md_error_t		status = mdnullerror;
1324	md_set_desc		*sd;
1325	int			stale_bool = FALSE;
1326	int			flags;
1327	int			firstmddb = 1;
1328	md_timeval32_t		inittime = {0, 0};
1329
1330	/*
1331	 * Error if we don't get some work to do.
1332	 */
1333	if (db_nlp == NULL)
1334		return (mdsyserror(ep, EINVAL, NULL));
1335
1336	if (mdnamesareunique(db_nlp, ep) != 0)
1337		return (-1);
1338	(void) memset(&c, 0, sizeof (c));
1339	c.c_id = 0;
1340	c.c_setno = sp->setno;
1341
1342	/* Don't need device id information from this ioctl */
1343	c.c_locator.l_devid = (uint64_t)0;
1344	c.c_locator.l_devid_flags = 0;
1345	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1346		if (metaislocalset(sp)) {
1347			if (mdismddberror(&c.c_mde, MDE_DB_INVALID))
1348				mdclrerror(&c.c_mde);
1349			else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) ||
1350			    (! (options & MDCHK_ALLOW_NODBS)))
1351				return (mdstealerror(ep, &c.c_mde));
1352		} else {
1353			if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER))
1354				return (mdstealerror(ep, &c.c_mde));
1355		}
1356		mdclrerror(&c.c_mde);
1357	}
1358	/*
1359	 * Is current set STALE?
1360	 */
1361	if (c.c_flags & MDDB_C_STALE) {
1362		stale_bool = TRUE;
1363	}
1364
1365	assert(db_nlp != NULL);
1366
1367	/* if these are the first replicas then the SVM daemons need to run */
1368	if (c.c_dbcnt == 0)
1369		start_svmdaemons = 1;
1370
1371	/*
1372	 * check to see if we will go over the total possible number
1373	 * of data bases
1374	 */
1375	nlp = db_nlp;
1376	while (nlp) {
1377		replicacount += dbcnt;
1378		nlp = nlp->next;
1379	}
1380
1381	if ((replicacount + c.c_dbcnt) > c.c_dbmax)
1382		return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
1383		    sp->setno, c.c_dbcnt + replicacount, NULL));
1384
1385	/*
1386	 * go through and check to make sure all locations specified
1387	 * are legal also pick out driver name;
1388	 */
1389	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1390		diskaddr_t devsize;
1391
1392		np = nlp->namep;
1393
1394		if (! metaislocalset(sp)) {
1395			uint_t	partno;
1396			uint_t	rep_partno;
1397			mddrivename_t	*dnp = np->drivenamep;
1398
1399			/*
1400			 * make sure that non-local database replicas
1401			 * are always on the replica slice.
1402			 */
1403			if (meta_replicaslice(dnp,
1404			    &rep_partno, ep) != 0)
1405				return (-1);
1406			if (metagetvtoc(np, FALSE, &partno, ep) == NULL)
1407				return (-1);
1408			if (partno != rep_partno)
1409				return (mddeverror(ep, MDE_REPCOMP_ONLY,
1410				    np->dev, sp->setname));
1411		}
1412
1413		if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize),
1414		    ep)) {
1415			return (-1);
1416		}
1417
1418		if ((devsize = metagetsize(np, ep)) == -1)
1419			return (-1);
1420
1421		if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16))
1422			return (mdmddberror(ep, MDE_REPLICA_TOOSMALL,
1423			    meta_getminor(np->dev), sp->setno, devsize,
1424			    np->cname));
1425	}
1426
1427	/*
1428	 * If first disk in set we don't have lb_inittime yet for use as
1429	 * mb_setcreatetime so don't go looking for it. WE'll come back
1430	 * later and update after the locator block has been created.
1431	 * If this isn't the first disk in the set, we have a locator
1432	 * block and thus we have lb_inittime. Set mb_setcreatetime to
1433	 * lb_inittime.
1434	 */
1435	if (! metaislocalset(sp)) {
1436		if (c.c_dbcnt != 0) {
1437			firstmddb = 0;
1438			inittime = meta_get_lb_inittime(sp, ep);
1439		}
1440	}
1441
1442	/*
1443	 * go through and write all master blocks
1444	 */
1445
1446	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1447		np = nlp->namep;
1448
1449		if ((fd = open(np->rname, O_RDWR)) < 0)
1450			return (mdsyserror(ep, errno, np->rname));
1451
1452		for (i = 0; i < dbcnt; i++) {
1453			if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize,
1454			    inittime, ep)) {
1455				(void) close(fd);
1456				return (-1);
1457			}
1458		}
1459		(void) close(fd);
1460	}
1461
1462	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1463		return (-1);
1464
1465	if (! metaislocalset(sp)) {
1466		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1467		if (! mdisok(ep))
1468			return (-1);
1469		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1470			return (-1);
1471
1472	}
1473
1474	/*
1475	 * go through and tell kernel to add them
1476	 */
1477	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1478		mdcinfo_t	*cinfo;
1479
1480		np = nlp->namep;
1481
1482		if ((cinfo = metagetcinfo(np, ep)) == NULL) {
1483			rval = -1;
1484			goto out;
1485		}
1486
1487		/*
1488		 * If mddb is being added to MN diskset and there already
1489		 * exists a valid mddb in the set (which equates to this
1490		 * node being an owner of the set) then use rpc.mdcommd
1491		 * mechanism to add mddb(s) so that all nodes stay in sync.
1492		 * If set is stale, don't log the message since rpc.mdcommd
1493		 * can't write the message to the mddb.
1494		 *
1495		 * Otherwise, just add mddb to this node.
1496		 */
1497		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1498		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1499			md_mn_result_t			*resultp = NULL;
1500			md_mn_msg_meta_db_attach_t	attach;
1501			int 				send_rval;
1502
1503			/*
1504			 * In a scenario where new replicas had been added on
1505			 * the master, and then all of the old replicas failed
1506			 * before the slaves had knowledge of the new replicas,
1507			 * the slaves are unable to re-parse in the mddb
1508			 * from the new replicas since the slaves have no
1509			 * knowledge of the new replicas.  The following
1510			 * algorithm solves this problem:
1511			 * 	- META_DB_ATTACH message generates submsgs
1512			 * 		- BLOCK parse (master)
1513			 * 		- MDDB_ATTACH new replicas
1514			 * 		- UNBLOCK parse (master) causing parse
1515			 *		information to be sent from master
1516			 *		to slaves at a higher class than the
1517			 *		unblock so the parse message will
1518			 *		reach slaves before unblock message.
1519			 */
1520			attach.msg_l_dev = np->dev;
1521			attach.msg_cnt = dbcnt;
1522			attach.msg_dbsize = dbsize;
1523			(void) strncpy(attach.msg_dname, cinfo->dname,
1524			    sizeof (attach.msg_dname));
1525			(void) splitname(np->bname, &attach.msg_splitname);
1526			attach.msg_options = options;
1527
1528			/* Set devid to NULL until devids are supported */
1529			attach.msg_devid[0] = NULL;
1530
1531			/*
1532			 * If reconfig cycle has been started, this node is
1533			 * stuck in in the return step until this command has
1534			 * completed.  If mdcommd is suspended, ask
1535			 * send_message to fail (instead of retrying)
1536			 * so that metaset can finish allowing the reconfig
1537			 * cycle to proceed.
1538			 */
1539			flags = MD_MSGF_FAIL_ON_SUSPEND;
1540			if (stale_bool == TRUE)
1541				flags |= MD_MSGF_NO_LOG;
1542			send_rval = mdmn_send_message(sp->setno,
1543			    MD_MN_MSG_META_DB_ATTACH,
1544			    flags, 0, (char *)&attach,
1545			    sizeof (md_mn_msg_meta_db_attach_t),
1546			    &resultp, ep);
1547			if (send_rval != 0) {
1548				rval = -1;
1549				if (resultp == NULL)
1550					(void) mddserror(ep,
1551					    MDE_DS_COMMD_SEND_FAIL,
1552					    sp->setno, NULL, NULL,
1553					    sp->setname);
1554				else {
1555					(void) mdstealerror(ep,
1556					    &(resultp->mmr_ep));
1557					if (mdisok(ep)) {
1558						(void) mddserror(ep,
1559						    MDE_DS_COMMD_SEND_FAIL,
1560						    sp->setno, NULL, NULL,
1561						    sp->setname);
1562					}
1563					free_result(resultp);
1564				}
1565				goto out;
1566			}
1567			if (resultp)
1568				free_result(resultp);
1569		} else {
1570			/* Adding mddb(s) to just this node */
1571			for (i = 0; i < dbcnt; i++) {
1572				(void) memset(&c, 0, sizeof (c));
1573				/* Fill in device/replica info */
1574				c.c_locator.l_dev = meta_cmpldev(np->dev);
1575				c.c_locator.l_blkno = i * dbsize + 16;
1576				blkno = c.c_locator.l_blkno;
1577				(void) strncpy(c.c_locator.l_driver,
1578				    cinfo->dname,
1579				    sizeof (c.c_locator.l_driver));
1580
1581				if (splitname(np->bname, &c.c_devname) ==
1582				    METASPLIT_LONGDISKNAME && devid_in_use ==
1583				    FALSE) {
1584					rval = mddeverror(ep,
1585					    MDE_DISKNAMETOOLONG,
1586					    NODEV64, np->rname);
1587					goto out;
1588				}
1589
1590				c.c_locator.l_mnum = meta_getminor(np->dev);
1591
1592				/* Fill in setno, setname, and sideno */
1593				c.c_setno = sp->setno;
1594				if (! metaislocalset(sp)) {
1595					if (MD_MNSET_DESC(sd)) {
1596						c.c_multi_node = 1;
1597					}
1598				}
1599				(void) strcpy(c.c_setname, sp->setname);
1600				c.c_sideno = sideno;
1601
1602				/*
1603				 * Don't need device id information from this
1604				 * ioctl Kernel determines device id from
1605				 * dev_t, which is just what this code would do.
1606				 */
1607				c.c_locator.l_devid = (uint64_t)0;
1608				c.c_locator.l_devid_flags = 0;
1609
1610				if (timeval != NULL)
1611					c.c_timestamp = *timeval;
1612
1613				if (setup_med_cfg(sp, &c,
1614				    (options & MDCHK_SET_FORCE), ep)) {
1615					rval = -1;
1616					goto out;
1617				}
1618
1619				if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde,
1620				    NULL) != 0) {
1621					rval = mdstealerror(ep, &c.c_mde);
1622					goto out;
1623				}
1624				/*
1625				 * This is either a traditional diskset OR this
1626				 * is the first replica added to a MN diskset.
1627				 * In either case, set broadcast to NO_BCAST so
1628				 * that message won't go through rpc.mdcommd.
1629				 * If this is a traditional diskset, the bcast
1630				 * flag is ignored since traditional disksets
1631				 * don't use the rpc.mdcommd.
1632				 */
1633				if (meta_db_addsidenms(sp, np, blkno,
1634				    DB_ADDSIDENMS_NO_BCAST, ep))
1635					goto out;
1636			}
1637		}
1638		if (! metaislocalset(sp)) {
1639			/* update the dbcnt and size in dd */
1640			for (p = dd; p != NULL; p = p->dd_next)
1641				if (p->dd_dnp == np->drivenamep) {
1642					p->dd_dbcnt = dbcnt;
1643					p->dd_dbsize  = dbsize;
1644					break;
1645				}
1646		}
1647
1648		/*
1649		 * If this was the first addition of disks to the
1650		 * diskset you now need to update the mb_setcreatetime
1651		 * which needed lb_inittime which wasn't there until now.
1652		 */
1653		if (firstmddb) {
1654			if (meta_update_mb(sp, dd, ep) != 0) {
1655				return (-1);
1656			}
1657		}
1658		(void) close(fd);
1659	}
1660
1661out:
1662	if (metaislocalset(sp)) {
1663
1664		/* everything looks fine. Start mdmonitord */
1665		if (rval == 0 && start_svmdaemons == 1) {
1666			if (meta_smf_enable(META_SMF_CORE, &status) == -1) {
1667				mde_perror(&status, "");
1668				mdclrerror(&status);
1669			}
1670		}
1671
1672		if (buildconf(sp, &status)) {
1673			/* Don't mask any previous errors */
1674			if (rval == 0)
1675				rval = mdstealerror(ep, &status);
1676			return (rval);
1677		}
1678
1679		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
1680			/* Don't mask any previous errors */
1681			if (rval == 0)
1682				rval = mdstealerror(ep, &status);
1683		}
1684	} else {
1685		if (update_dbinfo_on_drives(sp, dd,
1686		    (options & MDCHK_SET_LOCKED),
1687		    (options & MDCHK_SET_FORCE),
1688		    &status)) {
1689			/* Don't mask any previous errors */
1690			if (rval == 0)
1691				rval = mdstealerror(ep, &status);
1692			else
1693				mdclrerror(&status);
1694		}
1695		metafreedrivedesc(&dd);
1696	}
1697	/*
1698	 * For MN disksets that already had already had nodes joined
1699	 * before the attach of this mddb(s), the name invalidation is
1700	 * done by the commd handler routine.  Otherwise, if this
1701	 * is the first attach of a MN diskset mddb, the invalidation
1702	 * must be done here since the first attach cannot be sent
1703	 * via the commd since there are no nodes joined to the set yet.
1704	 */
1705	if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) ||
1706	    (MD_MNSET_DESC(sd) &&
1707	    (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) {
1708		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
1709			meta_invalidate_name(nlp->namep);
1710		}
1711	}
1712	return (rval);
1713}
1714
1715/*
1716 * deletelist_length
1717 *
1718 *	return the number of slices that have been specified for deletion
1719 *	on the metadb command line.  This does not calculate the number
1720 *	of replicas because there may be multiple replicas per slice.
1721 */
1722static int
1723deletelist_length(mdnamelist_t *db_nlp)
1724{
1725
1726	mdnamelist_t		*nlp;
1727	int			list_length = 0;
1728
1729	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1730		list_length++;
1731	}
1732
1733	return (list_length);
1734}
1735
1736static int
1737in_deletelist(char *devname, mdnamelist_t *db_nlp)
1738{
1739
1740	mdnamelist_t		*nlp;
1741	mdname_t		*np;
1742	int			index = 0;
1743
1744	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1745		np = nlp->namep;
1746
1747		if (strcmp(devname, np->bname) == 0)
1748			return (index);
1749		index++;
1750	}
1751
1752	return (-1);
1753}
1754
1755/*
1756 * Delete replicas from set.  This happens as a result of:
1757 *	- metadb [-s set_name] -d
1758 *	- metaset -s set_name -a disk	(causes a rebalance of mddbs)
1759 *	- metaset -s set_name -d disk
1760 *	- metaset -s set_name -b
1761 *
1762 * For a local set, this routine is run on the local set host.
1763 *
1764 * For a traditional diskset, this routine is run on the node that
1765 * is running the metaset command.
1766 *
1767 * For a multinode diskset, this routine is run by the node that is
1768 * running the metaset command.  This detach routine is sent to all
1769 * of the joined nodes in the diskset using commd.  This keeps
1770 * the nodes in-sync.
1771 */
1772int
1773meta_db_detach(
1774	mdsetname_t		*sp,
1775	mdnamelist_t		*db_nlp,
1776	mdforceopts_t		force_option,
1777	char			*sysfilename,
1778	md_error_t		*ep
1779)
1780{
1781	struct mddb_config	c;
1782	mdnamelist_t		*nlp;
1783	mdname_t		*np;
1784	md_drive_desc		*dd = NULL;
1785	md_drive_desc		*p;
1786	int			replicacount;
1787	int			replica_delete_count;
1788	int			nr_replica_slices;
1789	int			i;
1790	int			stop_svmdaemons = 0;
1791	int			rval = 0;
1792	int			index;
1793	int			valid_replicas_nottodelete = 0;
1794	int			invalid_replicas_nottodelete = 0;
1795	int			invalid_replicas_todelete = 0;
1796	int			errored = 0;
1797	int			*tag_array;
1798	int			fd = -1;
1799	md_error_t		status = mdnullerror;
1800	md_set_desc		*sd;
1801	int			stale_bool = FALSE;
1802	int			flags;
1803
1804	/*
1805	 * Error if we don't get some work to do.
1806	 */
1807	if (db_nlp == NULL)
1808		return (mdsyserror(ep, EINVAL, NULL));
1809
1810	if (mdnamesareunique(db_nlp, ep) != 0)
1811		return (-1);
1812
1813	(void) memset(&c, 0, sizeof (c));
1814	c.c_id = 0;
1815	c.c_setno = sp->setno;
1816
1817	/* Don't need device id information from this ioctl */
1818	c.c_locator.l_devid = (uint64_t)0;
1819	c.c_locator.l_devid_flags = 0;
1820
1821	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1822		return (mdstealerror(ep, &c.c_mde));
1823
1824	/*
1825	 * Is current set STALE?
1826	 */
1827	if (c.c_flags & MDDB_C_STALE) {
1828		stale_bool = TRUE;
1829	}
1830
1831	replicacount = c.c_dbcnt;
1832
1833	assert(db_nlp != NULL);
1834
1835	/*
1836	 * go through and gather how many data bases are on each
1837	 * device specified.
1838	 */
1839
1840	nr_replica_slices = deletelist_length(db_nlp);
1841	tag_array = (int *)calloc(nr_replica_slices, sizeof (int));
1842
1843	replica_delete_count = 0;
1844	for (i = 0; i < replicacount; i++) {
1845		char	*devname;
1846		int	found = 0;
1847
1848		c.c_id = i;
1849
1850		/* Don't need device id information from this ioctl */
1851		c.c_locator.l_devid = (uint64_t)0;
1852		c.c_locator.l_devid_flags = 0;
1853
1854		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1855			return (mdstealerror(ep, &c.c_mde));
1856
1857		devname = splicename(&c.c_devname);
1858
1859		if (strstr(devname, META_LONGDISKNAME_STR) != NULL) {
1860			Free(devname);
1861			devname = getlongname(&c, ep);
1862			if (devname == NULL) {
1863				return (-1);
1864			}
1865		}
1866
1867		if ((index = in_deletelist(devname, db_nlp)) != -1) {
1868			found = 1;
1869			tag_array[index] = 1;
1870			replica_delete_count++;
1871		}
1872
1873		errored = c.c_locator.l_flags & (MDDB_F_EREAD |
1874		    MDDB_F_EWRITE | MDDB_F_TOOSMALL | MDDB_F_EFMT |
1875		    MDDB_F_EDATA | MDDB_F_EMASTER);
1876
1877		/*
1878		 * There are four combinations of "errored" and "found"
1879		 * and they are used to find the number of
1880		 * (a) valid/invalid replicas that are not in the delete
1881		 * list and are available in the system.
1882		 * (b) valid/invalid replicas that are to be deleted.
1883		 */
1884
1885		if (errored && !found)		/* errored and !found */
1886			invalid_replicas_nottodelete++;
1887		else if (!found)		/* !errored and !found */
1888			valid_replicas_nottodelete++;
1889		else if (errored)		/* errored and found */
1890			invalid_replicas_todelete++;
1891		/*
1892		 * else it is !errored and found. This means
1893		 * valid_replicas_todelete++; But this variable will not
1894		 * be used anywhere
1895		 */
1896
1897		Free(devname);
1898	}
1899
1900	index = 0;
1901	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1902		np = nlp->namep;
1903		if (tag_array[index++] != 1) {
1904			Free(tag_array);
1905			return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname));
1906		}
1907	}
1908
1909	Free(tag_array);
1910
1911
1912	/* if all replicas are deleted stop mdmonitord */
1913	if ((replicacount - replica_delete_count) == 0)
1914		stop_svmdaemons = 1;
1915
1916	if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) {
1917		if (force_option & MDFORCE_NONE)
1918			return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname));
1919		if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS))
1920			return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname));
1921	}
1922
1923	/*
1924	 * The following algorithms are followed to check for deletion:
1925	 * (a) If the delete list(db_nlp) has all invalid replicas and no valid
1926	 * replicas, then deletion should be allowed.
1927	 * (b) Deletion should be allowed only if valid replicas that are "not"
1928	 * to be deleted is always greater than the invalid replicas that
1929	 * are "not" to be deleted.
1930	 * (c) If the user uses -f option, then deletion should be allowed.
1931	 */
1932
1933	if ((invalid_replicas_todelete != replica_delete_count) &&
1934	    (invalid_replicas_nottodelete > valid_replicas_nottodelete) &&
1935	    (force_option != MDFORCE_LOCAL))
1936		return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname));
1937
1938	/*
1939	 * go through and tell kernel to delete them
1940	 */
1941
1942	/* Don't need device id information from this ioctl */
1943	c.c_locator.l_devid = (uint64_t)0;
1944	c.c_locator.l_devid_flags = 0;
1945
1946	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1947		return (mdstealerror(ep, &c.c_mde));
1948
1949	if (! metaislocalset(sp)) {
1950		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1951		if (! mdisok(ep))
1952			return (-1);
1953		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1954			return (-1);
1955	}
1956
1957	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1958		np = nlp->namep;
1959
1960		/*
1961		 * If mddb is being deleted from MN diskset and node is
1962		 * an owner of the diskset then use rpc.mdcommd
1963		 * mechanism to add mddb(s) so that all nodes stay in sync.
1964		 * If set is stale, don't log the message since rpc.mdcommd
1965		 * can't write the message to the mddb.
1966		 *
1967		 * When mddbs are first being added to set, a detach can
1968		 * be called before any node has joined the diskset, so
1969		 * must check to see if node is an owner of the diskset.
1970		 *
1971		 * Otherwise, just delete mddb from this node.
1972		 */
1973
1974		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1975		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1976			md_mn_result_t			*resultp;
1977			md_mn_msg_meta_db_detach_t	detach;
1978			int				send_rval;
1979
1980			/*
1981			 * The following algorithm is used to detach replicas.
1982			 * 	- META_DB_DETACH message generates submsgs
1983			 * 		- BLOCK parse (master)
1984			 * 		- MDDB_DETACH replicas
1985			 * 		- UNBLOCK parse (master) causing parse
1986			 *		information to be sent from master
1987			 *		to slaves at a higher class than the
1988			 *		unblock so the parse message will
1989			 *		reach slaves before unblock message.
1990			 */
1991			(void) splitname(np->bname, &detach.msg_splitname);
1992
1993			/* Set devid to NULL until devids are supported */
1994			detach.msg_devid[0] = NULL;
1995
1996			/*
1997			 * If reconfig cycle has been started, this node is
1998			 * stuck in in the return step until this command has
1999			 * completed.  If mdcommd is suspended, ask
2000			 * send_message to fail (instead of retrying)
2001			 * so that metaset can finish allowing the reconfig
2002			 * cycle to proceed.
2003			 */
2004			flags = MD_MSGF_FAIL_ON_SUSPEND;
2005			if (stale_bool == TRUE)
2006				flags |= MD_MSGF_NO_LOG;
2007			send_rval = mdmn_send_message(sp->setno,
2008			    MD_MN_MSG_META_DB_DETACH,
2009			    flags, 0, (char *)&detach,
2010			    sizeof (md_mn_msg_meta_db_detach_t),
2011			    &resultp, ep);
2012			if (send_rval != 0) {
2013				rval = -1;
2014				if (resultp == NULL)
2015					(void) mddserror(ep,
2016					    MDE_DS_COMMD_SEND_FAIL,
2017					    sp->setno, NULL, NULL,
2018					    sp->setname);
2019				else {
2020					(void) mdstealerror(ep,
2021					    &(resultp->mmr_ep));
2022					if (mdisok(ep)) {
2023						(void) mddserror(ep,
2024						    MDE_DS_COMMD_SEND_FAIL,
2025						    sp->setno, NULL, NULL,
2026						    sp->setname);
2027					}
2028					free_result(resultp);
2029				}
2030				goto out;
2031			}
2032			if (resultp)
2033				free_result(resultp);
2034		} else {
2035			i = 0;
2036			while (i < c.c_dbcnt) {
2037				char	*devname;
2038
2039				c.c_id = i;
2040
2041				/* Don't need devid info from this ioctl */
2042				c.c_locator.l_devid = (uint64_t)0;
2043				c.c_locator.l_devid_flags = 0;
2044
2045				if (metaioctl(MD_DB_GETDEV, &c,
2046				    &c.c_mde, NULL)) {
2047					rval = mdstealerror(ep, &c.c_mde);
2048					goto out;
2049				}
2050
2051				devname = splicename(&c.c_devname);
2052
2053				if (strstr(devname, META_LONGDISKNAME_STR)
2054				    != NULL) {
2055					Free(devname);
2056					devname = getlongname(&c, ep);
2057					if (devname == NULL) {
2058						return (-1);
2059					}
2060				}
2061
2062				if (strcmp(devname, np->bname) != 0) {
2063					Free(devname);
2064					i++;
2065					continue;
2066				}
2067				Free(devname);
2068
2069				/* Don't need devid info from this ioctl */
2070				c.c_locator.l_devid = (uint64_t)0;
2071				c.c_locator.l_devid_flags = 0;
2072
2073				if (metaioctl(MD_DB_DELDEV, &c,
2074				    &c.c_mde, NULL) != 0) {
2075					rval = mdstealerror(ep, &c.c_mde);
2076					goto out;
2077				}
2078
2079				/* Not incrementing "i" intentionally */
2080			}
2081		}
2082		if (! metaislocalset(sp)) {
2083			/* update the dbcnt and size in dd */
2084			for (p = dd; p != NULL; p = p->dd_next) {
2085				if (p->dd_dnp == np->drivenamep) {
2086					p->dd_dbcnt = 0;
2087					p->dd_dbsize  = 0;
2088					break;
2089				}
2090			}
2091
2092			/*
2093			 * Slam a dummy master block and make it self
2094			 * identifying
2095			 */
2096			if ((fd = open(np->rname, O_RDWR)) >= 0) {
2097				meta_mkdummymaster(sp, fd, 16);
2098				(void) close(fd);
2099			}
2100		}
2101	}
2102out:
2103	if (metaislocalset(sp)) {
2104		/*
2105		 * Stop all the daemons if there are
2106		 * no more replicas so that the module can be
2107		 * unloaded.
2108		 */
2109		if (rval == 0 && stop_svmdaemons == 1) {
2110			char buf[MAXPATHLEN];
2111			int i;
2112
2113			for (i = 0; i < DAEMON_COUNT; i++) {
2114				(void) snprintf(buf, MAXPATHLEN,
2115				    "/usr/bin/pkill -%s -x %s",
2116				    svmd_kill_list[i].svmd_kill_val,
2117				    svmd_kill_list[i].svmd_name);
2118				if (pclose(popen(buf, "w")) == -1)
2119					md_perror(buf);
2120			}
2121
2122			if (meta_smf_disable(META_SMF_ALL, &status) == -1) {
2123				mde_perror(&status, "");
2124				mdclrerror(&status);
2125			}
2126		}
2127		if (buildconf(sp, &status)) {
2128			/* Don't mask any previous errors */
2129			if (rval == 0)
2130				rval = mdstealerror(ep, &status);
2131			else
2132				mdclrerror(&status);
2133			return (rval);
2134		}
2135
2136		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
2137			/* Don't mask any previous errors */
2138			if (rval == 0)
2139				rval = mdstealerror(ep, &status);
2140			else
2141				mdclrerror(&status);
2142		}
2143	} else {
2144		if (update_dbinfo_on_drives(sp, dd,
2145		    (force_option & MDFORCE_SET_LOCKED),
2146		    ((force_option & MDFORCE_LOCAL) |
2147		    (force_option & MDFORCE_DS)), &status)) {
2148			/* Don't mask any previous errors */
2149			if (rval == 0)
2150				rval = mdstealerror(ep, &status);
2151			else
2152				mdclrerror(&status);
2153		}
2154		metafreedrivedesc(&dd);
2155	}
2156	if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) {
2157		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
2158			meta_invalidate_name(nlp->namep);
2159		}
2160	}
2161	return (rval);
2162}
2163
2164static md_replica_t *
2165metareplicaname(
2166	mdsetname_t		*sp,
2167	int			flags,
2168	struct mddb_config	*c,
2169	md_error_t		*ep
2170)
2171{
2172	md_replica_t	*rp;
2173	char		*devname;
2174	size_t		sz;
2175	devid_nmlist_t	*disklist = NULL;
2176	char		*devid_str;
2177
2178	/* allocate replicaname */
2179	rp = Zalloc(sizeof (*rp));
2180
2181	/* get device name */
2182	devname = splicename(&c->c_devname);
2183
2184	/*
2185	 * Check if the device has a long name (>40 characters) and
2186	 * if so then we have to use devids to get the device name.
2187	 * If this cannot be done then we have to fail the request.
2188	 */
2189	if (strstr(devname, META_LONGDISKNAME_STR) != NULL) {
2190		if (c->c_locator.l_devid != NULL) {
2191			if (meta_deviceid_to_nmlist("/dev/dsk",
2192			    (ddi_devid_t)(uintptr_t)c->c_locator.l_devid,
2193			    c->c_locator.l_minor_name, &disklist) != 0) {
2194				devid_str = devid_str_encode(
2195				    (ddi_devid_t)(uintptr_t)
2196				    c->c_locator.l_devid, NULL);
2197				(void) mderror(ep, MDE_MISSING_DEVID_DISK, "");
2198				mderrorextra(ep, devid_str);
2199				if (devid_str != NULL)
2200					devid_str_free(devid_str);
2201				Free(rp);
2202				Free(devname);
2203				return (NULL);
2204			}
2205		} else {
2206			(void) mderror(ep, MDE_NODEVID, "");
2207			Free(rp);
2208			Free(devname);
2209			return (NULL);
2210		}
2211		Free(devname);
2212		devname = disklist[0].devname;
2213	}
2214
2215	if (flags & PRINT_FAST) {
2216		if ((rp->r_namep = metaname_fast(&sp, devname,
2217		    LOGICAL_DEVICE, ep)) == NULL) {
2218			Free(devname);
2219			Free(rp);
2220			return (NULL);
2221		}
2222	} else {
2223		if ((rp->r_namep = metaname(&sp, devname,
2224		    LOGICAL_DEVICE, ep)) == NULL) {
2225			Free(devname);
2226			Free(rp);
2227			return (NULL);
2228		}
2229	}
2230	Free(devname);
2231
2232	/* make sure it's OK */
2233	if ((! (flags & MD_BASICNAME_OK)) &&
2234	    (metachkcomp(rp->r_namep, ep) != 0)) {
2235		Free(rp);
2236		return (NULL);
2237	}
2238
2239	rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR;
2240	rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR;
2241	rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID;
2242	if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) {
2243		sz = devid_sizeof((ddi_devid_t)(uintptr_t)
2244		    (c->c_locator.l_devid));
2245		if ((rp->r_devid = (ddi_devid_t)malloc(sz)) ==
2246		    (ddi_devid_t)NULL) {
2247			Free(rp);
2248			return (NULL);
2249		}
2250		(void) memcpy((void *)rp->r_devid,
2251		    (void *)(uintptr_t)c->c_locator.l_devid, sz);
2252		(void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name);
2253		rp->r_flags &= ~MDDB_F_NODEVID;
2254		/* Overwrite dev derived from name with dev from devid */
2255		rp->r_namep->dev = meta_expldev(c->c_locator.l_dev);
2256	}
2257	(void) strcpy(rp->r_driver_name, c->c_locator.l_driver);
2258
2259	rp->r_blkno = c->c_locator.l_blkno;
2260	if (c->c_dbend != 0)
2261		rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1;
2262
2263	/* return replica */
2264	return (rp);
2265}
2266
2267/*
2268 * free replica list
2269 */
2270void
2271metafreereplicalist(
2272	md_replicalist_t	*rlp
2273)
2274{
2275	md_replicalist_t	*rl = NULL;
2276
2277	for (/* void */; (rlp != NULL); rlp = rl) {
2278		rl = rlp->rl_next;
2279		if (rlp->rl_repp->r_devid != (ddi_devid_t)0) {
2280			free(rlp->rl_repp->r_devid);
2281		}
2282		Free(rlp->rl_repp);
2283		Free(rlp);
2284	}
2285}
2286
2287/*
2288 * return list of all replicas in set
2289 */
2290int
2291metareplicalist(
2292	mdsetname_t		*sp,
2293	int			flags,
2294	md_replicalist_t	**rlpp,
2295	md_error_t		*ep
2296)
2297{
2298	md_replicalist_t	**tail = rlpp;
2299	int			count = 0;
2300	struct mddb_config	c;
2301	int			i;
2302	char			*devid;
2303
2304	/* for each replica */
2305	i = 0;
2306	do {
2307		md_replica_t	*rp;
2308
2309		/* get next replica */
2310		(void) memset(&c, 0, sizeof (c));
2311		c.c_id = i;
2312		c.c_setno = sp->setno;
2313
2314		c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2315		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2316			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2317				mdclrerror(&c.c_mde);
2318				break;	/* handle none at all */
2319			}
2320			(void) mdstealerror(ep, &c.c_mde);
2321			goto out;
2322		}
2323
2324		if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) {
2325			if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) {
2326				(void) mdsyserror(ep, ENOMEM, META_DBCONF);
2327				goto out;
2328			}
2329			c.c_locator.l_devid = (uintptr_t)devid;
2330			/*
2331			 * Turn on space and sz flags since 'sz' amount of
2332			 * space has been alloc'd.
2333			 */
2334			c.c_locator.l_devid_flags =
2335			    MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2336		}
2337
2338		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2339			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2340				mdclrerror(&c.c_mde);
2341				break;	/* handle none at all */
2342			}
2343			(void) mdstealerror(ep, &c.c_mde);
2344			goto out;
2345		}
2346
2347		/*
2348		 * Paranoid check - shouldn't happen, but is left as
2349		 * a place holder for changes that will be needed after
2350		 * dynamic reconfiguration changes are added to SVM (to
2351		 * support movement of disks at any point in time).
2352		 */
2353		if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
2354			(void) fprintf(stderr,
2355			    dgettext(TEXT_DOMAIN,
2356			    "Error: Relocation Information "
2357			    "(drvnm=%s, mnum=0x%lx) \n"
2358			    "relocation information size changed - \n"
2359			    "rerun command\n"),
2360			    c.c_locator.l_driver, c.c_locator.l_mnum);
2361			(void) mderror(ep, MDE_DEVID_TOOBIG, NULL);
2362			goto out;
2363		}
2364
2365		if (c.c_dbcnt == 0)
2366			break;		/* handle none at all */
2367
2368		/* get info */
2369		if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL)
2370			goto out;
2371
2372		/* append to list */
2373		*tail = Zalloc(sizeof (**tail));
2374		(*tail)->rl_repp = rp;
2375		tail = &(*tail)->rl_next;
2376		++count;
2377
2378		if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2379			free(devid);
2380			c.c_locator.l_devid_flags = 0;
2381		}
2382
2383	} while (++i < c.c_dbcnt);
2384
2385	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2386		free(devid);
2387	}
2388
2389	/* return count */
2390	return (count);
2391
2392	/* cleanup, return error */
2393out:
2394	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2395		free(devid);
2396	}
2397	metafreereplicalist(*rlpp);
2398	*rlpp = NULL;
2399	return (-1);
2400}
2401
2402/*
2403 * meta_sync_db_locations - get list of replicas from kernel and write
2404 * 	out to mddb.cf and md.conf.  'Syncs up' the replica list in
2405 * 	the kernel with the replica list in the conf files.
2406 *
2407 */
2408void
2409meta_sync_db_locations(
2410	mdsetname_t	*sp,
2411	md_error_t	*ep
2412)
2413{
2414	char		*sname = 0;		/* system file name */
2415	char 		*cname = 0;		/* config file name */
2416
2417	if (!metaislocalset(sp))
2418		return;
2419
2420	/* Updates backup of configuration file (aka mddb.cf) */
2421	if (buildconf(sp, ep) != 0)
2422		return;
2423
2424	/* Updates system configuration file (aka md.conf) */
2425	(void) meta_db_patch(sname, cname, 0, ep);
2426}
2427
2428/*
2429 * setup_db_locations - parse the mddb.cf file and
2430 *			tells the driver which db locations to use.
2431 */
2432int
2433meta_setup_db_locations(
2434	md_error_t	*ep
2435)
2436{
2437	mddb_config_t	c;
2438	FILE		*fp;
2439	char		inbuff[1024];
2440	char		*buff;
2441	uint_t		i;
2442	size_t		sz;
2443	int		rval = 0;
2444	char		*devidp;
2445	uint_t		devid_size;
2446	char		*minor_name = NULL;
2447	ddi_devid_t	devid_decode;
2448	int		checksum;
2449
2450	/* do mddb.cf file */
2451	(void) memset(&c, '\0', sizeof (c));
2452	if ((fp = fopen(META_DBCONF, "r")) == NULL) {
2453		if (errno != ENOENT)
2454			return (mdsyserror(ep, errno, META_DBCONF));
2455	}
2456	while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1),
2457	    fp)) != NULL)) {
2458
2459		/* ignore comments */
2460		if (*buff == '#')
2461			continue;
2462
2463		/* parse locator */
2464		(void) memset(&c, 0, sizeof (c));
2465		c.c_setno = MD_LOCAL_SET;
2466		i = strcspn(buff, " \t");
2467		if (i > sizeof (c.c_locator.l_driver))
2468			i = sizeof (c.c_locator.l_driver);
2469		(void) strncpy(c.c_locator.l_driver, buff, i);
2470		buff += i;
2471		c.c_locator.l_dev =
2472		    makedev((major_t)0, (minor_t)strtol(buff, &buff, 10));
2473		c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10);
2474		c.c_locator.l_mnum = minor(c.c_locator.l_dev);
2475
2476		/* parse out devid */
2477		while (isspace((int)(*buff)))
2478			buff += 1;
2479		i = strcspn(buff, " \t");
2480		if ((devidp = (char *)malloc(i+1)) == NULL)
2481			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2482
2483		(void) strncpy(devidp, buff, i);
2484		devidp[i] = '\0';
2485		if (devid_str_decode(devidp, &devid_decode,
2486		    &minor_name) == -1) {
2487			free(devidp);
2488			continue;
2489		}
2490
2491		/* Conf file must have minor name associated with devid */
2492		if (minor_name == NULL) {
2493			free(devidp);
2494			devid_free(devid_decode);
2495			continue;
2496		}
2497
2498		sz = devid_sizeof(devid_decode);
2499		/* Copy to devid size buffer that ioctl expects */
2500		if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) {
2501			devid_free(devid_decode);
2502			free(minor_name);
2503			free(devidp);
2504			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2505		}
2506
2507		(void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2508		    (void *)devid_decode, sz);
2509
2510		devid_free(devid_decode);
2511
2512		if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) {
2513			free(minor_name);
2514			free(devidp);
2515			free((void *)(uintptr_t)c.c_locator.l_devid);
2516			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2517		}
2518		(void) strcpy(c.c_locator.l_minor_name, minor_name);
2519		free(minor_name);
2520		c.c_locator.l_devid_flags = MDDB_DEVID_VALID |
2521		    MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2522		c.c_locator.l_devid_sz = sz;
2523
2524		devid_size = strlen(devidp);
2525		buff += devid_size;
2526
2527		checksum = strtol(buff, &buff, 10);
2528		for (i = 0; c.c_locator.l_driver[i] != 0; i++)
2529			checksum += c.c_locator.l_driver[i];
2530		for (i = 0; i < devid_size; i++) {
2531			checksum += devidp[i];
2532		}
2533		free(devidp);
2534
2535		checksum += minor(c.c_locator.l_dev);
2536		checksum += c.c_locator.l_blkno;
2537		if (checksum != 42) {
2538			/* overwritten later for more serious problems */
2539			rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF);
2540			free((void *)(uintptr_t)c.c_locator.l_devid);
2541			continue;
2542		}
2543		c.c_locator.l_flags = 0;
2544
2545		/* use db location */
2546		if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2547			free((void *)(uintptr_t)c.c_locator.l_devid);
2548			return (mdstealerror(ep, &c.c_mde));
2549		}
2550
2551		/* free up devid if in use */
2552		free((void *)(uintptr_t)c.c_locator.l_devid);
2553		c.c_locator.l_devid = (uint64_t)0;
2554		c.c_locator.l_devid_flags = 0;
2555	}
2556	if ((fp) && (fclose(fp) != 0))
2557		return (mdsyserror(ep, errno, META_DBCONF));
2558
2559	/* check for stale database */
2560	(void) memset((char *)&c, 0, sizeof (struct mddb_config));
2561	c.c_id = 0;
2562	c.c_setno = MD_LOCAL_SET;
2563
2564	/*
2565	 * While we do not need the devid here we may need to
2566	 * know if devid's are being used by the kernel for
2567	 * the replicas. This is because under some circumstances
2568	 * we can only manipulate the SVM configuration if the
2569	 * kernel is using devid's.
2570	 */
2571	c.c_locator.l_devid = (uint64_t)0;
2572	c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2573	c.c_locator.l_devid_sz = 0;
2574
2575	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2576		if (! mdismddberror(&c.c_mde, MDE_DB_INVALID))
2577			return (mdstealerror(ep, &c.c_mde));
2578		mdclrerror(&c.c_mde);
2579	}
2580
2581	if (c.c_flags & MDDB_C_STALE)
2582		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET,
2583		    0, NULL));
2584
2585	if (c.c_locator.l_devid_sz != 0) {
2586		/*
2587		 * Devid's are being used to track the replicas because
2588		 * there is space for a devid.
2589		 */
2590		devid_in_use = TRUE;
2591	}
2592
2593	/* success */
2594	return (rval);
2595}
2596
2597/*
2598 * meta_db_minreplica - returns the minimum size replica currently in use.
2599 */
2600daddr_t
2601meta_db_minreplica(
2602	mdsetname_t	*sp,
2603	md_error_t	*ep
2604)
2605{
2606	md_replica_t		*r;
2607	md_replicalist_t	*rl, *rlp = NULL;
2608	daddr_t			nblks = 0;
2609
2610	if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
2611		return (-1);
2612
2613	if (rlp == NULL)
2614		return (-1);
2615
2616	/* find the smallest existing replica */
2617	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2618		r = rl->rl_repp;
2619		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2620	}
2621
2622	metafreereplicalist(rlp);
2623	return (nblks);
2624}
2625
2626/*
2627 * meta_get_replica_names
2628 *  returns an mdnamelist_t of replica slices
2629 */
2630/*ARGSUSED*/
2631int
2632meta_get_replica_names(
2633	mdsetname_t	*sp,
2634	mdnamelist_t	**nlpp,
2635	int		options,
2636	md_error_t	*ep
2637)
2638{
2639	md_replicalist_t	*rlp = NULL;
2640	md_replicalist_t	*rl;
2641	mdnamelist_t		**tailpp = nlpp;
2642	int			cnt = 0;
2643
2644	assert(nlpp != NULL);
2645
2646	if (!metaislocalset(sp))
2647		goto out;
2648
2649	/* get replicas */
2650	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
2651		cnt = -1;
2652		goto out;
2653	}
2654
2655	/* build name list */
2656	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
2657		/*
2658		 * Add the name struct to the end of the
2659		 * namelist but keep a pointer to the last
2660		 * element so that we don't incur the overhead
2661		 * of traversing the list each time
2662		 */
2663		tailpp = meta_namelist_append_wrapper(
2664		    tailpp, rl->rl_repp->r_namep);
2665		++cnt;
2666	}
2667
2668	/* cleanup, return count or error */
2669out:
2670	metafreereplicalist(rlp);
2671	return (cnt);
2672}
2673