1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * Metadevice diskset interfaces
30 */
31
32#include <meta.h>
33#include <mdmn_changelog.h>
34#include "meta_set_prv.h"
35#include "meta_repartition.h"
36
37static int
38check_setnodes_againstdrivelist(
39	mdsetname_t		*sp,
40	mddrivenamelist_t	*dnlp,
41	md_error_t		*ep
42)
43{
44	md_set_desc		*sd;
45	mddrivenamelist_t	*p;
46	int 			i;
47	md_mnnode_desc		*nd;
48
49	if ((sd = metaget_setdesc(sp, ep)) == NULL)
50		return (-1);
51
52	if (MD_MNSET_DESC(sd)) {
53		nd = sd->sd_nodelist;
54		while (nd) {
55			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
56				nd = nd->nd_next;
57				continue;
58			}
59			for (p = dnlp; p != NULL; p = p->next)
60				if (checkdrive_onnode(sp, p->drivenamep,
61				    nd->nd_nodename, ep))
62					return (-1);
63			nd = nd->nd_next;
64		}
65	} else {
66		for (i = 0; i < MD_MAXSIDES; i++) {
67			/* Skip empty slots */
68			if (sd->sd_nodes[i][0] == '\0')
69				continue;
70
71			for (p = dnlp; p != NULL; p = p->next)
72				if (checkdrive_onnode(sp, p->drivenamep,
73				    sd->sd_nodes[i], ep))
74					return (-1);
75		}
76	}
77	return (0);
78}
79
80static int
81drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
82{
83	mddrivenamelist_t *dl1, *dl2;
84	mddrivename_t *dn1, *dn2;
85
86	for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
87		dn1 = dl1->drivenamep;
88
89		for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
90			dn2 = dl2->drivenamep;
91			if (strcmp(dn1->cname, dn2->cname) != 0)
92				continue;
93
94			return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
95			    NULL, dn1->cname, sp->setname));
96		}
97	}
98	return (0);
99}
100
101static md_drive_desc *
102metaget_drivedesc_fromdrivelist(
103	mdsetname_t		*sp,
104	mddrivenamelist_t	*dnlp,
105	uint_t			flags,
106	md_error_t		*ep
107)
108{
109	mddrivenamelist_t	*p;
110	md_drive_desc		*dd = NULL;
111	md_set_desc		*sd;
112
113	if ((sd = metaget_setdesc(sp, ep)) == NULL)
114		return (NULL);
115
116	for (p = dnlp; p != NULL; p = p->next) {
117		(void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
118		    sd->sd_ctime, sd->sd_genid, flags);
119	}
120
121	return (dd);
122}
123
124/*
125 * Exported Entry Points
126 */
127
128int
129meta_make_sidenmlist(
130	mdsetname_t		*sp,
131	mddrivename_t		*dnp,
132	int			import_flag, /* flags partial import */
133	md_im_drive_info_t	*midp,	/* import drive information */
134	md_error_t		*ep
135)
136{
137	mdsidenames_t		*sn, **sn_next;
138	mdname_t		*np;
139	int			done;
140	side_t			sideno = MD_SIDEWILD;
141	uint_t			rep_slice;
142	char			*bname;
143
144	if (!import_flag) {
145		/*
146		 * Normal (aka NOT partial import) code path.
147		 */
148		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
149			return (-1);
150		}
151
152		dnp->side_names_key = MD_KEYWILD;
153
154		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
155			return (-1);
156		bname = Strdup(np->bname);
157	} else {
158		/*
159		 * When doing a partial import, we'll get the needed
160		 * information from somewhere other than the system.
161		 */
162		dnp->side_names_key = MD_KEYWILD;
163		bname = Strdup(midp->mid_devname);
164	}
165	metaflushsidenames(dnp);
166	sn_next = &dnp->side_names;
167	/*CONSTCOND*/
168	while (1) {
169		sn = Zalloc(sizeof (*sn));
170
171		if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
172		    &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
173			if (import_flag) {
174				mdclrerror(ep);
175				sn->dname = Strdup(midp->mid_driver_name);
176				sn->mnum = midp->mid_mnum;
177			} else {
178				Free(sn);
179				Free(bname);
180				return (-1);
181			}
182		}
183
184		if (done == 0) {
185			Free(sn);
186			Free(bname);
187			return (0);
188		}
189
190		sn->sideno = sideno;
191
192		/* Add to the end of the linked list */
193		assert(*sn_next == NULL);
194		*sn_next = sn;
195		sn_next = &sn->next;
196	}
197	/*NOTREACHED*/
198}
199
200int
201meta_set_adddrives(
202	mdsetname_t		*sp,
203	mddrivenamelist_t	*dnlp,
204	daddr_t			dbsize,
205	int			force_label,
206	md_error_t		*ep
207)
208{
209	md_set_desc		*sd;
210	md_drive_desc		*dd = NULL, *curdd = NULL, *ddp;
211	int			i;
212	mddrivenamelist_t	*p;
213	mhd_mhiargs_t		mhiargs;
214	int			rval = 0;
215	md_timeval32_t		now;
216	sigset_t		oldsigs;
217	ulong_t			genid;
218	ulong_t			max_genid = 0;
219	md_setkey_t		*cl_sk;
220	int			rb_level = 0;
221	md_error_t		xep = mdnullerror;
222	md_mnnode_desc		*nd;
223	int			suspendall_flag = 0;
224	int			suspend1_flag = 0;
225	int			lock_flag = 0;
226	int			flush_set_onerr = 0;
227	md_replicalist_t	*rlp = NULL, *rl;
228
229	if ((sd = metaget_setdesc(sp, ep)) == NULL)
230		return (-1);
231
232	/* Make sure we own the set */
233	if (meta_check_ownership(sp, ep) != 0)
234		return (-1);
235
236	/*
237	 * The drive and node records are stored in the local mddbs of each
238	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
239	 * drive and node records from that node's local mddb and caches them
240	 * internally. Any process needing diskset information contacts its
241	 * local rpc.metad to get this information.  Since each node in the
242	 * diskset is independently reading the set information from its local
243	 * mddb, the set, drive and node records in the local mddbs must stay
244	 * in-sync, so that all nodes have a consistent view of the diskset.
245	 *
246	 * For a multinode diskset, explicitly verify that all nodes in the
247	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
248	 * fail this operation since all nodes must be ALIVE in order to add
249	 * the new drive record to their local mddb.  If a panic of this node
250	 * leaves the local mddbs set, node and drive records out-of-sync, the
251	 * reconfig cycle will fix the local mddbs and force them back into
252	 * synchronization.
253	 */
254	if (MD_MNSET_DESC(sd)) {
255		nd = sd->sd_nodelist;
256		while (nd) {
257			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
258				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
259					sp->setno,
260					nd->nd_nodename, NULL, sp->setname);
261				return (-1);
262			}
263			nd = nd->nd_next;
264		}
265	}
266
267	if (drvsuniq(sp, dnlp, ep) == -1)
268		return (-1);
269
270	/*
271	 * Lock the set on current set members.
272	 * Set locking done much earlier for MN diskset than for traditional
273	 * diskset since lock_set and SUSPEND are used to protect against
274	 * other meta* commands running on the other nodes.
275	 */
276	if (MD_MNSET_DESC(sd)) {
277		/* Make sure we are blocking all signals */
278		if (procsigs(TRUE, &oldsigs, &xep) < 0)
279			mdclrerror(&xep);
280
281		nd = sd->sd_nodelist;
282		/* All nodes are guaranteed to be ALIVE */
283		while (nd) {
284			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
285				rval = -1;
286				goto out;
287			}
288			lock_flag = 1;
289			nd = nd->nd_next;
290		}
291		/*
292		 * Lock out other meta* commands by suspending
293		 * class 1 messages across the diskset.
294		 */
295		nd = sd->sd_nodelist;
296		/* All nodes are guaranteed to be ALIVE */
297		while (nd) {
298			if (clnt_mdcommdctl(nd->nd_nodename,
299			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
300			    MD_MSCF_NO_FLAGS, ep)) {
301				rval = -1;
302				goto out;
303			}
304			suspend1_flag = 1;
305			nd = nd->nd_next;
306		}
307	}
308
309	if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
310		rval = -1;
311		goto out;
312	}
313
314	for (p = dnlp; p != NULL; p = p->next) {
315		mdsetname_t	*tmp;
316
317		if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
318		    ep) == -1) {
319			rval = -1;
320			goto out;
321		}
322
323		if (tmp != NULL) {
324			(void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
325			    tmp->setname, p->drivenamep->cname, sp->setname);
326			rval = -1;
327			goto out;
328		}
329	}
330
331	/* END CHECK CODE */
332
333	/*
334	 * This is a separate loop (from above) so that we validate all the
335	 * drives handed to us before we repartition any one drive.
336	 */
337	for (p = dnlp; p != NULL; p = p->next) {
338		if (meta_repartition_drive(sp,
339		    p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
340		    NULL, /* Don't return the VTOC. */
341		    ep) != 0) {
342			rval = -1;
343			goto out;
344		}
345		/*
346		 * Create the names for the drives we are adding per side.
347		 */
348		if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
349		    ep) == -1) {
350			rval = -1;
351			goto out;
352		}
353	}
354
355	/*
356	 * Get the list of drives descriptors that we are adding.
357	 */
358	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
359
360	if (! mdisok(ep)) {
361		rval = -1;
362		goto out;
363	}
364
365	/*
366	 * Get the set timeout information.
367	 */
368	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
369	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
370		rval = -1;
371		goto out;
372	}
373
374	/*
375	 * Get timestamp and generation id for new records
376	 */
377	now = sd->sd_ctime;
378	genid = sd->sd_genid;
379
380
381	/* At this point, in case of error, set should be flushed. */
382	flush_set_onerr = 1;
383
384	/* Lock the set on current set members */
385	if (!(MD_MNSET_DESC(sd))) {
386		md_rb_sig_handling_on();
387		for (i = 0; i < MD_MAXSIDES; i++) {
388			/* Skip empty slots */
389			if (sd->sd_nodes[i][0] == '\0')
390				continue;
391
392			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
393				rval = -1;
394				goto out;
395			}
396			lock_flag = 1;
397		}
398	}
399
400	/*
401	 * Get drive descriptors for the drives that are currently in the set.
402	 */
403	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
404	if (! mdisok(ep))
405		goto rollback;
406
407	/*
408	 * If first drive being added to set, set the mastership
409	 * of the multinode diskset to be this node.
410	 * Only set it on this node.  If all goes well
411	 * and there are no errors, the mastership of this node will be set
412	 * on all nodes in user space and in the kernel.
413	 */
414	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
415		if (clnt_mnsetmaster(mynode(), sp,
416		    sd->sd_mn_mynode->nd_nodename,
417		    sd->sd_mn_mynode->nd_nodeid, ep)) {
418			goto rollback;
419		}
420		/*
421		 * Set this up in my local cache of the set desc so that
422		 * the set descriptor won't have to be gotten again from
423		 * rpc.metad.  If it is flushed and gotten again, these
424		 * values will be set in sr2setdesc.
425		 */
426		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
427		(void) strcpy(sd->sd_mn_master_nodenm,
428		    sd->sd_mn_mynode->nd_nodename);
429		sd->sd_mn_am_i_master = 1;
430	}
431
432	RB_TEST(1, "adddrives", ep)
433
434	RB_PREEMPT;
435	rb_level = 1;	/* level 1 */
436
437	RB_TEST(2, "adddrives", ep)
438
439	/*
440	 * Add the drive records for the drives that we are adding to
441	 * each host in the set.  Marks the drive as MD_DR_ADD.
442	 */
443	if (MD_MNSET_DESC(sd)) {
444		nd = sd->sd_nodelist;
445		/* All nodes are guaranteed to be ALIVE */
446		while (nd) {
447			if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
448			    ep) == -1)
449				goto rollback;
450
451			RB_TEST(3, "adddrives", ep)
452			nd = nd->nd_next;
453		}
454	} else {
455		for (i = 0; i < MD_MAXSIDES; i++) {
456			/* Skip empty slots */
457			if (sd->sd_nodes[i][0] == '\0')
458				continue;
459
460			if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
461			    ep) == -1)
462				goto rollback;
463
464			RB_TEST(3, "adddrives", ep)
465		}
466	}
467
468	RB_TEST(4, "adddrives", ep)
469
470	RB_PREEMPT;
471	rb_level = 2;	/* level 2 */
472
473	RB_TEST(5, "adddrives", ep)
474
475	/*
476	 * Take ownership of the added drives.
477	 */
478	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
479		if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
480			goto rollback;
481	}
482
483	/*
484	 * If this is not a MN set and the state flags do not indicate the
485	 * presence of devids, update the set records on all nodes.
486	 */
487	if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
488		if (meta_update_mb(sp, dd, ep) == 0) {
489			mdclrerror(ep);
490
491			/* update the sr_flags on all hosts */
492			for (i = 0; i < MD_MAXSIDES; i++) {
493				if (sd->sd_nodes[i][0] == '\0')
494					continue;
495
496				if (clnt_upd_sr_flags(sd->sd_nodes[i],
497				    sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
498					goto rollback;
499			}
500		}
501	}
502
503	RB_TEST(6, "adddrives", ep)
504
505	RB_PREEMPT;
506	rb_level = 3;	/* level 3 */
507
508	RB_TEST(7, "adddrives", ep)
509
510	/*
511	 * Balance the DB's according to the list of existing drives and the
512	 * list of added drives.
513	 */
514	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
515		goto rollback;
516
517	/*
518	 * Slam a dummy master block on all the disks that we are adding
519	 * that don't have replicas on them.
520	 * Used by diskset import if the disksets are remotely replicated
521	 */
522	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
523		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
524			uint_t		rep_slice;
525			int		fd = -1;
526			mdname_t	*np = NULL;
527			char		*drive_name;
528
529			drive_name = ddp->dd_dnp->cname;
530
531			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
532				char	*rep_name;
533
534				rep_name =
535				    rl->rl_repp->r_namep->drivenamep->cname;
536
537				if (strcmp(drive_name, rep_name) == 0) {
538					/*
539					 * Disk has a replica on it so don't
540					 * add dummy master block.
541					 */
542					break;
543				}
544			}
545			if (rl == NULL) {
546				/*
547				 * Drive doesn't have a replica on it so
548				 * we need a dummy master block. Add it.
549				 */
550				if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
551				    &xep) != 0) {
552					mdclrerror(&xep);
553					continue;
554				}
555
556				if ((np = metaslicename(ddp->dd_dnp, rep_slice,
557				    &xep)) == NULL) {
558					mdclrerror(&xep);
559					continue;
560				}
561
562				if ((fd = open(np->rname, O_RDWR)) >= 0) {
563					meta_mkdummymaster(sp, fd, 16);
564					(void) close(fd);
565				}
566			}
567		}
568	}
569
570	if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
571		/*
572		 * Notify rpc.mdcommd on all nodes of a nodelist change.
573		 * Start by suspending rpc.mdcommd (which drains it of all
574		 * messages), then change the nodelist followed by a reinit
575		 * and resume.
576		 */
577		nd = sd->sd_nodelist;
578		/* All nodes are guaranteed to be ALIVE */
579		while (nd) {
580			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
581			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
582				rval = -1;
583				goto out;
584			}
585			suspendall_flag = 1;
586			nd = nd->nd_next;
587		}
588	}
589
590	/*
591	 * If a MN diskset and this is the first disk(s) being added
592	 * to set, then pre-allocate change log records here.
593	 * When the other nodes are joined into the MN diskset, the
594	 * USER records will just be snarfed in.
595	 */
596	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
597		if (mdmn_allocate_changelog(sp, ep) != 0)
598			goto rollback;
599	}
600
601	/*
602	 * Mark the drives MD_DR_OK.
603	 * If first drive being added to MN diskset, then set
604	 * master on all nodes to be this node and then join
605	 * all alive nodes (nodes in membership list) to set.
606	 */
607	if (MD_MNSET_DESC(sd)) {
608		nd = sd->sd_nodelist;
609		/* All nodes are guaranteed to be ALIVE */
610		while (nd) {
611			/* don't set master on this node - done earlier */
612			if ((curdd == NULL) && (nd->nd_nodeid !=
613			    sd->sd_mn_mynode->nd_nodeid)) {
614				/*
615				 * Set master on all alive nodes since
616				 * all alive nodes will become joined nodes.
617				 */
618				if (clnt_mnsetmaster(nd->nd_nodename, sp,
619				    sd->sd_mn_mynode->nd_nodename,
620				    sd->sd_mn_mynode->nd_nodeid, ep)) {
621					goto rollback;
622				}
623			}
624
625			if (curdd == NULL) {
626				/*
627				 * No special flags for join set.  Since
628				 * all nodes are joining if 1st drive is being
629				 * added to set then all nodes will be either
630				 * STALE or non-STALE and each node can
631				 * determine this on its own.
632				 */
633				if (clnt_joinset(nd->nd_nodename, sp,
634				    NULL, ep)) {
635					goto rollback;
636				}
637				/* Sets join node flag on all nodes in list */
638				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
639				    sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
640					goto rollback;
641				}
642			}
643
644			/*
645			 * Set MD_DR_OK as last thing before unlock.
646			 * In case of panic on this node, recovery
647			 * code can check for MD_DR_OK to determine
648			 * status of diskset.
649			 */
650			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
651			    MD_DR_OK, ep) == -1)
652				goto rollback;
653
654
655			RB_TEST(8, "adddrives", ep)
656			nd = nd->nd_next;
657		}
658	} else {
659		for (i = 0; i < MD_MAXSIDES; i++) {
660			/* Skip empty slots */
661			if (sd->sd_nodes[i][0] == '\0')
662				continue;
663
664			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
665			    ep) == -1)
666				goto rollback;
667
668			RB_TEST(8, "adddrives", ep)
669		}
670	}
671
672	RB_TEST(9, "adddrives", ep)
673
674out:
675	/*
676	 * Notify rpc.mdcommd on all nodes of a nodelist change.
677	 * Send reinit command to mdcommd which forces it to get
678	 * fresh set description.
679	 */
680	if (suspendall_flag) {
681		/* Send reinit */
682		nd = sd->sd_nodelist;
683		/* All nodes are guaranteed to be ALIVE */
684		while (nd) {
685			/* Class is ignored for REINIT */
686			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
687			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
688				if (rval == 0)
689					(void) mdstealerror(ep, &xep);
690				rval = -1;
691				mde_perror(ep, dgettext(TEXT_DOMAIN,
692				    "Unable to reinit rpc.mdcommd.\n"));
693			}
694			nd = nd->nd_next;
695		}
696	}
697	/*
698	 * Unlock diskset by resuming messages across the diskset.
699	 * Just resume all classes so that resume is the same whether
700	 * just one class was locked or all classes were locked.
701	 */
702	if ((suspend1_flag) || (suspendall_flag)) {
703		nd = sd->sd_nodelist;
704		/* All nodes are guaranteed to be ALIVE */
705		while (nd) {
706			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
707			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
708				if (rval == 0)
709					(void) mdstealerror(ep, &xep);
710				rval = -1;
711				mde_perror(ep, dgettext(TEXT_DOMAIN,
712				    "Unable to resume rpc.mdcommd.\n"));
713			}
714			nd = nd->nd_next;
715		}
716		meta_ping_mnset(sp->setno);
717	}
718
719	if (lock_flag) {
720		cl_sk = cl_get_setkey(sp->setno, sp->setname);
721		if (MD_MNSET_DESC(sd)) {
722			nd = sd->sd_nodelist;
723			/* All nodes are guaranteed to be ALIVE */
724			while (nd) {
725				if (clnt_unlock_set(nd->nd_nodename,
726				    cl_sk, &xep)) {
727					if (rval == 0)
728						(void) mdstealerror(ep, &xep);
729					rval = -1;
730				}
731				nd = nd->nd_next;
732			}
733		} else {
734			for (i = 0; i < MD_MAXSIDES; i++) {
735				/* Skip empty slots */
736				if (sd->sd_nodes[i][0] == '\0')
737					continue;
738
739				if (clnt_unlock_set(sd->sd_nodes[i],
740				    cl_sk, &xep)) {
741					if (rval == 0)
742						(void) mdstealerror(ep, &xep);
743					rval = -1;
744				}
745			}
746		}
747		cl_set_setkey(NULL);
748	}
749
750	metafreedrivedesc(&dd);
751
752	if (flush_set_onerr) {
753		metaflushsetname(sp);
754		if (!(MD_MNSET_DESC(sd))) {
755			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
756		}
757	}
758
759	if (MD_MNSET_DESC(sd)) {
760		/* release signals back to what they were on entry */
761		if (procsigs(FALSE, &oldsigs, &xep) < 0)
762			mdclrerror(&xep);
763	}
764
765	return (rval);
766
767rollback:
768	/* all signals already blocked for MN disket */
769	if (!(MD_MNSET_DESC(sd))) {
770		/* Make sure we are blocking all signals */
771		if (procsigs(TRUE, &oldsigs, &xep) < 0)
772			mdclrerror(&xep);
773	}
774
775	rval = -1;
776
777	max_genid = sd->sd_genid;
778
779	/* level 3 */
780	if (rb_level > 2) {
781		/*
782		 * Since the add drive operation is failing, need
783		 * to reset config back to the way it was
784		 * before the add drive opration.
785		 * If a MN diskset and this is the first drive being added,
786		 * then reset master on all ALIVE nodes (which is all nodes)
787		 * since the master would have not been set previously.
788		 * Don't reset master on this node, since this
789		 * is done later.
790		 * This is ok to fail since next node to add first
791		 * disk to diskset will also set the master on all nodes.
792		 *
793		 * Also, if this is the first drive being added,
794		 * need to have each node withdraw itself from the set.
795		 */
796		if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
797			nd = sd->sd_nodelist;
798			/* All nodes are guaranteed to be ALIVE */
799			while (nd) {
800				/*
801				 * Be careful with ordering in case of
802				 * panic between the steps and the
803				 * effect on recovery during reconfig.
804				 */
805				if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
806					mdclrerror(&xep);
807
808				/* Sets withdraw flag on all nodes in list */
809				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
810				    sd->sd_nodelist, MD_NR_WITHDRAW,
811				    NULL, &xep)) {
812					mdclrerror(&xep);
813				}
814
815				/* Skip this node */
816				if (nd->nd_nodeid ==
817				    sd->sd_mn_mynode->nd_nodeid) {
818					nd = nd->nd_next;
819					continue;
820				}
821				/* Reset master on all of the other nodes. */
822				if (clnt_mnsetmaster(nd->nd_nodename, sp,
823				    "", MD_MN_INVALID_NID, &xep))
824					mdclrerror(&xep);
825				nd = nd->nd_next;
826			}
827		}
828	}
829
830	/*
831	 * Send resume command to mdcommd.  Don't send reinit command
832	 * since nodelist should not have changed.
833	 * If suspendall_flag is set, then user would have been adding
834	 * first drives to set.  Since this failed, there is certainly
835	 * no reinit message to send to rpc.commd since no nodes will
836	 * be joined to set at the end of this metaset command.
837	 */
838	if (suspendall_flag) {
839		/* Send resume */
840		nd = sd->sd_nodelist;
841		/* All nodes are guaranteed to be ALIVE */
842		while (nd) {
843			/*
844			 * Resume all classes but class 1 so that lock is held
845			 * against meta* commands.
846			 * To later resume class1, must issue a class0 resume.
847			 */
848			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
849			    sp, MD_MSG_CLASS0,
850			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
851				mde_perror(&xep, dgettext(TEXT_DOMAIN,
852				    "Unable to resume rpc.mdcommd.\n"));
853				mdclrerror(&xep);
854			}
855			nd = nd->nd_next;
856		}
857		meta_ping_mnset(sp->setno);
858	}
859
860	/* level 3 */
861	if (rb_level > 2) {
862		mdnamelist_t	*nlp;
863		mdname_t	*np;
864
865		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
866			uint_t	rep_slice;
867
868			if ((meta_replicaslice(ddp->dd_dnp,
869			    &rep_slice, &xep) != 0) ||
870			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
871				&xep)) == NULL)) {
872				mdclrerror(&xep);
873				continue;
874			}
875			nlp = NULL;
876			(void) metanamelist_append(&nlp, np);
877
878			if (meta_db_detach(sp, nlp,
879			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
880				mdclrerror(&xep);
881
882			metafreenamelist(nlp);
883		}
884
885		/* Re-balance */
886		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
887			mdclrerror(&xep);
888
889		/* Only if we are adding the first drive */
890		/* Handled MN diskset above. */
891		if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
892			if (clnt_stimeout(mynode(), sp, &defmhiargs,
893			    &xep) == -1)
894				mdclrerror(&xep);
895
896			/* This is needed because of a corner case */
897			if (halt_set(sp, &xep))
898				mdclrerror(&xep);
899		}
900		max_genid++;
901	}
902
903	/* level 2 */
904	if (rb_level > 1) {
905		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
906			if (rel_own_bydd(sp, dd, TRUE, &xep))
907				mdclrerror(&xep);
908		}
909	}
910
911	/* level 1 */
912	if (rb_level > 0) {
913		if (MD_MNSET_DESC(sd)) {
914			nd = sd->sd_nodelist;
915			/* All nodes are guaranteed to be ALIVE */
916			while (nd) {
917				if (clnt_deldrvs(nd->nd_nodename, sp, dd,
918				    &xep) == -1)
919					mdclrerror(&xep);
920				nd = nd->nd_next;
921			}
922		} else {
923			for (i = 0; i < MD_MAXSIDES; i++) {
924				/* Skip empty slots */
925				if (sd->sd_nodes[i][0] == '\0')
926					continue;
927
928				if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
929				    &xep) == -1)
930					mdclrerror(&xep);
931			}
932		}
933		max_genid += 2;
934		resync_genid(sp, sd, max_genid, 0, NULL);
935	}
936
937	if ((suspend1_flag) || (suspendall_flag)) {
938		/* Send resume */
939		nd = sd->sd_nodelist;
940		/* All nodes are guaranteed to be ALIVE */
941		while (nd) {
942			/*
943			 * Just resume all classes so that resume is the
944			 * same whether just one class was locked or all
945			 * classes were locked.
946			 */
947			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
948			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
949				mdclrerror(&xep);
950			}
951			nd = nd->nd_next;
952		}
953		meta_ping_mnset(sp->setno);
954	}
955
956	/* level 0 */
957	cl_sk = cl_get_setkey(sp->setno, sp->setname);
958	/* Don't test lock flag since guaranteed to be set if in rollback */
959	if (MD_MNSET_DESC(sd)) {
960		/*
961		 * Since the add drive operation is failing, need
962		 * to reset config back to the way it was
963		 * before the add drive opration.
964		 * If a MN diskset and this is the first drive being
965		 * added, then reset master on this node since
966		 * the master would have not been set previously.
967		 * This is ok to fail since next node to add first
968		 * disk to diskset will also set the master on all nodes.
969		 */
970		if (curdd == NULL) {
971			/* Reset master on mynode */
972			if (clnt_mnsetmaster(mynode(), sp, "",
973			    MD_MN_INVALID_NID, &xep))
974				mdclrerror(&xep);
975		}
976		nd = sd->sd_nodelist;
977		/* All nodes are guaranteed to be ALIVE */
978		while (nd) {
979			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
980				mdclrerror(&xep);
981			nd = nd->nd_next;
982		}
983	} else {
984		for (i = 0; i < MD_MAXSIDES; i++) {
985			/* Skip empty slots */
986			if (sd->sd_nodes[i][0] == '\0')
987				continue;
988
989			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
990				mdclrerror(&xep);
991		}
992	}
993	cl_set_setkey(NULL);
994
995	/* release signals back to what they were on entry */
996	if (procsigs(FALSE, &oldsigs, &xep) < 0)
997		mdclrerror(&xep);
998
999	metafreedrivedesc(&dd);
1000
1001	if (flush_set_onerr) {
1002		metaflushsetname(sp);
1003		if (!(MD_MNSET_DESC(sd))) {
1004			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1005		}
1006	}
1007
1008	return (rval);
1009}
1010
1011/*
1012 * Add drives routine used during import of a diskset.
1013 */
1014int
1015meta_imp_set_adddrives(
1016	mdsetname_t		*sp,
1017	mddrivenamelist_t	*dnlp,
1018	md_im_set_desc_t	*misp,
1019	md_error_t		*ep
1020)
1021{
1022	md_set_desc		*sd;
1023	mddrivenamelist_t	*p;
1024	md_drive_desc		*dd = NULL, *ddp;
1025	int			flush_set_onerr = 0;
1026	md_timeval32_t		now;
1027	ulong_t			genid;
1028	mhd_mhiargs_t		mhiargs;
1029	md_im_replica_info_t	*mirp;
1030	md_im_drive_info_t	*midp;
1031	int			rval = 0;
1032	sigset_t		oldsigs;
1033	ulong_t			max_genid = 0;
1034	int			rb_level = 0;
1035	md_error_t		xep = mdnullerror;
1036
1037	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1038		return (-1);
1039
1040	for (p = dnlp; p != NULL; p = p->next) {
1041		int		imp_flag = 0;
1042
1043		/*
1044		 * If we have a partial diskset, meta_make_sidenmlist will
1045		 * need information from midp to complete making the
1046		 * side name structure.
1047		 */
1048		if (misp->mis_partial) {
1049			imp_flag = MDDB_C_IMPORT;
1050			for (midp = misp->mis_drives; midp != NULL;
1051			    midp = midp->mid_next) {
1052				if (midp->mid_dnp == p->drivenamep)
1053					break;
1054			}
1055			if (midp == NULL) {
1056				(void) mddserror(ep, MDE_DS_SETNOTIMP,
1057				    MD_SET_BAD, mynode(), NULL, sp->setname);
1058				rval = -1;
1059				goto out;
1060			}
1061		}
1062		/*
1063		 * Create the names for the drives we are adding per side.
1064		 */
1065		if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
1066		    midp, ep) == -1) {
1067			rval = -1;
1068			goto out;
1069		}
1070	}
1071
1072	/*
1073	 * Get the list of drives descriptors that we are adding.
1074	 */
1075	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
1076
1077	if (! mdisok(ep)) {
1078		rval = -1;
1079		goto out;
1080	}
1081
1082	/*
1083	 * Get the set timeout information.
1084	 */
1085	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1086	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1087		rval = -1;
1088		goto out;
1089	}
1090
1091	/*
1092	 * Get timestamp and generation id for new records
1093	 */
1094	now = sd->sd_ctime;
1095	genid = sd->sd_genid;
1096
1097	/* At this point, in case of error, set should be flushed. */
1098	flush_set_onerr = 1;
1099
1100	rb_level = 1;   /* level 1 */
1101
1102	for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
1103		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1104			if (ddp->dd_dnp == midp->mid_dnp) {
1105				/* same disk */
1106				ddp->dd_dnp->devid =
1107				    devid_str_encode(midp->mid_devid,
1108				    midp->mid_minor_name);
1109
1110				ddp->dd_dbcnt = 0;
1111				mirp = midp->mid_replicas;
1112				if (mirp) {
1113					ddp->dd_dbsize = mirp->mir_length;
1114					for (; mirp != NULL;
1115					    mirp = mirp->mir_next) {
1116						ddp->dd_dbcnt++;
1117					}
1118				}
1119				if ((midp->mid_available &
1120				    MD_IM_DISK_NOT_AVAILABLE) &&
1121				    (misp->mis_flags & MD_IM_SET_REPLICATED)) {
1122					ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
1123				}
1124			}
1125		}
1126	}
1127
1128	/*
1129	 * Add the drive records for the drives that we are adding to
1130	 * each host in the set.  Marks the drive records as MD_DR_ADD.
1131	 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
1132	 * this flag was set in the dd_flags for that drive.
1133	 */
1134	if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
1135		goto rollback;
1136
1137	rb_level = 2;   /* level 2 */
1138
1139	/*
1140	 * Take ownership of the added drives.
1141	 */
1142	if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
1143		goto rollback;
1144
1145out:
1146	metafreedrivedesc(&dd);
1147
1148	if (flush_set_onerr) {
1149		metaflushsetname(sp);
1150	}
1151
1152	return (rval);
1153
1154rollback:
1155	/* Make sure we are blocking all signals */
1156	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1157		mdclrerror(&xep);
1158
1159	rval = -1;
1160
1161	max_genid = sd->sd_genid;
1162
1163	/* level 2 */
1164	if (rb_level > 1) {
1165		if (!MD_ATSET_DESC(sd)) {
1166			if (rel_own_bydd(sp, dd, TRUE, &xep)) {
1167				mdclrerror(&xep);
1168			}
1169		}
1170	}
1171
1172	/* level 1 */
1173	if (rb_level > 0) {
1174		if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
1175			mdclrerror(&xep);
1176		}
1177		max_genid += 2;
1178		resync_genid(sp, sd, max_genid, 0, NULL);
1179	}
1180
1181	/* level 0 */
1182
1183	/* release signals back to what they were on entry */
1184	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1185		mdclrerror(&xep);
1186
1187	metafreedrivedesc(&dd);
1188
1189	if (flush_set_onerr) {
1190		metaflushsetname(sp);
1191		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1192	}
1193
1194	return (rval);
1195}
1196
1197int
1198meta_set_deletedrives(
1199	mdsetname_t		*sp,
1200	mddrivenamelist_t	*dnlp,
1201	int			forceflg,
1202	md_error_t		*ep
1203)
1204{
1205	md_set_desc		*sd;
1206	md_drive_desc		*ddp, *dd = NULL, *curdd = NULL;
1207	md_replicalist_t	*rlp = NULL, *rl;
1208	mddrivenamelist_t	*p;
1209	int			deldrvcnt = 0;
1210	int			rval = 0;
1211	mhd_mhiargs_t		mhiargs;
1212	int			i;
1213	sigset_t		oldsigs;
1214	md_setkey_t		*cl_sk;
1215	ulong_t			max_genid = 0;
1216	int			rb_level = 0;
1217	md_error_t		xep = mdnullerror;
1218	md_mnnode_desc		*nd;
1219	int			has_set;
1220	int			current_drv_cnt = 0;
1221	int			suspendall_flag = 0, suspendall_flag_rb = 0;
1222	int			suspend1_flag = 0;
1223	int			lock_flag = 0;
1224	bool_t			stale_bool = FALSE;
1225	int			flush_set_onerr = 0;
1226	mdnamelist_t		*nlp;
1227	mdname_t		*np;
1228
1229	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1230		return (-1);
1231
1232	/* Make sure we own the set */
1233	if (meta_check_ownership(sp, ep) != 0)
1234		return (-1);
1235
1236	if (drvsuniq(sp, dnlp, ep) == -1)
1237		return (-1);
1238
1239	/*
1240	 * Check and see if all the nodes have the set.
1241	 *
1242	 * The drive and node records are stored in the local mddbs of each
1243	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
1244	 * drive and node records from that node's local mddb and caches them
1245	 * internally. Any process needing diskset information contacts its
1246	 * local rpc.metad to get this information.  Since each node in the
1247	 * diskset is independently reading the set information from its local
1248	 * mddb, the set, drive and node records in the local mddbs must stay
1249	 * in-sync, so that all nodes have a consistent view of the diskset.
1250	 *
1251	 * For a multinode diskset, explicitly verify that all nodes in the
1252	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
1253	 * fail this operation since all nodes must be ALIVE in order to delete
1254	 * a drive record from their local mddb.  If a panic of this node
1255	 * leaves the local mddbs set, node and drive records out-of-sync, the
1256	 * reconfig cycle will fix the local mddbs and force them back into
1257	 * synchronization.
1258	 */
1259	if (MD_MNSET_DESC(sd)) {
1260		nd = sd->sd_nodelist;
1261		while (nd) {
1262			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1263				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1264					sp->setno,
1265					nd->nd_nodename, NULL, sp->setname);
1266				return (-1);
1267			}
1268			nd = nd->nd_next;
1269		}
1270
1271		/* Make sure we are blocking all signals */
1272		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1273			mdclrerror(&xep);
1274
1275		/*
1276		 * Lock the set on current set members.
1277		 * Set locking done much earlier for MN diskset than for
1278		 * traditional diskset since lock_set and SUSPEND are used
1279		 * to protect against other meta* commands running on the
1280		 * other nodes.
1281		 */
1282		nd = sd->sd_nodelist;
1283		/* All nodes are guaranteed to be ALIVE */
1284		while (nd) {
1285			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1286				rval = -1;
1287				goto out;
1288			}
1289			lock_flag = 1;
1290			nd = nd->nd_next;
1291		}
1292		/*
1293		 * Lock out other meta* commands by suspending
1294		 * class 1 messages across the diskset.
1295		 */
1296		nd = sd->sd_nodelist;
1297		/* All nodes are guaranteed to be ALIVE */
1298		while (nd) {
1299			if (clnt_mdcommdctl(nd->nd_nodename,
1300			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1301			    MD_MSCF_NO_FLAGS, ep)) {
1302				rval = -1;
1303				goto out;
1304			}
1305			suspend1_flag = 1;
1306			nd = nd->nd_next;
1307		}
1308
1309		nd = sd->sd_nodelist;
1310		/* All nodes are guaranteed to be ALIVE */
1311		while (nd) {
1312			if (strcmp(nd->nd_nodename, mynode()) == 0) {
1313				nd = nd->nd_next;
1314				continue;
1315			}
1316
1317			has_set = nodehasset(sp, nd->nd_nodename,
1318				    NHS_NSTG_EQ, ep);
1319			if (has_set < 0) {
1320				rval = -1;
1321				goto out;
1322			}
1323
1324			if (! has_set) {
1325				(void) mddserror(ep, MDE_DS_NODENOSET,
1326					sp->setno, nd->nd_nodename,
1327					NULL, sp->setname);
1328				rval = -1;
1329				goto out;
1330			}
1331			nd = nd->nd_next;
1332		}
1333	} else {
1334		for (i = 0; i < MD_MAXSIDES; i++) {
1335			/* Skip empty slots */
1336			if (sd->sd_nodes[i][0] == '\0')
1337				continue;
1338
1339			if (strcmp(sd->sd_nodes[i], mynode()) == 0)
1340				continue;
1341
1342			has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
1343				ep);
1344			if (has_set < 0) {
1345				/*
1346				 * Can directly return since !MN diskset;
1347				 * nothing to unlock.
1348				 */
1349				return (-1);
1350			}
1351
1352			if (! has_set) {
1353				/*
1354				 * Can directly return since !MN diskset;
1355				 * nothing to unlock.
1356				 */
1357				return (mddserror(ep, MDE_DS_NODENOSET,
1358				    sp->setno, sd->sd_nodes[i], NULL,
1359				    sp->setname));
1360			}
1361		}
1362	}
1363
1364	for (p = dnlp; p != NULL; p = p->next) {
1365		int		is_it;
1366		mddrivename_t	*dnp;
1367
1368		dnp = p->drivenamep;
1369
1370		if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
1371		    == -1) {
1372			rval = -1;
1373			goto out;
1374		}
1375
1376		if (! is_it) {
1377			(void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
1378			    NULL, dnp->cname, sp->setname);
1379			rval = -1;
1380			goto out;
1381		}
1382
1383		if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
1384			rval = -1;
1385			goto out;
1386		}
1387
1388		deldrvcnt++;
1389	}
1390	current_drv_cnt = deldrvcnt;
1391
1392	/*
1393	 * Get drive descriptors for the drives that are currently in the set.
1394	 */
1395	curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
1396	if (! mdisok(ep)) {
1397		rval = -1;
1398		goto out;
1399	}
1400
1401	/*
1402	 * Decrement the the delete drive count for each drive currently in the
1403	 * set.
1404	 */
1405	for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
1406		deldrvcnt--;
1407
1408	/*
1409	 * If the count of drives we are deleting is equal to the drives in the
1410	 * set, and we haven't specified forceflg, return an error
1411	 */
1412	if (deldrvcnt == 0 && forceflg == FALSE) {
1413		(void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
1414		rval = -1;
1415		goto out;
1416	}
1417
1418	/*
1419	 * Get the list of drive descriptors that we are deleting.
1420	 */
1421	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
1422	if (! mdisok(ep)) {
1423		rval = -1;
1424		goto out;
1425	}
1426
1427	/*
1428	 * Get the set timeout information in case we have to roll back.
1429	 */
1430	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1431	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1432		rval = -1;
1433		goto out;
1434	}
1435
1436	/* At this point, in case of error, set should be flushed. */
1437	flush_set_onerr = 1;
1438
1439	/* END CHECK CODE */
1440
1441	/* Lock the set on current set members */
1442	if (!(MD_MNSET_DESC(sd))) {
1443		md_rb_sig_handling_on();
1444		for (i = 0; i < MD_MAXSIDES; i++) {
1445			/* Skip empty slots */
1446			if (sd->sd_nodes[i][0] == '\0')
1447				continue;
1448
1449			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1450				rval = -1;
1451				goto out;
1452			}
1453			lock_flag = 1;
1454		}
1455	}
1456
1457	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1458		mddb_config_t		c;
1459		/*
1460		 * Is current set STALE?
1461		 */
1462		(void) memset(&c, 0, sizeof (c));
1463		c.c_id = 0;
1464		c.c_setno = sp->setno;
1465		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1466			(void) mdstealerror(ep, &c.c_mde);
1467			rval = -1;
1468			goto out;
1469		}
1470		if (c.c_flags & MDDB_C_STALE) {
1471			stale_bool = TRUE;
1472		}
1473	}
1474
1475	RB_TEST(1, "deletedrives", ep)
1476
1477	RB_PREEMPT;
1478	rb_level = 1;	/* level 1 */
1479
1480	RB_TEST(2, "deletedrives", ep)
1481
1482	/*
1483	 * Mark the drives MD_DR_DEL
1484	 */
1485	if (MD_MNSET_DESC(sd)) {
1486		nd = sd->sd_nodelist;
1487		/* All nodes are guaranteed to be ALIVE */
1488		while (nd) {
1489			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
1490			    MD_DR_DEL, ep) == -1)
1491				goto rollback;
1492
1493			RB_TEST(3, "deletedrives", ep)
1494			nd = nd->nd_next;
1495		}
1496	} else {
1497		for (i = 0; i < MD_MAXSIDES; i++) {
1498			/* Skip empty slots */
1499			if (sd->sd_nodes[i][0] == '\0')
1500				continue;
1501
1502			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
1503			    MD_DR_DEL, ep) == -1)
1504				goto rollback;
1505
1506			RB_TEST(3, "deletedrives", ep)
1507		}
1508	}
1509
1510	RB_TEST(4, "deletedrives", ep)
1511
1512	RB_PREEMPT;
1513	rb_level = 2;	/* level 2 */
1514
1515	RB_TEST(5, "deletedrives", ep)
1516
1517	/*
1518	 * Balance the DB's according to the list of existing drives and the
1519	 * list of deleted drives.
1520	 */
1521	if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
1522		goto rollback;
1523
1524	/*
1525	 * If the drive(s) to be deleted cannot be accessed,
1526	 * they haven't really been deleted yet. Check and delete now
1527	 * if need be.
1528	 */
1529	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
1530		nlp = NULL;
1531		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1532			char	*delete_name;
1533
1534			delete_name = ddp->dd_dnp->cname;
1535
1536			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1537				char	*cur_name;
1538
1539				cur_name =
1540				    rl->rl_repp->r_namep->drivenamep->cname;
1541
1542				if (strcmp(delete_name, cur_name) == 0) {
1543					/* put it on the delete list */
1544					np = rl->rl_repp->r_namep;
1545					(void) metanamelist_append(&nlp, np);
1546
1547				}
1548			}
1549		}
1550
1551		if (nlp != NULL) {
1552			if (meta_db_detach(sp, nlp,
1553			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
1554			    ep) == -1) {
1555				metafreenamelist(nlp);
1556				goto rollback;
1557			}
1558			metafreenamelist(nlp);
1559		}
1560	}
1561
1562	RB_TEST(6, "deletedrives", ep)
1563
1564	RB_PREEMPT;
1565	rb_level = 3;	/* level 3 */
1566
1567	RB_TEST(7, "deletedrives", ep)
1568
1569	/*
1570	 * Cannot suspend set until after meta_db_balance since
1571	 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
1572	 */
1573	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1574		/*
1575		 * Notify rpc.mdcommd on all nodes of a nodelist change.
1576		 * Start by suspending rpc.mdcommd (which drains it of all
1577		 * messages), then change the nodelist followed by a reinit
1578		 * and resume.
1579		 */
1580		nd = sd->sd_nodelist;
1581		/* All nodes are guaranteed to be ALIVE */
1582		while (nd) {
1583			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
1584			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
1585				rval = -1;
1586				goto out;
1587			}
1588			suspendall_flag = 1;
1589			nd = nd->nd_next;
1590		}
1591	}
1592
1593	/*
1594	 * Remove the drive records for the drives that were deleted from
1595	 * each host in the set.  This removes the record and dr_flags.
1596	 */
1597	if (MD_MNSET_DESC(sd)) {
1598		nd = sd->sd_nodelist;
1599		/* All nodes are guaranteed to be ALIVE */
1600		while (nd) {
1601			if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
1602				goto rollback;
1603
1604			RB_TEST(8, "deletedrives", ep)
1605			nd = nd->nd_next;
1606		}
1607	} else {
1608		for (i = 0; i < MD_MAXSIDES; i++) {
1609			/* Skip empty slots */
1610			if (sd->sd_nodes[i][0] == '\0')
1611				continue;
1612
1613			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
1614				goto rollback;
1615
1616			RB_TEST(8, "deletedrives", ep)
1617		}
1618	}
1619
1620	RB_TEST(9, "deletedrives", ep)
1621
1622	RB_PREEMPT;
1623	rb_level = 4;	/* level 4 */
1624
1625	RB_TEST(10, "deletedrives", ep)
1626
1627	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
1628		if (rel_own_bydd(sp, dd, TRUE, ep))
1629			goto rollback;
1630	}
1631
1632	/* If we deleted all the drives, then we need to halt the set. */
1633	if (deldrvcnt == 0) {
1634		RB_TEST(11, "deletedrives", ep)
1635
1636		RB_PREEMPT;
1637		rb_level = 5;	/* level 5 */
1638
1639		RB_TEST(12, "deletedrives", ep)
1640
1641		if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1642			goto rollback;
1643
1644		RB_TEST(13, "deletedrives", ep)
1645
1646		RB_PREEMPT;
1647		rb_level = 6;	/* level 6 */
1648
1649		RB_TEST(14, "deletedrives", ep)
1650
1651		/* Halt MN diskset on all nodes by having node withdraw */
1652		if (MD_MNSET_DESC(sd)) {
1653			nd = sd->sd_nodelist;
1654			/* All nodes are guaranteed to be ALIVE */
1655			while (nd) {
1656				/* Only withdraw nodes that are joined */
1657				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
1658					nd = nd->nd_next;
1659					continue;
1660				}
1661				/*
1662				 * Going to set locally cached node flags to
1663				 * rollback join so in case of error, the
1664				 * rollback code knows which nodes to re-join.
1665				 */
1666				nd->nd_flags |= MD_MN_NODE_RB_JOIN;
1667
1668				/*
1669				 * Be careful in ordering of following steps
1670				 * so that recovery from a panic between
1671				 * the steps is viable.
1672				 * Only reset master info in rpc.metad -
1673				 * don't reset local cached information
1674				 * which will be used to set master information
1675				 * back in case of failure (rollback).
1676				 */
1677				if (clnt_withdrawset(nd->nd_nodename, sp, ep))
1678					goto rollback;
1679				/* Sets withdraw flag on all nodes in list */
1680				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
1681				    sd->sd_nodelist, MD_NR_WITHDRAW,
1682				    NULL, ep)) {
1683					goto rollback;
1684				}
1685				if (clnt_mnsetmaster(nd->nd_nodename, sp,
1686				    "", MD_MN_INVALID_NID, ep)) {
1687					goto rollback;
1688				}
1689				nd = nd->nd_next;
1690			}
1691		} else {
1692			if (halt_set(sp, ep))
1693				goto rollback;
1694		}
1695
1696		RB_TEST(15, "deletedrives", ep)
1697	}
1698
1699	RB_TEST(16, "deletedrives", ep)
1700
1701out:
1702	/*
1703	 * Notify rpc.mdcommd on all nodes of a nodelist change.
1704	 * Send reinit command to mdcommd which forces it to get
1705	 * fresh set description.
1706	 */
1707	if (suspendall_flag) {
1708		/* Send reinit */
1709		nd = sd->sd_nodelist;
1710		/* All nodes are guaranteed to be ALIVE */
1711		while (nd) {
1712			/* Class is ignored for REINIT */
1713			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1714			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1715				if (rval == 0)
1716					(void) mdstealerror(ep, &xep);
1717				rval = -1;
1718				mde_perror(ep, dgettext(TEXT_DOMAIN,
1719				    "Unable to reinit rpc.mdcommd.\n"));
1720			}
1721			nd = nd->nd_next;
1722		}
1723	}
1724
1725	/*
1726	 * Just resume all classes so that resume is the same whether
1727	 * just one class was locked or all classes were locked.
1728	 */
1729	if ((suspend1_flag) || (suspendall_flag)) {
1730		/* Send resume */
1731		nd = sd->sd_nodelist;
1732		/* All nodes are guaranteed to be ALIVE */
1733		while (nd) {
1734			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1735			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1736				if (rval == 0)
1737					(void) mdstealerror(ep, &xep);
1738				rval = -1;
1739				mde_perror(ep, dgettext(TEXT_DOMAIN,
1740				    "Unable to resume rpc.mdcommd.\n"));
1741			}
1742			nd = nd->nd_next;
1743		}
1744		meta_ping_mnset(sp->setno);
1745	}
1746	if (lock_flag) {
1747		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1748		if (MD_MNSET_DESC(sd)) {
1749			nd = sd->sd_nodelist;
1750			/* All nodes are guaranteed to be ALIVE */
1751			while (nd) {
1752				if (clnt_unlock_set(nd->nd_nodename,
1753				    cl_sk, &xep)) {
1754					if (rval == 0)
1755						(void) mdstealerror(ep, &xep);
1756					rval = -1;
1757				}
1758				nd = nd->nd_next;
1759			}
1760		} else {
1761			for (i = 0; i < MD_MAXSIDES; i++) {
1762				/* Skip empty slots */
1763				if (sd->sd_nodes[i][0] == '\0')
1764					continue;
1765
1766				if (clnt_unlock_set(sd->sd_nodes[i],
1767				    cl_sk, &xep)) {
1768					if (rval == 0)
1769						(void) mdstealerror(ep, &xep);
1770					rval = -1;
1771				}
1772			}
1773		}
1774		cl_set_setkey(NULL);
1775	}
1776
1777	metafreedrivedesc(&dd);
1778
1779	if (flush_set_onerr) {
1780		metaflushsetname(sp);
1781		if (!(MD_MNSET_DESC(sd))) {
1782			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1783		}
1784	}
1785
1786	if (MD_MNSET_DESC(sd)) {
1787		/* release signals back to what they were on entry */
1788		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1789			mdclrerror(&xep);
1790	}
1791
1792	return (rval);
1793
1794rollback:
1795	/* all signals already blocked for MN disket */
1796	if (!(MD_MNSET_DESC(sd))) {
1797		/* Make sure we are blocking all signals */
1798		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1799			mdclrerror(&xep);
1800	}
1801
1802	rval = -1;
1803
1804	max_genid = sd->sd_genid;
1805
1806	/* Set the master on all nodes first thing */
1807	if (rb_level > 5) {
1808		if (MD_MNSET_DESC(sd)) {
1809			nd = sd->sd_nodelist;
1810			/* All nodes are guaranteed to be ALIVE */
1811			while (nd) {
1812				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
1813					continue;
1814				}
1815				/*
1816				 * Set master on all re-joining nodes to be
1817				 * my cached view of master.
1818				 */
1819				if (clnt_mnsetmaster(nd->nd_nodename, sp,
1820				    sd->sd_mn_master_nodenm,
1821				    sd->sd_mn_master_nodeid, &xep)) {
1822					mdclrerror(&xep);
1823				}
1824			}
1825		}
1826	}
1827
1828	/* level 3 */
1829	if (rb_level > 2) {
1830		md_set_record		*sr;
1831		md_mnset_record		*mnsr;
1832		md_drive_record		*dr;
1833		int			sr_drive_cnt;
1834
1835		/*
1836		 * See if we have to re-add the drives specified.
1837		 */
1838		if (MD_MNSET_DESC(sd)) {
1839			nd = sd->sd_nodelist;
1840			/* All nodes are guaranteed to be ALIVE */
1841			while (nd) {
1842				/*
1843				 * Must get current set record from each
1844				 * node to see what else must be done
1845				 * to recover.
1846				 * Record should be for a multi-node diskset.
1847				 */
1848				if (clnt_mngetset(nd->nd_nodename, sp->setname,
1849				    MD_SET_BAD, &mnsr, &xep) == -1) {
1850					mdclrerror(&xep);
1851					nd = nd->nd_next;
1852					continue;
1853				}
1854
1855				/*
1856				 * If all drives are already there, skip
1857				 * to next node.
1858				 */
1859				sr_drive_cnt = 0;
1860				dr = mnsr->sr_drivechain;
1861				while (dr) {
1862					sr_drive_cnt++;
1863					dr = dr->dr_next;
1864				}
1865				if (sr_drive_cnt == current_drv_cnt) {
1866					free_sr((md_set_record *)mnsr);
1867					nd = nd->nd_next;
1868					continue;
1869				}
1870
1871				/* Readd all drives */
1872				if (clnt_adddrvs(nd->nd_nodename, sp, dd,
1873				    mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
1874					mdclrerror(&xep);
1875
1876				free_sr((struct md_set_record *)mnsr);
1877				nd = nd->nd_next;
1878			}
1879		} else {
1880			for (i = 0; i < MD_MAXSIDES; i++) {
1881				/* Skip empty slots */
1882				if (sd->sd_nodes[i][0] == '\0')
1883					continue;
1884
1885				/* Record should be for a non-multi-node set */
1886				if (clnt_getset(sd->sd_nodes[i], sp->setname,
1887				    MD_SET_BAD, &sr, &xep) == -1) {
1888					mdclrerror(&xep);
1889					continue;
1890				}
1891
1892				/*
1893				 * Set record structure was allocated from RPC
1894				 * routine getset so this structure is only of
1895				 * size md_set_record even if the MN flag is
1896				 * set.  So, clear the flag so that the free
1897				 * code doesn't attempt to free a structure
1898				 * the size of md_mnset_record.
1899				 */
1900				if (MD_MNSET_REC(sr)) {
1901					sr->sr_flags &= ~MD_SR_MN;
1902					free_sr(sr);
1903					continue;
1904				}
1905
1906				/* Drive already added, skip to next node */
1907				if (sr->sr_drivechain != NULL) {
1908					free_sr(sr);
1909					continue;
1910				}
1911
1912				if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
1913				    sr->sr_ctime, sr->sr_genid, &xep) == -1)
1914					mdclrerror(&xep);
1915
1916				free_sr(sr);
1917			}
1918		}
1919		max_genid += 2;
1920	}
1921
1922	/*
1923	 * Notify rpc.mdcommd on all nodes of a nodelist change.
1924	 * At this point in time, don't know which nodes are joined
1925	 * to the set.  So, send a reinit command to mdcommd
1926	 * which forces it to get fresh set description.  Then send resume.
1927	 *
1928	 * Later, this code will use rpc.mdcommd messages to reattach disks
1929	 * and then rpc.mdcommd may be suspended again, rest of the nodes
1930	 * joined, rpc.mdcommd reinited and then resumed.
1931	 */
1932	if (suspendall_flag) {
1933		/* Send reinit */
1934		nd = sd->sd_nodelist;
1935		/* All nodes are guaranteed to be ALIVE */
1936		while (nd) {
1937			/* Class is ignored for REINIT */
1938			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1939			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1940				mde_perror(&xep, dgettext(TEXT_DOMAIN,
1941				    "Unable to reinit rpc.mdcommd.\n"));
1942				mdclrerror(&xep);
1943			}
1944			nd = nd->nd_next;
1945		}
1946
1947		/* Send resume */
1948		nd = sd->sd_nodelist;
1949		/* All nodes are guaranteed to be ALIVE */
1950		while (nd) {
1951			/*
1952			 * Resume all classes but class 1 so that lock is held
1953			 * against meta* commands.
1954			 * To later resume class1, must issue a class0 resume.
1955			 */
1956			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1957			    sp, MD_MSG_CLASS0,
1958			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
1959				mde_perror(&xep, dgettext(TEXT_DOMAIN,
1960				    "Unable to resume rpc.mdcommd.\n"));
1961				mdclrerror(&xep);
1962			}
1963			nd = nd->nd_next;
1964		}
1965		meta_ping_mnset(sp->setno);
1966	}
1967
1968	/* level 2 */
1969	if (rb_level > 1) {
1970		mdnamelist_t	*nlp;
1971		mdname_t	*np;
1972
1973		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1974			uint_t	rep_slice;
1975
1976			if ((meta_replicaslice(ddp->dd_dnp,
1977			    &rep_slice, &xep) != 0) ||
1978			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
1979				&xep)) == NULL)) {
1980				mdclrerror(&xep);
1981				continue;
1982			}
1983			nlp = NULL;
1984			(void) metanamelist_append(&nlp, np);
1985
1986			if (meta_db_attach(sp, nlp,
1987			    (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
1988			    &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
1989			    NULL, &xep) == -1)
1990				mdclrerror(&xep);
1991
1992			metafreenamelist(nlp);
1993		}
1994		/* Re-balance */
1995		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
1996			mdclrerror(&xep);
1997	}
1998
1999	/* level 4 */
2000	if (rb_level > 3) {
2001		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
2002			if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
2003				mdclrerror(&xep);
2004		}
2005	}
2006
2007	/* level 5 */
2008	if (rb_level > 4) {
2009		if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
2010			mdclrerror(&xep);
2011	}
2012
2013	/*
2014	 * If at least one node needs to be rejoined to MN diskset,
2015	 * then suspend commd again.
2016	 */
2017	if (MD_MNSET_DESC(sd)) {
2018		nd = sd->sd_nodelist;
2019		/* All nodes are guaranteed to be ALIVE */
2020		while (nd) {
2021			if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2022				nd = nd->nd_next;
2023				continue;
2024			}
2025			break;
2026		}
2027		if (nd) {
2028			/*
2029			 * Found node that will be rejoined so
2030			 * notify rpc.mdcommd on all nodes of a nodelist change.
2031			 * Start by suspending rpc.mdcommd (which drains it of
2032			 * all messages), then change the nodelist followed by
2033			 * a reinit and resume.
2034			 */
2035			nd = sd->sd_nodelist;
2036			/* All nodes are guaranteed to be ALIVE */
2037			while (nd) {
2038				if (clnt_mdcommdctl(nd->nd_nodename,
2039				    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
2040				    MD_MSCF_NO_FLAGS, &xep)) {
2041					mdclrerror(&xep);
2042				}
2043				suspendall_flag_rb = 1;
2044				nd = nd->nd_next;
2045			}
2046		}
2047	}
2048
2049
2050
2051	/* level 6 */
2052	if (rb_level > 5) {
2053		if (MD_MNSET_DESC(sd)) {
2054			int	join_flags = 0;
2055
2056			nd = sd->sd_nodelist;
2057			/* All nodes are guaranteed to be ALIVE */
2058			while (nd) {
2059				/* Only rejoin nodes that were joined before */
2060				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2061					nd = nd->nd_next;
2062					continue;
2063				}
2064				/*
2065				 * Rejoin nodes to same state as before -
2066				 * either STALE or non-STALE.
2067				 */
2068				if (stale_bool == TRUE)
2069					join_flags = MNSET_IS_STALE;
2070				if (clnt_joinset(nd->nd_nodename, sp,
2071				    join_flags, &xep))
2072					mdclrerror(&xep);
2073				/* Sets OWN flag on all nodes in list */
2074				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2075				    sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
2076					mdclrerror(&xep);
2077				}
2078				nd = nd->nd_next;
2079			}
2080		} else {
2081			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
2082				mdclrerror(&xep);
2083
2084			/* No special flag for traditional diskset */
2085			if (snarf_set(sp, NULL, &xep))
2086				mdclrerror(&xep);
2087		}
2088	}
2089
2090	/* level 1 */
2091	if (rb_level > 0) {
2092		/*
2093		 * Mark the drives as OK.
2094		 */
2095		if (MD_MNSET_DESC(sd)) {
2096			nd = sd->sd_nodelist;
2097			/* All nodes are guaranteed to be ALIVE */
2098			while (nd) {
2099				/*
2100				 * Must be last action before unlock.
2101				 * In case of panic, recovery code checks
2102				 * for MD_DR_OK to know that drive
2103				 * and possible master are fully added back.
2104				 */
2105				if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2106				    MD_DR_OK, &xep) == -1)
2107					mdclrerror(&xep);
2108				nd = nd->nd_next;
2109			}
2110		} else {
2111			for (i = 0; i < MD_MAXSIDES; i++) {
2112				/* Skip empty slots */
2113				if (sd->sd_nodes[i][0] == '\0')
2114					continue;
2115
2116				if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
2117				    MD_DR_OK, &xep) == -1)
2118					mdclrerror(&xep);
2119
2120			}
2121		}
2122		max_genid += 2;
2123		resync_genid(sp, sd, max_genid, 0, NULL);
2124	}
2125	/*
2126	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2127	 * Send a reinit command to mdcommd which forces it to get
2128	 * fresh set description.
2129	 */
2130	if (suspendall_flag_rb) {
2131		/* Send reinit */
2132		nd = sd->sd_nodelist;
2133		/* All nodes are guaranteed to be ALIVE */
2134		while (nd) {
2135			/* Class is ignored for REINIT */
2136			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2137			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2138				mde_perror(&xep, dgettext(TEXT_DOMAIN,
2139				    "Unable to reinit rpc.mdcommd.\n"));
2140				mdclrerror(&xep);
2141			}
2142			nd = nd->nd_next;
2143		}
2144	}
2145
2146	/*
2147	 * Just resume all classes so that resume is the same whether
2148	 * just one class was locked or all classes were locked.
2149	 */
2150	if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
2151		/* Send resume */
2152		nd = sd->sd_nodelist;
2153		/* All nodes are guaranteed to be ALIVE */
2154		while (nd) {
2155			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2156			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2157				mde_perror(&xep, dgettext(TEXT_DOMAIN,
2158				    "Unable to resume rpc.mdcommd.\n"));
2159				mdclrerror(&xep);
2160			}
2161			nd = nd->nd_next;
2162		}
2163		meta_ping_mnset(sp->setno);
2164	}
2165
2166
2167	/* level 0 */
2168	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2169	/* Don't test lock flag since guaranteed to be set if in rollback */
2170	if (MD_MNSET_DESC(sd)) {
2171		nd = sd->sd_nodelist;
2172		/* All nodes are guaranteed to be ALIVE */
2173		while (nd) {
2174			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
2175				mdclrerror(&xep);
2176			nd = nd->nd_next;
2177		}
2178	} else {
2179		for (i = 0; i < MD_MAXSIDES; i++) {
2180			/* Skip empty slots */
2181			if (sd->sd_nodes[i][0] == '\0')
2182				continue;
2183
2184			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
2185				mdclrerror(&xep);
2186		}
2187	}
2188	cl_set_setkey(NULL);
2189
2190	/* release signals back to what they were on entry */
2191	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2192		mdclrerror(&xep);
2193
2194	metafreedrivedesc(&dd);
2195
2196	if (flush_set_onerr) {
2197		metaflushsetname(sp);
2198		if (!(MD_MNSET_DESC(sd))) {
2199			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
2200		}
2201	}
2202
2203	return (rval);
2204}
2205