meta_set_hst.c revision 8452:89d32dfdae6e
1219019Sgabor/*
2264497Stijl * CDDL HEADER START
3219019Sgabor *
4219019Sgabor * The contents of this file are subject to the terms of the
5219019Sgabor * Common Development and Distribution License (the "License").
6219019Sgabor * You may not use this file except in compliance with the License.
7219019Sgabor *
8219019Sgabor * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9219019Sgabor * or http://www.opensolaris.org/os/licensing.
10219019Sgabor * See the License for the specific language governing permissions
11219019Sgabor * and limitations under the License.
12219019Sgabor *
13219019Sgabor * When distributing Covered Code, include this CDDL HEADER in each
14219019Sgabor * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15219019Sgabor * If applicable, add the following below this CDDL HEADER, with the
16219019Sgabor * fields enclosed by brackets "[]" replaced with your own identifying
17219019Sgabor * information: Portions Copyright [yyyy] [name of copyright owner]
18219019Sgabor *
19219019Sgabor * CDDL HEADER END
20219019Sgabor */
21219019Sgabor
22219019Sgabor/*
23219019Sgabor * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24219019Sgabor * Use is subject to license terms.
25219019Sgabor */
26219019Sgabor
27219019Sgabor/*
28219019Sgabor * Just in case we're not in a build environment, make sure that
29219019Sgabor * TEXT_DOMAIN gets set to something.
30219019Sgabor */
31219019Sgabor#if !defined(TEXT_DOMAIN)
32219019Sgabor#define	TEXT_DOMAIN "SYS_TEST"
33219019Sgabor#endif
34219019Sgabor
35219019Sgabor/*
36219019Sgabor * Metadevice diskset interfaces
37219019Sgabor */
38219019Sgabor
39219019Sgabor#include "meta_set_prv.h"
40219019Sgabor#include <meta.h>
41219019Sgabor#include <sys/lvm/md_crc.h>
42219019Sgabor#include <sys/time.h>
43219019Sgabor#include <sdssc.h>
44219019Sgabor
45219019Sgaborstatic int
46219019Sgaboradd_db_sidenms(
47219019Sgabor	mdsetname_t	*sp,
48219019Sgabor	md_error_t	*ep
49219019Sgabor)
50219019Sgabor{
51219019Sgabor	md_replicalist_t	*rlp = NULL;
52219019Sgabor	md_replicalist_t	*rl;
53219019Sgabor	int			rval = 0;
54219019Sgabor
55219019Sgabor	if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0)
56219019Sgabor		return (-1);
57219019Sgabor
58219019Sgabor	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
59219019Sgabor		md_replica_t	*r = rl->rl_repp;
60219019Sgabor
61219019Sgabor		/*
62219019Sgabor		 * This is not the first replica being added to the
63219019Sgabor		 * diskset so call with ADDSIDENMS_BCAST.  If this
64219019Sgabor		 * is a traditional diskset, the bcast flag is ignored
65219019Sgabor		 * since traditional disksets don't use the rpc.mdcommd.
66219019Sgabor		 */
67219019Sgabor		if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
68219019Sgabor		    DB_ADDSIDENMS_BCAST, ep)) {
69219019Sgabor			rval = -1;
70219019Sgabor			goto out;
71219019Sgabor		}
72219019Sgabor	}
73219019Sgabor
74219019Sgaborout:
75219019Sgabor	metafreereplicalist(rlp);
76219019Sgabor	return (rval);
77219019Sgabor}
78219019Sgabor
79219019Sgaborstatic int
80219019Sgaboradd_drvs_to_hosts(
81219019Sgabor	mdsetname_t	*sp,
82219019Sgabor	int		node_c,
83219019Sgabor	char		**node_v,
84219019Sgabor	md_error_t	*ep
85219019Sgabor)
86219019Sgabor{
87219019Sgabor	int		i;
88219019Sgabor	md_set_desc	*sd;
89219019Sgabor	md_drive_desc	*dd;
90219019Sgabor	md_timeval32_t	now;
91219019Sgabor	ulong_t		genid;
92219019Sgabor
93219019Sgabor	if ((sd = metaget_setdesc(sp, ep)) == NULL)
94219019Sgabor		return (-1);
95219019Sgabor
96219019Sgabor	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
97219019Sgabor		if (! mdisok(ep))
98219019Sgabor			return (-1);
99219019Sgabor		return (0);
100219019Sgabor	}
101219019Sgabor
102219019Sgabor	now = sd->sd_ctime;
103219019Sgabor	genid = sd->sd_genid - 1;
104219019Sgabor
105219019Sgabor	for (i = 0; i < node_c; i++) {
106219019Sgabor		if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1)
107219019Sgabor			return (-1);
108219019Sgabor	}
109219019Sgabor
110219019Sgabor	return (0);
111219019Sgabor}
112219019Sgabor
113219019Sgaborstatic int
114219019Sgaboradd_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
115219019Sgabor{
116219019Sgabor	mdnm_params_t	nm;
117219019Sgabor	char		*cname, *dname;
118219019Sgabor	side_t		tmp_sideno;
119219019Sgabor	minor_t		mnum;
120219019Sgabor	int		done, i;
121219019Sgabor	int		rval = 0;
122219019Sgabor	md_set_desc	*sd;
123219019Sgabor
124219019Sgabor	(void) memset(&nm, '\0', sizeof (nm));
125219019Sgabor	nm.key   = MD_KEYWILD;
126219019Sgabor
127219019Sgabor	if (!metaislocalset(sp)) {
128219019Sgabor		if ((sd = metaget_setdesc(sp, ep)) == NULL)
129219019Sgabor			return (-1);
130219019Sgabor	}
131219019Sgabor	/* Use rpc.mdcommd to add md side info from all nodes */
132219019Sgabor	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
133219019Sgabor	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
134219019Sgabor		md_mn_result_t			*resultp = NULL;
135219019Sgabor		md_mn_msg_meta_md_addside_t	md_as;
136219019Sgabor		int				send_rval;
137219019Sgabor
138219019Sgabor		md_as.msg_sideno = sideno;
139219019Sgabor		md_as.msg_otherside = otherside;
140219019Sgabor		/*
141219019Sgabor		 * If reconfig cycle has been started, this node is stuck in
142219019Sgabor		 * in the return step until this command has completed.  If
143219019Sgabor		 * mdcommd is suspended, ask send_message to fail (instead of
144219019Sgabor		 * retrying) so that metaset can finish allowing the
145219019Sgabor		 * reconfig cycle to proceed.
146219019Sgabor		 */
147219019Sgabor		send_rval = mdmn_send_message(sp->setno,
148219019Sgabor		    MD_MN_MSG_META_MD_ADDSIDE,
149219019Sgabor		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
150219019Sgabor		    0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
151219019Sgabor		    &resultp, ep);
152219019Sgabor		if (send_rval != 0) {
153219019Sgabor			(void) mdstealerror(ep, &(resultp->mmr_ep));
154219019Sgabor			if (resultp)
155219019Sgabor				free_result(resultp);
156219019Sgabor			return (-1);
157219019Sgabor		}
158219019Sgabor		if (resultp)
159219019Sgabor			free_result(resultp);
160219019Sgabor		return (0);
161219019Sgabor	} else {
162219019Sgabor		/*CONSTCOND*/
163219019Sgabor		while (1) {
164219019Sgabor			char	*drvnm = NULL;
165219019Sgabor
166219019Sgabor			nm.mde   = mdnullerror;
167219019Sgabor			nm.setno = sp->setno;
168219019Sgabor			nm.side  = otherside;
169219019Sgabor			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
170219019Sgabor				return (mdstealerror(ep, &nm.mde));
171219019Sgabor
172219019Sgabor			if (nm.key == MD_KEYWILD)
173219019Sgabor				return (0);
174219019Sgabor
175219019Sgabor			/*
176219019Sgabor			 * Okay we have a valid key
177219019Sgabor			 * Let's see if it is hsp or not
178219019Sgabor			 */
179219019Sgabor			nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno,
180219019Sgabor			    otherside, nm.key, &drvnm, NULL, NULL, ep);
181219019Sgabor			if (nm.devname == NULL || drvnm == NULL) {
182219019Sgabor				if (nm.devname)
183219019Sgabor					Free((void *)(uintptr_t)nm.devname);
184219019Sgabor				if (drvnm)
185219019Sgabor					Free((void *)(uintptr_t)drvnm);
186219019Sgabor				return (-1);
187219019Sgabor			}
188219019Sgabor
189219019Sgabor			/*
190219019Sgabor			 * If it is hsp add here
191219019Sgabor			 */
192219019Sgabor			if (strcmp(drvnm, MD_HOTSPARES) == 0) {
193219019Sgabor				if (add_name(sp, sideno, nm.key, MD_HOTSPARES,
194219019Sgabor				    minor(NODEV), (char *)(uintptr_t)nm.devname,
195219019Sgabor				    NULL, NULL, ep) == -1) {
196219019Sgabor					Free((void *)(uintptr_t)nm.devname);
197219019Sgabor					Free((void *)(uintptr_t)drvnm);
198219019Sgabor					return (-1);
199219019Sgabor				} else {
200219019Sgabor					Free((void *)(uintptr_t)nm.devname);
201219019Sgabor					Free((void *)(uintptr_t)drvnm);
202219019Sgabor					continue;
203219019Sgabor				}
204219019Sgabor			}
205219019Sgabor
206219019Sgabor			nm.side = sideno;
207219019Sgabor			if (MD_MNSET_DESC(sd)) {
208219019Sgabor				tmp_sideno = sideno;
209219019Sgabor			} else {
210219019Sgabor				tmp_sideno = sideno - 1;
211219019Sgabor			}
212219019Sgabor
213219019Sgabor			if ((done = meta_getnextside_devinfo(sp,
214219019Sgabor			    (char *)(uintptr_t)nm.devname, &tmp_sideno,
215219019Sgabor			    &cname, &dname, &mnum, ep)) == -1) {
216219019Sgabor				Free((void *)(uintptr_t)nm.devname);
217219019Sgabor				return (-1);
218219019Sgabor			}
219219019Sgabor
220219019Sgabor			assert(done == 1);
221219019Sgabor			Free((void *)(uintptr_t)nm.devname);
222219019Sgabor			Free((void *)(uintptr_t)drvnm);
223219019Sgabor
224219019Sgabor			/*
225219019Sgabor			 * The device reference count can be greater than 1 if
226219019Sgabor			 * more than one softpart is configured on top of the
227219019Sgabor			 * same device.  If this is the case then we want to
228219019Sgabor			 * increment the count to sync up with the other sides.
229219019Sgabor			 */
230219019Sgabor			for (i = 0; i < nm.ref_count; i++) {
231219019Sgabor				if (add_name(sp, sideno, nm.key, dname, mnum,
232219019Sgabor				    cname, NULL, NULL, ep) == -1)
233219019Sgabor					rval = -1;
234219019Sgabor			}
235219019Sgabor
236219019Sgabor			Free(cname);
237219019Sgabor			Free(dname);
238219019Sgabor
239219019Sgabor			if (rval != 0)
240219019Sgabor				return (rval);
241219019Sgabor		}
242219019Sgabor	}
243219019Sgabor
244219019Sgabor	/*NOTREACHED*/
245219019Sgabor}
246219019Sgabor
247219019Sgaborstatic int
248219019Sgaborcheck_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep)
249219019Sgabor{
250219019Sgabor	mddrivename_t	*dp;
251219019Sgabor	md_drive_desc	*dd, *ddp;
252219019Sgabor
253219019Sgabor	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
254219019Sgabor		if (! mdisok(ep))
255219019Sgabor			return (-1);
256219019Sgabor
257219019Sgabor	for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
258219019Sgabor		dp = ddp->dd_dnp;
259219019Sgabor
260219019Sgabor		if (checkdrive_onnode(sp, dp, node, ep))
261219019Sgabor			return (-1);
262219019Sgabor	}
263219019Sgabor
264219019Sgabor	return (0);
265219019Sgabor}
266219019Sgabor
267219019Sgaborstatic int
268219019Sgaborcreate_multinode_set_on_hosts(
269219019Sgabor	mdsetname_t	*sp,
270219019Sgabor	int		node_c,		/* Number of new nodes */
271219019Sgabor	char		**node_v,	/* Nodes which are being added */
272219019Sgabor	int		new_set,
273219019Sgabor	md_error_t	*ep
274219019Sgabor)
275219019Sgabor{
276219019Sgabor	int				i;
277219019Sgabor	md_set_desc			*sd;
278219019Sgabor	md_timeval32_t			now;
279219019Sgabor	ulong_t				genid;
280219019Sgabor	int				rval = 0;
281219019Sgabor	md_mnnode_desc			*nd, *ndm = NULL;
282219019Sgabor	md_mnnode_desc			*nd_prev, *nd_curr;
283219019Sgabor	int				nodecnt;
284219019Sgabor	mndiskset_membershiplist_t	*nl, *nl2;
285219019Sgabor
286219019Sgabor	if (!new_set) {
287219019Sgabor		if ((sd = metaget_setdesc(sp, ep)) == NULL)
288219019Sgabor			return (-1);
289219019Sgabor		now = sd->sd_ctime;
290219019Sgabor		genid = sd->sd_genid - 1;
291219019Sgabor		if (sd->sd_drvs)
292219019Sgabor			genid--;
293219019Sgabor	} else {
294219019Sgabor		sd = Zalloc(sizeof (*sd));
295219019Sgabor
296219019Sgabor		if (meta_gettimeofday(&now) == -1) {
297219019Sgabor			(void) mdsyserror(ep, errno,
298219019Sgabor			    dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
299219019Sgabor			rval = -1;
300219019Sgabor			goto out;
301219019Sgabor		}
302219019Sgabor
303219019Sgabor		/* Put the new entries into the set */
304219019Sgabor		/*
305219019Sgabor		 * Get membershiplist from API routine.  If there's
306219019Sgabor		 * an error, fail to create set and pass back error.
307219019Sgabor		 */
308219019Sgabor		if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
309219019Sgabor			rval = -1;
310219019Sgabor			goto out;
311219019Sgabor		}
312219019Sgabor
313219019Sgabor		/*
314219019Sgabor		 * meta_set_addhosts has already verified that
315219019Sgabor		 * this node list is in the membership list
316219019Sgabor		 * so set ALIVE flag.
317219019Sgabor		 * Since this is a new set, all hosts being
318219019Sgabor		 * added are new to the set, so also set ADD flag.
319219019Sgabor		 */
320219019Sgabor		for (i = 0; i < node_c; i++) {
321219019Sgabor			nd = Zalloc(sizeof (*nd));
322219019Sgabor			(void) strcpy(nd->nd_nodename, node_v[i]);
323219019Sgabor			nd->nd_ctime = now;
324219019Sgabor			nd->nd_flags = (MD_MN_NODE_ALIVE |
325219019Sgabor			    MD_MN_NODE_ADD);
326219019Sgabor			nl2 = nl;
327219019Sgabor			while (nl2) {
328219019Sgabor				if (strcmp(nl2->msl_node_name,
329219019Sgabor				    node_v[i]) == 0) {
330219019Sgabor					nd->nd_nodeid = nl2->msl_node_id;
331219019Sgabor					(void) strcpy(nd->nd_priv_ic,
332219019Sgabor					    nl2->msl_node_addr);
333219019Sgabor					break;
334219019Sgabor				}
335219019Sgabor				nl2 = nl2->next;
336219019Sgabor			}
337219019Sgabor
338219019Sgabor			/*
339219019Sgabor			 * Nodelist must be kept in ascending
340219019Sgabor			 * nodeid order.
341219019Sgabor			 */
342219019Sgabor			if (sd->sd_nodelist == NULL) {
343219019Sgabor				/* Nothing in list, just add it */
344219019Sgabor				sd->sd_nodelist = nd;
345219019Sgabor			} else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) {
346219019Sgabor				/* Add to head of list */
347219019Sgabor				nd->nd_next = sd->sd_nodelist;
348219019Sgabor				sd->sd_nodelist = nd;
349219019Sgabor			} else {
350219019Sgabor				nd_curr = sd->sd_nodelist->nd_next;
351219019Sgabor				nd_prev = sd->sd_nodelist;
352219019Sgabor				/* Search for place ot add it */
353219019Sgabor				while (nd_curr) {
354219019Sgabor					if (nd->nd_nodeid <
355219019Sgabor					    nd_curr->nd_nodeid) {
356219019Sgabor						/* Add before nd_curr */
357219019Sgabor						nd->nd_next = nd_curr;
358219019Sgabor						nd_prev->nd_next = nd;
359219019Sgabor						break;
360219019Sgabor					}
361219019Sgabor					nd_prev = nd_curr;
362219019Sgabor					nd_curr = nd_curr->nd_next;
363				}
364				/* Add to end of list */
365				if (nd_curr == NULL) {
366					nd_prev->nd_next = nd;
367				}
368
369			}
370			/* Set master to be first node added */
371			if (ndm == NULL)
372				ndm = nd;
373		}
374
375		meta_free_nodelist(nl);
376		/*
377		 * Creating mnset for first time.
378		 * Set master to be invalid until first drive is
379		 * in set.
380		 */
381		(void) strcpy(sd->sd_mn_master_nodenm, "");
382		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
383		sd->sd_mn_masternode = ndm;
384		sd->sd_ctime = now;
385		genid = sd->sd_genid = 0;
386	}
387
388	/* Create the set where needed */
389	for (i = 0; i < node_c; i++) {
390		/*
391		 * Create the set on each new node.  If the set already
392		 * exists, then the node list being created on each new node
393		 * is the current node list from before the new nodes
394		 * were added.  If the set doesn't exist, then the node
395		 * list being created on each new node is the entire
396		 * new node list.
397		 */
398		if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist,
399		    now, genid, sd->sd_mn_master_nodenm,
400		    sd->sd_mn_master_nodeid, ep) == -1) {
401			rval = -1;
402			break;
403		}
404	}
405
406out:
407	if (new_set) {
408		nd = sd->sd_nodelist;
409		while (nd) {
410			sd->sd_nodelist = nd->nd_next;
411			Free(nd);
412			nd = sd->sd_nodelist;
413		}
414		Free(sd);
415	}
416
417	if (rval != 0 || new_set)
418		return (rval);
419
420	/*
421	 * Add the drive records to the new sets
422	 * and names for the new sides.
423	 */
424	return (add_drvs_to_hosts(sp, node_c, node_v, ep));
425}
426
427
428static int
429create_traditional_set_on_hosts(
430	mdsetname_t	*sp,
431	int		node_c,		/* Number of new nodes */
432	char		**node_v,	/* Nodes which are being added */
433	int		new_set,
434	md_error_t	*ep
435)
436{
437	int		i;
438	md_set_desc	*sd;
439	md_timeval32_t	now;
440	ulong_t		genid;
441	int		rval = 0;
442
443	if (!new_set) {
444
445		if ((sd = metaget_setdesc(sp, ep)) == NULL)
446			return (-1);
447		now = sd->sd_ctime;
448
449		genid = sd->sd_genid;
450
451		if (sd->sd_drvs)
452			genid--;
453	} else {
454		if (node_c > MD_MAXSIDES)
455			return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL,
456			    sp->setno, NULL, NULL, sp->setname));
457
458		sd = Zalloc(sizeof (*sd));
459
460		/* Put the new entries into the set */
461		for (i = 0; i < node_c; i++) {
462			(void) strcpy(sd->sd_nodes[i], node_v[i]);
463		}
464
465		if (meta_gettimeofday(&now) == -1) {
466			(void) mdsyserror(ep, errno, "meta_gettimeofday()");
467			rval = -1;
468			goto out;
469		}
470
471		sd->sd_ctime = now;
472		genid = sd->sd_genid = 0;
473	}
474
475	/* Create the set where needed */
476	for (i = 0; i < node_c; i++) {
477		/*
478		 * Create the set on each new host
479		 */
480		if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid,
481		    ep) == -1) {
482			rval = -1;
483			break;
484		}
485	}
486
487out:
488	if (new_set)
489		Free(sd);
490
491	if (rval != 0 || new_set)
492		return (rval);
493
494	/*
495	 * Add the drive records to the new sets
496	 * and names for the new sides.
497	 */
498	return (add_drvs_to_hosts(sp, node_c, node_v, ep));
499}
500
501static int
502create_set_on_hosts(
503	mdsetname_t	*sp,
504	int		multi_node,	/* Multi_node diskset or not? */
505	int		node_c,		/* Number of new nodes */
506	char		**node_v,	/* Nodes which are being added */
507	int		new_set,
508	md_error_t	*ep
509)
510{
511	if (multi_node)
512		return (create_multinode_set_on_hosts(sp, node_c, node_v,
513		    new_set, ep));
514	else
515		return (create_traditional_set_on_hosts(sp, node_c, node_v,
516		    new_set, ep));
517}
518
519static int
520create_set(
521	mdsetname_t	*sp,
522	int		multi_node,	/* Multi-node diskset or not? */
523	int		node_c,
524	char		**node_v,
525	int		auto_take,
526	md_error_t	*ep
527)
528{
529	int		i;
530	int		rval = 0;
531	set_t		max_sets;
532	set_t		setno;
533	int		bool;
534	uint_t		sr_flags;
535	sigset_t	oldsigs;
536	md_setkey_t	*cl_sk;
537	int		rb_level = 0;
538	md_error_t	xep = mdnullerror;
539	rval_e		sdssc_rval;
540	int		lock_flag = 0;
541	int		sig_flag = 0;
542
543	if ((max_sets = get_max_sets(ep)) == 0)
544		return (-1);
545
546	/* We must be a member of the set we are creating */
547	if (! strinlst(mynode(), node_c, node_v))
548		return (mddserror(ep, MDE_DS_SELFNOTIN,
549		    sp->setno, mynode(), NULL, sp->setname));
550
551	/*
552	 * If auto_take then we must be the only member of the set
553	 * that we are creating.
554	 */
555	if (auto_take && node_c > 1)
556		return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
557		    sp->setname));
558
559	/*
560	 * If we're part of SC3.0 we'll already have allocated the
561	 * set number so we can skip the allocation algorithm used.
562	 * Set number is unique across traditional and MN disksets.
563	 */
564	if ((sdssc_rval = sdssc_get_index(sp->setname, &setno))
565	    == SDSSC_NOT_BOUND) {
566
567		for (i = 0; i < node_c; i++) {
568			int	has_set;
569
570			/* Skip my node */
571			if (strcmp(mynode(), node_v[i]) == 0)
572				continue;
573
574			/*
575			 * Make sure this set name is not used on the
576			 * other hosts
577			 */
578			has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
579			if (has_set < 0) {
580				if (! mdiserror(ep, MDE_NO_SET)) {
581					rval = -1;
582					goto out;
583				}
584				mdclrerror(ep);
585				continue;
586			}
587
588			if (has_set) {
589				(void) mddserror(ep, MDE_DS_NODEHASSET,
590				    sp->setno, node_v[i], NULL, sp->setname);
591				rval = -1;
592				goto out;
593			}
594		}
595
596		for (setno = 1; setno < max_sets; setno++) {
597			for (i = 0; i < node_c; i++) {
598				if (clnt_setnumbusy(node_v[i], setno,
599				    &bool, ep) == -1) {
600					rval = -1;
601					goto out;
602				}
603
604				if (bool == TRUE)
605					break;
606			}
607			if (i == node_c)
608				break;
609		}
610	} else if (sdssc_rval != SDSSC_OKAY) {
611		(void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
612		    NULL, sp->setname);
613		rval = -1;
614		goto out;
615	}
616
617	if (setno == max_sets) {
618		(void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
619		    NULL, sp->setname);
620		rval = -1;
621		goto out;
622	}
623
624	sp->setno = setno;
625
626	/*
627	 * Lock the set on current set members.
628	 * Set locking done much earlier for MN diskset than for traditional
629	 * diskset since lock_set is used to protect against
630	 * other meta* commands running on the other nodes.
631	 * Don't issue mdcommd SUSPEND command since there is nothing
632	 * to suspend since there currently is no set.
633	 */
634	if (multi_node) {
635		/* Make sure we are blocking all signals */
636		if (procsigs(TRUE, &oldsigs, &xep) < 0)
637			mdclrerror(&xep);
638		sig_flag = 1;
639
640		/* Lock the set on new set members */
641		for (i = 0; i < node_c; i++) {
642			if (clnt_lock_set(node_v[i], sp, ep)) {
643				rval = -1;
644				goto out;
645			}
646			lock_flag = 1;
647		}
648		/* Now have the diskset locked, verify set number is still ok */
649		for (i = 0; i < node_c; i++) {
650			if (clnt_setnumbusy(node_v[i], setno,
651			    &bool, ep) == -1) {
652				rval = -1;
653				goto out;
654			}
655		}
656	}
657
658
659	if (meta_set_checkname(sp->setname, ep)) {
660		rval = -1;
661		goto out;
662	}
663
664	for (i = 0; i < node_c; i++) {
665		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
666			rval = -1;
667			goto out;
668		}
669		if (bool == FALSE) {
670			(void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
671			    node_v[i], NULL, sp->setname);
672			rval = -1;
673			goto out;
674		}
675	}
676
677	/* END CHECK CODE */
678
679	/* Lock the set on new set members */
680	if (!multi_node) {
681		md_rb_sig_handling_on();
682		sig_flag = 1;
683		for (i = 0; i < node_c; i++) {
684			if (clnt_lock_set(node_v[i], sp, ep)) {
685				rval = -1;
686				goto out;
687			}
688			lock_flag = 1;
689		}
690	}
691
692	RB_TEST(1, "create_set", ep)
693
694	RB_PREEMPT;
695	rb_level = 1;	/* level 1 */
696
697	RB_TEST(2, "create_set", ep)
698
699	if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v,
700	    1, ep)) == -1)
701		goto rollback;
702
703	RB_TEST(3, "create_set", ep)
704
705	if (auto_take)
706		sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE;
707	else
708		sr_flags = MD_SR_OK;
709
710	/*
711	 * Mark the set record MD_SR_OK
712	 */
713	for (i = 0; i < node_c; i++)
714		if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep))
715			goto rollback;
716
717	rb_level = 2;	/* level 2 */
718
719	/*
720	 * For MN diskset:
721	 * On each added node, set the node record for that node
722	 * to OK.  Then set all node records for the newly added
723	 * nodes on all nodes to ok.
724	 *
725	 * By setting a node's own node record to ok first, even if
726	 * the node adding the hosts panics, the rest of the nodes can
727	 * determine the same node list during the choosing of the master
728	 * during reconfig.  So, only nodes considered for mastership
729	 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
730	 * on that node's rpc.metad.  If all nodes have MD_SR_OK set,
731	 * but no node has its own MD_MN_NODE_OK set, then the set will
732	 * be removed during reconfig since a panic occurred during the
733	 * creation of the initial diskset.
734	 */
735
736	if (multi_node) {
737		md_mnnode_desc	*nd, *saved_nd_next;
738		md_set_desc	*sd;
739
740		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
741			goto rollback;
742		}
743
744		for (i = 0; i < node_c; i++) {
745			nd = sd->sd_nodelist;
746			/* All nodes are guaranteed to be ALIVE */
747			while (nd) {
748				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
749					break;
750				nd = nd->nd_next;
751			}
752			/* Something wrong, will pick this up in next loop */
753			if (nd == NULL)
754				continue;
755
756			/* Only changing my local cache of node list */
757			saved_nd_next = nd->nd_next;
758			nd->nd_next = NULL;
759
760			/* Set node record for added host to ok on that host */
761			if (clnt_upd_nr_flags(node_v[i], sp,
762			    nd, MD_NR_OK, NULL, ep)) {
763				nd->nd_next = saved_nd_next;
764				goto rollback;
765			}
766			nd->nd_next = saved_nd_next;
767		}
768
769		/* Now set all node records on all nodes to be ok */
770		nd = sd->sd_nodelist;
771		/* All nodes are guaranteed to be ALIVE */
772		while (nd) {
773			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
774			    sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
775				goto rollback;
776			}
777			nd = nd->nd_next;
778		}
779	}
780
781	RB_TEST(4, "create_set", ep)
782
783out:
784	if ((rval == 0) && multi_node) {
785		/*
786		 * Set successfully created.
787		 * Notify rpc.mdcommd on all nodes of a nodelist change.
788		 * Send reinit command to mdcommd which forces it to get
789		 * fresh set description.  Then send resume.
790		 * Resume on class 0 will resume all classes.
791		 */
792		for (i = 0; i < node_c; i++) {
793			/* Class is ignored for REINIT */
794			if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
795			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
796				if (rval == 0)
797					(void) mdstealerror(ep, &xep);
798				rval = -1;
799				mde_perror(ep, dgettext(TEXT_DOMAIN,
800				    "Unable to reinit rpc.mdcommd.\n"));
801			}
802		}
803		for (i = 0; i < node_c; i++) {
804			if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
805			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
806				if (rval == 0)
807					(void) mdstealerror(ep, &xep);
808				rval = -1;
809				mde_perror(ep, dgettext(TEXT_DOMAIN,
810				    "Unable to resume rpc.mdcommd.\n"));
811			}
812		}
813		meta_ping_mnset(sp->setno);
814	}
815	if (lock_flag) {
816		cl_sk = cl_get_setkey(sp->setno, sp->setname);
817		for (i = 0; i < node_c; i++) {
818			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
819				if (rval == 0)
820					(void) mdstealerror(ep, &xep);
821				rval = -1;
822			}
823		}
824		cl_set_setkey(NULL);
825	}
826
827	if (sig_flag) {
828		if (multi_node) {
829			/* release signals back to what they were on entry */
830			if (procsigs(FALSE, &oldsigs, &xep) < 0)
831				mdclrerror(&xep);
832		} else {
833			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
834		}
835	}
836
837	return (rval);
838
839rollback:
840	/* all signals already blocked for MN disket */
841	if (!multi_node) {
842		/* Make sure we are blocking all signals */
843		if (procsigs(TRUE, &oldsigs, &xep) < 0)
844			mdclrerror(&xep);
845	}
846
847	rval = -1;
848
849	/*
850	 * For MN diskset:
851	 * On each added node (which is now each node to be deleted),
852	 * set the node record for that node to DEL.  Then set all
853	 * node records for the newly added (soon to be deleted) nodes
854	 * on all nodes to ok.
855	 *
856	 * By setting a node's own node record to DEL first, even if
857	 * the node doing the rollback panics, the rest of the nodes can
858	 * determine the same node list during the choosing of the master
859	 * during reconfig.
860	 */
861
862	/* level 3 */
863	if ((rb_level > 1) && (multi_node)) {
864		md_mnnode_desc	*nd, *saved_nd_next;
865		md_set_desc	*sd;
866
867		if ((sd = metaget_setdesc(sp, &xep)) == NULL) {
868			mdclrerror(&xep);
869		}
870
871		for (i = 0; i < node_c; i++) {
872			nd = sd->sd_nodelist;
873			/* All nodes are guaranteed to be ALIVE */
874			while (nd) {
875				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
876					break;
877				nd = nd->nd_next;
878			}
879			/* Something wrong, will pick this up in next loop */
880			if (nd == NULL)
881				continue;
882
883			/* Only changing my local cache of node list */
884			saved_nd_next = nd->nd_next;
885			nd->nd_next = NULL;
886
887			/* Set node record for added host to DEL on that host */
888			if (clnt_upd_nr_flags(node_v[i], sp,
889			    nd, MD_NR_DEL, NULL, &xep)) {
890				nd->nd_next = saved_nd_next;
891				mdclrerror(&xep);
892			}
893			nd->nd_next = saved_nd_next;
894		}
895
896		/* Now set all node records on all nodes to be DEL */
897		nd = sd->sd_nodelist;
898		/* All nodes are guaranteed to be ALIVE */
899		while (nd) {
900			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
901			    sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) {
902				mdclrerror(&xep);
903			}
904			nd = nd->nd_next;
905		}
906
907		/* Mark set record on all hosts to be DELETED */
908		for (i = 0; i < node_c; i++) {
909			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
910				mdclrerror(&xep);
911			}
912		}
913	}
914	/* level 1 */
915	if (rb_level > 0) {
916		for (i = 0; i < node_c; i++) {
917			if (clnt_delset(node_v[i], sp, &xep) == -1)
918				mdclrerror(&xep);
919		}
920	}
921
922	/* level 0 */
923	/* Don't test lock flag since guaranteed to be set if in rollback */
924	cl_sk = cl_get_setkey(sp->setno, sp->setname);
925	for (i = 0; i < node_c; i++) {
926		if (clnt_unlock_set(node_v[i], cl_sk, &xep))
927			mdclrerror(&xep);
928	}
929	cl_set_setkey(NULL);
930
931	/* release signals back to what they were on entry */
932	if (procsigs(FALSE, &oldsigs, &xep) < 0)
933		mdclrerror(&xep);
934
935	if ((sig_flag) && (!multi_node))
936		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
937
938	return (rval);
939}
940
941static int
942del_db_sidenms(
943	mdsetname_t	*sp,
944	side_t		sideno,
945	md_error_t	*ep
946)
947{
948	md_replicalist_t	*rlp = NULL;
949	md_replicalist_t	*rl;
950	int			rval = 0;
951
952	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
953		return (-1);
954
955	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
956		md_replica_t	*r = rl->rl_repp;
957
958		if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) {
959			rval = -1;
960			goto out;
961		}
962	}
963
964out:
965	metafreereplicalist(rlp);
966	return (rval);
967}
968
969static int
970del_drvs_from_hosts(
971	mdsetname_t	*sp,
972	md_set_desc	*sd,
973	md_drive_desc	*dd,
974	int		node_c,
975	char		**node_v,
976	int		oha,
977	md_error_t	*ep
978)
979{
980	int 		i;
981	md_mnnode_desc	*nd;
982
983	for (i = 0; i < node_c; i++) {
984		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
985			/*
986			 * During OHA mode, don't issue RPCs to
987			 * non-alive nodes since there is no reason to
988			 * wait for RPC timeouts.
989			 */
990			nd = sd->sd_nodelist;
991			while (nd) {
992				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
993					break;
994				nd = nd->nd_next;
995			}
996			if (nd == NULL) {
997				return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
998				    sp->setno, nd->nd_nodename,
999				    NULL, sp->setname));
1000			}
1001
1002			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1003				continue;
1004			}
1005			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1006				return (-1);
1007			}
1008		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1009			/*
1010			 * All nodes should be alive in non-oha mode.
1011			 */
1012			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1013				return (-1);
1014			}
1015		} else {
1016			/*
1017			 * For traditional diskset, issue the RPC and
1018			 * ignore RPC failure if in OHA mode.
1019			 */
1020			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1021				if (oha == TRUE && mdanyrpcerror(ep)) {
1022					mdclrerror(ep);
1023					continue;
1024				}
1025				return (-1);
1026			}
1027		}
1028	}
1029
1030	return (0);
1031}
1032
1033static int
1034del_host_noset(
1035	mdsetname_t	*sp,
1036	char		**anode,
1037	md_error_t	*ep
1038)
1039{
1040	int		rval = 0;
1041	md_setkey_t	*cl_sk;
1042	md_drive_desc	*dd;
1043	md_error_t	xep = mdnullerror;
1044	md_set_desc	*sd;
1045
1046	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1047		return (-1);
1048
1049	/* Make sure we own the set */
1050	if (meta_check_ownership(sp, ep) != 0)
1051		return (-1);
1052
1053	/* Lock the set on our side */
1054	if (clnt_lock_set(mynode(), sp, ep)) {
1055		rval = -1;
1056		goto out;
1057	}
1058
1059	if (clnt_delhosts(mynode(), sp, 1, anode, ep)) {
1060		rval = -1;
1061		goto out;
1062	}
1063
1064	if (!MD_MNSET_DESC(sd)) {
1065		if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1066		    ep)) == NULL) {
1067			if (! mdisok(ep)) {
1068				rval = -1;
1069				goto out;
1070			}
1071		}
1072
1073		/* If we have drives */
1074		if (dd != NULL) {
1075			if (clnt_del_drv_sidenms(mynode(), sp, ep)) {
1076				rval = -1;
1077				goto out;
1078			}
1079		}
1080	}
1081
1082out:
1083	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1084	if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1085		if (rval == 0)
1086			(void) mdstealerror(ep, &xep);
1087		rval = -1;
1088	}
1089	cl_set_setkey(NULL);
1090
1091	metaflushsetname(sp);
1092
1093	return (rval);
1094}
1095
1096static int
1097del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
1098{
1099	mdnm_params_t		nm;
1100	md_set_desc		*sd;
1101	int			i;
1102
1103	if (!metaislocalset(sp)) {
1104		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1105			return (-1);
1106	}
1107	/* Use rpc.mdcommd to add md side info from all nodes */
1108	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1109	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1110		md_mn_result_t			*resultp = NULL;
1111		md_mn_msg_meta_md_delside_t	md_ds;
1112		int				send_rval;
1113
1114		md_ds.msg_sideno = sideno;
1115		/*
1116		 * If reconfig cycle has been started, this node is stuck in
1117		 * in the return step until this command has completed.  If
1118		 * mdcommd is suspended, ask send_message to fail (instead of
1119		 * retrying) so that metaset can finish allowing the
1120		 * reconfig cycle to proceed.
1121		 */
1122		send_rval = mdmn_send_message(sp->setno,
1123		    MD_MN_MSG_META_MD_DELSIDE,
1124		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
1125		    0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
1126		    &resultp, ep);
1127		if (send_rval != 0) {
1128			(void) mdstealerror(ep, &(resultp->mmr_ep));
1129			if (resultp)
1130				free_result(resultp);
1131			return (-1);
1132		}
1133		if (resultp)
1134			free_result(resultp);
1135	} else {
1136		(void) memset(&nm, '\0', sizeof (nm));
1137		nm.key   = MD_KEYWILD;
1138
1139		/*CONSTCOND*/
1140		while (1) {
1141			nm.mde   = mdnullerror;
1142			nm.setno = sp->setno;
1143			nm.side  = MD_SIDEWILD;
1144			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
1145				return (mdstealerror(ep, &nm.mde));
1146
1147			if (nm.key == MD_KEYWILD)
1148				return (0);
1149
1150			/*
1151			 * The device reference count can be greater than 1 if
1152			 * more than one softpart is configured on top of the
1153			 * same device.  If this is the case then we want to
1154			 * decrement the count to zero so the entry can be
1155			 * actually removed.
1156			 */
1157			for (i = 0; i < nm.ref_count; i++) {
1158				if (del_name(sp, sideno, nm.key, ep) == -1)
1159					return (-1);
1160			}
1161		}
1162	}
1163	return (0);
1164}
1165
1166static void
1167recreate_set(
1168	mdsetname_t		*sp,
1169	md_set_desc		*sd
1170)
1171{
1172	int			i;
1173	int			has_set;
1174	md_error_t		xep = mdnullerror;
1175	md_mnnode_desc		*nd;
1176
1177	if (MD_MNSET_DESC(sd)) {
1178		nd = sd->sd_nodelist;
1179		while (nd) {
1180			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1181				nd = nd->nd_next;
1182				continue;
1183			}
1184			has_set = nodehasset(sp, nd->nd_nodename,
1185			    NHS_NST_EQ, &xep);
1186
1187			if (has_set >= 0) {
1188				nd = nd->nd_next;
1189				continue;
1190			}
1191
1192			mdclrerror(&xep);
1193
1194			if (clnt_mncreateset(nd->nd_nodename, sp,
1195			    sd->sd_nodelist,
1196			    sd->sd_ctime, sd->sd_genid,
1197			    sd->sd_mn_master_nodenm,
1198			    sd->sd_mn_master_nodeid, &xep) == -1)
1199				mdclrerror(&xep);
1200			nd = nd->nd_next;
1201		}
1202	} else {
1203		for (i = 0; i < MD_MAXSIDES; i++) {
1204			/* Skip empty slots */
1205			if (sd->sd_nodes[i][0] == '\0')
1206				continue;
1207
1208			has_set = nodehasset(sp, sd->sd_nodes[i],
1209			    NHS_NST_EQ, &xep);
1210
1211			if (has_set >= 0)
1212				continue;
1213
1214			mdclrerror(&xep);
1215
1216			if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes,
1217			    sd->sd_ctime, sd->sd_genid, &xep) == -1)
1218				mdclrerror(&xep);
1219		}
1220	}
1221}
1222
1223/*
1224 * If a MN diskset, set is already locked on all nodes via clnt_lock_set.
1225 */
1226static int
1227del_set_nodrives(
1228	mdsetname_t		*sp,
1229	int			node_c,
1230	char			**node_v,
1231	int			oha,
1232	md_error_t		*ep
1233)
1234{
1235	md_set_desc		*sd;
1236	int			i;
1237	sigset_t		oldsigs;
1238	md_setkey_t		*cl_sk;
1239	int			rb_level = 0;
1240	ulong_t			max_genid = 0;
1241	int			rval = 0;
1242	md_error_t		xep = mdnullerror;
1243	md_mnnode_desc		*nd;
1244	int			delete_end = 1;
1245
1246	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1247		return (-1);
1248
1249	if (MD_MNSET_DESC(sd)) {
1250		/* Make sure we are blocking all signals */
1251		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1252			mdclrerror(&xep);
1253	} else {
1254		md_rb_sig_handling_on();
1255	}
1256
1257	/*
1258	 * Lock the set on current set members for traditional disksets.
1259	 */
1260	if (!(MD_MNSET_DESC(sd))) {
1261		for (i = 0; i < node_c; i++) {
1262			/*
1263			 * For traditional diskset, issue the RPC and
1264			 * ignore RPC failure if in OHA mode.
1265			 */
1266			if (clnt_lock_set(node_v[i], sp, ep)) {
1267				if (oha == TRUE && mdanyrpcerror(ep)) {
1268					mdclrerror(ep);
1269					continue;
1270				}
1271				rval = -1;
1272				goto out;
1273			}
1274		}
1275	}
1276
1277
1278	RB_TEST(1, "deletehosts", ep)
1279
1280	RB_PREEMPT;
1281	rb_level = 1;	/* level 1 */
1282
1283	RB_TEST(2, "deletehosts", ep)
1284
1285	/*
1286	 * Mark the set record MD_SR_DEL
1287	 */
1288	for (i = 0; i < node_c; i++) {
1289
1290		RB_TEST(3, "deletehosts", ep)
1291
1292		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1293			/*
1294			 * During OHA mode, don't issue RPCs to
1295			 * non-alive nodes since there is no reason to
1296			 * wait for RPC timeouts.
1297			 */
1298			nd = sd->sd_nodelist;
1299			while (nd) {
1300				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1301					break;
1302				nd = nd->nd_next;
1303			}
1304			if (nd == NULL) {
1305				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1306				    sp->setno, nd->nd_nodename,
1307				    NULL, sp->setname);
1308				goto rollback;
1309			}
1310
1311			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1312				continue;
1313			}
1314
1315			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1316				goto rollback;
1317			}
1318		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1319			/*
1320			 * All nodes should be alive in non-oha mode.
1321			 */
1322			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1323				goto rollback;
1324			}
1325		} else {
1326			/*
1327			 * For traditional diskset, issue the RPC and
1328			 * ignore RPC failure if in OHA mode.
1329			 */
1330			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1331				if (oha == TRUE && mdanyrpcerror(ep)) {
1332					mdclrerror(ep);
1333					continue;
1334				}
1335				goto rollback;
1336			}
1337		}
1338
1339		RB_TEST(4, "deletehosts", ep)
1340	}
1341
1342	RB_TEST(5, "deletehosts", ep)
1343
1344	RB_PREEMPT;
1345	rb_level = 2;	/* level 2 */
1346
1347	RB_TEST(6, "deletehosts", ep)
1348
1349	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR)
1350		if (metad_isautotakebyname(sp->setname))
1351			delete_end = 0;
1352		else
1353			goto rollback;
1354
1355	/* The set is OK to delete, make it so. */
1356	for (i = 0; i < node_c; i++) {
1357
1358		RB_TEST(7, "deletehosts", ep)
1359
1360		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1361			/*
1362			 * During OHA mode, don't issue RPCs to
1363			 * non-alive nodes since there is no reason to
1364			 * wait for RPC timeouts.
1365			 */
1366			nd = sd->sd_nodelist;
1367			while (nd) {
1368				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1369					break;
1370				nd = nd->nd_next;
1371			}
1372			if (nd == NULL) {
1373				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1374				    sp->setno, nd->nd_nodename,
1375				    NULL, sp->setname);
1376				goto rollback;
1377			}
1378
1379			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1380				continue;
1381			}
1382
1383			if (clnt_delset(node_v[i], sp, ep) == -1) {
1384				goto rollback;
1385			}
1386		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1387			/*
1388			 * All nodes should be alive in non-oha mode.
1389			 */
1390			if (clnt_delset(node_v[i], sp, ep) == -1) {
1391				goto rollback;
1392			}
1393		} else {
1394			/*
1395			 * For traditional diskset, issue the RPC and
1396			 * ignore RPC failure if in OHA mode.
1397			 */
1398			if (clnt_delset(node_v[i], sp, ep) == -1) {
1399				if (oha == TRUE && mdanyrpcerror(ep)) {
1400					mdclrerror(ep);
1401					continue;
1402				}
1403				goto rollback;
1404			}
1405		}
1406
1407		RB_TEST(8, "deletehosts", ep)
1408	}
1409
1410	RB_TEST(9, "deletehosts", ep)
1411
1412out:
1413	/*
1414	 * Unlock the set on current set members
1415	 * for traditional disksets.
1416	 */
1417	if (!(MD_MNSET_DESC(sd))) {
1418		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1419		for (i = 0; i < node_c; i++) {
1420			/*
1421			 * For traditional diskset, issue the RPC and
1422			 * ignore RPC failure if in OHA mode.
1423			 */
1424			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
1425				if (oha == TRUE && mdanyrpcerror(&xep)) {
1426					mdclrerror(&xep);
1427					continue;
1428				}
1429				if (rval == 0)
1430					(void) mdstealerror(ep, &xep);
1431				rval = -1;
1432			}
1433		}
1434		cl_set_setkey(NULL);
1435	}
1436
1437	/*
1438	 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1439	 * don't flush that data until meta_set_deletehosts has finished
1440	 * with it.  meta_set_deletehosts will handle the flush of the
1441	 * setname.
1442	 */
1443	if (!(MD_MNSET_DESC(sd))) {
1444		metaflushsetname(sp);
1445	}
1446
1447	if (delete_end &&
1448	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1449		rval = -1;
1450
1451	if (MD_MNSET_DESC(sd)) {
1452		/* release signals back to what they were on entry */
1453		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1454			mdclrerror(&xep);
1455	} else {
1456		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1457	}
1458
1459	return (rval);
1460
1461rollback:
1462	/* all signals already blocked for MN disket */
1463	if (!(MD_MNSET_DESC(sd))) {
1464		/* Make sure we are blocking all signals */
1465		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1466			mdclrerror(&xep);
1467	}
1468
1469	rval = -1;
1470
1471	max_genid = sd->sd_genid;
1472
1473	/* level 2 */
1474	if (rb_level > 1) {
1475		recreate_set(sp, sd);
1476		max_genid++;
1477
1478		if (delete_end)
1479			(void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
1480	}
1481
1482	/* level 1 */
1483	if (rb_level > 0) {
1484		max_genid++;
1485		resync_genid(sp, sd, max_genid, node_c, node_v);
1486	}
1487
1488	/* level 0 */
1489	/*
1490	 * Unlock the set on current set members
1491	 * for traditional disksets.
1492	 */
1493	if (!(MD_MNSET_DESC(sd))) {
1494		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1495		for (i = 0; i < node_c; i++) {
1496			/*
1497			 * For traditional diskset, issue the RPC and
1498			 * ignore RPC failure if in OHA mode.
1499			 */
1500			if (clnt_unlock_set(node_v[i], cl_sk, &xep))
1501				mdclrerror(&xep);
1502		}
1503		cl_set_setkey(NULL);
1504	}
1505
1506	/* release signals back to what they were on entry */
1507	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1508		mdclrerror(&xep);
1509
1510	/*
1511	 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1512	 * don't flush that data until meta_set_deletehosts has finished
1513	 * with it.  meta_set_deletehosts will handle the flush of the
1514	 * setname.
1515	 */
1516	if (!(MD_MNSET_DESC(sd))) {
1517		metaflushsetname(sp);
1518		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1519	}
1520
1521	return (rval);
1522}
1523
1524/*
1525 * On entry:
1526 *   procsigs already called for MN diskset.
1527 *   md_rb_sig_handling already called for traditional diskset.
1528 */
1529static int
1530del_set_on_hosts(
1531	mdsetname_t		*sp,
1532	md_set_desc		*sd,
1533	md_drive_desc		*dd,
1534	int			node_c,		/* Number of nodes */
1535	char			**node_v,	/* Nodes being deleted */
1536	int			oha,
1537	md_error_t		*ep
1538)
1539{
1540	int			i;
1541	int			j;
1542	side_t			sideno;
1543	md_replicalist_t	*rlp = NULL;
1544	sigset_t		oldsigs;
1545	md_setkey_t		*cl_sk;
1546	ulong_t			max_genid = 0;
1547	int			rb_level = 1;	/* This is a special case */
1548	md_error_t		xep = mdnullerror;
1549	md_mnnode_desc		*nd;
1550
1551	RB_PREEMPT;
1552
1553	RB_TEST(7, "deletehosts", ep)
1554
1555	if (dd != NULL) {
1556		/*
1557		 * May need this to re-add sidenames on roll back.
1558		 */
1559		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
1560		    ep) < 0)
1561			goto rollback;
1562
1563		RB_TEST(8, "deletehosts", ep)
1564
1565		RB_PREEMPT;
1566		rb_level = 2;	/* level 2 */
1567
1568		RB_TEST(9, "deletehosts", ep)
1569
1570		if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep))
1571			goto rollback;
1572
1573		RB_TEST(10, "deletehosts", ep)
1574
1575		RB_PREEMPT;
1576		rb_level = 3;	/* level 3 */
1577
1578		RB_TEST(11, "deletehosts", ep)
1579
1580		/*
1581		 * Delete the db replica sides
1582		 * This is done before the next loop, so that
1583		 * the db does not get unloaded before we are finished
1584		 * deleting the sides.
1585		 */
1586		if (MD_MNSET_DESC(sd)) {
1587			nd = sd->sd_nodelist;
1588			while (nd) {
1589				/* Skip hosts not being deleted */
1590				if (! strinlst(nd->nd_nodename, node_c,
1591				    node_v)) {
1592					nd = nd->nd_next;
1593					continue;
1594				}
1595
1596				if (del_db_sidenms(sp, nd->nd_nodeid, ep))
1597					goto rollback;
1598
1599				RB_TEST(12, "deletehosts", ep)
1600				nd = nd->nd_next;
1601			}
1602		} else {
1603			for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1604				/* Skip empty slots */
1605				if (sd->sd_nodes[sideno][0] == '\0')
1606					continue;
1607
1608				/* Skip hosts not being deleted */
1609				if (! strinlst(sd->sd_nodes[sideno], node_c,
1610				    node_v))
1611					continue;
1612
1613				if (del_db_sidenms(sp, sideno, ep))
1614					goto rollback;
1615
1616				RB_TEST(12, "deletehosts", ep)
1617			}
1618		}
1619
1620		RB_TEST(13, "deletehosts", ep)
1621
1622		RB_PREEMPT;
1623		rb_level = 4;	/* level 4 */
1624
1625		RB_TEST(14, "deletehosts", ep)
1626
1627		/* Delete the names from the namespace */
1628		if (MD_MNSET_DESC(sd)) {
1629			nd = sd->sd_nodelist;
1630			while (nd) {
1631				/* Skip hosts not being deleted */
1632				if (! strinlst(nd->nd_nodename, node_c,
1633				    node_v)) {
1634					nd = nd->nd_next;
1635					continue;
1636				}
1637
1638				if (del_md_sidenms(sp, nd->nd_nodeid, ep))
1639					goto rollback;
1640
1641				RB_TEST(15, "deletehosts", ep)
1642				nd = nd->nd_next;
1643			}
1644		} else {
1645			for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1646				/* Skip empty slots */
1647				if (sd->sd_nodes[sideno][0] == '\0')
1648					continue;
1649
1650				/* Skip hosts not being deleted */
1651				if (! strinlst(sd->sd_nodes[sideno], node_c,
1652				    node_v))
1653					continue;
1654
1655				if (del_md_sidenms(sp, sideno, ep))
1656					goto rollback;
1657
1658				RB_TEST(15, "deletehosts", ep)
1659			}
1660		}
1661	}
1662
1663	RB_TEST(16, "deletehosts", ep)
1664
1665	RB_PREEMPT;
1666	rb_level = 5;	/* level 6 */
1667
1668	RB_TEST(17, "deletehosts", ep)
1669
1670	for (i = 0; i < node_c; i++) {
1671		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1672			/*
1673			 * During OHA mode, don't issue RPCs to
1674			 * non-alive nodes since there is no reason to
1675			 * wait for RPC timeouts.
1676			 */
1677			nd = sd->sd_nodelist;
1678			while (nd) {
1679				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1680					break;
1681				nd = nd->nd_next;
1682			}
1683			if (nd == NULL) {
1684				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1685				    sp->setno, nd->nd_nodename,
1686				    NULL, sp->setname);
1687				goto rollback;
1688			}
1689
1690			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1691				continue;
1692			}
1693
1694			if (clnt_delset(node_v[i], sp, ep) == -1) {
1695				goto rollback;
1696			}
1697		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1698			/*
1699			 * All nodes should be alive in non-oha mode.
1700			 */
1701			if (clnt_delset(node_v[i], sp, ep) == -1) {
1702				goto rollback;
1703			}
1704		} else {
1705			/*
1706			 * For traditional diskset, issue the RPC and
1707			 * ignore RPC failure if in OHA mode.
1708			 */
1709			if (clnt_delset(node_v[i], sp, ep) == -1) {
1710				if (oha == TRUE && mdanyrpcerror(ep)) {
1711					mdclrerror(ep);
1712					continue;
1713				}
1714				goto rollback;
1715			}
1716		}
1717
1718		RB_TEST(18, "deletehosts", ep)
1719	}
1720
1721	metafreereplicalist(rlp);
1722
1723	if (MD_MNSET_DESC(sd)) {
1724		/* release signals back to what they were on entry */
1725		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1726			mdclrerror(&xep);
1727	} else {
1728		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1729	}
1730
1731	return (0);
1732
1733rollback:
1734	/* all signals already blocked for MN disket */
1735	if (!(MD_MNSET_DESC(sd))) {
1736		/* Make sure we are blocking all signals */
1737		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1738			mdclrerror(&xep);
1739	}
1740
1741	max_genid = sd->sd_genid;
1742
1743	/* level 5 */
1744	if (rb_level > 4) {
1745		recreate_set(sp, sd);
1746		max_genid++;
1747	}
1748
1749	/* level 2 */
1750	if (rb_level > 1 && dd != NULL) {
1751		/*
1752		 * See if we have to re-add the drives specified.
1753		 */
1754		for (i = 0; i < node_c; i++) {
1755			md_set_record	*sr;
1756
1757			if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1758				/*
1759				 * During OHA mode, don't issue RPCs to
1760				 * non-alive nodes since there is no reason to
1761				 * wait for RPC timeouts.
1762				 */
1763				nd = sd->sd_nodelist;
1764				while (nd) {
1765					if (strcmp(nd->nd_nodename, node_v[i])
1766					    == 0)
1767						break;
1768					nd = nd->nd_next;
1769				}
1770				if (nd == NULL)
1771					continue;
1772
1773				if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1774					continue;
1775			}
1776
1777			/* Don't care if set record is MN or not */
1778			if (clnt_getset(node_v[i], sp->setname,
1779			    MD_SET_BAD, &sr, &xep) == -1) {
1780				mdclrerror(&xep);
1781				continue;
1782			}
1783
1784			/* Drive already added, skip to next node */
1785			if (sr->sr_drivechain != NULL) {
1786				/*
1787				 * Set record structure was allocated from RPC
1788				 * routine getset so this structure is only of
1789				 * size md_set_record even if the MN flag is
1790				 * set.  So, clear the flag so that the free
1791				 * code doesn't attempt to free a structure
1792				 * the size of md_mnset_record.
1793				 */
1794				sr->sr_flags &= ~MD_SR_MN;
1795				free_sr(sr);
1796				continue;
1797			}
1798
1799			if (clnt_adddrvs(node_v[i], sp, dd,
1800			    sr->sr_ctime, sr->sr_genid, &xep) == -1)
1801				mdclrerror(&xep);
1802
1803			if (clnt_upd_dr_flags(node_v[i], sp, dd,
1804			    MD_DR_OK, &xep) == -1)
1805				mdclrerror(&xep);
1806
1807			/*
1808			 * Set record structure was allocated from RPC routine
1809			 * getset so this structure is only of size
1810			 * md_set_record even if the MN flag is set.  So,
1811			 * clear the flag so that the free code doesn't
1812			 * attempt to free a structure the size of
1813			 * md_mnset_record.
1814			 */
1815			sr->sr_flags &= ~MD_SR_MN;
1816			free_sr(sr);
1817		}
1818		max_genid += 3;
1819	}
1820
1821	/* level 3 */
1822	if (rb_level > 2 && dd != NULL) {
1823		md_replicalist_t	*rl;
1824
1825		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1826			md_replica_t	*r = rl->rl_repp;
1827
1828			/*
1829			 * This is not the first replica being added to the
1830			 * diskset so call with ADDSIDENMS_BCAST.  If this
1831			 * is a traditional diskset, the bcast flag is ignored
1832			 * since traditional disksets don't use the rpc.mdcommd.
1833			 */
1834			if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
1835			    DB_ADDSIDENMS_BCAST, &xep))
1836				mdclrerror(&xep);
1837		}
1838	}
1839
1840	/* level 4 */
1841	if (rb_level > 3 && dd != NULL) {
1842		int	nodeid_addsides = 0;
1843		/*
1844		 * Add the device names for the new sides into the namespace,
1845		 * on all hosts not being deleted.
1846		 */
1847		if (MD_MNSET_DESC(sd)) {
1848			nd = sd->sd_nodelist;
1849			while (nd) {
1850				/* Find a node that is not being deleted */
1851				if (! strinlst(nd->nd_nodename, node_c,
1852				    node_v)) {
1853					nodeid_addsides = nd->nd_nodeid;
1854					break;
1855				}
1856				nd = nd->nd_next;
1857			}
1858		} else {
1859			for (j = 0; j < MD_MAXSIDES; j++) {
1860				/* Skip empty slots */
1861				if (sd->sd_nodes[j][0] == '\0')
1862					continue;
1863
1864				/* Find a node that is not being deleted */
1865				if (! strinlst(sd->sd_nodes[j], node_c,
1866				    node_v))
1867					break;
1868			}
1869			nodeid_addsides = j;
1870		}
1871
1872		if (MD_MNSET_DESC(sd)) {
1873			nd = sd->sd_nodelist;
1874			while (nd) {
1875				/* Skip nodes not being deleted */
1876				if (!strinlst(nd->nd_nodename, node_c,
1877				    node_v)) {
1878					nd = nd->nd_next;
1879					continue;
1880				}
1881
1882				/* this side was just created, add the names */
1883				if (add_md_sidenms(sp, nd->nd_nodeid,
1884				    nodeid_addsides, &xep))
1885					mdclrerror(&xep);
1886				nd = nd->nd_next;
1887			}
1888		} else {
1889			for (i = 0; i < MD_MAXSIDES; i++) {
1890				/* Skip empty slots */
1891				if (sd->sd_nodes[i][0] == '\0')
1892					continue;
1893
1894				/* Skip nodes not being deleted */
1895				if (!strinlst(sd->sd_nodes[i], node_c, node_v))
1896					continue;
1897
1898				/* this side was just created, add the names */
1899				if (add_md_sidenms(sp, i, nodeid_addsides,
1900				    &xep))
1901					mdclrerror(&xep);
1902			}
1903		}
1904	}
1905
1906	/* level 1 */
1907	if (rb_level > 0) {
1908		max_genid++;
1909		resync_genid(sp, sd, max_genid, node_c, node_v);
1910	}
1911
1912	/* level 0 */
1913	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1914	if (MD_MNSET_DESC(sd)) {
1915		nd = sd->sd_nodelist;
1916		while (nd) {
1917			if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1918				continue;
1919			/* To balance lock/unlock; can send to dead node */
1920			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1921				mdclrerror(&xep);
1922			nd = nd->nd_next;
1923		}
1924	} else {
1925		for (i = 0; i < MD_MAXSIDES; i++) {
1926			/* Skip empty slots */
1927			if (sd->sd_nodes[i][0] == '\0')
1928				continue;
1929
1930			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1931				mdclrerror(&xep);
1932		}
1933	}
1934	cl_set_setkey(NULL);
1935
1936	/* release signals back to what they were on entry */
1937	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1938		mdclrerror(&xep);
1939
1940	metafreereplicalist(rlp);
1941
1942	if (!(MD_MNSET_DESC(sd))) {
1943		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1944	}
1945
1946	return (-1);
1947}
1948
1949static int
1950make_sideno_sidenm(
1951	mdsetname_t	*sp,
1952	mddrivename_t	*dnp,
1953	side_t		sideno,
1954	md_error_t	*ep
1955)
1956{
1957	mdsidenames_t	*sn, **sn_next;
1958	md_set_desc	*sd;
1959	mdname_t	*np;
1960	uint_t		rep_slice;
1961	int		err = 0;
1962
1963	assert(dnp->side_names_key != MD_KEYWILD);
1964
1965	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1966		return (-1);
1967
1968	/* find the end of the link list */
1969	for (sn = dnp->side_names; sn->next != NULL; sn = sn->next)
1970		;
1971	sn_next = &sn->next;
1972
1973	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
1974		return (-1);
1975
1976	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1977		return (-1);
1978
1979	sn = Zalloc(sizeof (*sn));
1980	sn->sideno = sideno;
1981
1982	if (MD_MNSET_DESC(sd)) {
1983		/*
1984		 * For MO diskset the sideno is not an index into
1985		 * the array of nodes.  Hence getside_devinfo is
1986		 * used instead of meta_getnextside_devinfo.
1987		 */
1988		if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
1989		    &sn->dname, &sn->mnum, ep) == -1)
1990			err = -1;
1991	} else {
1992		/* decrement sideno, to look like the previous sideno */
1993		sideno--;
1994		if (meta_getnextside_devinfo(sp, np->bname, &sideno,
1995		    &sn->cname, &sn->dname, &sn->mnum, ep) == -1)
1996			err = -1;
1997	}
1998
1999	if (err) {
2000		Free(sn);
2001		return (err);
2002	}
2003	assert(sn->sideno == sideno);
2004
2005	/* Add to the end of the linked list */
2006	*sn_next = sn;
2007	return (0);
2008}
2009
2010static int
2011validate_nodes(
2012	mdsetname_t	*sp,
2013	int		node_c,
2014	char		**node_v,
2015	md_error_t	*ep
2016)
2017{
2018	char		*hostname;
2019	int		i;
2020
2021
2022	for (i = 0; i < node_c; i++) {
2023		if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME)
2024			return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
2025			    sp->setno, node_v[i], NULL, sp->setname));
2026		if (clnt_hostname(node_v[i], &hostname, ep))
2027			return (-1);
2028		if (strcmp(node_v[i], hostname) != 0) {
2029			Free(hostname);
2030			return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno,
2031			    node_v[i], NULL, sp->setname));
2032		}
2033		Free(hostname);
2034	}
2035	return (0);
2036}
2037
2038/*
2039 * Exported Entry Points
2040 */
2041
2042/*
2043 * Check the given disk set name for syntactic correctness.
2044 */
2045int
2046meta_set_checkname(char *setname, md_error_t *ep)
2047{
2048	char	*cp;
2049
2050	if (strlen(setname) > (size_t)MD_MAX_SETNAME)
2051		return (mddserror(ep, MDE_DS_SETNAMETOOLONG,
2052		    MD_SET_BAD, NULL, NULL, setname));
2053
2054	for (cp = setname; *cp; cp++)
2055		if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL)
2056			return (mddserror(ep, MDE_DS_INVALIDSETNAME,
2057			    MD_SET_BAD, NULL, NULL, setname));
2058	return (0);
2059}
2060
2061/*
2062 * Add host(s) to the multi-node diskset provided in sp.
2063 * 	- create set if non-existent.
2064 */
2065static int
2066meta_multinode_set_addhosts(
2067	mdsetname_t	*sp,
2068	int		multi_node,
2069	int		node_c,
2070	char		**node_v,
2071	int		auto_take,
2072	md_error_t	*ep
2073)
2074{
2075	md_set_desc			*sd;
2076	md_drive_desc			*dd, *p;
2077	int				rval = 0;
2078	int				bool;
2079	int				nodeindex;
2080	int 				i;
2081	int				has_set;
2082	sigset_t			oldsigs;
2083	md_setkey_t			*cl_sk;
2084	int				rb_level = 0;
2085	md_error_t			xep = mdnullerror;
2086	md_mnnode_desc			*nd, *nd_curr, *nd_prev;
2087	md_timeval32_t			now;
2088	int				nodecnt;
2089	mndiskset_membershiplist_t	*nl, *nl2;
2090	int				suspendall_flag = 0;
2091	int				suspend1_flag = 0;
2092	int				lock_flag = 0;
2093	int				stale_flag = 0;
2094	md_mnnode_desc			*saved_nd_next;
2095	int				remote_sets_created = 0;
2096
2097	/*
2098	 * Check membershiplist first.  If there's
2099	 * an error, fail to create set and pass back error.
2100	 */
2101	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2102		return (-1);
2103	}
2104	/* Verify that all nodes are in member list */
2105	for (i = 0; i < node_c; i++) {
2106		/*
2107		 * If node in list isn't a member of the membership,
2108		 * just return error.
2109		 */
2110		if (meta_is_member(node_v[i], NULL, nl) == 0) {
2111			meta_free_nodelist(nl);
2112			return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2113			    sp->setno, node_v[i], NULL, sp->setname));
2114		}
2115	}
2116	/*
2117	 * Node list is needed later, but there is a lot of error
2118	 * checking and possible failures between here and there, so
2119	 * just re-get the list later if there are no errors.
2120	 */
2121	meta_free_nodelist(nl);
2122	nl = NULL;
2123
2124	/*
2125	 * Verify that list of nodes being added contains no
2126	 * duplicates.
2127	 */
2128	if (nodesuniq(sp, node_c, node_v, ep))
2129		return (-1);
2130
2131	/*
2132	 * Verify that each node being added thinks that its nodename
2133	 * is the same as the nodename given.
2134	 */
2135	if (validate_nodes(sp, node_c, node_v, ep))
2136		return (-1);
2137
2138	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2139		if (! mdiserror(ep, MDE_NO_SET))
2140			return (-1);
2141		mdclrerror(ep);
2142		return (create_set(sp, multi_node, node_c, node_v, auto_take,
2143		    ep));
2144	} else {
2145		/*
2146		 * If this node and another node were both attempting to
2147		 * create the same setname at the same time, and the other
2148		 * node has just created the set on this node then sd would
2149		 * be non-NULL, but sp->setno would be null (setno is filled
2150		 * in by the create_set). If this is true, then fail since
2151		 * the other node has already won this race.
2152		 */
2153		if (sp->setno == NULL) {
2154			return (mddserror(ep, MDE_DS_NODEINSET,
2155			    NULL, mynode(), NULL, sp->setname));
2156		}
2157	}
2158
2159	/* The auto_take behavior is inconsistent with multiple hosts. */
2160	if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
2161		(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
2162		    sp->setname);
2163		return (-1);
2164	}
2165
2166	/*
2167	 * We already have the set.
2168	 */
2169
2170	/* Make sure we own the set */
2171	if (meta_check_ownership(sp, ep) != 0)
2172		return (-1);
2173
2174	/*
2175	 * The drive and node records are stored in the local mddbs of each
2176	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
2177	 * drive and node records from that node's local mddb and caches them
2178	 * internally. Any process needing diskset information contacts its
2179	 * local rpc.metad to get this information.  Since each node in the
2180	 * diskset is independently reading the set information from its local
2181	 * mddb, the set, drive and node records in the local mddbs must stay
2182	 * in-sync, so that all nodes have a consistent view of the diskset.
2183	 *
2184	 * For a multinode diskset, explicitly verify that all nodes in the
2185	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
2186	 * fail this operation since all nodes must be ALIVE in order to add
2187	 * the new node record to their local mddb.  If a panic of this node
2188	 * leaves the local mddbs set, node and drive records out-of-sync, the
2189	 * reconfig cycle will fix the local mddbs and force them back into
2190	 * synchronization.
2191	 */
2192	nd = sd->sd_nodelist;
2193	while (nd) {
2194		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2195			return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2196			    sp->setno, nd->nd_nodename, NULL,
2197			    sp->setname));
2198		}
2199		nd = nd->nd_next;
2200	}
2201
2202	/*
2203	 * Check if node is already in set.
2204	 */
2205	for (i = 0; i < node_c; i++) {
2206		/* Is node already in set? */
2207		nd = sd->sd_nodelist;
2208		while (nd) {
2209			if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2210				break;
2211			nd = nd->nd_next;
2212		}
2213		if (nd) {
2214			return (mddserror(ep, MDE_DS_NODEINSET,
2215			    sp->setno, node_v[i], NULL,
2216			    sp->setname));
2217		}
2218	}
2219
2220	/*
2221	 * Lock the set on current set members.
2222	 * Set locking done much earlier for MN diskset than for traditional
2223	 * diskset since lock_set and SUSPEND are used to protect against
2224	 * other meta* commands running on the other nodes.
2225	 */
2226	/* Make sure we are blocking all signals */
2227	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2228		mdclrerror(&xep);
2229
2230	nd = sd->sd_nodelist;
2231	/* All nodes are guaranteed to be ALIVE */
2232	while (nd) {
2233		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2234			rval = -1;
2235			goto out;
2236		}
2237		lock_flag = 1;
2238		nd = nd->nd_next;
2239	}
2240	/*
2241	 * Lock out other meta* commands by suspending
2242	 * class 1 messages across the diskset.
2243	 */
2244	nd = sd->sd_nodelist;
2245	/* Send suspend to nodes in nodelist before addhosts call */
2246	/* All nodes are guaranteed to be ALIVE */
2247	while (nd) {
2248		if (clnt_mdcommdctl(nd->nd_nodename,
2249		    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2250		    MD_MSCF_NO_FLAGS, ep)) {
2251			rval = -1;
2252			goto out;
2253		}
2254		suspend1_flag = 1;
2255		nd = nd->nd_next;
2256	}
2257
2258	/* Lock the set on new set members */
2259	for (i = 0; i < node_c; i++) {
2260		/* Already verified to be alive */
2261		if (clnt_lock_set(node_v[i], sp, ep)) {
2262			rval = -1;
2263			goto out;
2264		}
2265		lock_flag = 1;
2266	}
2267
2268	/*
2269	 * Perform the required checks for new hosts
2270	 */
2271	for (i = 0; i < node_c; i++) {
2272		/* Make sure this set name is not used on the other hosts */
2273		has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
2274		if (has_set < 0) {
2275			if (! mdiserror(ep, MDE_NO_SET)) {
2276				rval = -1;
2277				goto out;
2278			}
2279			/* Keep on truck'n */
2280			mdclrerror(ep);
2281		} else if (has_set) {
2282			(void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
2283			    node_v[i], NULL, sp->setname);
2284			rval = -1;
2285			goto out;
2286		}
2287
2288		if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) {
2289			rval = -1;
2290			goto out;
2291		}
2292
2293		if (bool == TRUE) {
2294			(void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
2295			    node_v[i], NULL, sp->setname);
2296			rval = -1;
2297			goto out;
2298		}
2299
2300		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
2301			rval = -1;
2302			goto out;
2303		}
2304
2305		if (bool == FALSE) {
2306			(void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
2307			    node_v[i], NULL, sp->setname);
2308			rval = -1;
2309			goto out;
2310		}
2311
2312		if (check_setdrvs_againstnode(sp, node_v[i], ep)) {
2313			rval = -1;
2314			goto out;
2315		}
2316	}
2317
2318	/* Get drive descriptors for the set */
2319	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
2320		if (! mdisok(ep)) {
2321			rval = -1;
2322			goto out;
2323		}
2324	}
2325
2326	/* END CHECK CODE */
2327
2328	RB_TEST(1, "addhosts", ep)
2329
2330	RB_PREEMPT;
2331	rb_level = 1;	/* level 1 */
2332
2333	RB_TEST(2, "addhosts", ep)
2334
2335	/*
2336	 * Create the set where needed
2337	 */
2338	if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
2339		goto rollback;
2340	}
2341
2342	/*
2343	 * Send suspend to rpc.mdcommd on nodes where a set has been
2344	 * created since rpc.mdcommd must now be running on the remote nodes.
2345	 */
2346	remote_sets_created = 1;
2347	for (i = 0; i < node_c; i++) {
2348		/*
2349		 * Lock out other meta* commands by suspending
2350		 * class 1 messages across the diskset.
2351		 */
2352		if (clnt_mdcommdctl(node_v[i],
2353		    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2354		    MD_MSCF_NO_FLAGS, ep)) {
2355			rval = -1;
2356			goto rollback;
2357		}
2358	}
2359
2360	/*
2361	 * Merge the new entries into the set with the existing sides.
2362	 * Get membershiplist from API routine.  If there's
2363	 * an error, fail to create set and pass back error.
2364	 */
2365	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2366		goto rollback;
2367	}
2368	if (meta_gettimeofday(&now) == -1) {
2369		meta_free_nodelist(nl);
2370		(void) mdsyserror(ep, errno,
2371		    dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
2372		goto rollback;
2373	}
2374	for (nodeindex = 0; nodeindex < node_c; nodeindex++) {
2375		nd = Zalloc(sizeof (*nd));
2376		(void) strcpy(nd->nd_nodename, node_v[nodeindex]);
2377		nd->nd_ctime = now;
2378		nl2 = nl;
2379		while (nl2) {
2380			if (strcmp(nl2->msl_node_name,
2381			    node_v[nodeindex]) == 0) {
2382				nd->nd_nodeid = nl2->msl_node_id;
2383				(void) strcpy(nd->nd_priv_ic,
2384				    nl2->msl_node_addr);
2385				break;
2386			}
2387			nl2 = nl2->next;
2388		}
2389
2390		/*
2391		 * Nodelist must be kept in ascending nodeid order.
2392		 */
2393		if (sd->sd_nodelist == NULL) {
2394			/* Nothing in list, just add it */
2395			sd->sd_nodelist = nd;
2396		} else if (nd->nd_nodeid <
2397		    sd->sd_nodelist->nd_nodeid) {
2398			/* Add to head of list */
2399			nd->nd_next = sd->sd_nodelist;
2400			sd->sd_nodelist = nd;
2401		} else {
2402			nd_curr = sd->sd_nodelist->nd_next;
2403			nd_prev = sd->sd_nodelist;
2404			/* Search for place to add it */
2405			while (nd_curr) {
2406				if (nd->nd_nodeid < nd_curr->nd_nodeid) {
2407					/* Add before nd_curr */
2408					nd->nd_next = nd_curr;
2409					nd_prev->nd_next = nd;
2410					break;
2411				}
2412				nd_prev = nd_curr;
2413				nd_curr = nd_curr->nd_next;
2414			}
2415			/* Add to end of list */
2416			if (nd_curr == NULL) {
2417				nd_prev->nd_next = nd;
2418			}
2419
2420		}
2421		/* Node already verified to be in membership */
2422		nd->nd_flags |= MD_MN_NODE_ALIVE;
2423	}
2424	meta_free_nodelist(nl);
2425
2426	/* If we have drives */
2427	if (dd != NULL) {
2428		/*
2429		 * For all the hosts being added, create a sidename structure
2430		 */
2431		nd = sd->sd_nodelist;
2432		while (nd) {
2433			/* Skip nodes not being added */
2434			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2435				nd = nd->nd_next;
2436				continue;
2437			}
2438			for (p = dd; p != NULL; p = p->dd_next) {
2439				if (make_sideno_sidenm(sp, p->dd_dnp,
2440				    nd->nd_nodeid, ep) != 0)
2441					goto rollback;
2442			}
2443			nd = nd->nd_next;
2444		}
2445
2446		RB_PREEMPT;
2447		rb_level = 2;   /* level 2 */
2448
2449		RB_TEST(4, "addhosts", ep)
2450
2451		/*
2452		 * Add the new sidename for each drive to all the hosts
2453		 *
2454		 * If a multi-node diskset, each host only stores
2455		 * the side information for itself.  So, only send
2456		 * side information to the new hosts where each host
2457		 * will add the appropriate side information to its
2458		 * local mddb.
2459		 */
2460		nd = sd->sd_nodelist;
2461		while (nd) {
2462			/* Skip nodes not being added */
2463			if (!strinlst(nd->nd_nodename, node_c,
2464			    node_v)) {
2465				nd = nd->nd_next;
2466				continue;
2467			}
2468
2469			/* Add side info to new hosts */
2470			if (clnt_add_drv_sidenms(nd->nd_nodename,
2471			    mynode(), sp, sd, node_c, node_v, ep))
2472				goto rollback;
2473
2474			nd = nd->nd_next;
2475		}
2476
2477		RB_TEST(5, "addhosts", ep)
2478
2479		RB_PREEMPT;
2480		rb_level = 3;	/* level 3 */
2481
2482		RB_TEST(6, "addhosts", ep)
2483
2484		/*
2485		 * Add the device names for the new sides into the namespace
2486		 * for all hosts being added.  This is adding the side
2487		 * names to the diskset's mddb so add sidenames for all
2488		 * of the new hosts.
2489		 */
2490		nd = sd->sd_nodelist;
2491		while (nd) {
2492			/* Skip nodes not being added */
2493			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2494				nd = nd->nd_next;
2495				continue;
2496			}
2497
2498			/* this side was just created, add the names */
2499			if (add_md_sidenms(sp, nd->nd_nodeid,
2500			    MD_SIDEWILD, ep))
2501				goto rollback;
2502
2503			nd = nd->nd_next;
2504		}
2505
2506		RB_TEST(7, "addhosts", ep)
2507
2508		RB_PREEMPT;
2509		rb_level = 4;   /* level 4 */
2510
2511		RB_TEST(8, "addhosts", ep)
2512
2513		if (add_db_sidenms(sp, ep))
2514			goto rollback;
2515
2516	} else {
2517		RB_PREEMPT;
2518		rb_level = 4;
2519	}
2520
2521	RB_TEST(9, "addhosts", ep)
2522
2523	RB_PREEMPT;
2524	rb_level = 5;	/* level 5 */
2525
2526	RB_TEST(10, "addhosts", ep)
2527
2528	if (dd != NULL) {
2529		/*
2530		 * Notify rpc.mdcommd on all nodes of a nodelist change.
2531		 * Start by suspending rpc.mdcommd (which drains it of all
2532		 * messages), then change the nodelist followed by a reinit
2533		 * and resume.
2534		 */
2535		nd = sd->sd_nodelist;
2536		/* Send suspend_all to nodes in nodelist (existing + new) */
2537		/* All nodes are guaranteed to be ALIVE */
2538		while (nd) {
2539			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2540			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2541				rval = -1;
2542				goto rollback;
2543			}
2544			suspendall_flag = 1;
2545			nd = nd->nd_next;
2546		}
2547	}
2548
2549	/* Add the node(s) to the each host that is currently in the set */
2550	nd = sd->sd_nodelist;
2551	/* All nodes are guaranteed to be ALIVE */
2552	while (nd) {
2553		if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) {
2554			goto rollback;
2555		}
2556		nd = nd->nd_next;
2557	}
2558
2559	RB_TEST(11, "addhosts", ep)
2560
2561	if (dd != NULL) {
2562		/*
2563		 * Mark the drives MD_DR_OK.
2564		 */
2565		nd = sd->sd_nodelist;
2566		/* All nodes are guaranteed to be ALIVE */
2567		while (nd) {
2568			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2569			    MD_DR_OK, ep) == -1)
2570				goto rollback;
2571			nd = nd->nd_next;
2572		}
2573	}
2574
2575	RB_TEST(12, "addhosts", ep)
2576
2577	RB_PREEMPT;
2578	rb_level = 6;   /* level 6 */
2579
2580	RB_TEST(13, "addhosts", ep)
2581
2582
2583	/* Add the mediator information to all hosts in the set. */
2584	nd = sd->sd_nodelist;
2585	/* All nodes are guaranteed to be ALIVE */
2586	while (nd) {
2587		if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
2588			goto rollback;
2589		nd = nd->nd_next;
2590	}
2591
2592	RB_TEST(14, "addhosts", ep)
2593
2594	/*
2595	 * If a MN diskset and there are drives in the set,
2596	 * set the master on the new nodes and
2597	 * automatically join the new nodes into the set.
2598	 */
2599	if (dd != NULL) {
2600		mddb_config_t   c;
2601		/*
2602		 * Is current set STALE?
2603		 */
2604		(void) memset(&c, 0, sizeof (c));
2605		c.c_id = 0;
2606		c.c_setno = sp->setno;
2607		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2608			(void) mdstealerror(ep, &c.c_mde);
2609			rval = -1;
2610			goto out;
2611		}
2612		if (c.c_flags & MDDB_C_STALE) {
2613			stale_flag = MNSET_IS_STALE;
2614		}
2615
2616		/* Set master on newly added nodes */
2617		for (i = 0; i < node_c; i++) {
2618			if (clnt_mnsetmaster(node_v[i], sp,
2619			    sd->sd_mn_master_nodenm,
2620			    sd->sd_mn_master_nodeid, ep)) {
2621				goto rollback;
2622			}
2623		}
2624		/* Join newly added nodes to diskset and set OWN flag */
2625		for (i = 0; i < node_c; i++) {
2626			if (clnt_joinset(node_v[i], sp, stale_flag, ep))
2627				goto rollback;
2628			nd = sd->sd_nodelist;
2629			while (nd) {
2630				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2631					nd->nd_flags |= MD_MN_NODE_OWN;
2632					/*
2633					 * Also set ADD flag since this flag
2634					 * is already set in rpc.metad - it's
2635					 * just not in the local copy.
2636					 * Could flush local cache and call
2637					 * metaget_setdesc, but this just
2638					 * adds time.  Since this node knows
2639					 * the state of the node flags in
2640					 * rpc.metad, just set the ADD
2641					 * flag and save time.
2642					 */
2643					nd->nd_flags |= MD_MN_NODE_ADD;
2644					break;
2645				}
2646				nd = nd->nd_next;
2647			}
2648		}
2649
2650		/* Send new node flag list to all Owner nodes */
2651		nd = sd->sd_nodelist;
2652		while (nd) {
2653			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2654				nd = nd->nd_next;
2655				continue;
2656			}
2657			/*
2658			 * Will effectively set OWN flag in records kept
2659			 * cached in rpc.metad.  The ADD flag would have
2660			 * already been set by the call to clnt_addhosts.
2661			 */
2662			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2663			    sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
2664				goto rollback;
2665			}
2666			nd = nd->nd_next;
2667		}
2668	}
2669
2670	/*
2671	 * Mark the set record MD_SR_OK
2672	 */
2673	nd = sd->sd_nodelist;
2674	/* All nodes are guaranteed to be ALIVE */
2675	while (nd) {
2676		if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK,
2677		    ep)) {
2678			goto rollback;
2679		}
2680		nd = nd->nd_next;
2681	}
2682
2683	/*
2684	 * For MN diskset:
2685	 * On each newly added node, set the node record for that node
2686	 * to OK.  Then set all node records for the newly added
2687	 * nodes on all nodes to ok.
2688	 *
2689	 * By setting a node's own node record to ok first, even if
2690	 * the node adding the hosts panics, the rest of the nodes can
2691	 * determine the same node list during the choosing of the master
2692	 * during reconfig.  So, only nodes considered for mastership
2693	 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
2694	 * on that node's rpc.metad.  If all nodes have MD_SR_OK set,
2695	 * but no node has its own MD_MN_NODE_OK set, then the set will
2696	 * be removed during reconfig since a panic occurred during the
2697	 * creation of the initial diskset.
2698	 */
2699
2700	for (i = 0; i < node_c; i++) {
2701		nd = sd->sd_nodelist;
2702		/* All nodes are guaranteed to be ALIVE */
2703		while (nd) {
2704			if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2705				break;
2706			nd = nd->nd_next;
2707		}
2708		/* Something wrong, will pick this up in next loop */
2709		if (nd == NULL)
2710			continue;
2711
2712		/* Only changing my local cache of node list */
2713		saved_nd_next = nd->nd_next;
2714		nd->nd_next = NULL;
2715
2716		/* Set node record for added host to ok on that host */
2717		if (clnt_upd_nr_flags(node_v[i], sp,
2718		    nd, MD_NR_OK, NULL, ep)) {
2719			nd->nd_next = saved_nd_next;
2720			goto rollback;
2721		}
2722		nd->nd_next = saved_nd_next;
2723	}
2724
2725	/* Now set all node records on all nodes to be ok */
2726	nd = sd->sd_nodelist;
2727	/* All nodes are guaranteed to be ALIVE */
2728	while (nd) {
2729		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2730		    sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
2731			goto rollback;
2732		}
2733		nd = nd->nd_next;
2734	}
2735
2736	RB_TEST(15, "addhosts", ep)
2737out:
2738	/*
2739	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2740	 * Send reinit command to mdcommd which forces it to get
2741	 * fresh set description.  Then send resume.
2742	 * Resume on class 0 will resume all classes, so can skip
2743	 * doing an explicit resume of class1 (ignore suspend1_flag).
2744	 */
2745	if (suspendall_flag) {
2746		/*
2747		 * Don't know if nodelist contains the nodes being added
2748		 * or not, so do reinit to nodes not being added (by skipping
2749		 * any nodes in the nodelist being added) and then do
2750		 * reinit to nodes being added if remote_sets_created is 1.
2751		 */
2752		nd = sd->sd_nodelist;
2753		/* All nodes are guaranteed to be ALIVE */
2754		while (nd) {
2755			/* Skip nodes being added - handled later */
2756			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2757				nd = nd->nd_next;
2758				continue;
2759			}
2760			/* Class is ignored for REINIT */
2761			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2762			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2763				if (rval == 0)
2764					(void) mdstealerror(ep, &xep);
2765				rval = -1;
2766				mde_perror(ep, dgettext(TEXT_DOMAIN,
2767				    "Unable to reinit rpc.mdcommd.\n"));
2768			}
2769			nd = nd->nd_next;
2770		}
2771		/*
2772		 * Send reinit to added nodes that had a set created since
2773		 * rpc.mdcommd is running on the nodes with a set.
2774		 */
2775		if (remote_sets_created == 1) {
2776			for (i = 0; i < node_c; i++) {
2777				if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
2778				    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2779					if (rval == 0)
2780						(void) mdstealerror(ep, &xep);
2781					rval = -1;
2782					mde_perror(ep, dgettext(TEXT_DOMAIN,
2783					    "Unable to reinit rpc.mdcommd.\n"));
2784				}
2785			}
2786		}
2787	}
2788	if ((suspend1_flag) || (suspendall_flag)) {
2789		/*
2790		 * Unlock diskset by resuming messages across the diskset.
2791		 * Just resume all classes so that resume is the same whether
2792		 * just one class was locked or all classes were locked.
2793		 *
2794		 * Don't know if nodelist contains the nodes being added
2795		 * or not, so do resume_all to nodes not being added (by
2796		 * skipping any nodes in the nodelist being added) and then do
2797		 * resume_all to nodes being added if remote_sets_created is 1.
2798		 */
2799		nd = sd->sd_nodelist;
2800		/* All nodes are guaranteed to be ALIVE */
2801		while (nd) {
2802			/* Skip nodes being added - handled later */
2803			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2804				nd = nd->nd_next;
2805				continue;
2806			}
2807			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2808			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2809				if (rval == 0)
2810					(void) mdstealerror(ep, &xep);
2811				rval = -1;
2812				mde_perror(ep, dgettext(TEXT_DOMAIN,
2813				    "Unable to resume rpc.mdcommd.\n"));
2814			}
2815			nd = nd->nd_next;
2816		}
2817		/*
2818		 * Send resume to added nodes that had a set created since
2819		 * rpc.mdcommd is be running on the nodes with a set.
2820		 */
2821		if (remote_sets_created == 1) {
2822			for (i = 0; i < node_c; i++) {
2823				/* Already verified to be alive */
2824				if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
2825				    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS,
2826				    &xep)) {
2827					if (rval == 0)
2828						(void) mdstealerror(ep, &xep);
2829					rval = -1;
2830					mde_perror(ep, dgettext(TEXT_DOMAIN,
2831					    "Unable to resume rpc.mdcommd.\n"));
2832				}
2833			}
2834		}
2835		meta_ping_mnset(sp->setno);
2836		/*
2837		 * Start a resync thread on the newly added nodes
2838		 * if set is not stale. Also start a thread to update the
2839		 * abr state of all soft partitions
2840		 */
2841		if (stale_flag != MNSET_IS_STALE) {
2842			for (i = 0; i < node_c; i++) {
2843				if (clnt_mn_mirror_resync_all(node_v[i],
2844				    sp->setno, &xep)) {
2845					if (rval == 0)
2846						(void) mdstealerror(ep, &xep);
2847					rval = -1;
2848					mde_perror(ep, dgettext(TEXT_DOMAIN,
2849					    "Unable to start resync "
2850					    "thread.\n"));
2851				}
2852				if (clnt_mn_sp_update_abr(node_v[i],
2853				    sp->setno, &xep)) {
2854					if (rval == 0)
2855						(void) mdstealerror(ep, &xep);
2856					rval = -1;
2857					mde_perror(ep, dgettext(TEXT_DOMAIN,
2858					    "Unable to start sp update "
2859					    "thread.\n"));
2860				}
2861			}
2862		}
2863	}
2864	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2865	/*
2866	 * Don't know if nodelist contains the nodes being added
2867	 * or not, so do clnt_unlock_set to nodes not being added (by
2868	 * skipping any nodes in the nodelist being added) and then do
2869	 * clnt_unlock_set to nodes being added.
2870	 */
2871	if (lock_flag) {
2872		nd = sd->sd_nodelist;
2873		/* All nodes are guaranteed to be ALIVE */
2874		while (nd) {
2875			/* Skip hosts we get in the next loop */
2876			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2877				nd = nd->nd_next;
2878				continue;
2879			}
2880			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2881				if (rval == 0)
2882					(void) mdstealerror(ep, &xep);
2883				rval = -1;
2884			}
2885			nd = nd->nd_next;
2886		}
2887		for (i = 0; i < node_c; i++) {
2888			/* Already verified to be alive */
2889			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
2890				if (rval == 0)
2891					(void) mdstealerror(ep, &xep);
2892				rval = -1;
2893			}
2894		}
2895	}
2896	cl_set_setkey(NULL);
2897
2898	metaflushsetname(sp);
2899
2900	/* release signals back to what they were on entry */
2901	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2902		mdclrerror(&xep);
2903
2904	return (rval);
2905
2906rollback:
2907	rval = -1;
2908
2909	/* level 6 */
2910	if (rb_level > 5) {
2911		/*
2912		 * For each node being deleted, set DEL flag and
2913		 * reset OK flag on that node first.
2914		 * Until a node has turned off its own
2915		 * rpc.metad's NODE_OK flag, that node could be
2916		 * considered for master during a reconfig.
2917		 */
2918		for (i = 0; i < node_c; i++) {
2919			nd = sd->sd_nodelist;
2920			/* All nodes are guaranteed to be ALIVE */
2921			while (nd) {
2922				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2923					break;
2924				nd = nd->nd_next;
2925			}
2926			/* Something wrong, handle this in next loop */
2927			if (nd == NULL)
2928				continue;
2929
2930			/* Only changing my local cache of node list */
2931			saved_nd_next = nd->nd_next;
2932			nd->nd_next = NULL;
2933
2934			/* Set flags for del host to DEL on that host */
2935			if (clnt_upd_nr_flags(node_v[i], sp,
2936			    nd, MD_NR_DEL, NULL, &xep)) {
2937				mdclrerror(&xep);
2938			}
2939			nd->nd_next = saved_nd_next;
2940		}
2941
2942		for (i = 0; i < node_c; i++) {
2943			if (dd != NULL) {
2944				/* Reset master on newly added node */
2945				if (clnt_mnsetmaster(node_v[i], sp, "",
2946				    MD_MN_INVALID_NID, &xep))
2947					mdclrerror(&xep);
2948				/* Withdraw set on newly added node */
2949				if (clnt_withdrawset(node_v[i], sp, &xep))
2950					mdclrerror(&xep);
2951			}
2952			/*
2953			 * Turn off owner flag in nodes to be deleted
2954			 * if there are drives in the set.
2955			 * Also, turn off NODE_OK and turn on NODE_DEL
2956			 * for nodes to be deleted.
2957			 * These flags are used to set the node
2958			 * record flags in all nodes in the set.
2959			 */
2960			nd = sd->sd_nodelist;
2961			while (nd) {
2962				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2963					if (dd != NULL) {
2964						nd->nd_flags &= ~MD_MN_NODE_OWN;
2965					}
2966					nd->nd_flags |= MD_MN_NODE_DEL;
2967					nd->nd_flags &= ~MD_MN_NODE_OK;
2968					break;
2969				}
2970				nd = nd->nd_next;
2971			}
2972		}
2973
2974		/*
2975		 * Now, reset owner and set delete flags for the deleted
2976		 * nodes on all nodes.
2977		 */
2978		nd = sd->sd_nodelist;
2979		while (nd) {
2980			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2981			    sd->sd_nodelist, MD_NR_SET, NULL, &xep)) {
2982				mdclrerror(&xep);
2983			}
2984			nd = nd->nd_next;
2985		}
2986
2987		/*
2988		 * On each node being deleted, set the set record
2989		 * to be in DEL state.
2990		 */
2991		for (i = 0; i < node_c; i++) {
2992			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
2993				mdclrerror(&xep);
2994			}
2995		}
2996	}
2997
2998	/* level 5 */
2999	if (rb_level > 4) {
3000		nd = sd->sd_nodelist;
3001		/* All nodes are guaranteed to be ALIVE */
3002		while (nd) {
3003			if (clnt_delhosts(nd->nd_nodename, sp, node_c,
3004			    node_v, &xep) == -1)
3005				mdclrerror(&xep);
3006			nd = nd->nd_next;
3007		}
3008	}
3009
3010	/*
3011	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3012	 * Send reinit command to mdcommd which forces it to get
3013	 * fresh set description.  Then send resume.
3014	 * Nodelist contains all nodes (existing + added).
3015	 */
3016	if (suspendall_flag) {
3017		/* Send reinit */
3018		nd = sd->sd_nodelist;
3019		/* All nodes are guaranteed to be ALIVE */
3020		/* Send reinit to nodes in nodelist before addhosts call */
3021		while (nd) {
3022			/*
3023			 * Skip nodes being added if remote sets were not
3024			 * created since rpc.mdcommd may not be running
3025			 * on the remote nodes.
3026			 */
3027			if ((remote_sets_created == 0) &&
3028			    (strinlst(nd->nd_nodename, node_c, node_v))) {
3029				nd = nd->nd_next;
3030				continue;
3031			}
3032			/* Class is ignored for REINIT */
3033			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3034			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3035				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3036				    "Unable to reinit rpc.mdcommd.\n"));
3037				mdclrerror(&xep);
3038			}
3039			nd = nd->nd_next;
3040		}
3041
3042		/* Send resume */
3043		nd = sd->sd_nodelist;
3044		/* All nodes are guaranteed to be ALIVE */
3045		while (nd) {
3046			/*
3047			 * Skip nodes being added if remote sets were not
3048			 * created since rpc.mdcommd may not be running
3049			 * on the remote nodes.
3050			 */
3051			if ((remote_sets_created == 0) &&
3052			    (strinlst(nd->nd_nodename, node_c, node_v))) {
3053				nd = nd->nd_next;
3054				continue;
3055			}
3056			/*
3057			 * Resume all classes but class 1 so that lock is held
3058			 * against meta* commands.
3059			 * Send resume_all_but_1 to nodes in nodelist
3060			 * before addhosts call.
3061			 */
3062			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3063			    sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
3064			    &xep)) {
3065				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3066				    "Unable to resume rpc.mdcommd.\n"));
3067				mdclrerror(&xep);
3068			}
3069			nd = nd->nd_next;
3070		}
3071		meta_ping_mnset(sp->setno);
3072	}
3073
3074	/* level 4 */
3075	/* Nodelist may or may not contain nodes being added. */
3076	if (rb_level > 3 && dd != NULL) {
3077		nd = sd->sd_nodelist;
3078		while (nd) {
3079			/* Skip nodes not being added */
3080			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3081				nd = nd->nd_next;
3082				continue;
3083			}
3084
3085			if (del_db_sidenms(sp, nd->nd_nodeid, &xep))
3086				mdclrerror(&xep);
3087			nd = nd->nd_next;
3088		}
3089	}
3090
3091	/* level 3 */
3092	/* Nodelist may or may not contain nodes being added. */
3093	if (rb_level > 2 && dd != NULL) {
3094		nd = sd->sd_nodelist;
3095		while (nd) {
3096			/* Skip nodes not being added */
3097			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3098				nd = nd->nd_next;
3099				continue;
3100			}
3101
3102			if (del_md_sidenms(sp, nd->nd_nodeid, &xep))
3103				mdclrerror(&xep);
3104			nd = nd->nd_next;
3105		}
3106	}
3107
3108	/* level 1 */
3109	if (rb_level > 0) {
3110		if (dd != NULL) {
3111			/* delete the drive records */
3112			for (i = 0; i < node_c; i++) {
3113				if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3114					mdclrerror(&xep);
3115			}
3116		}
3117
3118		/* delete the set record */
3119		for (i = 0; i < node_c; i++) {
3120			if (clnt_delset(node_v[i], sp, &xep) == -1)
3121				mdclrerror(&xep);
3122		}
3123	}
3124
3125	/* level 0 */
3126	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3127	/* Don't test lock flag since guaranteed to be set if in rollback */
3128	/* Nodelist may or may not contain nodes being added. */
3129	/*
3130	 * Unlock diskset by resuming messages across the diskset.
3131	 * Just resume all classes so that resume is the same whether
3132	 * just one class was locked or all classes were locked.
3133	 */
3134	if ((suspend1_flag) || (suspendall_flag)) {
3135		/* All nodes are guaranteed to be ALIVE */
3136		nd = sd->sd_nodelist;
3137		while (nd) {
3138			/*
3139			 * Skip nodes being added since remote sets
3140			 * were either created and then deleted or
3141			 * were never created.  Either way - rpc.mdcommd
3142			 * may not be running on the remote node.
3143			 */
3144			if (strinlst(nd->nd_nodename, node_c, node_v)) {
3145				nd = nd->nd_next;
3146				continue;
3147			}
3148			if (clnt_mdcommdctl(nd->nd_nodename,
3149			    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
3150			    MD_MSCF_NO_FLAGS, &xep)) {
3151				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3152				    "Unable to resume rpc.mdcommd.\n"));
3153				mdclrerror(&xep);
3154			}
3155			nd = nd->nd_next;
3156		}
3157		meta_ping_mnset(sp->setno);
3158	}
3159	nd = sd->sd_nodelist;
3160	/* All nodes are guaranteed to be ALIVE */
3161	while (nd) {
3162		/* Skip hosts we get in the next loop */
3163		if (strinlst(nd->nd_nodename, node_c, node_v)) {
3164			nd = nd->nd_next;
3165			continue;
3166		}
3167
3168		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
3169			mdclrerror(&xep);
3170		nd = nd->nd_next;
3171	}
3172
3173	for (i = 0; i < node_c; i++)
3174		if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3175			mdclrerror(&xep);
3176	cl_set_setkey(NULL);
3177
3178	/* release signals back to what they were on entry */
3179	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3180		mdclrerror(&xep);
3181
3182	metaflushsetname(sp);
3183
3184	return (rval);
3185}
3186
3187/*
3188 * Add host(s) to the traditional diskset provided in sp.
3189 *	- create set if non-existent.
3190 */
3191static int
3192meta_traditional_set_addhosts(
3193	mdsetname_t	*sp,
3194	int		multi_node,
3195	int		node_c,
3196	char		**node_v,
3197	int		auto_take,
3198	md_error_t	*ep
3199)
3200{
3201	md_set_desc	*sd;
3202	md_drive_desc	*dd, *p;
3203	med_rec_t	medr;
3204	med_rec_t	rb_medr;
3205	int		rval = 0;
3206	int		bool;
3207	int		nodeindex;
3208	int 		i;
3209	int		has_set;
3210	int		numsides;
3211	sigset_t	oldsigs;
3212	md_setkey_t	*cl_sk;
3213	int		rb_level = 0;
3214	md_error_t	xep = mdnullerror;
3215	int		max_meds;
3216
3217	if (nodesuniq(sp, node_c, node_v, ep))
3218		return (-1);
3219
3220	if (validate_nodes(sp, node_c, node_v, ep))
3221		return (-1);
3222
3223	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
3224		if (! mdiserror(ep, MDE_NO_SET))
3225			return (-1);
3226		mdclrerror(ep);
3227		return (create_set(sp, multi_node, node_c, node_v, auto_take,
3228		    ep));
3229	}
3230
3231	/* The auto_take behavior is inconsistent with multiple hosts. */
3232	if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
3233		(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
3234		    sp->setname);
3235		return (-1);
3236	}
3237
3238	/*
3239	 * We already have the set.
3240	 */
3241
3242	/* Make sure we own the set */
3243	if (meta_check_ownership(sp, ep) != 0)
3244		return (-1);
3245
3246	/*
3247	 * Perform the required checks for new hosts
3248	 */
3249	for (i = 0; i < node_c; i++) {
3250		if (getnodeside(node_v[i], sd) != MD_SIDEWILD)
3251			return (mddserror(ep, MDE_DS_NODEINSET, sp->setno,
3252			    node_v[i], NULL, sp->setname));
3253
3254		/* Make sure this set name is not used on the other hosts */
3255		has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
3256		if (has_set < 0) {
3257			if (! mdiserror(ep, MDE_NO_SET))
3258				return (-1);
3259			/* Keep on truck'n */
3260			mdclrerror(ep);
3261		} else if (has_set)
3262			return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
3263			    node_v[i], NULL, sp->setname));
3264
3265		if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1)
3266			return (-1);
3267
3268		if (bool == TRUE)
3269			return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
3270			    node_v[i], NULL, sp->setname));
3271
3272		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1)
3273			return (-1);
3274
3275		if (bool == FALSE)
3276			return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
3277			    node_v[i], NULL, sp->setname));
3278
3279		if (check_setdrvs_againstnode(sp, node_v[i], ep))
3280			return (-1);
3281	}
3282
3283	/* Count the number of occupied slots */
3284	numsides = 0;
3285	for (i = 0; i < MD_MAXSIDES; i++) {
3286		/* Count occupied slots */
3287		if (sd->sd_nodes[i][0] != '\0')
3288			numsides++;
3289	}
3290
3291	/* Make sure the we have space to add the new sides */
3292	if ((numsides + node_c) > MD_MAXSIDES) {
3293		(void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL,
3294		    NULL, sp->setname);
3295		return (-1);
3296	}
3297
3298	/* Get drive descriptors for the set */
3299	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
3300		if (! mdisok(ep))
3301			return (-1);
3302
3303	/* Setup the mediator record roll-back structure */
3304	(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
3305	rb_medr.med_rec_mag = MED_REC_MAGIC;
3306	rb_medr.med_rec_rev = MED_REC_REV;
3307	rb_medr.med_rec_fl  = 0;
3308	rb_medr.med_rec_sn  = sp->setno;
3309	(void) strcpy(rb_medr.med_rec_snm, sp->setname);
3310	for (i = 0; i < MD_MAXSIDES; i++)
3311		(void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
3312	rb_medr.med_rec_meds = sd->sd_med;	/* structure assigment */
3313	(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
3314	rb_medr.med_rec_foff = 0;
3315	crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
3316
3317	if ((max_meds = get_max_meds(ep)) == 0)
3318		return (-1);
3319
3320	/* END CHECK CODE */
3321
3322	md_rb_sig_handling_on();
3323
3324	/* Lock the set on current set members */
3325	for (i = 0; i < MD_MAXSIDES; i++) {
3326		/* Skip empty slots */
3327		if (sd->sd_nodes[i][0] == '\0')
3328			continue;
3329
3330		if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
3331			rval = -1;
3332			goto out;
3333		}
3334	}
3335
3336	/* Lock the set on new set members */
3337	for (i = 0; i < node_c; i++) {
3338		if (clnt_lock_set(node_v[i], sp, ep)) {
3339			rval = -1;
3340			goto out;
3341		}
3342	}
3343
3344	RB_TEST(1, "addhosts", ep)
3345
3346	RB_PREEMPT;
3347	rb_level = 1;	/* level 1 */
3348
3349	RB_TEST(2, "addhosts", ep)
3350
3351	/*
3352	 * Add the new hosts to the existing set record on the existing hosts
3353	 */
3354	for (i = 0; i < MD_MAXSIDES; i++) {
3355		/* skip empty slots */
3356		if (sd->sd_nodes[i][0] == '\0')
3357			continue;
3358
3359		if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep))
3360			goto rollback;
3361	}
3362
3363	RB_PREEMPT;
3364	rb_level = 2;	/* level 2 */
3365
3366	RB_TEST(3, "addhosts", ep);
3367
3368	/* Merge the new entries into the set with the existing sides */
3369	nodeindex = 0;
3370	for (i = 0; i < MD_MAXSIDES; i++) {
3371		/* Skip full slots */
3372		if (sd->sd_nodes[i][0] != '\0')
3373			continue;
3374
3375		(void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]);
3376		if (nodeindex == node_c)
3377			break;
3378	}
3379
3380	/* If we have drives */
3381	if (dd != NULL) {
3382		/*
3383		 * For all the hosts being added, create a sidename structure
3384		 */
3385		for (i = 0; i < MD_MAXSIDES; i++) {
3386			/* Skip empty slots */
3387			if (sd->sd_nodes[i][0] == '\0')
3388				continue;
3389
3390			/* Skip nodes not being added */
3391			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3392				continue;
3393
3394			for (p = dd; p != NULL; p = p->dd_next) {
3395				if (make_sideno_sidenm(sp, p->dd_dnp, i,
3396				    ep) != 0)
3397					goto rollback;
3398			}
3399		}
3400
3401		/*
3402		 * Add the new sidename for each drive to the existing hosts
3403		 */
3404		for (i = 0; i < MD_MAXSIDES; i++) {
3405			/* Skip empty slots */
3406			if (sd->sd_nodes[i][0] == '\0')
3407				continue;
3408
3409			/* Skip nodes being added */
3410			if (strinlst(sd->sd_nodes[i], node_c, node_v))
3411				continue;
3412
3413			if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp,
3414			    sd, node_c, node_v, ep)) {
3415				goto rollback;
3416			}
3417		}
3418
3419		RB_TEST(4, "addhosts", ep)
3420
3421		RB_PREEMPT;
3422		rb_level = 3;	/* level 3 */
3423
3424		RB_TEST(5, "addhosts", ep)
3425
3426		if (add_db_sidenms(sp, ep)) {
3427			goto rollback;
3428		}
3429
3430	} else {
3431		RB_PREEMPT;
3432		rb_level = 3;
3433	}
3434
3435	RB_TEST(6, "addhosts", ep)
3436
3437	RB_PREEMPT;
3438	rb_level = 4;	/* level 4 */
3439
3440	RB_TEST(7, "addhosts", ep)
3441
3442
3443	/* create the set on the new nodes, this adds the drives as well */
3444	if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
3445		goto rollback;
3446	}
3447
3448	RB_TEST(8, "addhosts", ep)
3449
3450	RB_PREEMPT;
3451	rb_level = 5;	/* level 5 */
3452
3453	RB_TEST(9, "addhosts", ep)
3454
3455	if (dd != NULL) {
3456
3457		/*
3458		 * Add the device entries for the new sides into the namespace.
3459		 */
3460		for (i = 0; i < MD_MAXSIDES; i++) {
3461			/* Skip empty slots */
3462			if (sd->sd_nodes[i][0] == '\0')
3463				continue;
3464
3465			/* Skip nodes not being added */
3466			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3467				continue;
3468
3469			if (add_md_sidenms(sp, i, MD_SIDEWILD, ep))
3470				goto rollback;
3471		}
3472	}
3473
3474	RB_TEST(10, "addhosts", ep)
3475
3476	RB_PREEMPT;
3477	rb_level = 6;	/* level 6 */
3478
3479	RB_TEST(11, "addhosts", ep);
3480
3481	if (dd != NULL) {
3482		/*
3483		 * Mark the drives MD_DR_OK.
3484		 */
3485		for (i = 0; i < MD_MAXSIDES; i++) {
3486			/* Skip empty slots */
3487			if (sd->sd_nodes[i][0] == '\0')
3488				continue;
3489
3490			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
3491			    MD_DR_OK, ep) == -1) {
3492				goto rollback;
3493			}
3494		}
3495	}
3496
3497	RB_TEST(12, "addhosts", ep)
3498
3499	/* Bring the mediator record up to date with the set record */
3500	medr = rb_medr;				/* structure assignment */
3501	for (i = 0; i < MD_MAXSIDES; i++)
3502		(void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]);
3503	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
3504
3505	/* Inform the mediator hosts of the new node list */
3506	for (i = 0; i < max_meds; i++) {
3507		if (sd->sd_med.n_lst[i].a_cnt == 0)
3508			continue;
3509
3510		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
3511			goto rollback;
3512	}
3513
3514	/* Add the mediator information to all hosts in the set */
3515	for (i = 0; i < MD_MAXSIDES; i++) {
3516		/* Skip empty slots */
3517		if (sd->sd_nodes[i][0] == '\0')
3518			continue;
3519
3520		if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
3521			goto rollback;
3522	}
3523
3524	RB_TEST(13, "addhosts", ep)
3525
3526	/*
3527	 * Mark the set record MD_SR_OK
3528	 */
3529	for (i = 0; i < MD_MAXSIDES; i++) {
3530		/* Skip empty slots */
3531		if (sd->sd_nodes[i][0] == '\0')
3532			continue;
3533
3534		if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep))
3535			goto rollback;
3536	}
3537
3538	RB_TEST(14, "addhosts", ep)
3539
3540out:
3541	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3542	for (i = 0; i < MD_MAXSIDES; i++) {
3543		/* Skip empty slots */
3544		if (sd->sd_nodes[i][0] == '\0')
3545			continue;
3546
3547		/* Skip hosts we get in the next loop */
3548		if (strinlst(sd->sd_nodes[i], node_c, node_v))
3549			continue;
3550
3551		if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
3552			if (rval == 0)
3553				(void) mdstealerror(ep, &xep);
3554			rval = -1;
3555		}
3556	}
3557
3558	if (rval == 0) {
3559		for (i = 0; i < node_c; i++)
3560			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
3561				if (rval == 0)
3562					(void) mdstealerror(ep, &xep);
3563				rval = -1;
3564			}
3565	}
3566	cl_set_setkey(NULL);
3567
3568	metaflushsetname(sp);
3569
3570	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3571
3572	return (rval);
3573
3574rollback:
3575	/* Make sure we are blocking all signals */
3576	if (procsigs(TRUE, &oldsigs, &xep) < 0)
3577		mdclrerror(&xep);
3578
3579	rval = -1;
3580
3581	/* level 6 */
3582	if (rb_level > 5) {
3583		for (i = 0; i < max_meds; i++) {
3584			if (sd->sd_med.n_lst[i].a_cnt == 0)
3585				continue;
3586
3587			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
3588			    &rb_medr, &xep))
3589				mdclrerror(&xep);
3590		}
3591		if (dd != NULL) {
3592			for (i = 0; i < MD_MAXSIDES; i++) {
3593				/* Skip empty slots */
3594				if (sd->sd_nodes[i][0] == '\0')
3595					continue;
3596
3597				/* Skip nodes not being added */
3598				if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3599					continue;
3600
3601				if (del_md_sidenms(sp, i, &xep))
3602					mdclrerror(&xep);
3603			}
3604		}
3605	}
3606
3607	/* level 5 */
3608	if (rb_level > 4) {
3609		if (dd != NULL) {
3610			/* delete the drive records */
3611			for (i = 0; i < node_c; i++) {
3612				if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3613					mdclrerror(&xep);
3614			}
3615		}
3616		/* delete the set record on the 'new' hosts */
3617		for (i = 0; i < node_c; i++) {
3618			if (clnt_delset(node_v[i], sp, &xep) == -1)
3619				mdclrerror(&xep);
3620		}
3621	}
3622
3623	/* level 4 */
3624	if (rb_level > 3 && dd != NULL) {
3625		for (i = 0; i < MD_MAXSIDES; i++) {
3626			/* Skip empty slots */
3627			if (sd->sd_nodes[i][0] == '\0')
3628				continue;
3629
3630			/* Skip nodes not being added */
3631			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3632				continue;
3633
3634			if (del_db_sidenms(sp, i, &xep))
3635				mdclrerror(&xep);
3636		}
3637	}
3638
3639	/* level 3 */
3640	if (rb_level > 2 && dd != NULL) {
3641		for (i = 0; i < MD_MAXSIDES; i++) {
3642			/* Skip empty slots */
3643			if (sd->sd_nodes[i][0] == '\0')
3644				continue;
3645
3646			/* Skip nodes not being added */
3647			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3648				continue;
3649
3650			if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
3651			    &xep) == -1)
3652				mdclrerror(&xep);
3653		}
3654	}
3655
3656	/* level 2 */
3657	if (rb_level > 1) {
3658		for (i = 0; i < MD_MAXSIDES; i++) {
3659			/* Skip empty slots */
3660			if (sd->sd_nodes[i][0] == '\0')
3661				continue;
3662
3663			if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
3664			    &xep) == -1)
3665				mdclrerror(&xep);
3666		}
3667	}
3668
3669	/* level 1 */
3670	if (rb_level > 0) {
3671		cl_sk = cl_get_setkey(sp->setno, sp->setname);
3672		for (i = 0; i < MD_MAXSIDES; i++) {
3673			/* Skip empty slots */
3674			if (sd->sd_nodes[i][0] == '\0')
3675				continue;
3676
3677			/* Skip hosts we get in the next loop */
3678			if (strinlst(sd->sd_nodes[i], node_c, node_v))
3679				continue;
3680
3681			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
3682				mdclrerror(&xep);
3683		}
3684
3685		for (i = 0; i < node_c; i++)
3686			if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3687				mdclrerror(&xep);
3688		cl_set_setkey(NULL);
3689	}
3690
3691	/* release signals back to what they were on entry */
3692	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3693		mdclrerror(&xep);
3694
3695	metaflushsetname(sp);
3696
3697	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3698
3699	return (rval);
3700}
3701
3702/*
3703 * Add host(s) to the diskset provided in sp.
3704 * 	- create set if non-existent.
3705 */
3706int
3707meta_set_addhosts(
3708	mdsetname_t	*sp,
3709	int		multi_node,
3710	int		node_c,
3711	char		**node_v,
3712	int		auto_take,
3713	md_error_t	*ep
3714)
3715{
3716	if (multi_node)
3717		return (meta_multinode_set_addhosts(sp, multi_node, node_c,
3718		    node_v, auto_take, ep));
3719	else
3720		return (meta_traditional_set_addhosts(sp, multi_node, node_c,
3721		    node_v, auto_take, ep));
3722}
3723
3724/*
3725 * Delete host(s) from the diskset provided in sp.
3726 * 	- destroy set if last host in set is removed.
3727 */
3728int
3729meta_set_deletehosts(
3730	mdsetname_t		*sp,
3731	int			node_c,
3732	char			**node_v,
3733	int			forceflg,
3734	md_error_t		*ep
3735)
3736{
3737	md_set_desc		*sd;
3738	md_drive_desc		*dd;
3739	med_rec_t		medr;
3740	med_rec_t		rb_medr;
3741	int			i, j;
3742	int			has_set;
3743	int			numsides = 0;
3744	int			oha = FALSE;
3745	sigset_t		oldsigs;
3746	mhd_mhiargs_t		mhiargs;
3747	md_replicalist_t	*rlp = NULL;
3748	md_setkey_t		*cl_sk;
3749	ulong_t			max_genid = 0;
3750	int			rval = 0;
3751	int			rb_level = 0;
3752	int			max_meds = 0;
3753	md_error_t		xep = mdnullerror;
3754	md_mnnode_desc		*nd;
3755	md_mnnode_record	*nr;
3756	int			delete_master = 0;
3757	int			suspendall_flag = 0, suspendall_flag_rb = 0;
3758	int			suspend1_flag = 0;
3759	int			lock_flag = 0;
3760	int			stale_flag = 0;
3761	int			*node_id_list = NULL;
3762	int			remote_sets_deleted = 0;
3763
3764	if ((sd = metaget_setdesc(sp, ep)) == NULL)
3765		return (-1);
3766
3767	/*
3768	 * Verify that list of nodes being deleted contains no
3769	 * duplicates.
3770	 */
3771	if (nodesuniq(sp, node_c, node_v, ep))
3772		return (-1);
3773
3774	/* Make sure we own the set */
3775	if (meta_check_ownership(sp, ep) != 0)
3776		return (-1);
3777
3778	/*
3779	 * The drive and node records are stored in the local mddbs of each
3780	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
3781	 * drive and node records from that node's local mddb and caches them
3782	 * internally. Any process needing diskset information contacts its
3783	 * local rpc.metad to get this information.  Since each node in the
3784	 * diskset is independently reading the set information from its local
3785	 * mddb, the set, drive and node records in the local mddbs must stay
3786	 * in-sync, so that all nodes have a consistent view of the diskset.
3787	 *
3788	 * For a multinode diskset, explicitly verify that all nodes in the
3789	 * diskset are ALIVE (i.e. are in the API membership list) if the
3790	 * forceflag is FALSE.  (The case of forceflag being TRUE is handled
3791	 * in OHA check above.)
3792	 *
3793	 * If forceflag is FALSE and a node in the diskset is not in
3794	 * the membership list, then fail this operation since all nodes must
3795	 * be ALIVE in order to delete the node record from their local mddb.
3796	 * If a panic of this node leaves the local mddbs set, node and drive
3797	 * records out-of-sync, the reconfig cycle will fix the local mddbs
3798	 * and force them back into synchronization.
3799	 */
3800	if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) {
3801		nd = sd->sd_nodelist;
3802		while (nd) {
3803			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3804				return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
3805				    sp->setno, nd->nd_nodename,
3806				    NULL, sp->setname));
3807			}
3808			nd = nd->nd_next;
3809		}
3810	}
3811
3812
3813	/*
3814	 * Lock the set on current set members.
3815	 * Set locking done much earlier for MN diskset than for traditional
3816	 * diskset since lock_set and SUSPEND are used to protect against
3817	 * other meta* commands running on the other nodes.
3818	 */
3819	if (MD_MNSET_DESC(sd)) {
3820		/* Make sure we are blocking all signals */
3821		if (procsigs(TRUE, &oldsigs, &xep) < 0)
3822			mdclrerror(&xep);
3823
3824		nd = sd->sd_nodelist;
3825		while (nd) {
3826			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3827				nd = nd->nd_next;
3828				continue;
3829			}
3830
3831			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
3832				rval = -1;
3833				goto out2;
3834			}
3835			lock_flag = 1;
3836			nd = nd->nd_next;
3837		}
3838		/*
3839		 * Lock out other meta* commands by suspending
3840		 * class 1 messages across the diskset.
3841		 */
3842		nd = sd->sd_nodelist;
3843		while (nd) {
3844			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3845				nd = nd->nd_next;
3846				continue;
3847			}
3848			if (clnt_mdcommdctl(nd->nd_nodename,
3849			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
3850			    MD_MSCF_NO_FLAGS, ep)) {
3851				rval = -1;
3852				goto out2;
3853			}
3854			suspend1_flag = 1;
3855			nd = nd->nd_next;
3856		}
3857	}
3858
3859	for (i = 0; i < node_c; i++)
3860		if (getnodeside(node_v[i], sd) == MD_SIDEWILD) {
3861			(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3862			    node_v[i], NULL, sp->setname);
3863			rval = -1;
3864			goto out2;
3865		}
3866
3867	/*
3868	 * Count the number of nodes currently in the set.
3869	 */
3870	if (MD_MNSET_DESC(sd)) {
3871		nd = sd->sd_nodelist;
3872		while (nd) {
3873			numsides++;
3874			nd = nd->nd_next;
3875		}
3876	} else {
3877		for (i = 0; i < MD_MAXSIDES; i++)
3878			/* Count full slots */
3879			if (sd->sd_nodes[i][0] != '\0')
3880				numsides++;
3881	}
3882
3883	/*
3884	 * OHA mode == -f -h <hostname>
3885	 * OHA is One Host Administration that occurs when the forceflag (-f)
3886	 * is set and at least one host in the diskset isn't responding
3887	 * to RPC requests.
3888	 *
3889	 * When in OHA mode, a node cannot delete itself from a diskset.
3890	 * When in OHA mode, a node can delete a list of nodes from a diskset
3891	 * even if some of the nodes in the diskset are unresponsive.
3892	 *
3893	 * For multinode diskset, only allow OHA mode when the nodes that
3894	 * aren't responding in the diskset are not in the membership list
3895	 * (i.e. nodes that aren't responding are not marked ALIVE).
3896	 * Nodes that aren't in the membership list will be rejoining
3897	 * the diskset through a reconfig cycle and the local mddb set
3898	 * and node records can be reconciled during the reconfig cycle.
3899	 *
3900	 * If a node isn't responding, but is still in the membership list,
3901	 * fail the request since the node may not be responding because
3902	 * rpc.metad died and is restarting.  In this case, no reconfig
3903	 * cycle will be started, so there's no way to recover if
3904	 * the host delete operation was allowed.
3905	 *
3906	 * NOTE: if nodes that weren't in the membership when the OHA host
3907	 * delete occurred are now the only nodes in membership list,
3908	 * those nodes will see the old view of the diskset.  As soon as
3909	 * a node re-enters the cluster that was present in the cluster
3910	 * during the host deletion, the diskset will reflect the host
3911	 * deletion on all nodes presently in the cluster.
3912	 */
3913	if (forceflg == TRUE) {
3914		if (MD_MNSET_DESC(sd)) {
3915			nd = sd->sd_nodelist;
3916			while (nd) {
3917				/*
3918				 * If a node isn't ALIVE (in member list),
3919				 * then allow a force-able delete in OHA mode.
3920				 */
3921				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3922					oha = TRUE;
3923					break;
3924				}
3925				/*
3926				 * Don't test for clnt_nullproc since already
3927				 * tested the RPC connections by clnt_lock_set.
3928				 */
3929				nd = nd->nd_next;
3930			}
3931		} else {
3932			for (i = 0; i < MD_MAXSIDES; i++) {
3933				/* Skip empty slots */
3934				if (sd->sd_nodes[i][0] == '\0')
3935					continue;
3936
3937				if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) {
3938					/*
3939					 * If we timeout to at least one
3940					 * client, then we can allow OHA mode,
3941					 * otherwise, we are in normal mode.
3942					 */
3943					if (mdanyrpcerror(ep)) {
3944						mdclrerror(ep);
3945						if (strinlst(sd->sd_nodes[i],
3946						    node_c, node_v)) {
3947							oha = TRUE;
3948							break;
3949						}
3950					}
3951				}
3952			}
3953		}
3954	}
3955
3956	/*
3957	 * Don't allow this for MN diskset since meta_set_destroy of 1 node
3958	 * does NOT remove this node's node record from the other node's set
3959	 * records in their local mddb.  This leaves a MN diskset in a very
3960	 * messed up state.
3961	 */
3962	if (!(MD_MNSET_DESC(sd))) {
3963		/* Destroy set */
3964		if (forceflg == TRUE && node_c == 1 &&
3965		    strcmp(mynode(), node_v[0]) == 0) {
3966			/* Can return since !MN diskset so nothing to unlock */
3967			return (meta_set_destroy(sp, TRUE, ep));
3968		}
3969	}
3970
3971
3972	/*
3973	 * In multinode diskset, can only delete self if this
3974	 * is the last node in the set or if all nodes in
3975	 * the set are being deleted.  The traditional diskset code
3976	 * allows a node to delete itself (when there are other nodes
3977	 * in the diskset) when using the force flag, but that code
3978	 * path doesn't have the node remove itself from
3979	 * the set node list on the other nodes.  Since this isn't
3980	 * satisfactory for the multinode diskset, just don't
3981	 * allow this operation.
3982	 */
3983	if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3984	    strinlst(mynode(), node_c, node_v)) {
3985		(void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno,
3986		    mynode(), NULL, sp->setname);
3987		rval = -1;
3988		goto out2;
3989	}
3990
3991	/*
3992	 * In multinode diskset, don't allow deletion of master node unless
3993	 * this is the only node left or unless all nodes are being
3994	 * deleted since there is no way to switch
3995	 * master ownership (unless via a cluster reconfig cycle).
3996	 */
3997	delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v);
3998	if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3999	    delete_master) {
4000		(void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno,
4001		    sd->sd_mn_master_nodenm, NULL, sp->setname);
4002		rval = -1;
4003		goto out2;
4004	}
4005
4006
4007	/* Deleting self w/o forceflg */
4008	if (forceflg == FALSE && numsides > 1 &&
4009	    strinlst(mynode(), node_c, node_v)) {
4010		(void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno,
4011		    mynode(), NULL, sp->setname);
4012		rval = -1;
4013		goto out2;
4014	}
4015
4016	/*
4017	 * Setup the mediator record roll-back structure for a trad diskset.
4018	 *
4019	 * For a MN diskset, the deletion of a host in the diskset
4020	 * does not cause an update of the mediator record.  If the
4021	 * host deletion will cause the diskset to be removed (this is
4022	 * the last host being removed or all hosts are being removed)
4023	 * then the mediator record must have already been removed by the
4024	 * user or this delete host operation will fail (a check for
4025	 * this is done later in this routine).
4026	 */
4027	if (!(MD_MNSET_DESC(sd))) {
4028		(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
4029		rb_medr.med_rec_mag = MED_REC_MAGIC;
4030		rb_medr.med_rec_rev = MED_REC_REV;
4031		rb_medr.med_rec_fl = 0;
4032		rb_medr.med_rec_sn  = sp->setno;
4033		(void) strcpy(rb_medr.med_rec_snm, sp->setname);
4034		for (i = 0; i < MD_MAXSIDES; i++)
4035			(void) strcpy(rb_medr.med_rec_nodes[i],
4036			    sd->sd_nodes[i]);
4037		rb_medr.med_rec_meds = sd->sd_med;  /* structure assigment */
4038		(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
4039		rb_medr.med_rec_foff = 0;
4040		crcgen(&rb_medr, &rb_medr.med_rec_cks,
4041		    sizeof (med_rec_t), NULL);
4042
4043		/* Bring the mediator record up to date with the set record */
4044		medr = rb_medr;			/* structure assignment */
4045
4046		if ((max_meds = get_max_meds(ep)) == 0) {
4047			rval = -1;
4048			goto out2;
4049		}
4050	}
4051
4052	/*
4053	 * For traditional diskset:
4054	 * Check to see if all the hosts we are trying to delete the set from
4055	 * have a set "setname" that is the same as ours, i.e. - same name,
4056	 * same time stamp, same genid.  We only do this if forceflg is not
4057	 * specified or we are in OHA mode.
4058	 */
4059	if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) {
4060		int	fix_node_v = FALSE;
4061		int	j;
4062
4063		for (i = 0; i < node_c; i++) {
4064			/* We skip this side */
4065			if (strcmp(mynode(), node_v[i]) == 0)
4066				continue;
4067
4068			has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4069
4070			if (has_set < 0) {
4071				char	 *anode[1];
4072
4073				/*
4074				 * Can't talk to the host only allowed in OHA
4075				 * mode.
4076				 */
4077				if (oha == TRUE && mdanyrpcerror(ep)) {
4078					mdclrerror(ep);
4079					continue;
4080				}
4081
4082				/*
4083				 * We got an error we do not, or are not,
4084				 * prepared to handle.
4085				 */
4086				if (! mdiserror(ep, MDE_NO_SET) &&
4087				    ! mdismddberror(ep, MDE_DB_NODB)) {
4088					rval = -1;
4089					goto out2;
4090				}
4091				mdclrerror(ep);
4092
4093				/*
4094				 * If we got here: both hosts are up; a host in
4095				 * our set record does not have the set. So we
4096				 * delete the host from our set and invalidate
4097				 * the node.
4098				 */
4099				anode[0] = Strdup(node_v[i]);
4100
4101				rval = del_host_noset(sp, anode, ep);
4102
4103				/*
4104				 * If we delete a host, make sure the mediator
4105				 * hosts are made aware of this.
4106				 */
4107				for (j = 0; j < MD_MAXSIDES; j++) {
4108					if (strcmp(medr.med_rec_nodes[j],
4109					    node_v[i]) != 0)
4110						continue;
4111					(void) memset(&medr.med_rec_nodes[j],
4112					    '\0', sizeof (md_node_nm_t));
4113				}
4114				crcgen(&medr, &medr.med_rec_cks,
4115				    sizeof (med_rec_t), NULL);
4116
4117				rb_medr = medr;		/* struct assignment */
4118
4119				Free(anode[0]);
4120
4121				if (rval == -1)
4122					goto out2;
4123
4124				node_v[i][0] = '\0';
4125				fix_node_v = TRUE;
4126				continue;
4127			}
4128
4129			/*
4130			 * If we can talk to the host, and they do not have the
4131			 * exact set, then we disallow the operation.
4132			 */
4133			if (has_set == FALSE) {
4134				(void) mddserror(ep, MDE_DS_NODENOSET,
4135				    sp->setno, node_v[i], NULL, sp->setname);
4136				rval = -1;
4137				goto out2;
4138			}
4139		}
4140
4141		/*
4142		 * Here we prune the node_v's that were invalidated above.
4143		 */
4144		if (fix_node_v == TRUE) {
4145			i = 0;
4146			while (i < node_c) {
4147				if (node_v[i][0] == '\0') {
4148					for (j = i; (j + 1) < node_c; j++)
4149						node_v[j] = node_v[j + 1];
4150					node_c--;
4151				}
4152				i++;
4153			}
4154			/*
4155			 * If we are left with no nodes, then we have
4156			 * compeleted the operation.
4157			 */
4158			if (node_c == 0) {
4159				/*
4160				 * Inform the mediator hosts of the new node
4161				 * list
4162				 */
4163				for (i = 0; i < max_meds; i++) {
4164					if (sd->sd_med.n_lst[i].a_cnt == 0)
4165						continue;
4166
4167					if (clnt_med_upd_rec(
4168					    &sd->sd_med.n_lst[i], sp, &medr,
4169					    ep))
4170						mdclrerror(ep);
4171				}
4172				rval = 0;
4173				goto out2;
4174			}
4175		}
4176	}
4177
4178	/*
4179	 * For multinode diskset:
4180	 * If forceflag is FALSE then check to see if all the hosts we
4181	 * are trying to delete the set from have a set "setname" that
4182	 * is the same as ours, i.e. - same name, same time stamp, same genid.
4183	 * If forceflag is TRUE, then we don't care if the hosts being
4184	 * deleted have the same set information or not since user is forcing
4185	 * those hosts to be deleted.
4186	 */
4187	if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) {
4188		for (i = 0; i < node_c; i++) {
4189			/* We skip this node since comparing against it */
4190			if (strcmp(mynode(), node_v[i]) == 0)
4191				continue;
4192
4193			has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4194
4195			if (has_set < 0) {
4196				rval = -1;
4197				goto out2;
4198			}
4199
4200			/*
4201			 * If we can talk to the host, and they do not have the
4202			 * exact set, then we disallow the operation.
4203			 */
4204			if (has_set == FALSE) {
4205				(void) mddserror(ep, MDE_DS_NODENOSET,
4206				    sp->setno, node_v[i], NULL, sp->setname);
4207				rval = -1;
4208				goto out2;
4209			}
4210		}
4211	}
4212
4213	/*
4214	 * For traditional diskset:
4215	 * Can't allow user to delete their node (without deleting all nodes)
4216	 * out of a set in OHA mode, would leave a real mess.
4217	 * This action was already failed above for a MN diskset.
4218	 */
4219	if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) &&
4220	    strinlst(mynode(), node_c, node_v)) {
4221		/* Can directly return since !MN diskset; nothing to unlock */
4222		return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno,
4223		    mynode(), NULL, sp->setname));
4224	}
4225
4226
4227	/* Get the drive descriptors for this set */
4228	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4229	    ep)) == NULL) {
4230		if (! mdisok(ep)) {
4231			rval = -1;
4232			goto out2;
4233		}
4234	}
4235
4236	/*
4237	 * We have been asked to delete all the hosts in the set, i.e. - delete
4238	 * the whole set.
4239	 */
4240	if (node_c == numsides) {
4241		/*
4242		 * This is only a valid operation if all drives have been
4243		 * removed first.
4244		 */
4245
4246		if (dd != NULL) {
4247			(void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno,
4248			    NULL, NULL, sp->setname);
4249			rval = -1;
4250			goto out2;
4251		}
4252
4253		/*
4254		 * If a mediator is currently associated with this set,
4255		 * fail the deletion of the last host(s).
4256		 */
4257		if (sd->sd_med.n_cnt != 0) {
4258			(void) mddserror(ep, MDE_DS_HASMED, sp->setno,
4259			    NULL, NULL, sp->setname);
4260			rval = -1;
4261			goto out2;
4262		}
4263
4264		if (! mdisok(ep)) {
4265			rval = -1;
4266			goto out2;
4267		}
4268
4269		rval = del_set_nodrives(sp, node_c, node_v, oha, ep);
4270		remote_sets_deleted = 1;
4271		goto out2;
4272	}
4273
4274	/*
4275	 * Get timeout values in case we need to roll back
4276	 */
4277	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
4278	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) {
4279		rval = -1;
4280		goto out2;
4281	}
4282
4283	if (dd != NULL) {
4284		/*
4285		 * We need this around for re-adding DB side names later.
4286		 */
4287		if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
4288			rval = -1;
4289			goto out2;
4290		}
4291
4292		/*
4293		 * Alloc nodeid list if drives are present in diskset.
4294		 * nodeid list is used to reset mirror owners if the
4295		 * owner is a deleted node.
4296		 */
4297		if (MD_MNSET_DESC(sd)) {
4298			node_id_list = Zalloc(sizeof (int) * node_c);
4299		}
4300	}
4301
4302	/* Lock the set on current set members */
4303	if (!(MD_MNSET_DESC(sd))) {
4304		md_rb_sig_handling_on();
4305		for (i = 0; i < MD_MAXSIDES; i++) {
4306			/* Skip empty slots */
4307			if (sd->sd_nodes[i][0] == '\0')
4308				continue;
4309
4310			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
4311				if (oha == TRUE && mdanyrpcerror(ep)) {
4312					mdclrerror(ep);
4313					continue;
4314				}
4315				rval = -1;
4316				goto out2;
4317			}
4318			lock_flag = 1;
4319		}
4320	}
4321
4322	RB_TEST(1, "deletehosts", ep)
4323
4324	RB_PREEMPT;
4325	rb_level = 1;	/* level 1 */
4326
4327	RB_TEST(2, "deletehosts", ep)
4328
4329	if (MD_MNSET_DESC(sd)) {
4330		md_mnnode_desc		*saved_nd_next;
4331		mddb_config_t		c;
4332
4333		if (dd != NULL) {
4334			/*
4335			 * Notify rpc.mdcommd on all nodes of a nodelist change.
4336			 * Start by suspending rpc.mdcommd (which drains it of
4337			 * all messages), then change the nodelist followed
4338			 * by a reinit and resume.
4339			 */
4340			nd = sd->sd_nodelist;
4341			while (nd) {
4342				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4343					nd = nd->nd_next;
4344					continue;
4345				}
4346				if (clnt_mdcommdctl(nd->nd_nodename,
4347				    COMMDCTL_SUSPEND, sp,
4348				    MD_MSG_CLASS0,
4349				    MD_MSCF_NO_FLAGS, ep)) {
4350					rval = -1;
4351					goto out2;
4352				}
4353				suspendall_flag = 1;
4354				nd = nd->nd_next;
4355			}
4356			/*
4357			 * Is current set STALE?
4358			 * Need to know this if delete host fails and node
4359			 * is re-joined to diskset.
4360			 */
4361			(void) memset(&c, 0, sizeof (c));
4362			c.c_id = 0;
4363			c.c_setno = sp->setno;
4364			if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
4365				(void) mdstealerror(ep, &c.c_mde);
4366				rval = -1;
4367				goto out2;
4368			}
4369			if (c.c_flags & MDDB_C_STALE) {
4370				stale_flag = MNSET_IS_STALE;
4371			}
4372		}
4373
4374		/*
4375		 * For each node being deleted, set DEL flag and
4376		 * reset OK flag on that node first.
4377		 * Until a node has turned off its own
4378		 * rpc.metad's NODE_OK flag, that node could be
4379		 * considered for master during a reconfig.
4380		 */
4381		for (i = 0; i < node_c; i++) {
4382			/*
4383			 * During OHA mode, don't issue RPCs to
4384			 * non-alive nodes since there is no reason to
4385			 * wait for RPC timeouts.
4386			 */
4387			nd = sd->sd_nodelist;
4388			while (nd) {
4389				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
4390					break;
4391				nd = nd->nd_next;
4392			}
4393			/* Something wrong, handle this in next loop */
4394			if (nd == NULL)
4395				continue;
4396
4397			/* If node_id_list is alloc'd, fill in for later use */
4398			if (node_id_list)
4399				node_id_list[i] = nd->nd_nodeid;
4400
4401			/* All nodes are guaranteed to be ALIVE unless OHA */
4402			if ((oha == TRUE) &&
4403			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4404				continue;
4405			}
4406
4407			/* Only changing my local cache of node list */
4408			saved_nd_next = nd->nd_next;
4409			nd->nd_next = NULL;
4410
4411			/* Set flags for del host to DEL on that host */
4412			if (clnt_upd_nr_flags(node_v[i], sp,
4413			    nd, MD_NR_DEL, NULL, ep)) {
4414				nd->nd_next = saved_nd_next;
4415				goto rollback;
4416			}
4417			nd->nd_next = saved_nd_next;
4418		}
4419		for (i = 0; i < node_c; i++) {
4420			/*
4421			 * Turn off owner flag in nodes to be deleted
4422			 * if this node has been joined.
4423			 * Also, turn off NODE_OK and turn on NODE_DEL
4424			 * for nodes to be deleted.
4425			 * These flags are used to set the node
4426			 * record flags in all nodes in the set.
4427			 * Only withdraw nodes that are joined.
4428			 */
4429			nd = sd->sd_nodelist;
4430			while (nd) {
4431				/*
4432				 * Don't communicate with non-ALIVE node if
4433				 * in OHA - but set flags in master list so
4434				 * alive nodes are updated correctly.
4435				 */
4436				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4437					if ((oha == TRUE) && (!(nd->nd_flags &
4438					    MD_MN_NODE_ALIVE))) {
4439						nd->nd_flags |= MD_MN_NODE_DEL;
4440						nd->nd_flags &= ~MD_MN_NODE_OK;
4441						nd = nd->nd_next;
4442						continue;
4443					}
4444					if (nd->nd_flags & MD_MN_NODE_OWN) {
4445						/*
4446						 * Going to set locally cached
4447						 * node flags to rollback join
4448						 * so in case of error, the
4449						 * rollback code knows which
4450						 * nodes to re-join.  rpc.metad
4451						 * ignores the RB_JOIN flag.
4452						 */
4453						nd->nd_flags |=
4454						    MD_MN_NODE_RB_JOIN;
4455						nd->nd_flags &= ~MD_MN_NODE_OWN;
4456
4457						/*
4458						 * Be careful in ordering of
4459						 * following steps so that
4460						 * recovery from a panic
4461						 * between the steps is viable.
4462						 * Only reset master info in
4463						 * rpc.metad - don't reset
4464						 * local cached info which will
4465						 * be used to set master info
4466						 * back if failure (rollback).
4467						 */
4468						if (clnt_withdrawset(
4469						    nd->nd_nodename, sp, ep))
4470							goto rollback;
4471
4472						/*
4473						 * Reset master on deleted node
4474						 */
4475						if (clnt_mnsetmaster(node_v[i],
4476						    sp, "", MD_MN_INVALID_NID,
4477						    ep))
4478							goto rollback;
4479					}
4480
4481					nd->nd_flags |= MD_MN_NODE_DEL;
4482					nd->nd_flags &= ~MD_MN_NODE_OK;
4483				}
4484				nd = nd->nd_next;
4485			}
4486		}
4487
4488		/*
4489		 * Now, reset owner and set delete flags for the
4490		 * deleted nodes on all nodes.
4491		 */
4492		nd = sd->sd_nodelist;
4493		while (nd) {
4494			/* Skip non-ALIVE node if in OHA */
4495			if ((oha == TRUE) &&
4496			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4497				nd = nd->nd_next;
4498				continue;
4499			}
4500			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4501			    sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
4502				goto rollback;
4503			}
4504			nd = nd->nd_next;
4505		}
4506		/*
4507		 * Notify rpc.mdcommd on all nodes of a nodelist change.
4508		 * Send reinit command to mdcommd which forces it to get
4509		 * fresh set description.
4510		 */
4511		if (suspendall_flag) {
4512			/* Send reinit */
4513			nd = sd->sd_nodelist;
4514			while (nd) {
4515				if ((oha == TRUE) &&
4516				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4517					nd = nd->nd_next;
4518					continue;
4519				}
4520				/* Class is ignored for REINIT */
4521				if (clnt_mdcommdctl(nd->nd_nodename,
4522				    COMMDCTL_REINIT, sp, NULL,
4523				    MD_MSCF_NO_FLAGS, ep)) {
4524					mde_perror(ep, dgettext(TEXT_DOMAIN,
4525					    "Unable to reinit rpc.mdcommd.\n"));
4526					goto rollback;
4527				}
4528				nd = nd->nd_next;
4529			}
4530			/* Send resume */
4531			nd = sd->sd_nodelist;
4532			while (nd) {
4533				if ((oha == TRUE) &&
4534				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4535					nd = nd->nd_next;
4536					continue;
4537				}
4538				if (clnt_mdcommdctl(nd->nd_nodename,
4539				    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
4540				    MD_MSCF_DONT_RESUME_CLASS1, ep)) {
4541					mde_perror(ep, dgettext(TEXT_DOMAIN,
4542					    "Unable to resume rpc.mdcommd.\n"));
4543					goto rollback;
4544				}
4545				nd = nd->nd_next;
4546			}
4547			meta_ping_mnset(sp->setno);
4548		}
4549	}
4550
4551
4552	/*
4553	 * Mark the set record MD_SR_DEL on the hosts we are deleting
4554	 * If a MN diskset and OHA mode, don't issue RPC to nodes that
4555	 * are not ALIVE.
4556	 * If a MN diskset and not in OHA mode, then all nodes must respond
4557	 * to RPC (be alive) or this routine will return failure.
4558	 * If a traditional diskset, all RPC failures if in OHA mode.
4559	 */
4560	for (i = 0; i < node_c; i++) {
4561
4562		RB_TEST(3, "deletehosts", ep)
4563
4564		if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) {
4565			/*
4566			 * During OHA mode, don't issue RPCs to
4567			 * non-alive nodes since there is no reason to
4568			 * wait for RPC timeouts.
4569			 */
4570			nd = sd->sd_nodelist;
4571			while (nd) {
4572				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4573					break;
4574				}
4575				nd = nd->nd_next;
4576			}
4577			if (nd == NULL) {
4578				(void) mddserror(ep, MDE_DS_NODENOTINSET,
4579				    sp->setno, node_v[i], NULL, sp->setname);
4580				goto rollback;
4581			} else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4582				/* Skip non-ALIVE node if in OHA mode */
4583				continue;
4584			} else {
4585				if (clnt_upd_sr_flags(node_v[i], sp,
4586				    MD_SR_DEL, ep)) {
4587					goto rollback;
4588				}
4589			}
4590		} else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) {
4591			/*
4592			 * All nodes should be alive in non-oha mode.
4593			 */
4594			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4595				goto rollback;
4596			}
4597		} else {
4598			/*
4599			 * For traditional diskset, issue the RPC and
4600			 * ignore RPC failure if in OHA mode.
4601			 */
4602			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4603				if (oha == TRUE && mdanyrpcerror(ep)) {
4604					mdclrerror(ep);
4605					continue;
4606				}
4607				goto rollback;
4608			}
4609		}
4610
4611		RB_TEST(4, "deletehosts", ep)
4612	}
4613
4614	RB_TEST(5, "deletehosts", ep)
4615
4616	RB_PREEMPT;
4617	rb_level = 2;	/* level 2 */
4618
4619	RB_TEST(6, "deletehosts", ep)
4620
4621	/* Delete the set on the hosts we are deleting */
4622	if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) {
4623		if (node_id_list)
4624			Free(node_id_list);
4625		/*
4626		 * Failure during del_set_on_hosts would have recreated
4627		 * the diskset on the remote hosts, but for multi-owner
4628		 * disksets need to set node flags properly and REINIT and
4629		 * RESUME rpc.mdcommd, so just let the rollback code
4630		 * do this.
4631		 */
4632		if (MD_MNSET_DESC(sd))
4633			goto rollback;
4634		return (-1);
4635	}
4636	remote_sets_deleted = 1;
4637
4638	RB_TEST(19, "deletehosts", ep)
4639
4640	RB_PREEMPT;
4641	rb_level = 3;	/* level 3 */
4642
4643	RB_TEST(20, "deletehosts", ep)
4644
4645	/* Delete the host from sets on hosts not being deleted */
4646	if (MD_MNSET_DESC(sd)) {
4647		nd = sd->sd_nodelist;
4648		/* All nodes are guaranteed to be ALIVE unless in oha mode */
4649		while (nd) {
4650			/*
4651			 * During OHA mode, don't issue RPCs to
4652			 * non-alive nodes since there is no reason to
4653			 * wait for RPC timeouts.
4654			 */
4655			if ((oha == TRUE) &&
4656			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4657				nd = nd->nd_next;
4658				continue;
4659			}
4660
4661			/* Skip nodes being deleted */
4662			if (strinlst(nd->nd_nodename, node_c, node_v)) {
4663				nd = nd->nd_next;
4664				continue;
4665			}
4666			if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v,
4667			    ep) == -1) {
4668				goto rollback;
4669			}
4670
4671			RB_TEST(21, "deletehosts", ep)
4672			nd = nd->nd_next;
4673		}
4674	} else {
4675		for (i = 0; i < MD_MAXSIDES; i++) {
4676			/* Skip empty slots */
4677			if (sd->sd_nodes[i][0] == '\0')
4678				continue;
4679
4680			/* Skip nodes being deleted */
4681			if (strinlst(sd->sd_nodes[i], node_c, node_v))
4682				continue;
4683
4684			if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
4685			    ep) == -1) {
4686				if (oha == TRUE && mdanyrpcerror(ep)) {
4687					mdclrerror(ep);
4688					continue;
4689				}
4690				goto rollback;
4691			}
4692
4693			RB_TEST(21, "deletehosts", ep)
4694		}
4695	}
4696
4697	/* We have drives */
4698	if (dd != NULL) {
4699		RB_TEST(22, "deletehosts", ep)
4700
4701		RB_PREEMPT;
4702		rb_level = 4;	/* level 4 */
4703
4704		RB_TEST(23, "deletehosts", ep)
4705
4706		/*
4707		 * Delete the old sidename for each drive on all the hosts.
4708		 * If a multi-node diskset, each host only stores
4709		 * the side information for itself.  So, a multi-node
4710		 * diskset doesn't delete the old sidename for
4711		 * an old host.
4712		 *
4713		 * If a MN diskset, reset owners of mirrors that are
4714		 * owned by the deleted nodes.
4715		 */
4716		if (!(MD_MNSET_DESC(sd))) {
4717			for (i = 0; i < MD_MAXSIDES; i++) {
4718				/* Skip empty slots */
4719				if (sd->sd_nodes[i][0] == '\0')
4720					continue;
4721
4722				/* Skip nodes being deleted */
4723				if (strinlst(sd->sd_nodes[i], node_c, node_v))
4724					continue;
4725
4726				if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
4727				    ep)) {
4728					if (oha == TRUE && mdanyrpcerror(ep)) {
4729						mdclrerror(ep);
4730						continue;
4731					}
4732					metaflushsetname(sp);
4733					goto rollback;
4734				}
4735
4736				RB_TEST(24, "deletehosts", ep)
4737			}
4738		} else {
4739			nd = sd->sd_nodelist;
4740			/* All nodes guaranteed ALIVE unless in oha mode */
4741			while (nd) {
4742				/*
4743				 * If mirror owner was set to a deleted node,
4744				 * then each existing node resets mirror owner
4745				 * to NULL.
4746				 *
4747				 * During OHA mode, don't issue RPCs to
4748				 * non-alive nodes since there is no reason to
4749				 * wait for RPC timeouts.
4750				 */
4751				if ((oha == TRUE) &&
4752				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4753					nd = nd->nd_next;
4754					continue;
4755				}
4756
4757				/* Skip nodes being deleted */
4758				if (strinlst(nd->nd_nodename, node_c, node_v)) {
4759					nd = nd->nd_next;
4760					continue;
4761				}
4762
4763				/*
4764				 * If mirror owner is a deleted node, reset
4765				 * mirror owners to NULL.  If an error occurs,
4766				 * print a warning and continue.  Don't fail
4767				 * metaset because of mirror owner reset
4768				 * problem since next node to grab mirror
4769				 * will resolve this issue.  Before next node
4770				 * grabs mirrors, metaset will show the deleted
4771				 * node as owner which is why an attempt to
4772				 * reset the mirror owner is made.
4773				 */
4774				if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
4775				    node_c, &node_id_list[0], &xep) == -1) {
4776					mde_perror(&xep, dgettext(TEXT_DOMAIN,
4777					    "Unable to reset mirror owner on"
4778					    " node %s\n"), nd->nd_nodename);
4779					mdclrerror(&xep);
4780				}
4781
4782				RB_TEST(21, "deletehosts", ep)
4783				nd = nd->nd_next;
4784			}
4785		}
4786	}
4787
4788	RB_TEST(25, "deletehosts", ep)
4789
4790	RB_PREEMPT;
4791	rb_level = 4;	/* level 4 */
4792
4793	RB_TEST(26, "deletehosts", ep)
4794
4795	/*
4796	 * Bring the mediator record up to date with the set record for
4797	 * traditional diskset.
4798	 */
4799	if (!(MD_MNSET_DESC(sd))) {
4800		medr = rb_medr;			/* structure assignment */
4801		for (i = 0; i < MD_MAXSIDES; i++) {
4802			if (strinlst(sd->sd_nodes[i], node_c, node_v))
4803				(void) memset(&medr.med_rec_nodes[i],
4804				    '\0', sizeof (md_node_nm_t));
4805			else
4806				(void) strcpy(medr.med_rec_nodes[i],
4807				    sd->sd_nodes[i]);
4808		}
4809		crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
4810
4811		/* Inform the mediator hosts of the new node list */
4812		for (i = 0; i < max_meds; i++) {
4813			if (sd->sd_med.n_lst[i].a_cnt == 0)
4814				continue;
4815
4816			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
4817			    &medr, ep)) {
4818				if (oha == TRUE && mdanyrpcerror(ep)) {
4819					mdclrerror(ep);
4820					continue;
4821				}
4822				goto rollback;
4823			}
4824		}
4825	}
4826
4827	RB_TEST(27, "deletehosts", ep)
4828
4829	/*
4830	 * For traditional diskset:
4831	 * We are deleting ourselves out of the set and we have drives to
4832	 * consider; so we need to halt the set, release the drives and
4833	 * reset the timeout.  **** THIS IS A ONE WAY TICKET, NO ROLL BACK
4834	 * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE
4835	 * WITH ALL SIGNALS BLOCKED AND LAST ****
4836	 *
4837	 * This situation cannot occur in a MN diskset since a node can't
4838	 * delete itself unless all nodes are being deleted and a diskset
4839	 * cannot contain any drives if all nodes are being deleted.
4840	 * So, don't even test for this if a MN diskset.
4841	 */
4842	if (!(MD_MNSET_DESC(sd)) && (dd != NULL) &&
4843	    strinlst(mynode(), node_c, node_v)) {
4844		/* Make sure we are blocking all signals */
4845		if (procsigs(TRUE, &oldsigs, ep) < 0) {
4846			rval = -1;
4847			goto out1;
4848		}
4849
4850		if (halt_set(sp, ep)) {
4851			rval = -1;
4852			goto out1;
4853		}
4854
4855		if (rel_own_bydd(sp, dd, FALSE, ep))
4856			rval = -1;
4857
4858out1:
4859		/* release signals back to what they were on entry */
4860		if (procsigs(FALSE, &oldsigs, &xep) < 0) {
4861			if (rval == 0)
4862				(void) mdstealerror(ep, &xep);
4863			rval = -1;
4864		}
4865	}
4866
4867out2:
4868	/*
4869	 * Unlock diskset by resuming messages across the diskset.
4870	 * Just resume all classes so that resume is the same whether
4871	 * just one class was locked or all classes were locked.
4872	 */
4873	if ((suspend1_flag) || (suspendall_flag)) {
4874		/* Send resume */
4875		nd = sd->sd_nodelist;
4876		while (nd) {
4877			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4878				nd = nd->nd_next;
4879				continue;
4880			}
4881			/*
4882			 * Skip nodes being deleted if remote set
4883			 * was deleted since rpc.mdcommd may no longer
4884			 * be running on remote node.
4885			 */
4886			if ((remote_sets_deleted == 1) &&
4887			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4888				nd = nd->nd_next;
4889				continue;
4890			}
4891			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
4892			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
4893				if (rval == 0)
4894					(void) mdstealerror(ep, &xep);
4895				rval = -1;
4896				mde_perror(ep, dgettext(TEXT_DOMAIN,
4897				    "Unable to resume rpc.mdcommd.\n"));
4898			}
4899			nd = nd->nd_next;
4900		}
4901		meta_ping_mnset(sp->setno);
4902	}
4903
4904	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4905	if (lock_flag) {
4906		if (MD_MNSET_DESC(sd)) {
4907			nd = sd->sd_nodelist;
4908			while (nd) {
4909				/*
4910				 * During OHA mode, don't issue RPCs to
4911				 * non-alive nodes since there is no reason to
4912				 * wait for RPC timeouts.
4913				 */
4914				if ((oha == TRUE) &&
4915				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4916					nd = nd->nd_next;
4917					continue;
4918				}
4919				if (clnt_unlock_set(nd->nd_nodename,
4920				    cl_sk, &xep)) {
4921					if (rval == 0)
4922						(void) mdstealerror(ep, &xep);
4923					rval = -1;
4924				}
4925				nd = nd->nd_next;
4926			}
4927		} else {
4928			for (i = 0; i < MD_MAXSIDES; i++) {
4929				/* Skip empty slots */
4930				if (sd->sd_nodes[i][0] == '\0')
4931					continue;
4932
4933				if (clnt_unlock_set(sd->sd_nodes[i],
4934				    cl_sk, &xep)) {
4935					if (oha == TRUE &&
4936					    mdanyrpcerror(&xep)) {
4937						mdclrerror(&xep);
4938						continue;
4939					}
4940					if (rval == 0)
4941						(void) mdstealerror(ep, &xep);
4942					rval = -1;
4943				}
4944			}
4945		}
4946	}
4947	cl_set_setkey(NULL);
4948
4949out3:
4950	metafreereplicalist(rlp);
4951	if (node_id_list)
4952		Free(node_id_list);
4953
4954	metaflushsetname(sp);
4955
4956	if (MD_MNSET_DESC(sd)) {
4957		/* release signals back to what they were on entry */
4958		if (procsigs(FALSE, &oldsigs, &xep) < 0)
4959			mdclrerror(&xep);
4960	} else {
4961		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
4962	}
4963
4964
4965	return (rval);
4966
4967rollback:
4968	/* all signals already blocked for MN disket */
4969	if (!(MD_MNSET_DESC(sd))) {
4970		if (procsigs(TRUE, &oldsigs, &xep) < 0)
4971			mdclrerror(&xep);
4972	}
4973
4974	rval = -1;
4975
4976	max_genid = sd->sd_genid;
4977
4978
4979	/*
4980	 * Send reinit command to rpc.mdcommd which forces it to get
4981	 * fresh set description and resume all classes but class 0.
4982	 * Don't send any commands to rpc.mdcommd if set on that node
4983	 * has been removed.
4984	 */
4985	if (suspendall_flag) {
4986		/* Send reinit */
4987		nd = sd->sd_nodelist;
4988		while (nd) {
4989			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4990				nd = nd->nd_next;
4991				continue;
4992			}
4993			/*
4994			 * If the remote set was deleted, rpc.mdcommd
4995			 * may no longer be running so send nothing to it.
4996			 */
4997			if ((remote_sets_deleted == 1) &&
4998			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4999				nd = nd->nd_next;
5000				continue;
5001			}
5002			/* Class is ignored for REINIT */
5003			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5004			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5005				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5006				    "Unable to reinit rpc.mdcommd.\n"));
5007				mdclrerror(&xep);
5008			}
5009			nd = nd->nd_next;
5010		}
5011		/* Send resume */
5012		nd = sd->sd_nodelist;
5013		while (nd) {
5014			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5015				nd = nd->nd_next;
5016				continue;
5017			}
5018			/*
5019			 * If the remote set was deleted, rpc.mdcommd
5020			 * may no longer be running so send nothing to it.
5021			 */
5022			if ((remote_sets_deleted == 1) &&
5023			    (strinlst(nd->nd_nodename, node_c, node_v))) {
5024				nd = nd->nd_next;
5025				continue;
5026			}
5027			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5028			    sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
5029			    &xep)) {
5030				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5031				    "Unable to resume rpc.mdcommd.\n"));
5032				mdclrerror(&xep);
5033			}
5034			nd = nd->nd_next;
5035		}
5036		meta_ping_mnset(sp->setno);
5037	}
5038
5039	/* level 2 */
5040	if (rb_level > 1) {
5041		md_set_record		*sr;
5042		md_replicalist_t	*rl;
5043
5044		recreate_set(sp, sd);
5045
5046		/*
5047		 * Lock out other meta* commands on nodes with the newly
5048		 * re-created sets by suspending class 1 messages
5049		 * across the diskset.
5050		 */
5051		nd = sd->sd_nodelist;
5052		while (nd) {
5053			/* Skip nodes not being deleted */
5054			if (!(strinlst(nd->nd_nodename, node_c, node_v))) {
5055				nd = nd->nd_next;
5056				continue;
5057			}
5058			/* Suspend commd on nodes with re-created sets */
5059			if (clnt_mdcommdctl(nd->nd_nodename,
5060			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
5061			    MD_MSCF_NO_FLAGS, &xep)) {
5062				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5063				    "Unable to suspend rpc.mdcommd.\n"));
5064				mdclrerror(&xep);
5065			}
5066			nd = nd->nd_next;
5067		}
5068
5069		max_genid++;
5070
5071		/*
5072		 * See if we have to re-add the drives specified.
5073		 */
5074		for (i = 0; i < node_c; i++) {
5075			if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
5076				/*
5077				 * During OHA mode, don't issue RPCs to
5078				 * non-alive nodes since there is no reason to
5079				 * wait for RPC timeouts.
5080				 */
5081				nd = sd->sd_nodelist;
5082				while (nd) {
5083					if (strcmp(nd->nd_nodename, node_v[i])
5084					    == 0) {
5085						break;
5086					}
5087					nd = nd->nd_next;
5088				}
5089				if (nd == 0)
5090					continue;
5091				if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
5092					continue;
5093			}
5094
5095			/* Don't care if set record is MN or not */
5096			if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr,
5097			    &xep) == -1) {
5098				mdclrerror(&xep);
5099				continue;
5100			}
5101
5102			/* Drive already added, skip to next node */
5103			if (sr->sr_drivechain != NULL) {
5104				/*
5105				 * Set record structure was allocated from RPC
5106				 * routine getset so this structure is only of
5107				 * size md_set_record even if the MN flag is
5108				 * set.  So, clear the flag so that the free
5109				 * code doesn't attempt to free a structure
5110				 * the size of md_mnset_record.
5111				 */
5112				sr->sr_flags &= ~MD_SR_MN;
5113				free_sr(sr);
5114				continue;
5115			}
5116
5117			if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime,
5118			    sr->sr_genid, &xep) == -1)
5119				mdclrerror(&xep);
5120
5121			if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK,
5122			    &xep) == -1)
5123				mdclrerror(&xep);
5124
5125			/*
5126			 * Set record structure was allocated from RPC routine
5127			 * getset so this structure is only of size
5128			 * md_set_record even if the MN flag is set.  So,
5129			 * clear the flag so that the free code doesn't
5130			 * attempt to free a structure the size of
5131			 * md_mnset_record.
5132			 */
5133			sr->sr_flags &= ~MD_SR_MN;
5134			free_sr(sr);
5135		}
5136		max_genid += 3;
5137
5138		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
5139			md_replica_t	*r = rl->rl_repp;
5140			/*
5141			 * This is not the first replica being added to the
5142			 * diskset so call with ADDSIDENMS_BCAST.  If this
5143			 * is a traditional diskset, the bcast flag is ignored
5144			 * since traditional disksets don't use the rpc.mdcommd.
5145			 */
5146			if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
5147			    DB_ADDSIDENMS_BCAST, &xep))
5148				mdclrerror(&xep);
5149		}
5150
5151		/*
5152		 * Add the device names for the new sides into the namespace,
5153		 * on all hosts not being deleted.
5154		 */
5155		if (MD_MNSET_DESC(sd)) {
5156			nd = sd->sd_nodelist;
5157			while (nd) {
5158				/* Find a node that is not being deleted */
5159				if (!strinlst(nd->nd_nodename, node_c,
5160				    node_v)) {
5161					j = nd->nd_nodeid;
5162					break;
5163				}
5164				nd = nd->nd_next;
5165			}
5166		} else {
5167			for (j = 0; j < MD_MAXSIDES; j++) {
5168				/* Skip empty slots */
5169				if (sd->sd_nodes[j][0] == '\0')
5170					continue;
5171
5172				/* Find a node that is not being deleted */
5173				if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5174					break;
5175			}
5176		}
5177
5178		if (MD_MNSET_DESC(sd)) {
5179			nd = sd->sd_nodelist;
5180			while (nd) {
5181				/* Skip nodes not being deleted */
5182				if (!strinlst(nd->nd_nodename, node_c,
5183				    node_v)) {
5184					nd = nd->nd_next;
5185					continue;
5186				}
5187
5188				/* this side was just created, add the names */
5189				if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep))
5190					mdclrerror(&xep);
5191				nd = nd->nd_next;
5192			}
5193		} else {
5194			for (i = 0; i < MD_MAXSIDES; i++) {
5195				/* Skip empty slots */
5196				if (sd->sd_nodes[i][0] == '\0')
5197					continue;
5198
5199				/* Skip nodes not being deleted */
5200				if (!strinlst(sd->sd_nodes[i], node_c, node_v))
5201					continue;
5202
5203				/* this side was just created, add the names */
5204				if (add_md_sidenms(sp, i, j, &xep))
5205					mdclrerror(&xep);
5206			}
5207		}
5208	}
5209
5210	/* level 4 */
5211	if (rb_level > 3 && dd != NULL) {
5212		/*
5213		 * Add the new sidename for each drive to all the hosts
5214		 * Multi-node disksets only store the sidename for
5215		 * that host, so there is nothing to re-add.
5216		 */
5217		if (!(MD_MNSET_DESC(sd))) {
5218			for (j = 0; j < MD_MAXSIDES; j++) {
5219				/* Skip empty slots */
5220				if (sd->sd_nodes[j][0] == '\0')
5221					continue;
5222
5223				/* Skip nodes not being deleted */
5224				if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5225					break;
5226			}
5227			for (i = 0; i < MD_MAXSIDES; i++) {
5228				/* Skip empty slots */
5229				if (sd->sd_nodes[i][0] == '\0')
5230					continue;
5231
5232				if (clnt_add_drv_sidenms(sd->sd_nodes[i],
5233				    sd->sd_nodes[j], sp, sd, node_c, node_v,
5234				    &xep))
5235					mdclrerror(&xep);
5236			}
5237		}
5238
5239	}
5240
5241	/* level 5 */
5242	if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) {
5243		/* rollback the mediator record */
5244		for (i = 0; i < max_meds; i++) {
5245			if (sd->sd_med.n_lst[i].a_cnt == 0)
5246				continue;
5247
5248			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
5249			    &rb_medr, &xep))
5250				mdclrerror(&xep);
5251		}
5252	}
5253
5254	/* level 3 */
5255	if (rb_level > 2) {
5256		md_set_record		*sr;
5257		md_mnset_record		*mnsr;
5258
5259		if (MD_MNSET_DESC(sd)) {
5260			nd = sd->sd_nodelist;
5261			/*
5262			 * During OHA mode, don't issue RPCs to
5263			 * non-alive nodes since there is no reason to
5264			 * wait for RPC timeouts.
5265			 */
5266			while (nd) {
5267				if ((oha == TRUE) &&
5268				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5269					nd = nd->nd_next;
5270					continue;
5271				}
5272				/* Record should be for a multi-node diskset */
5273				if (clnt_mngetset(nd->nd_nodename, sp->setname,
5274				    MD_SET_BAD, &mnsr, &xep) == -1) {
5275					mdclrerror(&xep);
5276					nd = nd->nd_next;
5277					continue;
5278				}
5279
5280				has_set = 1;
5281
5282				nr = mnsr->sr_nodechain;
5283				while (nr) {
5284					if (nd->nd_nodeid == nr->nr_nodeid) {
5285						break;
5286					}
5287					nr = nr->nr_next;
5288				}
5289				if (nr == NULL)
5290					has_set = 0;
5291
5292				free_sr((struct md_set_record *)mnsr);
5293				if (has_set) {
5294					nd = nd->nd_next;
5295					continue;
5296				}
5297
5298				if (clnt_addhosts(nd->nd_nodename, sp, node_c,
5299				    node_v, &xep) == -1)
5300					mdclrerror(&xep);
5301
5302				nd = nd->nd_next;
5303			}
5304		} else {
5305			for (i = 0; i < MD_MAXSIDES; i++) {
5306				/* Skip empty slots */
5307				if (sd->sd_nodes[i][0] == '\0')
5308					continue;
5309
5310				/* Record should be for a non-multi-node set */
5311				if (clnt_getset(sd->sd_nodes[i], sp->setname,
5312				    MD_SET_BAD, &sr, &xep) == -1) {
5313					mdclrerror(&xep);
5314					continue;
5315				}
5316
5317				/*
5318				 * Set record structure was allocated from RPC
5319				 * routine getset so this structure is only of
5320				 * size md_set_record even if the MN flag is
5321				 * set.  So, clear the flag so that the free
5322				 * code doesn't attempt to free a structure
5323				 * the size of md_mnset_record.
5324				 */
5325				if (MD_MNSET_REC(sr)) {
5326					sr->sr_flags &= ~MD_SR_MN;
5327					free_sr(sr);
5328					continue;
5329				}
5330
5331				has_set = 1;
5332				for (j = 0; j < MD_MAXSIDES; j++) {
5333					/* Skip empty slots */
5334					if (sd->sd_nodes[j][0] == '\0')
5335						continue;
5336
5337					if (sr->sr_nodes[j][0] == '\0') {
5338						has_set = 0;
5339						break;
5340					}
5341				}
5342
5343				free_sr(sr);
5344				if (has_set)
5345					continue;
5346
5347				if (clnt_addhosts(sd->sd_nodes[i], sp, node_c,
5348				    node_v, &xep) == -1)
5349					mdclrerror(&xep);
5350			}
5351		}
5352		max_genid++;
5353	}
5354
5355	/* level 1 */
5356	if (rb_level > 0) {
5357		max_genid++;
5358		/* Sets MD_SR_OK on given nodes. */
5359		resync_genid(sp, sd, max_genid, node_c, node_v);
5360
5361		/*
5362		 * For MN diskset:
5363		 * On each newly re-added node, set the node record for that
5364		 * node to OK.  Then set all node records for the newly added
5365		 * nodes on all nodes to ok.
5366		 *
5367		 * By setting a node's own node record to ok first, even if
5368		 * the node re-adding the hosts panics, the rest of the nodes
5369		 * can determine the same node list during the choosing of the
5370		 * master during reconfig.  So, only nodes considered for
5371		 * mastership are nodes that have both MD_MN_NODE_OK and
5372		 * MD_SR_OK set on that node's rpc.metad.  If all nodes have
5373		 * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set,
5374		 * then the set will be removed during reconfig since a panic
5375		 * occurred during the re-creation of the deletion of
5376		 * the initial diskset.
5377		 */
5378		if (MD_MNSET_DESC(sd)) {
5379			md_mnnode_desc	*saved_nd_next;
5380			if (dd != NULL) {
5381				/*
5382				 * Notify rpc.mdcommd on all nodes of a
5383				 * nodelist change.  Start by suspending
5384				 * rpc.mdcommd (which drains it of all
5385				 * messages), then change the nodelist
5386				 * followed by a reinit and resume.
5387				 */
5388				nd = sd->sd_nodelist;
5389				while (nd) {
5390					if (!(nd->nd_flags &
5391					    MD_MN_NODE_ALIVE)) {
5392						nd = nd->nd_next;
5393						continue;
5394					}
5395					if (clnt_mdcommdctl(nd->nd_nodename,
5396					    COMMDCTL_SUSPEND, sp,
5397					    MD_MSG_CLASS0,
5398					    MD_MSCF_NO_FLAGS, &xep)) {
5399						mde_perror(&xep,
5400						    dgettext(TEXT_DOMAIN,
5401						    "Unable to suspend "
5402						    "rpc.mdcommd.\n"));
5403						mdclrerror(&xep);
5404					}
5405					suspendall_flag_rb = 1;
5406					nd = nd->nd_next;
5407				}
5408			}
5409			for (i = 0; i < node_c; i++) {
5410				/*
5411				 * During OHA mode, don't issue RPCs to
5412				 * non-alive nodes since there is no reason to
5413				 * wait for RPC timeouts.
5414				 */
5415				nd = sd->sd_nodelist;
5416				while (nd) {
5417					if (strcmp(nd->nd_nodename, node_v[i])
5418					    == 0)
5419						break;
5420					nd = nd->nd_next;
5421				}
5422				/* Something wrong, finish this in next loop */
5423				if (nd == NULL)
5424					continue;
5425
5426				if ((oha == TRUE) &&
5427				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5428					continue;
5429				}
5430
5431				if (dd != NULL) {
5432					/* Set master on re-joining node. */
5433					if (clnt_mnsetmaster(node_v[i], sp,
5434					    sd->sd_mn_master_nodenm,
5435					    sd->sd_mn_master_nodeid, &xep)) {
5436						mdclrerror(&xep);
5437					}
5438
5439					/*
5440					 * Re-join set to same state as
5441					 * before - stale or non-stale.
5442					 */
5443					if (clnt_joinset(node_v[i], sp,
5444					    stale_flag, &xep)) {
5445						mdclrerror(&xep);
5446					}
5447				}
5448
5449				/* Only changing my local cache of node list */
5450				saved_nd_next = nd->nd_next;
5451				nd->nd_next = NULL;
5452
5453				/* Set record for host to ok on that host */
5454				if (clnt_upd_nr_flags(node_v[i], sp,
5455				    nd, MD_NR_OK, NULL, &xep)) {
5456					mdclrerror(&xep);
5457				}
5458				nd->nd_next = saved_nd_next;
5459			}
5460
5461			/* Now set all node records on all nodes to be ok */
5462			nd = sd->sd_nodelist;
5463			while (nd) {
5464				/*
5465				 * During OHA mode, don't issue RPCs to
5466				 * non-alive nodes since there is no reason to
5467				 * wait for RPC timeouts.
5468				 */
5469				if ((oha == TRUE) &&
5470				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5471					nd = nd->nd_next;
5472					continue;
5473				}
5474				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5475				    sd->sd_nodelist, MD_NR_OK, NULL, &xep)) {
5476					mdclrerror(&xep);
5477				}
5478				nd = nd->nd_next;
5479			}
5480		}
5481	}
5482
5483	/*
5484	 * Notify rpc.mdcommd on all nodes of a nodelist change.
5485	 * Send reinit command to mdcommd which forces it to get
5486	 * fresh set description.
5487	 */
5488	if (suspendall_flag_rb) {
5489		/* Send reinit */
5490		nd = sd->sd_nodelist;
5491		while (nd) {
5492			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5493				nd = nd->nd_next;
5494				continue;
5495			}
5496
5497			/* Class is ignored for REINIT */
5498			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5499			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5500				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5501				    "Unable to reinit rpc.mdcommd.\n"));
5502				mdclrerror(&xep);
5503			}
5504			nd = nd->nd_next;
5505		}
5506	}
5507
5508	/*
5509	 * Unlock diskset by resuming messages across the diskset.
5510	 * Just resume all classes so that resume is the same whether
5511	 * just one class was locked or all classes were locked.
5512	 */
5513	if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) {
5514		/* Send resume */
5515		nd = sd->sd_nodelist;
5516		while (nd) {
5517			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5518				nd = nd->nd_next;
5519				continue;
5520			}
5521			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5522			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
5523				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5524				    "Unable to resume rpc.mdcommd.\n"));
5525			}
5526			nd = nd->nd_next;
5527		}
5528		meta_ping_mnset(sp->setno);
5529	}
5530
5531	/*
5532	 * Start a resync thread on the re-added nodes
5533	 * if set is not stale. Also start a thread to update the
5534	 * abr state of all soft partitions
5535	 */
5536	if (stale_flag != MNSET_IS_STALE) {
5537		for (i = 0; i < node_c; i++) {
5538			/*
5539			 * During OHA mode, don't issue RPCs to
5540			 * non-alive nodes since there is no reason to
5541			 * wait for RPC timeouts.
5542			 */
5543			nd = sd->sd_nodelist;
5544			while (nd) {
5545				if (strcmp(nd->nd_nodename, node_v[i])
5546				    == 0)
5547					break;
5548				nd = nd->nd_next;
5549			}
5550			if (nd == NULL)
5551				continue;
5552
5553			if ((oha == TRUE) &&
5554			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5555				continue;
5556			}
5557
5558			if (dd != 0) {
5559				if (clnt_mn_mirror_resync_all(node_v[i],
5560				    sp->setno, &xep)) {
5561					mde_perror(ep, dgettext(TEXT_DOMAIN,
5562					    "Unable to start resync "
5563					    "thread.\n"));
5564				}
5565				if (clnt_mn_sp_update_abr(node_v[i],
5566				    sp->setno, &xep)) {
5567					mde_perror(ep, dgettext(TEXT_DOMAIN,
5568					    "Unable to start sp update "
5569					    "thread.\n"));
5570				}
5571			}
5572		}
5573	}
5574
5575	/* level 0 */
5576	cl_sk = cl_get_setkey(sp->setno, sp->setname);
5577	/* Don't test lock flag since guaranteed to be set if in rollback */
5578	if (MD_MNSET_DESC(sd)) {
5579		nd = sd->sd_nodelist;
5580		while (nd) {
5581			/*
5582			 * During OHA mode, don't issue RPCs to
5583			 * non-alive nodes since there is no reason to
5584			 * wait for RPC timeouts.
5585			 */
5586			if ((oha == TRUE) &&
5587			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5588				nd = nd->nd_next;
5589				continue;
5590			}
5591			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
5592				mdclrerror(&xep);
5593			nd = nd->nd_next;
5594		}
5595	} else {
5596		for (i = 0; i < MD_MAXSIDES; i++) {
5597			/* Skip empty slots */
5598			if (sd->sd_nodes[i][0] == '\0')
5599				continue;
5600
5601			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
5602				mdclrerror(&xep);
5603		}
5604	}
5605	cl_set_setkey(NULL);
5606
5607	/* release signals back to what they were on entry */
5608	if (procsigs(FALSE, &oldsigs, &xep) < 0)
5609		mdclrerror(&xep);
5610
5611	metafreereplicalist(rlp);
5612	if (node_id_list)
5613		Free(node_id_list);
5614
5615	metaflushsetname(sp);
5616
5617	if (!(MD_MNSET_DESC(sd))) {
5618		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
5619	}
5620
5621	return (rval);
5622}
5623
5624int
5625meta_set_auto_take(
5626	mdsetname_t	*sp,
5627	int		take_val,
5628	md_error_t	*ep
5629)
5630{
5631	int		i;
5632	md_set_desc	*sd;
5633	int		rval = 0;
5634	md_setkey_t	*cl_sk;
5635	md_error_t	xep = mdnullerror;
5636	char		*hostname;
5637	md_drive_desc	*dd;
5638
5639	if ((sd = metaget_setdesc(sp, ep)) == NULL)
5640		return (-1);
5641
5642	/* Make sure we own the set */
5643	if (meta_check_ownership(sp, ep) != 0)
5644		return (-1);
5645
5646	hostname = mynode();
5647
5648	/* Lock the set on our side */
5649	if (clnt_lock_set(hostname, sp, ep)) {
5650		rval = -1;
5651		goto out;
5652	}
5653
5654	if (take_val) {
5655		/* enable auto_take but only if it is not already set */
5656		if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
5657			/* verify that we're the only host in the set */
5658			for (i = 0; i < MD_MAXSIDES; i++) {
5659				if (sd->sd_nodes[i] == NULL ||
5660				    sd->sd_nodes[i][0] == '\0')
5661					continue;
5662
5663				if (strcmp(sd->sd_nodes[i], hostname) != 0) {
5664					(void) mddserror(ep, MDE_DS_SINGLEHOST,
5665					    sp->setno, NULL, NULL, sp->setname);
5666					rval = -1;
5667					goto out;
5668				}
5669			}
5670
5671			if (clnt_enable_sr_flags(hostname, sp,
5672			    MD_SR_AUTO_TAKE, ep))
5673				rval = -1;
5674
5675			/* Disable SCSI reservations */
5676			if (sd->sd_flags & MD_SR_MB_DEVID)
5677				dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5678				    PRINT_FAST, &xep);
5679			else
5680				dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5681				    &xep);
5682
5683			if (! mdisok(&xep))
5684				mdclrerror(&xep);
5685
5686			if (dd != NULL) {
5687				if (rel_own_bydd(sp, dd, TRUE, &xep))
5688					mdclrerror(&xep);
5689			}
5690		}
5691
5692	} else {
5693		/* disable auto_take, if set, or error */
5694		if (sd->sd_flags & MD_SR_AUTO_TAKE) {
5695			if (clnt_disable_sr_flags(hostname, sp,
5696			    MD_SR_AUTO_TAKE, ep))
5697				rval = -1;
5698
5699			/* Enable SCSI reservations */
5700			if (sd->sd_flags & MD_SR_MB_DEVID)
5701				dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5702				    PRINT_FAST, &xep);
5703			else
5704				dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5705				    &xep);
5706
5707			if (! mdisok(&xep))
5708				mdclrerror(&xep);
5709
5710			if (dd != NULL) {
5711				mhd_mhiargs_t	mhiargs = defmhiargs;
5712
5713				if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
5714					mdclrerror(&xep);
5715			}
5716		} else {
5717			(void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno,
5718			    NULL, NULL, sp->setname);
5719			rval = -1;
5720		}
5721	}
5722
5723out:
5724	cl_sk = cl_get_setkey(sp->setno, sp->setname);
5725	if (clnt_unlock_set(hostname, cl_sk, &xep)) {
5726		if (rval == 0)
5727			(void) mdstealerror(ep, &xep);
5728		rval = -1;
5729	}
5730	cl_set_setkey(NULL);
5731
5732	return (rval);
5733}
5734