metaclust.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <meta.h>
28#include <sdssc.h>
29#include <signal.h>
30#include <syslog.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <sys/lvm/md_mirror.h>
34#include <metad.h>
35
36#define	MY_VERSION		"1.0"	/* the highest supported version */
37#define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
38
39#define	RESET_OWNER		0x0001
40#define	CHOOSE_OWNER		0x0002
41#define	RESET_ABR		0x0004
42#define	UPDATE_ABR		0x0008
43#define	GET_MIRROR_STATE	0x0010
44
45#define	SET_INFO_NO_WR	0x0002
46#define	SET_INFO_MN	0x0004
47
48/*
49 * This table defines all the metaclust reconfig steps we understand
50 */
51typedef enum stpnum {
52	MC_UNK = 0,
53	MC_START,
54	MC_STOP,
55	MC_ABORT,
56	MC_RETURN,
57	MC_STEP1,
58	MC_STEP2,
59	MC_STEP3,
60	MC_STEP4
61} stepnum_t;
62
63/*
64 * Structure for step_name -> step_number mapping
65 */
66struct step_t {
67	char		*step_nam;
68	stepnum_t	step_num;
69};
70
71/*
72 * Step name to step number mapping table
73 * This table MUST be sorted alphabetically in ascending order of step name
74 */
75static struct step_t step_table[] = {
76	{ "abort",	MC_ABORT },
77	{ "return",	MC_RETURN },
78	{ "start",	MC_START },
79	{ "step1",	MC_STEP1 },
80	{ "step2",	MC_STEP2 },
81	{ "step3",	MC_STEP3 },
82	{ "step4",	MC_STEP4 },
83	{ "stop",	MC_STOP }
84};
85
86/*
87 * If support for a different version is added, the new version number should
88 * be appended to the version_table below. This list will be searched to
89 * determine if a version requested via the -V option is supported or not.
90 */
91static char *version_table[] = {
92	MY_VERSION
93};
94
95uint_t	timeout = 0;			/* disable timeout by default */
96char	*version = MY_VERSION;		/* use latest version by default */
97int	stepnum = MC_UNK;		/* reconfiguration step number */
98pid_t	c_pid;				/* child process id */
99
100/*
101 * Binary search comparison routine
102 */
103static int
104mc_compare(const void *stp1, const void *stp2)
105{
106	return (strcmp((const char *)stp1,
107	    ((const struct step_t *)stp2)->step_nam));
108}
109
110/*
111 * Timeout expiry alarm signal handler
112 */
113/*ARGSUSED*/
114static void
115sigalarmhandler(int sig)
116{
117	int	i, n, ret, stat_loc = 0;
118	FILE	*pgcore;
119	char	corecmd[256];
120
121	n = sizeof (step_table) / sizeof (step_table[0]);
122	for (i = 0; i < n; i++) {
123		if (stepnum == step_table[i].step_num)
124			break;
125	}
126
127	assert(i != n);
128
129	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
130	    step_table[i].step_nam,
131	    meta_print_hrtime(gethrtime() - start_time));
132
133	/*
134	 * See what the child was actually doing when the timeout expired.
135	 * A core-dump of this would be _really_ good, so let's just
136	 * try a 'gcore -g c_pid' and hope
137	 */
138
139	(void) memset(corecmd, 0, sizeof (corecmd));
140	(void) snprintf(corecmd, sizeof (corecmd),
141	    "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);
142
143	pgcore = popen(corecmd, "r");
144
145	if (pgcore == NULL) {
146		meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
147		    c_pid);
148	} else {
149		(void) pclose(pgcore);
150	}
151
152	if ((ret = kill(c_pid, SIGKILL)) == 0) {
153		/*
154		 * The child will wait forever until the status is retrieved
155		 * so get it now. Keep retrying if the call is interrupted.
156		 *
157		 * The possible results are,
158		 *
159		 *	- child killed successfully
160		 *	- signal sent but child not killed
161		 *	- waitpid failed/interrupted
162		 */
163		sleep(2);
164		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
165			if (errno != EINTR) {
166				break;
167			}
168		}
169		if ((ret == c_pid) || (errno == ECHILD)) {
170			ret = 0;
171		} else {
172			ret = 1;
173		}
174	} else if (errno == ESRCH) {
175		/*
176		 * If the kill did not catch the child then it means the child
177		 * exited immediately after the timeout occured.
178		 */
179		ret = 0;
180	}
181
182	/*
183	 * make sure not to exit with 205 for any steps other than step1-step4.
184	 * Suncluster reconfiguration can't handle it otherwise.
185	 */
186	switch (stepnum) {
187	case MC_STEP1:
188	case MC_STEP2:
189	case MC_STEP3:
190	case MC_STEP4:
191		/*
192		 * If the child was killed successfully return 205 for a
193		 * new reconfig cycle otherwise send 1 to panic the node.
194		 */
195		if (ret != 0) {
196			md_eprintf(gettext("Could not kill child\n"));
197			exit(1);
198		} else {
199			exit(205);
200		}
201		break;
202	case MC_START:
203	case MC_STOP:
204	case MC_ABORT:
205	case MC_RETURN:
206	default:
207		exit(1);
208		break;
209	}
210}
211
212/*
213 * Attempt to load local set.
214 * Returns:
215 *	pointer to mdsetname_t for local set (local_sp) is successful.
216 *	0 if failure
217 *		if there are no local set mddbs, no error message is printed.
218 *		Otherwise, error message is printed so that user
219 *		can determine why the local set didn't start.
220 */
221mdsetname_t *
222load_local_set(md_error_t *ep)
223{
224	mdsetname_t	*local_sp = NULL;
225
226	/* Does local set exist? If not, give no error */
227	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
228		return (0);
229	}
230
231	/*
232	 * snarf local set
233	 * If fails with MDE_DB_NODB, then just return 1 printing
234	 * no failure.
235	 * Otherwise, print error message, and return 1.
236	 */
237	if (meta_setup_db_locations(ep) != 0) {
238		if (!(mdismddberror(ep, MDE_DB_NODB)))
239			mde_perror(ep, "");
240		return (0);
241	}
242
243	/* local set loaded successfully */
244	return (local_sp);
245}
246
247/*
248 * Purpose:	Compose a full path name for a metadevice
249 *
250 * On entry:	sp	- setname pointer
251 *		mnum	- minor number of metadevice
252 *		pathname - pointer to array to return path string
253 *		pathlen	- max length of pathname array
254 */
255static int
256compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
257{
258	int	rtn;
259	mdname_t	*np;
260	md_error_t	status = mdnullerror;
261
262	if (MD_MIN2SET(mnum) != sp->setno) {
263		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
264		    mnum, sp->setno);
265		return (-1);
266	}
267
268	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
269		return (-1);
270	}
271
272	rtn = snprintf(pathname, pathlen, "%s", np->rname);
273
274	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
275		md_eprintf(gettext(
276		    "Could not create path for device %s\n"),
277		    get_mdname(sp, mnum));
278		return (-1);
279	}
280	return (0);
281}
282
283/*
284 * Purpose:	Walk through all the devices specified for the given set
285 *		and do the action specified in mode
286 */
287static int
288reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
289{
290	mdnamelist_t			*devnlp = NULL;
291	mdnamelist_t			*p;
292	mdname_t			*devnp = NULL;
293	md_set_mmown_params_t		ownpar_p;
294	md_set_mmown_params_t		*ownpar = &ownpar_p;
295	md_unit_t			*mm;
296	int				mirror_dev = 0;
297	mndiskset_membershiplist_t	*nl;
298	int				cnt;
299	int				has_parent;
300	md_mn_get_mir_state_t		mir_state_p;
301	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
302
303	/*
304	 * if we are choosing or resetting the owners then make sure
305	 * we are only doing it for mirror devices
306	 */
307	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
308	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
309		return (-1);
310	}
311
312	/* get a list of all the metadevices for current set */
313	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
314		mde_perror(ep, gettext("Could not get mirrors for set %s"),
315		    sp->setname);
316		return (-1);
317	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
318		mde_perror(ep, gettext(
319		    "Could not get soft partitions for set %s"), sp->setname);
320		return (-1);
321	}
322
323	/* If resetting the owner, get the known membership list */
324	if (mode & RESET_OWNER) {
325		if (meta_read_nodelist(&cnt, &nl, ep)) {
326			mde_perror(ep, "Could not get nodelist");
327			return (-1);
328		}
329	}
330
331	/* for each metadevice */
332	for (p = devnlp; (p != NULL); p = p->next) {
333		devnp = p->namep;
334
335		/*
336		 * Get the current setting for mirror ABR state and all of the
337		 * submirror state and flags from the master node. We only
338		 * perform this when going through a 'start' cycle.
339		 */
340		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
341			char	*miscname;
342
343			/*
344			 * Ensure that we ignore soft-parts that are returned
345			 * from the meta_get_mirror_names() call
346			 */
347			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
348				goto out;
349			if (strcmp(miscname, MD_MIRROR) != 0)
350				continue;
351
352			mir_state->mnum = meta_getminor(devnp->dev);
353			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
354			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
355			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
356			    meta_print_hrtime(gethrtime() - start_time));
357
358			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
359			    "MD_MN_GET_MIRROR_STATE") != 0) {
360				mde_perror(ep, gettext("Unable to get "
361				    "mirror state for %s"),
362				    get_mdname(sp, mir_state->mnum));
363				goto out;
364			} else {
365				continue;
366			}
367		}
368
369		/* check if this is a top level metadevice */
370		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
371			goto out;
372		if (MD_HAS_PARENT(MD_PARENT(mm))) {
373			has_parent = 1;
374		} else {
375			has_parent = 0;
376		}
377		Free(mm);
378
379		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
380			char	*miscname;
381
382			/*
383			 * we can only do these for mirrors so make sure we
384			 * really have a mirror device and not a softpartition
385			 * imitating one. meta_get_mirror_names seems to think
386			 * softparts on top of a mirror are mirrors!
387			 */
388			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
389				goto out;
390			if (strcmp(miscname, MD_MIRROR) != 0)
391				continue;
392
393			(void) memset(ownpar, 0, sizeof (*ownpar));
394			ownpar->d.mnum = meta_getminor(devnp->dev);
395			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
396
397			meta_mc_log(MC_LOG4, gettext("Setting owner "
398			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
399			    meta_print_hrtime(gethrtime() - start_time));
400
401			/* get the current owner id */
402			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
403			    "MD_MN_GET_MM_OWNER") != 0) {
404				mde_perror(ep, gettext("Unable to get "
405				    "mirror owner for %s"),
406				    get_mdname(sp, ownpar->d.mnum));
407				goto out;
408			}
409		}
410
411		if (mode & RESET_OWNER) {
412			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
413				mdclrerror(ep);
414				continue;
415			}
416
417			/*
418			 * reset owner only if the current owner is
419			 * not in the membership list
420			 * Also kill the resync thread so that when the resync
421			 * is started, it will perform an optimized resync
422			 * for any resync regions that were dirty when the
423			 * current owner left the membership.
424			 */
425			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
426				if (meta_mn_change_owner(&ownpar,
427				    sp->setno, ownpar->d.mnum,
428				    MD_MN_MIRROR_UNOWNED,
429				    MD_MN_MM_ALLOW_CHANGE) == -1) {
430					md_eprintf(gettext(
431					    "Unable to reset mirror owner "
432					    "for %s\n"),
433					    get_mdname(sp, ownpar->d.mnum));
434					goto out;
435				}
436				if (meta_mirror_resync(sp, devnp, 0, ep,
437				    MD_RESYNC_KILL_NO_WAIT) != 0) {
438					md_eprintf(gettext(
439					    "Unable to kill resync for"
440					    " %s\n"),
441					    get_mdname(sp, ownpar->d.mnum));
442					goto out;
443				}
444			}
445		}
446
447		if (mode & CHOOSE_OWNER) {
448			/*
449			 * only orphaned resyncs will have no owner.
450			 * if that is the case choose a new owner. Otherwise
451			 * re-establish the existing owner. This covers the
452			 * case where a node that owned the mirror
453			 * reboots/panics and comes back into the cluster before
454			 * the reconfig cycle has completed. In this case the
455			 * other cluster nodes will have the mirror owner marked
456			 * as the rebooted node while it has the owner marked
457			 * as 'None'. We have to reestablish the ownership so
458			 * that the subsequent resync can continue.
459			 */
460			if (meta_mn_change_owner(&ownpar, sp->setno,
461			    ownpar->d.mnum, ownpar->d.owner,
462			    MD_MN_MM_CHOOSE_OWNER) == -1) {
463				md_eprintf(gettext("Unable to choose "
464				    "mirror owner for %s\n"),
465				    get_mdname(sp, ownpar->d.mnum));
466				goto out;
467			}
468		}
469
470		/*
471		 * For RESET_ABR and UPDATE_ABR - only handle top
472		 * level metadevices.
473		 */
474		if (has_parent)
475			continue;
476
477		if (mode & RESET_ABR) {
478			/*
479			 * Reset the ABR (application based recovery)
480			 * value on all nodes. We are dealing with
481			 * the possibility that we have ABR set but the
482			 * only node that had the device open with ABR has
483			 * left the cluster. We simply open and close the
484			 * device and if this is the last close in the
485			 * cluster, ABR will be cleared on all nodes.
486			 */
487			char		*miscname;
488			char		name[MAXPATHLEN];
489			int		mnum, fd;
490
491			name[0] = '\0';
492			mnum = meta_getminor(devnp->dev);
493
494			/*
495			 * Ensure that we don't include soft-parts in the
496			 * mirror-only call to RESET_ABR. meta_get_mirror_names
497			 * returns a bogus list that includes all soft-parts
498			 * built on mirrors.
499			 */
500			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
501				goto out;
502			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
503				continue;
504
505			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
506			    "for %s: %s"), get_mdname(sp, mnum),
507			    meta_print_hrtime(gethrtime() - start_time));
508
509			/* compose the absolute device path and open it */
510			if (compose_path(sp, mnum, &name[0],
511			    sizeof (name)) != 0)
512				goto out;
513			if ((fd = open(name, O_RDWR, 0)) < 0) {
514				md_perror(gettext("Could not open device %s"),
515				    name);
516				continue;
517			}
518
519			(void) close(fd);
520		}
521
522		if (mode & UPDATE_ABR) {
523			/*
524			 * Update the ABR value on this node. We obtain the
525			 * current ABR state from the master node.
526			 */
527
528			char		*miscname;
529			char		name[MAXPATHLEN];
530			int		mnum, fd;
531			volcap_t	vc;
532			uint_t		tstate;
533
534			name[0] = '\0';
535			mnum = meta_getminor(devnp->dev);
536
537			/*
538			 * Ensure that we don't include soft-parts in the
539			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
540			 * returns a bogus list that includes all soft-parts
541			 * built on mirrors.
542			 */
543			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
544				goto out;
545			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
546				continue;
547
548			/* Get tstate from Master */
549			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
550			    != 0)
551				continue;
552			/* If not set on the master, nothing to do */
553			if (!(tstate & MD_ABR_CAP))
554				continue;
555
556			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
557			    "for %s: %s"), get_mdname(sp, mnum),
558			    meta_print_hrtime(gethrtime() - start_time));
559
560			/* compose the absolute device path and open it */
561			if (compose_path(sp, mnum, &name[0],
562			    sizeof (name)) != 0)
563				goto out;
564			if ((fd = open(name, O_RDWR, 0)) < 0) {
565				md_perror(gettext("Could not open device %s"),
566				    name);
567				continue;
568			}
569
570			/* set ABR state */
571			vc.vc_info = 0;
572			vc.vc_set = 0;
573			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
574				/*
575				 * Ignore if device does not support this
576				 * ioctl
577				 */
578				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
579					md_perror(gettext("Could not get "
580					    "ABR/DMR state for device %s"),
581					    name);
582				}
583				(void) close(fd);
584				continue;
585			}
586			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
587				(void) close(fd);
588				continue;
589			}
590
591			vc.vc_set = DKV_ABR_CAP;
592			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
593				md_perror(gettext(
594				    "Could not set ABR state for "
595				    "device %s"), name);
596				(void) close(fd);
597				goto out;
598			} else {
599				md_eprintf(gettext(
600				    "Setting ABR state on device %s\n"), name);
601			}
602
603			(void) close(fd);
604		}
605	}
606
607	/* cleanup */
608	if (mode & RESET_OWNER) {
609		meta_free_nodelist(nl);
610	}
611	metafreenamelist(devnlp);
612	return (0);
613
614out:
615	/* cleanup */
616	if (mode & RESET_OWNER) {
617		meta_free_nodelist(nl);
618	}
619	metafreenamelist(devnlp);
620	return (-1);
621}
622
623/*
624 * Print usage message
625 */
626static void
627usage(mdsetname_t *sp, int eval)
628{
629	(void) fprintf(stderr, gettext("usage:"
630	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
631	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
632	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
633	    "\t%s [-V | -? | -h]\n"),
634	    myname, myname, myname, myname);
635	if (!eval) {
636		fprintf(stderr, gettext("\n"
637		    "\tValid debug (-d) levels are 1-%d for increasing "
638		    "verbosity.\n\tDefault is -d 3.\n\n"
639		    "\tValid step values are: return | step1 | step2 | "
640		    "step3 | step4\n\n"
641		    "\tNodelist is a space-separated list of node id's\n\n"),
642		    MAX_DEBUG_LEVEL);
643	}
644	md_exit(sp, eval);
645}
646
647/*
648 * Input:	Input takes a config step name followed by a list of
649 *		possible node id's.
650 *
651 * Returns:	  0 - Success
652 *		  1 - Fail
653 *			Node will be removed from cluster membership
654 *			by forcing node to panic.
655 *		205 - Unsuccessful. Start another reconfig cycle.
656 *			Problem was encountered that could be fixed by
657 *			running another reconfig cycle.
658 *			Problem could be a result of a failure to read
659 *			the nodelist file or that all work could not be
660 *			accomplished in a reconfig step in the amount of
661 *			time given so another reconfig cycle is needed in
662 *			order to finish the current step.
663 */
664int
665main(int argc, char **argv)
666{
667	mdsetname_t		*sp = NULL;
668	md_error_t		status = mdnullerror;
669	md_error_t		*ep = &status;
670	set_t			max_sets, setno;
671	int			c, clust = 0;
672	struct sigaction	nsa, osa;
673	struct step_t		*step_ptr;
674	mdsetname_t		*local_sp = NULL;
675	md_drive_desc		*dd;
676	int			rval = 0;
677	md_set_desc		*sd;
678	mddb_block_parm_t	mbp;
679	uint_t			debug = 3; /* log upto MC_LOG3 by default */
680	int			version_table_size;
681	mddb_setflags_config_t	sf;
682	int			ret_val;
683	mddb_config_t		cfg;
684	int			set_info[MD_MAXSETS];
685	long			commd_timeout = 0;
686
687	/*
688	 * Get the locale set up before calling any other routines
689	 * with messages to ouput.  Just in case we're not in a build
690	 * environment, make sure that TEXT_DOMAIN gets set to
691	 * something.
692	 */
693#if !defined(TEXT_DOMAIN)
694#define	TEXT_DOMAIN "SYS_TEST"
695#endif
696	(void) setlocale(LC_ALL, "");
697	(void) textdomain(TEXT_DOMAIN);
698
699	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
700		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
701		exit(1);
702	}
703
704	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
705		mde_perror(ep, "");
706		md_exit(sp, 1);
707	}
708
709	/*
710	 * open log and enable libmeta logging. Do it here explicitly
711	 * rather than letting md_init() do it because we are not really
712	 * a daemon and that is what md_init() opens the log as.
713	 */
714	openlog("metaclust", LOG_CONS, LOG_USER);
715
716	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
717
718	optind = 1;
719	opterr = 0;
720	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
721		switch (c) {
722		case 'h':
723			usage(sp, 0);
724			break;
725
726		case 'd':
727			if (sscanf(optarg, "%u", &debug) != 1) {
728				md_eprintf(gettext("Invalid debug level\n"));
729				md_exit(sp, 1);
730			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
731				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
732				md_eprintf(gettext("Debug level must be "
733				    "between 1 and %d inclusive.\n"),
734				    MAX_DEBUG_LEVEL);
735				md_eprintf(gettext("Debug level set to %d.\n"),
736				    debug);
737			}
738			break;
739
740		case 'V':
741			version = Strdup(optarg);
742			break;
743
744		case 't':
745			if (sscanf(optarg, "%u", &timeout) != 1) {
746				md_eprintf(gettext("Invalid timeout value\n"));
747				md_exit(sp, 1);
748			}
749			break;
750
751		case '?':
752			if (optopt == '?') {
753				usage(sp, 0);
754			} else if (optopt == 'V') {
755				int	i;
756
757				fprintf(stdout, gettext(
758				    "%s: Versions Supported:"), myname);
759				for (i = 0; i < version_table_size; i++) {
760					fprintf(stdout, " %s",
761					    version_table[i]);
762				}
763				fprintf(stdout, "\n");
764				md_exit(sp, 0);
765			}
766			/*FALLTHROUGH*/
767
768		default:
769			usage(sp, 1);
770			break;
771		}
772	}
773
774	/* initialise the debug level and start time */
775	setup_mc_log(debug);
776
777	/*
778	 * check that the version specified (if any) is supported.
779	 */
780	if (version != NULL) {
781		int	i, found = 0;
782
783		for (i = 0; i < version_table_size; i++) {
784			if (strcmp(version, version_table[i]) == 0) {
785				found = 1;
786				break;
787			}
788		}
789		if (!found) {
790			md_eprintf(gettext("Version %s not supported\n"),
791			    version);
792			md_exit(sp, 1);
793		}
794	}
795
796	argc -= optind;
797	argv += optind;
798
799	/* parse arguments */
800	if (argc <= 0) {
801		usage(sp, 1);
802	}
803
804	/* convert the step name to the corresponding number */
805	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
806	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
807	if (step_ptr != NULL) {
808		stepnum = step_ptr->step_num;
809	}
810
811	--argc;
812	++argv;
813
814	/* set timeout alarm signal, a value of 0 will disable timeout */
815	if (timeout > 0) {
816		int	stat_loc = 0;
817		commd_timeout = (long)(timeout * .75);
818
819		c_pid = fork();
820
821		if (c_pid == (pid_t)-1) {
822			md_perror(gettext("Unable to fork"));
823			md_exit(sp, 1);
824		} else if (c_pid) {
825			/* parent */
826			nsa.sa_flags = 0;
827			if (sigfillset(&nsa.sa_mask) < 0) {
828				md_perror(gettext("Unable to set signal mask"));
829				md_exit(sp, 1);
830			}
831
832			nsa.sa_handler = sigalarmhandler;
833			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
834				md_perror(gettext("Unable to set alarm "
835				    "handler"));
836				md_exit(sp, 1);
837			}
838
839			(void) alarm(timeout);
840
841			/*
842			 * wait for child to exit or timeout to expire.
843			 * keep retrying if the call is interrupted
844			 */
845			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
846				if (errno != EINTR) {
847					break;
848				}
849			}
850			if (ret_val == c_pid) {
851				/* exit with the childs exit value */
852				exit(WEXITSTATUS(stat_loc));
853			} else if (errno == ECHILD) {
854				md_exit(sp, 0);
855			} else {
856				perror(myname);
857				md_exit(sp, 1);
858			}
859		}
860	}
861
862	/*
863	 * If a timeout value is given, everything from this point onwards is
864	 * executed in the child process.
865	 */
866
867	switch (stepnum) {
868	case MC_START:
869		/*
870		 * Start Step
871		 *
872		 * - Suspend all rpc.mdcommd messages
873		 */
874
875		/* expect the local node id to be given only */
876		if (argc != 1)
877			usage(sp, 1);
878
879		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
880		    meta_print_hrtime(0));
881
882		/*
883		 * Does local set exist? If not, exit with 0
884		 * since there's no reason to have this node panic if
885		 * the local set cannot be started.
886		 */
887		if ((local_sp = load_local_set(ep)) == NULL) {
888			md_exit(local_sp, 0);
889		}
890
891		if ((max_sets = get_max_sets(ep)) == 0) {
892			mde_perror(ep, "");
893			md_exit(sp, 1);
894		}
895
896		/* start walking through all possible disksets */
897		for (setno = 1; setno < max_sets; setno++) {
898			if ((sp = metasetnosetname(setno, ep)) == NULL) {
899				if (mdiserror(ep, MDE_NO_SET)) {
900					/* No set for this setno - continue */
901					mdclrerror(ep);
902					continue;
903				} else {
904					mde_perror(ep, gettext("Unable to "
905					    "get set %d information"), setno);
906					md_exit(sp, 1);
907				}
908			}
909
910			/* only check multi-node disksets */
911			if (!meta_is_mn_set(sp, ep)) {
912				mdclrerror(ep);
913				continue;
914			}
915
916			meta_mc_log(MC_LOG3, gettext("Start - block parse "
917			    "messages for set %s: %s"), sp->setname,
918			    meta_print_hrtime(gethrtime() - start_time));
919
920			/*
921			 * Mddb parse messages are sent amongst the nodes
922			 * in a diskset whenever the locator block or
923			 * locator names structure has been changed.
924			 * A locator block change could occur as a result
925			 * of a disk failure during the reconfig cycle,
926			 * so block the mddb parse messages while the
927			 * rpc.mdcommd is suspended during the reconfig cycle.
928			 */
929			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
930				(void) memset(&mbp, 0, sizeof (mbp));
931				mbp.c_setno = setno;
932				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
933				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
934				    &mbp.c_mde, NULL)) {
935					mdstealerror(ep, &mbp.c_mde);
936					mde_perror(ep, gettext("Could not "
937					    "block set %s"), sp->setname);
938					md_exit(sp, 1);
939				}
940			}
941
942			/* suspend commd and spin waiting for drain */
943			while ((ret_val = mdmn_suspend(setno,
944			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
945			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
946				sleep(1);
947			}
948
949			if (ret_val) {
950				md_eprintf(gettext("Could not suspend "
951				    "rpc.mdcommd for set %s\n"), sp->setname);
952				md_exit(sp, 1);
953			}
954
955			/*
956			 * Set start step flag for set. This is set to indicate
957			 * that this node entered the reconfig cycle through
958			 * the start step.  This is used during the reconfig
959			 * cycle to determine whether the node had entered
960			 * through the start step or the return step.
961			 */
962			(void) memset(&sf, 0, sizeof (sf));
963			sf.sf_setno = sp->setno;
964			sf.sf_setflags = MD_SET_MN_START_RC;
965			sf.sf_flags = MDDB_NM_SET;
966			/* Use magic to help protect ioctl against attack. */
967			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
968			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
969			    &sf.sf_mde, NULL)) {
970				mdstealerror(ep, &sf.sf_mde);
971				mde_perror(ep, gettext("Could not set "
972				    "start_step flag for set %s"), sp->setname);
973				md_exit(sp, 1);
974			}
975
976		}
977
978		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
979		    meta_print_hrtime(gethrtime() - start_time));
980
981		break;
982
983	case MC_STOP:
984		/*
985		 * Stop Step
986		 *
987		 * - ???
988		 */
989
990		/* don't expect any more arguments to follow the step name */
991		if (argc != 0)
992			usage(sp, 1);
993
994		break;
995
996	case MC_ABORT:
997		/*
998		 * Abort Step
999		 *
1000		 * - Abort rpc.mdcommd
1001		 */
1002
1003		/* don't expect any more arguments to follow the step name */
1004		if (argc != 0)
1005			usage(sp, 1);
1006
1007		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
1008		    meta_print_hrtime(0));
1009
1010		/*
1011		 * Does local set exist? If not, exit with 0
1012		 * since there's no reason to have this node panic if
1013		 * the local set cannot be started.
1014		 */
1015		if ((local_sp = load_local_set(ep)) == NULL) {
1016			md_exit(local_sp, 0);
1017		}
1018
1019		/*
1020		 * abort the rpc.mdcommd.  The abort is only issued on this node
1021		 * meaning that the abort reconfig step is called on this
1022		 * node before a panic while the rest of the cluster will
1023		 * undergo a reconfig cycle.
1024		 * There is no time relation between this node running a
1025		 * reconfig abort and the the rest of the cluster
1026		 * running a reconfig cycle meaning that this node may
1027		 * panic before, during or after the cluster has run
1028		 * a reconfig cycle.
1029		 */
1030		mdmn_abort();
1031
1032		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1033		    meta_print_hrtime(gethrtime() - start_time));
1034
1035		break;
1036
1037	case MC_RETURN:
1038		/*
1039		 * Return Step
1040		 *
1041		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1042		 *   and release local set lock.  Grabbing the local set
1043		 *   lock allows any active metaset/metadb commands to
1044		 *   terminate gracefully and will keep a metaset/metadb
1045		 *   command from starting until the DRAIN ALL is issued.
1046		 *   The metaset/metadb commands can issue
1047		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1048		 *   so the return step must not issue the DRAIN ALL command
1049		 *   until metaset/metadb have finished or metaset may issue
1050		 *   a RESUME ALL after this return reconfig step has issued
1051		 *   the DRAIN ALL command.
1052		 *   After this reconfig step has issued the DRAIN_ALL and
1053		 *   released the local set lock, metaset/metadb will fail
1054		 *   when attempting to contact the rpc.mdcommd and will
1055		 *   terminate without making any configuration changes.
1056		 *   The DRAIN ALL command will keep all other meta* commands
1057		 *   from running during the reconfig cycle (these commands
1058		 *   will wait until the rpc.mdcommd is resumed) since the
1059		 *   reconfig cycle may be changing the diskset configuration.
1060		 */
1061
1062		/* expect the nodelist to follow the step name */
1063		if (argc < 1)
1064			usage(sp, 1);
1065
1066		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1067		    meta_print_hrtime(0));
1068
1069		/*
1070		 * Does local set exist? If not, exit with 0
1071		 * since there's no reason to have this node panic if
1072		 * the local set cannot be started.
1073		 */
1074		if ((local_sp = load_local_set(ep)) == NULL) {
1075			md_exit(local_sp, 0);
1076		}
1077
1078		/*
1079		 * Suspend any mirror resyncs that are in progress. This
1080		 * stops unnecessary timeouts.
1081		 */
1082		meta_mirror_resync_block_all();
1083
1084		if (meta_lock(local_sp, TRUE, ep) != 0) {
1085			mde_perror(ep, "");
1086			md_exit(local_sp, 1);
1087		}
1088
1089		/*
1090		 * All metaset and metadb commands on this node have now
1091		 * terminated gracefully.  Now, issue a drain all to
1092		 * the rpc.mdcommd.  Any meta command issued after the
1093		 * drain all will either spin sending the command to the
1094		 * master until after the reconfig cycle has finished OR
1095		 * will terminate gracefully (metaset/metadb).
1096		 */
1097		if ((max_sets = get_max_sets(ep)) == 0) {
1098			mde_perror(ep, "");
1099			md_exit(sp, 1);
1100		}
1101
1102		/* start walking through all possible disksets */
1103		for (setno = 1; setno < max_sets; setno++) {
1104			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1105				if (mdiserror(ep, MDE_NO_SET)) {
1106					/* No set for this setno - continue */
1107					mdclrerror(ep);
1108					continue;
1109				} else {
1110					mde_perror(ep, gettext("Unable to "
1111					    "get set %d information"), setno);
1112					md_exit(sp, 1);
1113				}
1114			}
1115
1116			/* only check multi-node disksets */
1117			if (!meta_is_mn_set(sp, ep)) {
1118				mdclrerror(ep);
1119				continue;
1120			}
1121
1122			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1123			    "messages for set %s: %s"), sp->setname,
1124			    meta_print_hrtime(gethrtime() - start_time));
1125
1126			/*
1127			 * Mddb parse messages are sent amongst the nodes
1128			 * in a diskset whenever the locator block or
1129			 * locator names structure has been changed.
1130			 * A locator block change could occur as a result
1131			 * of a disk failure during the reconfig cycle,
1132			 * so block the mddb parse messages while the
1133			 * rpc.commd is suspended during the reconfig cycle.
1134			 */
1135			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1136				(void) memset(&mbp, 0, sizeof (mbp));
1137				mbp.c_setno = setno;
1138				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1139				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1140				    &mbp.c_mde, NULL)) {
1141					mdstealerror(ep, &mbp.c_mde);
1142					mde_perror(ep, gettext("Could not "
1143					    "block set %s"), sp->setname);
1144					md_exit(sp, 1);
1145				}
1146			}
1147
1148			/* suspend commd and spin waiting for drain */
1149			while ((ret_val = mdmn_suspend(setno,
1150			    MD_COMM_ALL_CLASSES, commd_timeout)) ==
1151			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1152				sleep(1);
1153			}
1154
1155			if (ret_val) {
1156				md_eprintf(gettext("Could not suspend "
1157				    "rpc.mdcommd for set %s\n"), sp->setname);
1158				md_exit(sp, 1);
1159			}
1160		}
1161		/*
1162		 * Resume all I/Os for this node for all MN sets in
1163		 * case master node had suspended I/Os but panic'd
1164		 * before resuming I/Os.  In case of failure, exit
1165		 * with a 1 since unable to resume I/Os on this node.
1166		 */
1167		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1168			mde_perror(ep, gettext(
1169			    "Unable to resume I/O on node %s for all sets"),
1170			    mynode());
1171			md_exit(sp, 1);
1172		}
1173
1174
1175		/*
1176		 * Can now unlock local set lock.  New metaset/metadb
1177		 * commands are now held off using drain all.
1178		 */
1179		(void) meta_unlock(local_sp, ep);
1180
1181		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1182		    meta_print_hrtime(gethrtime() - start_time));
1183
1184		break;
1185
1186	case MC_STEP1:
1187		/*
1188		 * Step 1
1189		 *
1190		 * - Populate nodelist file if we are on clustering
1191		 *   and pick a master node for each MN diskset.
1192		 */
1193
1194		/* expect the nodelist to follow the step name */
1195		if (argc < 1)
1196			usage(sp, 1);
1197
1198		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1199		    meta_print_hrtime(0));
1200
1201		/* Always write nodelist file even if no local set exists */
1202		if (clust == SDSSC_OKAY) {
1203			/* skip to the nodelist args */
1204			if (meta_write_nodelist(argc, argv, ep) != 0) {
1205				mde_perror(ep, gettext(
1206				    "Could not populate nodelist file"));
1207				md_exit(sp, 1);
1208			}
1209		}
1210
1211		/*
1212		 * Does local set exist? If not, exit with 0
1213		 * since there's no reason to have this node panic if
1214		 * the local set cannot be started.
1215		 */
1216		if ((local_sp = load_local_set(ep)) == NULL) {
1217			md_exit(local_sp, 0);
1218		}
1219
1220		/*
1221		 * At this point, all meta* commands are blocked across
1222		 * all disksets since the master rpc.mdcommd has drained or
1223		 * the master node has died.
1224		 * If a metaset or metadb command had been in progress
1225		 * at the start of the reconfig cycle, this command has
1226		 * either completed or it has been terminated due to
1227		 * the death of the master node.
1228		 *
1229		 * This means that that it is now ok to remove any
1230		 * outstanding clnt_locks associated with multinode
1231		 * disksets on this node due to a node panic during
1232		 * a metaset operation.  This allows the routines that
1233		 * choose the master to use rpc.metad to determine the
1234		 * master of the diskset.
1235		 */
1236		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1237			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1238			    "clear locks failed %s"),
1239			    meta_print_hrtime(gethrtime() - start_time));
1240			md_exit(local_sp, 1);
1241		}
1242
1243		/*
1244		 * Call reconfig_choose_master to choose a master for
1245		 * each MN diskset, update the nodelist for each diskset
1246		 * given the member information and send a reinit message
1247		 * to rpc.mdcommd to reload the nodelist.
1248		 */
1249		rval = meta_reconfig_choose_master(commd_timeout, ep);
1250		if (rval == 205) {
1251			/*
1252			 * NOTE: Should issue call to reboot remote host that
1253			 * is causing the RPC failure.  Clustering to
1254			 * provide interface in the future.  This should
1255			 * stop a never-ending set of 205 reconfig cycles.
1256			 * Remote host causing failure is stored in
1257			 * ep->host if ep is an RPC error.
1258			 * if (mdanyrpcerror(ep))
1259			 * 	reboot (ep->host);
1260			 */
1261			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1262			    "choose master failure of 205 %s"),
1263			    meta_print_hrtime(gethrtime() - start_time));
1264			md_exit(local_sp, 205);
1265		} else if (rval != 0) {
1266			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1267			    "choose master failure %s"),
1268			    meta_print_hrtime(gethrtime() - start_time));
1269			md_exit(local_sp, 1);
1270		}
1271
1272		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1273		    meta_print_hrtime(gethrtime() - start_time));
1274
1275		md_exit(local_sp, rval);
1276		break;
1277
1278	case MC_STEP2:
1279		/*
1280		 * Step 2
1281		 *
1282		 * In Step 2, each node walks the list of disksets.  If a
1283		 * node is a master of a MN diskset, it synchronizes
1284		 * the local set USER records for that diskset.
1285		 *
1286		 * If disks exist in the diskset and there is a joined
1287		 * (owner) node in the diskset, the master will also:
1288		 *	- synchronize the diskset mddbs to the master
1289		 *	- play the change log
1290		 *
1291		 * The master node will now attempt to join any unjoined
1292		 * nodes that are currently members in the membership list.
1293		 */
1294
1295		/* expect the nodelist to follow the step name */
1296		if (argc < 1)
1297			usage(sp, 1);
1298
1299		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1300		    meta_print_hrtime(0));
1301
1302		/*
1303		 * Does local set exist? If not, exit with 0
1304		 * since there's no reason to have this node panic if
1305		 * the local set cannot be started.
1306		 */
1307		if ((local_sp = load_local_set(ep)) == NULL) {
1308			md_exit(local_sp, 0);
1309		}
1310
1311		if ((max_sets = get_max_sets(ep)) == 0) {
1312			mde_perror(ep, "");
1313			md_exit(local_sp, 1);
1314		}
1315
1316		/* start walking through all possible disksets */
1317		for (setno = 1; setno < max_sets; setno++) {
1318			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1319				if (mdiserror(ep, MDE_NO_SET)) {
1320					/* No set for this setno - continue */
1321					mdclrerror(ep);
1322					continue;
1323				} else if (mdanyrpcerror(ep)) {
1324					/* Fail on RPC failure to self */
1325					mde_perror(ep, gettext(
1326					    "Unable to get information for "
1327					    "set number %d"), setno);
1328					md_exit(local_sp, 1);
1329				} else {
1330					mde_perror(ep, gettext(
1331					    "Unable to get information for "
1332					    "set number %d"), setno);
1333					mdclrerror(ep);
1334					continue;
1335				}
1336			}
1337
1338			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1339				if (mdanyrpcerror(ep)) {
1340					/* Fail on RPC failure to self */
1341					mde_perror(ep, gettext(
1342					    "Unable to get information for "
1343					    "set number %d"), setno);
1344					md_exit(local_sp, 1);
1345				}
1346				mde_perror(ep, gettext("Unable to get set "
1347				    "%s desc information"), sp->setname);
1348				mdclrerror(ep);
1349				continue;
1350			}
1351
1352			/* Only check MN disksets */
1353			if (!(MD_MNSET_DESC(sd))) {
1354				continue;
1355			}
1356
1357			/* All actions in step 2 are driven by master */
1358			if (!(sd->sd_mn_am_i_master)) {
1359				continue;
1360			}
1361
1362			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1363			    "synchronization for set %s: %s"), sp->setname,
1364			    meta_print_hrtime(gethrtime() - start_time));
1365
1366			/*
1367			 * Synchronize the USER records in the local mddbs
1368			 * for hosts that are members.  The USER records
1369			 * contain set, drive and host information.
1370			 */
1371			rval = meta_mnsync_user_records(sp, ep);
1372			if (rval != 0) {
1373				mde_perror(ep, gettext(
1374				    "Synchronization of user records "
1375				    "in set %s failed\n"), sp->setname);
1376				if (rval == 205) {
1377					/*
1378					 * NOTE: Should issue call to reboot
1379					 * remote host that is causing the RPC
1380					 * failure.  Clustering to provide
1381					 * interface in the future.  This
1382					 * should stop a never-ending set of
1383					 * 205 reconfig cycles.
1384					 * Remote host causing failure is
1385					 * stored in ep->host if ep is an
1386					 * RPC error.
1387					 * if (mdanyrpcerror(ep))
1388					 * 	reboot (ep->host);
1389					 */
1390					md_exit(local_sp, 205);
1391				} else {
1392					md_exit(local_sp, 1);
1393				}
1394			}
1395
1396			/* Reget sd since sync_user_recs may have flushed it */
1397			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1398				mde_perror(ep, gettext("Unable to get set "
1399				    "%s desc information"), sp->setname);
1400				md_exit(local_sp, 1);
1401			}
1402
1403			dd = metaget_drivedesc(sp,
1404			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1405			if (! mdisok(ep)) {
1406				mde_perror(ep, gettext("Unable to get set "
1407				    "%s drive information"), sp->setname);
1408				md_exit(local_sp, 1);
1409			}
1410
1411			/*
1412			 * No drives in set, continue to next set.
1413			 */
1414			if (dd == NULL) {
1415				/* Done with this set */
1416				continue;
1417			}
1418
1419			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1420			    "records completed for set %s: %s"), sp->setname,
1421			    meta_print_hrtime(gethrtime() - start_time));
1422
1423			/*
1424			 * Synchronize the diskset mddbs for hosts
1425			 * that are members.  This may involve
1426			 * playing the changelog and writing out
1427			 * to the diskset mddbs.
1428			 */
1429			rval = meta_mnsync_diskset_mddbs(sp, ep);
1430			if (rval != 0) {
1431				mde_perror(ep, gettext(
1432				    "Synchronization of diskset mddbs "
1433				    "in set %s failed\n"), sp->setname);
1434				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1435				    "mddb synchronization failed for "
1436				    "set %s: %s"), sp->setname,
1437				    meta_print_hrtime(gethrtime() -
1438				    start_time));
1439				if (rval == 205) {
1440					/*
1441					 * NOTE: Should issue call to reboot
1442					 * remote host that is causing the RPC
1443					 * failure.  Clustering to provide
1444					 * interface in the future.  This
1445					 * should stop a never-ending set of
1446					 * 205 reconfig cycles.
1447					 * Remote host causing failure is
1448					 * stored in ep->host if ep is an
1449					 * RPC error.
1450					 * if (mdanyrpcerror(ep))
1451					 * 	reboot (ep->host);
1452					 */
1453					md_exit(local_sp, 205);
1454				} else if (rval == 1) {
1455					continue;
1456				} else {
1457					md_exit(local_sp, 1);
1458				}
1459			}
1460
1461			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1462			    "synchronization completed for set %s: %s"),
1463			    sp->setname,
1464			    meta_print_hrtime(gethrtime() - start_time));
1465
1466			/* Join the starting nodes to the diskset */
1467			rval = meta_mnjoin_all(sp, ep);
1468			if (rval != 0) {
1469				mde_perror(ep, gettext(
1470				    "Join of non-owner (starting) nodes "
1471				    "in set %s failed\n"), sp->setname);
1472				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1473				    "nodes joined for set %s: %s"),
1474				    sp->setname,
1475				    meta_print_hrtime(gethrtime() -
1476				    start_time));
1477				if (rval == 205) {
1478					/*
1479					 * NOTE: Should issue call to reboot
1480					 * remote host that is causing the RPC
1481					 * failure.  Clustering to provide
1482					 * interface in the future.  This
1483					 * should stop a never-ending set of
1484					 * 205 reconfig cycles.
1485					 * Remote host causing failure is
1486					 * stored in ep->host if ep is an
1487					 * RPC error.
1488					 * if (mdanyrpcerror(ep))
1489					 * 	reboot (ep->host);
1490					 */
1491					md_exit(local_sp, 205);
1492				} else {
1493					md_exit(local_sp, 1);
1494				}
1495			}
1496
1497			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1498			    "joined for set %s: %s"), sp->setname,
1499			    meta_print_hrtime(gethrtime() - start_time));
1500
1501		}
1502
1503		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1504		    meta_print_hrtime(gethrtime() - start_time));
1505
1506		break;
1507
1508	case MC_STEP3:
1509		/*
1510		 * Step 3
1511		 *
1512		 * For all multinode sets do,
1513		 * - Reinitialise rpc.mdcommd
1514		 * - Reset mirror owners to null if the current owner is
1515		 *   no longer in the membership list
1516		 */
1517
1518		/* expect the nodelist to follow the step name */
1519		if (argc < 1)
1520			usage(sp, 1);
1521
1522		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1523		    meta_print_hrtime(0));
1524
1525		/*
1526		 * Does local set exist? If not, exit with 0
1527		 * since there's no reason to have this node panic if
1528		 * the local set cannot be started.
1529		 */
1530		if ((local_sp = load_local_set(ep)) == NULL) {
1531			md_exit(local_sp, 0);
1532		}
1533
1534		/*
1535		 * walk through all sets on this node which could include:
1536		 *	- MN disksets
1537		 *	- traditional disksets
1538		 *	- non-existent disksets
1539		 * start mirror resync for all MN sets
1540		 */
1541		if ((max_sets = get_max_sets(ep)) == 0) {
1542			mde_perror(ep, "");
1543			md_exit(local_sp, 1);
1544		}
1545
1546		/* start walking through all possible disksets */
1547		for (setno = 1; setno < max_sets; setno++) {
1548			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1549				if (mdiserror(ep, MDE_NO_SET)) {
1550					/* No set for this setno - continue */
1551					mdclrerror(ep);
1552					continue;
1553				} else {
1554					mde_perror(ep, gettext("Unable to "
1555					    "get set %d information"), setno);
1556					md_exit(local_sp, 1);
1557				}
1558			}
1559
1560			/* only check multi-node disksets */
1561			if (!meta_is_mn_set(sp, ep)) {
1562				mdclrerror(ep);
1563				continue;
1564			}
1565
1566			if (meta_lock(sp, TRUE, ep) != 0) {
1567				mde_perror(ep, "");
1568				md_exit(local_sp, 1);
1569			}
1570
1571			/* If this node isn't joined to set, do nothing */
1572			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1573				if (!mdisok(ep)) {
1574					mde_perror(ep, gettext("Could "
1575					    "not get set %s ownership"),
1576					    sp->setname);
1577					md_exit(sp, 1);
1578				}
1579				mdclrerror(ep);
1580				meta_unlock(sp, ep);
1581				continue;
1582			}
1583
1584			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1585			    "re-initialising rpc.mdcommd and resetting mirror "
1586			    "owners for set %s: %s"), sp->setname,
1587			    meta_print_hrtime(gethrtime() - start_time));
1588
1589			/* reinitialzse rpc.mdcommd with new nodelist */
1590			if (mdmn_reinit_set(setno, commd_timeout)) {
1591				md_eprintf(gettext(
1592				    "Could not re-initialise rpc.mdcommd for "
1593				    "set %s\n"), sp->setname);
1594				md_exit(sp, 1);
1595			}
1596
1597			(void) memset(&cfg, 0, sizeof (cfg));
1598			cfg.c_id = 0;
1599			cfg.c_setno = sp->setno;
1600			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1601			    NULL) != 0) {
1602				mdstealerror(ep, &cfg.c_mde);
1603				mde_perror(ep, gettext("Could "
1604				    "not get set %s information"),
1605				    sp->setname);
1606				md_exit(sp, 1);
1607			}
1608
1609			/* Don't do anything else if set is stale */
1610			if (cfg.c_flags & MDDB_C_STALE) {
1611				meta_unlock(sp, ep);
1612				mdclrerror(ep);
1613				continue;
1614			}
1615
1616			/* reset mirror owners */
1617			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1618				md_exit(sp, 1);
1619			}
1620
1621			meta_unlock(sp, ep);
1622
1623			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1624			    "re-initialised and mirror owners reset for "
1625			    "set %s: %s"), sp->setname,
1626			    meta_print_hrtime(gethrtime() - start_time));
1627		}
1628
1629		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1630		    meta_print_hrtime(gethrtime() - start_time));
1631
1632		break;
1633
1634	case MC_STEP4:
1635		/*
1636		 * Step 4
1637		 *
1638		 * For all multinode sets do:
1639		 * - Resume the rpc.mdcommd messages.  Must resume all
1640		 *	sets before issuing I/O to any set since an error
1641		 * 	encountered in a commd suspended set could be
1642		 *	blocked waiting for commd in another set to resume.
1643		 *	(This happens since the daemon queues service
1644		 *	all sets).  An open of a soft partition causes
1645		 *	a read of the watermarks during the open.
1646		 * - If set is non-writable (not an owner or STALE), then
1647		 *	continue to next set.
1648		 *
1649		 * For all multinode sets do,
1650		 * - Reset ABR states for all mirrors, ie clear ABR if not
1651		 *	open on any node.
1652		 * - Reset ABR states for all soft partitions, ie clear ABR if
1653		 *	not open on any node.
1654		 * - For all slave nodes that have entered through the start
1655		 *	step, update the ABR state to that of the master and
1656		 *	get the submirror state from the master
1657		 * - meta_lock set
1658		 * - Resync all mirrors
1659		 * - unlock meta_lock for this set.
1660		 * - Choose a new owner for any orphaned resyncs
1661		 *
1662		 * There is one potential issue here. when concurrently
1663		 * resetting and updating the ABR state. If the master has ABR
1664		 * set, but should no longer have because the only node that
1665		 * had the metadevice open and had ABR set has paniced, the
1666		 * master will send a message to all nodes to clear the ABR
1667		 * state. Meanwhile any node that has come through the
1668		 * start step will get tstate from the master and will update
1669		 * ABR if it was set in tstate. So, we appear to have a problem
1670		 * if the following sequence occurs:-
1671		 * - The slave gets tstate with ABR set
1672		 * - The master sends a message to clear ABR
1673		 * - The slave updates ABR with the value it got from tstate.
1674		 * We now have the master with ABR clear and the slave with ABR
1675		 * set. Fortunately, having set ABR, the slave will close the
1676		 * metadevice after setting ABR and as there are no nodes with
1677		 * the device open, the close will send a message to clear ABR
1678		 * on all nodes. So, the nodes will all have ABR unset.
1679		 */
1680
1681		/* expect the nodelist to follow the step name */
1682		if (argc < 1)
1683			usage(sp, 1);
1684
1685		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1686		    meta_print_hrtime(0));
1687
1688		/*
1689		 * Does local set exist? If not, exit with 0
1690		 * since there's no reason to have this node panic if
1691		 * the local set cannot be started.
1692		 */
1693		if ((local_sp = load_local_set(ep)) == NULL) {
1694			md_exit(local_sp, 0);
1695		}
1696
1697		/*
1698		 * walk through all sets on this node which could include:
1699		 *	- MN disksets
1700		 *	- traditional disksets
1701		 *	- non-existent disksets
1702		 * start mirror resync for all MN sets
1703		 */
1704		if ((max_sets = get_max_sets(ep)) == 0) {
1705			mde_perror(ep, "");
1706			md_exit(local_sp, 1);
1707		}
1708
1709		/* Clear set_info structure */
1710		for (setno = 1; setno < max_sets; setno++) {
1711			set_info[setno] = 0;
1712		}
1713
1714		/* start walking through all possible disksets */
1715		for (setno = 1; setno < max_sets; setno++) {
1716			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1717				if (mdiserror(ep, MDE_NO_SET)) {
1718					/* No set for this setno - continue */
1719					mdclrerror(ep);
1720					continue;
1721				} else {
1722					mde_perror(ep, gettext("Unable to "
1723					    "get set %d information"), setno);
1724					md_exit(local_sp, 1);
1725				}
1726			}
1727
1728			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1729				mde_perror(ep, gettext("Unable to get set "
1730				    "%s desc information"), sp->setname);
1731				mdclrerror(ep);
1732				continue;
1733			}
1734
1735			/* only check multi-node disksets */
1736			if (!meta_is_mn_set(sp, ep)) {
1737				mdclrerror(ep);
1738				continue;
1739			}
1740
1741			set_info[setno] |= SET_INFO_MN;
1742
1743			/*
1744			 * If not an owner (all mddbs failed) or stale
1745			 * (< 50% mddbs operational), then set is
1746			 * non-writable so just resume commd and
1747			 * unblock mddb messages.
1748			 */
1749			mdclrerror(ep);
1750			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1751				set_info[setno] |= SET_INFO_NO_WR;
1752			}
1753			if (!mdisok(ep)) {
1754				mde_perror(ep, gettext("Could "
1755				    "not get set %s ownership"),
1756				    sp->setname);
1757				md_exit(local_sp, 1);
1758			}
1759			/* Set is owned - is it stale? */
1760			if (!set_info[setno] & SET_INFO_NO_WR) {
1761				(void) memset(&cfg, 0, sizeof (cfg));
1762				cfg.c_id = 0;
1763				cfg.c_setno = sp->setno;
1764				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1765				    NULL) != 0) {
1766					mdstealerror(ep, &cfg.c_mde);
1767					mde_perror(ep, gettext("Could "
1768					    "not get set %s information"),
1769					    sp->setname);
1770					md_exit(local_sp, 1);
1771				}
1772				if (cfg.c_flags & MDDB_C_STALE) {
1773					set_info[setno] |= SET_INFO_NO_WR;
1774				}
1775			}
1776
1777			/* resume rpc.mdcommd */
1778			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0,
1779			    commd_timeout)) {
1780				md_eprintf(gettext("Unable to resume "
1781				    "rpc.mdcommd for set %s\n"), sp->setname);
1782				md_exit(local_sp, 1);
1783			}
1784
1785			/* Unblock mddb parse messages */
1786			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1787				(void) memset(&mbp, 0, sizeof (mbp));
1788				mbp.c_setno = setno;
1789				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1790				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1791				    &mbp.c_mde, NULL)) {
1792					mdstealerror(ep, &mbp.c_mde);
1793					mde_perror(ep, gettext("Could not "
1794					    "unblock set %s"), sp->setname);
1795					md_exit(local_sp, 1);
1796				}
1797			}
1798			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1799			    "resumed and messages unblocked for set %s: %s"),
1800			    sp->setname,
1801			    meta_print_hrtime(gethrtime() - start_time));
1802		}
1803
1804		for (setno = 1; setno < max_sets; setno++) {
1805			int			start_step;
1806
1807			/* Skip traditional disksets. */
1808			if ((set_info[setno] & SET_INFO_MN) == 0)
1809				continue;
1810
1811			/*
1812			 * If already determined that this set is
1813			 * a non-writable set, then just continue
1814			 * to next set since there's nothing else
1815			 * to do for a non-writable set.
1816			 */
1817			if (set_info[setno] & SET_INFO_NO_WR)
1818				continue;
1819
1820			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1821				if (mdiserror(ep, MDE_NO_SET)) {
1822					/* No set for this setno - continue */
1823					mdclrerror(ep);
1824					continue;
1825				} else {
1826					mde_perror(ep, gettext("Unable to "
1827					    "get set %d information"), setno);
1828					md_exit(local_sp, 1);
1829				}
1830			}
1831
1832			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1833				mde_perror(ep, gettext("Unable to get set "
1834				    "%s desc information"), sp->setname);
1835				mdclrerror(ep);
1836				continue;
1837			}
1838
1839			/* See if this node came through the start step */
1840			(void) memset(&sf, 0, sizeof (sf));
1841			sf.sf_setno = sp->setno;
1842			sf.sf_flags = MDDB_NM_GET;
1843			/* Use magic to help protect ioctl against attack. */
1844			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1845			if (metaioctl(MD_MN_GET_SETFLAGS, &sf,
1846			    &sf.sf_mde, NULL)) {
1847				mdstealerror(ep, &sf.sf_mde);
1848				mde_perror(ep, gettext("Could not get "
1849				    "start_step flag for set %s"), sp->setname);
1850				md_exit(local_sp, 1);
1851			}
1852			start_step =
1853			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1854
1855			/*
1856			 * We can now reset the start_step flag for the set
1857			 * if it was already set.
1858			 */
1859			if (start_step) {
1860				(void) memset(&sf, 0, sizeof (sf));
1861					sf.sf_setno = sp->setno;
1862				sf.sf_setflags = MD_SET_MN_START_RC;
1863				sf.sf_flags = MDDB_NM_RESET;
1864				/*
1865				 * Use magic to help protect ioctl
1866				 * against attack.
1867				 */
1868				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1869				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1870				    &sf.sf_mde, NULL)) {
1871					mdstealerror(ep, &sf.sf_mde);
1872					mde_perror(ep,
1873					    gettext("Could not reset "
1874					    "start_step flag for set %s"),
1875					    sp->setname);
1876				}
1877			}
1878
1879			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1880			    "ABR state and restarting io's for "
1881			    "set %s: %s"), sp->setname,
1882			    meta_print_hrtime(gethrtime() - start_time));
1883
1884
1885			/*
1886			 * If we are not the master and we have come through
1887			 * the start step, we must update the ABR states
1888			 * for mirrors and soft partitions. Also the submirror
1889			 * states need to be synchronised so that we see the
1890			 * same status as other previously joined members.
1891			 * This _must_ be done before starting the resync.
1892			 */
1893			if (!(sd->sd_mn_am_i_master) && start_step) {
1894				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1895				    ep) == -1) {
1896					md_exit(local_sp, 1);
1897				}
1898				if (reset_state(UPDATE_ABR, sp, MD_SP,
1899				    ep) == -1) {
1900					md_exit(local_sp, 1);
1901				}
1902				/*
1903				 * Mark the fact that we've got the mirror
1904				 * state. This allows the resync thread to
1905				 * determine if _it_ needs to issue this. This
1906				 * can happen if a node is added to a set after
1907				 * a reconfig cycle has completed.
1908				 */
1909				(void) memset(&sf, 0, sizeof (sf));
1910					sf.sf_setno = sp->setno;
1911				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1912				sf.sf_flags = MDDB_NM_SET;
1913				/*
1914				 * Use magic to help protect ioctl
1915				 * against attack.
1916				 */
1917				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1918				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1919				    &sf.sf_mde, NULL)) {
1920					mdstealerror(ep, &sf.sf_mde);
1921					mde_perror(ep,
1922					    gettext("Could not set "
1923					    "submirror state flag for set %s"),
1924					    sp->setname);
1925				}
1926			}
1927
1928			/*
1929			 * All remaining actions are only performed by the
1930			 * master
1931			 */
1932			if (!(sd->sd_mn_am_i_master)) {
1933				if (meta_lock(sp, TRUE, ep) != 0) {
1934					mde_perror(ep, "");
1935					md_exit(local_sp, 1);
1936				}
1937				meta_mirror_resync_unblock(sp);
1938				meta_unlock(sp, ep);
1939				continue;
1940			}
1941
1942			/*
1943			 * If the master came through the start step, this
1944			 * implies that all of the nodes must have done the
1945			 * same and hence there can be no applications
1946			 * running. Hence no need to reset ABR
1947			 */
1948			if (!start_step) {
1949				/* Reset ABR state for mirrors */
1950				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1951				    ep) == -1) {
1952					md_exit(local_sp, 1);
1953				}
1954				/* ...and now the same for soft partitions */
1955				if (reset_state(RESET_ABR, sp, MD_SP,
1956				    ep) == -1) {
1957					md_exit(local_sp, 1);
1958				}
1959			}
1960
1961			/*
1962			 * choose owners for orphaned resyncs and reset
1963			 * non-orphaned resyncs so that an owner node that
1964			 * reboots will restart the resync if needed.
1965			 */
1966			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1967				md_exit(local_sp, 1);
1968
1969			/*
1970			 * Must unlock set lock before meta_mirror_resync_all
1971			 * sends a message to run the metasync command
1972			 * which also grabs the meta_lock.
1973			 */
1974			if (meta_lock(sp, TRUE, ep) != 0) {
1975				mde_perror(ep, "");
1976				md_exit(local_sp, 1);
1977			}
1978			meta_mirror_resync_unblock(sp);
1979			meta_unlock(sp, ep);
1980
1981			/* resync all mirrors in set */
1982			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1983				mde_perror(ep, gettext("Mirror resyncs "
1984				    "failed for set %s"), sp->setname);
1985				md_exit(local_sp, 1);
1986			}
1987
1988			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1989			    "for set %s: %s"), sp->setname,
1990			    meta_print_hrtime(gethrtime() - start_time));
1991		}
1992
1993		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
1994		    meta_print_hrtime(gethrtime() - start_time));
1995
1996		break;
1997
1998	default:
1999		usage(sp, 1);
2000		break;
2001	}
2002
2003	md_exit(sp, 0);
2004	/* NOTREACHED */
2005	return (0);
2006}
2007