meta_mn_handlers.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <stdlib.h>
28#include <unistd.h>
29#include <wait.h>
30#include <sys/time.h>
31#include <syslog.h>
32
33#include <meta.h>
34#include <sys/lvm/mdio.h>
35#include <sys/lvm/md_mddb.h>
36#include <sys/lvm/md_mirror.h>
37
38#define	MAX_N_ARGS 64
39#define	MAX_ARG_LEN 1024
40#define	MAX_SLEEPS 99
41#define	SLEEP_MOD 5
42
43/* we reserve 1024 bytes for stdout and the same for stderr */
44#define	MAX_OUT	1024
45#define	MAX_ERR	1024
46#define	JUNK 128 /* used to flush stdout and stderr */
47
48
49/*ARGSUSED*/
50void
51mdmn_do_cmd(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
52{
53
54	/*
55	 * We are given one string containing all the arguments
56	 * For execvp() we have to regenerate the arguments again
57	 */
58	int	arg;		/* argument that is currently been built */
59	int	index;		/* runs through arg above */
60	int	i;		/* helper for for loop */
61	char	*argv[MAX_N_ARGS]; /* argument array for execvp */
62	char	*cp;		/* runs through the given command line string */
63	char	*command = NULL; /* the command we call locally */
64	int	pout[2];	/* pipe for stdout */
65	int	perr[2];	/* pipe for stderr */
66	pid_t	pid;		/* process id */
67
68	cp	= msg->msg_event_data;
69	arg	= 0;
70	index	= 0;
71
72	/* init the args array alloc the first one and null out the rest */
73	argv[0] = Malloc(MAX_ARG_LEN);
74	for (i = 1; i < MAX_N_ARGS; i++) {
75		argv[i] = NULL;
76	}
77
78	resp->mmr_comm_state	= MDMNE_ACK; /* Ok state */;
79
80	while (*cp != '\0') {
81		if (arg == MAX_N_ARGS) {
82			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
83			    "PANIC: too many arguments specified\n"));
84			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
85			goto out;
86		}
87		if (index == MAX_ARG_LEN) {
88			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
89			    "PANIC: argument too long\n"));
90			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
91			goto out;
92		}
93
94		if ((*cp != ' ') && (*cp != '\t')) {
95			/*
96			 * No space or tab: copy char into current
97			 * argv and advance both pointers
98			 */
99
100			argv[arg][index] = *cp;
101			cp++;	/* next char in command line	*/
102			index++;	/* next char in argument	*/
103		} else {
104			/*
105			 * space or tab: terminate current argv,
106			 * advance arg, reset pointer into arg,
107			 * advance pointer in command line
108			 */
109			argv[arg][index] = '\0';
110			arg++; /* next argument */
111			argv[arg] = Malloc(MAX_ARG_LEN);
112			cp++; /* next char in command line */
113			index = 0; /* starts at char 0 */
114		}
115	}
116	/* terminate the last real argument */
117	argv[arg][index] = '\0';
118	/* the last argument is an NULL pointer */
119	argv[++arg] = NULL;
120	if (pipe(pout) < 0)  {
121		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
122		    "PANIC: pipe failed\n"));
123		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
124		goto out;
125	}
126	if (pipe(perr) < 0) {
127		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
128		    "PANIC: pipe failed\n"));
129		(void) close(pout[0]);
130		(void) close(pout[1]);
131		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
132		goto out;
133	}
134	command = Strdup(argv[0]);
135	(void) strcat(argv[0], ".rpc_call");
136	pid = fork1();
137	if (pid == (pid_t)-1) {
138		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
139		    "PANIC: fork failed\n"));
140		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
141		(void) close(pout[0]);
142		(void) close(pout[1]);
143		(void) close(perr[0]);
144		(void) close(perr[1]);
145		goto out;
146	} else  if (pid == (pid_t)0) {
147		/* child */
148		(void) close(0);
149		/* close the reading channels of pout and perr */
150		(void) close(pout[0]);
151		(void) close(perr[0]);
152		/* redirect stdout */
153		if (dup2(pout[1], 1) < 0) {
154			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
155			    "PANIC: dup2 failed\n"));
156			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
157			return;
158		}
159
160		/* redirect stderr */
161		if (dup2(perr[1], 2) < 0) {
162			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
163			    "PANIC: dup2 failed\n"));
164			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
165			return;
166		}
167
168		(void) execvp(command, (char *const *)argv);
169		perror("execvp");
170		_exit(1);
171	} else {
172		/* parent process */
173		int stat_loc;
174		char *out, *err; /* for stdout and stderr of child */
175		int i; /* index into the aboves */
176		char junk[JUNK];
177		int out_done = 0;
178		int err_done = 0;
179		int out_read = 0;
180		int err_read = 0;
181		int maxfd;
182		fd_set	rset;
183
184
185		/* close the writing channels of pout and perr */
186		(void) close(pout[1]);
187		(void) close(perr[1]);
188		resp->mmr_out = Malloc(MAX_OUT);
189		resp->mmr_err = Malloc(MAX_ERR);
190		resp->mmr_out_size = MAX_OUT;
191		resp->mmr_err_size = MAX_ERR;
192		out = resp->mmr_out;
193		err = resp->mmr_err;
194		FD_ZERO(&rset);
195		while ((out_done == 0) || (err_done == 0)) {
196			FD_SET(pout[0], &rset);
197			FD_SET(perr[0], &rset);
198			maxfd = max(pout[0], perr[0]) + 1;
199			(void) select(maxfd, &rset, NULL, NULL, NULL);
200
201			/*
202			 * Did the child produce some output to stdout?
203			 * If so, read it until we either reach the end of the
204			 * output or until we read MAX_OUT bytes.
205			 * Whatever comes first.
206			 * In case we already read MAX_OUT bytes we simply
207			 * read away the output into a junk buffer.
208			 * Just to make the child happy
209			 */
210			if (FD_ISSET(pout[0], &rset)) {
211				if (MAX_OUT - out_read - 1 > 0) {
212					i = read(pout[0], out,
213					    MAX_OUT - out_read);
214					out_read += i;
215					out += i;
216				} else {
217					/* buffer full, empty stdout */
218					i = read(pout[0], junk, JUNK);
219				}
220				if (i == 0) {
221					/* stdout is closed by child */
222					out_done++;
223				}
224			}
225			/* same comment as above | sed -e 's/stdout/stderr/' */
226			if (FD_ISSET(perr[0], &rset)) {
227				if (MAX_ERR - err_read - 1 > 0) {
228					i = read(perr[0], err,
229					    MAX_ERR - err_read);
230					err_read += i;
231					err += i;
232				} else {
233					/* buffer full, empty stderr */
234					i = read(perr[0], junk, JUNK);
235				}
236				if (i == 0) {
237					/* stderr is closed by child */
238					err_done++;
239				}
240			}
241		}
242		resp->mmr_out[out_read] = '\0';
243		resp->mmr_err[err_read] = '\0';
244
245		while (waitpid(pid, &stat_loc, 0) < 0) {
246			if (errno != EINTR) {
247				resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
248				break;
249			}
250		}
251		if (errno == 0)
252			resp->mmr_exitval = WEXITSTATUS(stat_loc);
253
254		(void) close(pout[0]);
255		(void) close(perr[0]);
256	}
257out:
258	for (i = 0; i < MAX_N_ARGS; i++) {
259		if (argv[i] != NULL) {
260			free(argv[i]);
261		}
262	}
263	if (command != NULL) {
264		Free(command);
265	}
266}
267
268/*
269 * This is for checking if a metadevice is opened, and for
270 * locking in case it is not and for
271 * unlocking a locked device
272 */
273/*ARGSUSED*/
274void
275mdmn_do_clu(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
276{
277	if (msg->msg_type == MD_MN_MSG_CLU_CHECK) {
278		md_isopen_t	*d;
279		int		ret;
280
281		resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
282		resp->mmr_out_size = 0;
283		resp->mmr_err_size = 0;
284		resp->mmr_out = NULL;
285		resp->mmr_err = NULL;
286		d = (md_isopen_t *)(void *)msg->msg_event_data;
287		ret = metaioctl(MD_IOCISOPEN, d, &(d->mde), NULL);
288		/*
289		 * In case the ioctl succeeded, return the open state of
290		 * the metadevice. Otherwise we return the error the ioctl
291		 * produced. As this is not zero, no attempt is made to
292		 * remove/rename the metadevice later
293		 */
294
295		if (ret == 0) {
296			resp->mmr_exitval = d->isopen;
297		} else {
298			/*
299			 * When doing a metaclear, one node after the other
300			 * does the two steps:
301			 * - check on all nodes if this md is opened.
302			 * - remove the md locally.
303			 * When the 2nd node asks all nodes if the md is
304			 * open it starts with the first node.
305			 * As this already removed the md, the check
306			 * returns MDE_UNIT_NOT_SETUP.
307			 * In order to not keep the 2nd node from proceeding,
308			 * we map this to an Ok.
309			 */
310			if (mdismderror(&(d->mde), MDE_UNIT_NOT_SETUP)) {
311				mdclrerror(&(d->mde));
312				ret = 0;
313			}
314
315			resp->mmr_exitval = ret;
316		}
317	}
318}
319
320/* handler for MD_MN_MSG_REQUIRE_OWNER */
321/*ARGSUSED*/
322void
323mdmn_do_req_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
324{
325	md_set_mmown_params_t	setown;
326	md_mn_req_owner_t	*d;
327	int			ret, n = 0;
328
329	resp->mmr_out_size = 0;
330	resp->mmr_err_size = 0;
331	resp->mmr_out = NULL;
332	resp->mmr_err = NULL;
333	resp->mmr_comm_state = MDMNE_ACK;
334	d = (md_mn_req_owner_t *)(void *)msg->msg_event_data;
335
336	(void) memset(&setown, 0, sizeof (setown));
337	MD_SETDRIVERNAME(&setown, MD_MIRROR, MD_MIN2SET(d->mnum))
338	setown.d.mnum = d->mnum;
339	setown.d.owner = d->owner;
340
341	/* Retry ownership change if we get EAGAIN returned */
342	while ((ret = metaioctl(MD_MN_SET_MM_OWNER, &setown, &setown.mde, NULL))
343	    != 0) {
344		md_sys_error_t	*ip =
345		    &setown.mde.info.md_error_info_t_u.sys_error;
346		if (ip->errnum != EAGAIN) {
347			break;
348		}
349		if (n++ >= 10) {
350			break;
351		}
352		(void) sleep(1);
353	}
354
355	resp->mmr_exitval = ret;
356}
357
358/*
359 * handler for MD_MN_MSG_CHOOSE_OWNER
360 * This is called when a mirror resync has no owner. The master node generates
361 * this message which is not broadcast to the other nodes. The message is
362 * required as the kernel does not have access to the nodelist for the set.
363 */
364/*ARGSUSED*/
365void
366mdmn_do_choose_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
367{
368	md_mn_msg_chowner_t	chownermsg;
369	md_mn_msg_chooseid_t	*d;
370	int			ret = 0;
371	int			nodecnt;
372	int			nodeno;
373	uint_t			nodeid;
374	uint_t			myflags;
375	set_t			setno;
376	mdsetname_t		*sp;
377	md_set_desc		*sd;
378	md_mnnode_desc		*nd;
379	md_error_t		mde = mdnullerror;
380	md_mn_result_t		*resp1 = NULL;
381
382	resp->mmr_out_size = 0;
383	resp->mmr_err_size = 0;
384	resp->mmr_out = NULL;
385	resp->mmr_err = NULL;
386	resp->mmr_comm_state = MDMNE_ACK;
387	d = (md_mn_msg_chooseid_t *)(void *)msg->msg_event_data;
388
389	/*
390	 * The node to be chosen will be the resync count for the set
391	 * modulo the number of live nodes in the set
392	 */
393	setno = MD_MIN2SET(d->msg_chooseid_mnum);
394	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
395		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
396		    "MD_MN_MSG_CHOOSE_OWNER: Invalid setno %d\n"), setno);
397		resp->mmr_exitval = 1;
398		return;
399	}
400	if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
401		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
402		    "MD_MN_MSG_CHOOSE_OWNER: Invalid set pointer\n"));
403		resp->mmr_exitval = 1;
404		return;
405	}
406
407	/* Count the number of live nodes */
408	nodecnt = 0;
409	nd = sd->sd_nodelist;
410	while (nd) {
411		if (nd->nd_flags & MD_MN_NODE_ALIVE)
412			nodecnt++;
413		nd = nd->nd_next;
414	}
415	nodeno = (d->msg_chooseid_rcnt%nodecnt);
416
417	/*
418	 * If we've been called with msg_chooseid_set_node set TRUE then we
419	 * are simply re-setting the owner id to ensure consistency across
420	 * the cluster.
421	 * If the flag is reset (B_FALSE) we are requesting a new owner to be
422	 * determined.
423	 */
424	if (d->msg_chooseid_set_node) {
425		nodeid = d->msg_chooseid_rcnt;
426	} else {
427		/* scan the nodelist looking for the required node */
428		nodecnt = 0;
429		nd = sd->sd_nodelist;
430		while (nd) {
431			if (nd->nd_flags & MD_MN_NODE_ALIVE) {
432				if (nodecnt == nodeno)
433					break;
434				nodecnt++;
435			}
436			nd = nd->nd_next;
437		}
438		nodeid = nd->nd_nodeid;
439	}
440
441	/* Send message to all nodes to make ownership change */
442	chownermsg.msg_chowner_mnum =  d->msg_chooseid_mnum;
443	chownermsg.msg_chowner_nodeid = nodeid;
444	myflags = MD_MSGF_NO_LOG;
445
446	/* inherit some flags from the parent message */
447	myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS;
448
449	ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum),
450	    MD_MN_MSG_CHANGE_OWNER, myflags, 0, (char *)&chownermsg,
451	    sizeof (chownermsg), &resp1, &mde);
452	if (resp1 != NULL)
453		free_result(resp1);
454	resp->mmr_exitval = ret;
455}
456
457/*
458 * Handler for MD_MN_MSG_CHANGE_OWNER
459 * This is called when we are perfoming a resync and wish to change from
460 * no mirror owner to an owner chosen by the master.
461 * This mesage is only relevant for the new owner, the message will be
462 * ignored by all other nodes
463 */
464/*ARGSUSED*/
465void
466mdmn_do_change_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
467{
468	md_set_mmown_params_t	setown;
469	md_mn_msg_chowner_t	*d;
470	int			ret = 0;
471	set_t			setno;
472	mdsetname_t		*sp;
473	md_set_desc		*sd;
474	md_error_t		mde = mdnullerror;
475
476	resp->mmr_out_size = 0;
477	resp->mmr_err_size = 0;
478	resp->mmr_out = NULL;
479	resp->mmr_err = NULL;
480	resp->mmr_comm_state = MDMNE_ACK;
481	d = (md_mn_msg_chowner_t *)(void *)msg->msg_event_data;
482
483	setno = MD_MIN2SET(d->msg_chowner_mnum);
484	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
485		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
486		    "MD_MN_MSG_CHANGE_OWNER: Invalid setno %d\n"), setno);
487		resp->mmr_exitval = 1;
488		return;
489	}
490	if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
491		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
492		    "MD_MN_MSG_CHANGE_OWNER: Invalid set pointer\n"));
493		resp->mmr_exitval = 1;
494		return;
495	}
496
497	if (d->msg_chowner_nodeid == sd->sd_mn_mynode->nd_nodeid) {
498		/*
499		 * If we are the chosen owner, issue ioctl to make the
500		 * ownership change
501		 */
502		(void) memset(&setown, 0, sizeof (md_set_mmown_params_t));
503		setown.d.mnum = d->msg_chowner_mnum;
504		setown.d.owner = d->msg_chowner_nodeid;
505		setown.d.flags = MD_MN_MM_SPAWN_THREAD;
506		MD_SETDRIVERNAME(&setown, MD_MIRROR,
507		    MD_MIN2SET(d->msg_chowner_mnum));
508
509		/*
510		 * Single shot at changing the the owner, if it fails EAGAIN,
511		 * another node must have become the owner while we are in the
512		 * process of making this choice.
513		 */
514
515		ret = metaioctl(MD_MN_SET_MM_OWNER, &setown,
516		    &(setown.mde), NULL);
517		if (ret == EAGAIN)
518			ret = 0;
519	}
520	resp->mmr_exitval = ret;
521}
522
523/* handler for MD_MN_MSG_SUSPEND_WRITES */
524/*ARGSUSED*/
525void
526mdmn_do_susp_write(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
527{
528	/* Suspend writes to a region of a mirror */
529	md_suspend_wr_params_t	suspwr_ioc;
530	md_mn_msg_suspwr_t	*d;
531	int			ret;
532
533	resp->mmr_out_size = 0;
534	resp->mmr_err_size = 0;
535	resp->mmr_out = NULL;
536	resp->mmr_err = NULL;
537	resp->mmr_comm_state = MDMNE_ACK;
538	d = (md_mn_msg_suspwr_t *)(void *)msg->msg_event_data;
539
540	(void) memset(&suspwr_ioc, 0, sizeof (md_suspend_wr_params_t));
541	MD_SETDRIVERNAME(&suspwr_ioc, MD_MIRROR,
542	    MD_MIN2SET(d->msg_suspwr_mnum));
543	suspwr_ioc.mnum = d->msg_suspwr_mnum;
544	ret = metaioctl(MD_MN_SUSPEND_WRITES, &suspwr_ioc,
545	    &(suspwr_ioc.mde), NULL);
546	resp->mmr_exitval = ret;
547}
548
549/*
550 * handler for MD_MN_MSG_STATE_UPDATE_RESWR
551 * This functions update a submirror component state and then resumes writes
552 * to the mirror
553 */
554/*ARGSUSED*/
555void
556mdmn_do_state_upd_reswr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
557{
558	/* Update the state of the component of a mirror */
559	md_set_state_params_t	setstate_ioc;
560	md_mn_msg_stch_t	*d;
561	int			ret;
562
563	resp->mmr_out_size = 0;
564	resp->mmr_err_size = 0;
565	resp->mmr_out = NULL;
566	resp->mmr_err = NULL;
567	resp->mmr_comm_state = MDMNE_ACK;
568	d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
569
570	(void) memset(&setstate_ioc, 0, sizeof (md_set_state_params_t));
571	MD_SETDRIVERNAME(&setstate_ioc, MD_MIRROR,
572	    MD_MIN2SET(d->msg_stch_mnum));
573	setstate_ioc.mnum = d->msg_stch_mnum;
574	setstate_ioc.sm = d->msg_stch_sm;
575	setstate_ioc.comp = d->msg_stch_comp;
576	setstate_ioc.state = d->msg_stch_new_state;
577	setstate_ioc.hs_id = d->msg_stch_hs_id;
578	ret = metaioctl(MD_MN_SET_STATE, &setstate_ioc,
579	    &(setstate_ioc.mde), NULL);
580	resp->mmr_exitval = ret;
581}
582
583/*
584 * submessage generator for MD_MN_MSG_STATE_UPDATE and MD_MN_MSG_STATE_UPDATE2
585 * This generates 2 messages, the first is SUSPEND_WRITES and
586 * depending on the type of the original message the second one is
587 * either STATE_UPDATE_RESWR or STATE_UPDATE_RESWR2 which actually does
588 * the same, but runs on a higher class.
589 */
590int
591mdmn_smgen_state_upd(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
592{
593	md_mn_msg_t		*nmsg;
594	md_mn_msg_stch_t	*d;
595	md_mn_msg_stch_t	*stch_data;
596	md_mn_msg_suspwr_t	*suspwr_data;
597
598	d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
599
600	nmsg = Zalloc(sizeof (md_mn_msg_t));
601	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
602
603	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
604	nmsg->msg_setno		= msg->msg_setno;
605	nmsg->msg_type		= MD_MN_MSG_SUSPEND_WRITES;
606	nmsg->msg_event_size	= sizeof (md_mn_msg_suspwr_t);
607	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_suspwr_t));
608	suspwr_data = (md_mn_msg_suspwr_t *)(void *)nmsg->msg_event_data;
609	suspwr_data->msg_suspwr_mnum = d->msg_stch_mnum;
610	msglist[0] = nmsg;
611
612	nmsg = Zalloc(sizeof (md_mn_msg_t));
613	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
614
615	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
616	nmsg->msg_setno		= msg->msg_setno;
617	if (msg->msg_type == MD_MN_MSG_STATE_UPDATE2) {
618		nmsg->msg_type		= MD_MN_MSG_STATE_UPDATE_RESWR2;
619	} else {
620		nmsg->msg_type		= MD_MN_MSG_STATE_UPDATE_RESWR;
621	}
622	nmsg->msg_event_size	= sizeof (md_mn_msg_stch_t);
623	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_stch_t));
624	stch_data = (md_mn_msg_stch_t *)(void *)nmsg->msg_event_data;
625	stch_data->msg_stch_mnum = d->msg_stch_mnum;
626	stch_data->msg_stch_sm = d->msg_stch_sm;
627	stch_data->msg_stch_comp = d->msg_stch_comp;
628	stch_data->msg_stch_new_state = d->msg_stch_new_state;
629	stch_data->msg_stch_hs_id = d->msg_stch_hs_id;
630	msglist[1] = nmsg;
631	return (2); /* Return the number of submessages generated */
632}
633
634/*
635 * handler for MD_MN_MSG_ALLOCATE_HOTSPARE and MD_MN_MSG_ALLOCATE_HOTSPARE2
636 * This sends a message to all nodes requesting them to allocate a hotspare
637 * for the specified component. The component is specified by the mnum of
638 * the mirror, the submirror index and the component index.
639 */
640/*ARGSUSED*/
641void
642mdmn_do_allocate_hotspare(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
643{
644	/* Allocate a hotspare for a mirror component */
645	md_alloc_hotsp_params_t allochsp_ioc;
646	md_mn_msg_allochsp_t    *d;
647	int			ret;
648
649	resp->mmr_out_size = 0;
650	resp->mmr_err_size = 0;
651	resp->mmr_out = NULL;
652	resp->mmr_err = NULL;
653	resp->mmr_comm_state = MDMNE_ACK;
654	d = (md_mn_msg_allochsp_t *)((void *)(msg->msg_event_data));
655
656	(void) memset(&allochsp_ioc, 0,
657	    sizeof (md_alloc_hotsp_params_t));
658	MD_SETDRIVERNAME(&allochsp_ioc, MD_MIRROR,
659	    MD_MIN2SET(d->msg_allochsp_mnum));
660	allochsp_ioc.mnum = d->msg_allochsp_mnum;
661	allochsp_ioc.sm = d->msg_allochsp_sm;
662	allochsp_ioc.comp = d->msg_allochsp_comp;
663	allochsp_ioc.hs_id = d->msg_allochsp_hs_id;
664	ret = metaioctl(MD_MN_ALLOCATE_HOTSPARE, &allochsp_ioc,
665	    &(allochsp_ioc.mde), NULL);
666	resp->mmr_exitval = ret;
667}
668
669/*
670 * handler for MD_MN_MSG_RESYNC_STARTING,MD_MN_MSG_RESYNC_FIRST,
671 * MD_MN_MSG_RESYNC_NEXT, MD_MN_MSG_RESYNC_FINISH, MD_MN_MSG_RESYNC_PHASE_DONE
672 */
673/*ARGSUSED*/
674void
675mdmn_do_resync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
676{
677	md_mn_msg_resync_t		*d;
678	md_mn_rs_params_t		respar;
679	mddb_setflags_config_t	sf;
680	md_error_t				ep = mdnullerror;
681	mdsetname_t				*sp;
682	int	ret;
683	int	smi;
684	int start_flag = 1;
685	int sleep_count = 0;
686	unsigned int sleep_time = 2;
687
688	resp->mmr_out_size = 0;
689	resp->mmr_err_size = 0;
690	resp->mmr_out = NULL;
691	resp->mmr_err = NULL;
692	resp->mmr_comm_state = MDMNE_ACK;
693	d = (md_mn_msg_resync_t *)((void *)(msg->msg_event_data));
694
695	(void) memset(&respar, 0, sizeof (respar));
696	MD_SETDRIVERNAME(&respar, MD_MIRROR,
697	    MD_MIN2SET(d->msg_resync_mnum))
698	respar.msg_type = (int)msg->msg_type;
699	respar.mnum = d->msg_resync_mnum;
700	respar.rs_type = d->msg_resync_type;
701	respar.rs_start = d->msg_resync_start;
702	respar.rs_size = d->msg_resync_rsize;
703	respar.rs_done = d->msg_resync_done;
704	respar.rs_2_do = d->msg_resync_2_do;
705	respar.rs_originator = d->msg_originator;
706	respar.rs_flags = d->msg_resync_flags;
707
708	for (smi = 0; smi < NMIRROR; smi++) {
709		respar.rs_sm_state[smi] = d->msg_sm_state[smi];
710		respar.rs_sm_flags[smi] = d->msg_sm_flags[smi];
711	}
712
713	/*
714	 * Prior to running the resync thread first check that the start_step
715	 * flag (MD_SET_MN_START_RC) added by metaclust's MC_START step has been
716	 * removed from the set record flags. Ordinarily, this would be removed
717	 * at MC_STEP4 in metaclust - need to ensure this has happened on all
718	 * nodes.
719	 */
720	(void) memset(&sf, 0, sizeof (sf));
721	sf.sf_setno = MD_MIN2SET(d->msg_resync_mnum);
722	sf.sf_flags = MDDB_NM_GET;
723	/* Use magic to help protect ioctl against attack. */
724	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
725	if ((sp = metasetnosetname(sf.sf_setno, &ep)) == NULL) {
726		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
727		    "MDMN_DO_RESYNC: Invalid setno = %d\n"),
728		    sf.sf_setno);
729		(void) mdstealerror(&(resp->mmr_ep), &ep);
730		resp->mmr_exitval = -1;
731		return;
732	}
733
734	/* start_flag always true initially */
735	while (start_flag) {
736		if (metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) != 0) {
737			syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
738			    "MDMN_DO_RESYNC: Could not get start_step "
739			    "flag for set %s - returning\n"),
740			    sp->setname);
741			(void) mdstealerror(&(resp->mmr_ep), &sf.sf_mde);
742			resp->mmr_exitval = -1;
743			return;
744		}
745
746		/* metaioctl returns successfully - is start flag cleared? */
747		if (sf.sf_setflags & MD_SET_MN_START_RC) {
748			start_flag = 1;
749			(void) sleep(sleep_time);
750			sleep_count++;
751			if ((sleep_count == 1) ||
752			    (sleep_count % SLEEP_MOD) == 0) {
753				syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
754				    "MDMN_DO_RESYNC: Waiting for start_step "
755				    "flag for set %s to be cleared\n"),
756				    sp->setname);
757			}
758			if (sleep_count == MAX_SLEEPS) {
759				syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
760				    "MDMN_DO_RESYNC: Could not clear "
761				    "start_step flag for set %s "
762				    "- returning\n"), sp->setname);
763				resp->mmr_exitval = -1;
764				return;
765			}
766		} else {
767			start_flag = 0;
768		}
769	}
770
771	ret = metaioctl(MD_MN_RESYNC, &respar, &respar.mde, NULL);
772	if (ret) {
773		(void) mdstealerror(&(resp->mmr_ep), &respar.mde);
774	}
775	resp->mmr_exitval = ret;
776}
777
778/*
779 * handler for MD_MN_MSG_SETSYNC
780 */
781/*ARGSUSED*/
782void
783mdmn_do_setsync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
784{
785	md_mn_msg_setsync_t	*d;
786	md_resync_ioctl_t	ri;
787	int			ret;
788
789	resp->mmr_out_size = 0;
790	resp->mmr_err_size = 0;
791	resp->mmr_out = NULL;
792	resp->mmr_err = NULL;
793	resp->mmr_comm_state = MDMNE_ACK;
794	d = (md_mn_msg_setsync_t *)((void *)(msg->msg_event_data));
795
796	(void) memset(&ri, 0, sizeof (ri));
797	MD_SETDRIVERNAME(&ri, MD_MIRROR, MD_MIN2SET(d->setsync_mnum))
798	ri.ri_mnum = d->setsync_mnum;
799	ri.ri_copysize = d->setsync_copysize;
800	ri.ri_flags = d->setsync_flags;
801
802	ret = metaioctl(MD_MN_SETSYNC, &ri, &ri.mde, NULL);
803
804	resp->mmr_exitval = ret;
805}
806
807/*
808 * handler for MD_MN_MSG_SET_CAP. As this handler can deal with both mirrors
809 * and soft partitions, the driver name that is required for the ioctl call
810 * is included in the message.
811 */
812/*ARGSUSED*/
813void
814mdmn_do_set_cap(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
815{
816	md_mn_msg_setcap_t	*d;
817	md_mn_setcap_params_t	setcap_ioc;
818	minor_t			mnum;
819	int			ret;
820
821	resp->mmr_out_size = 0;
822	resp->mmr_err_size = 0;
823	resp->mmr_out = NULL;
824	resp->mmr_err = NULL;
825	resp->mmr_comm_state = MDMNE_ACK;
826	d = (md_mn_msg_setcap_t *)((void *)(msg->msg_event_data));
827	mnum = d->msg_setcap_mnum;
828
829	(void) memset(&setcap_ioc, 0, sizeof (setcap_ioc));
830
831	MD_SETDRIVERNAME(&setcap_ioc, d->msg_setcap_driver, MD_MIN2SET(mnum));
832	setcap_ioc.mnum = mnum;
833	setcap_ioc.sc_set = d->msg_setcap_set;
834
835	ret = metaioctl(MD_MN_SET_CAP, &setcap_ioc, &setcap_ioc.mde, NULL);
836
837	resp->mmr_exitval = ret;
838}
839
840/*
841 * Dummy handler for various CLASS0 messages like
842 * MD_MN_MSG_VERBOSITY / MD_MN_MSG_RESUME / MD_MN_MSG_SUSPEND ...
843 */
844/*ARGSUSED*/
845void
846mdmn_do_dummy(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
847{
848	resp->mmr_out_size = 0;
849	resp->mmr_err_size = 0;
850	resp->mmr_out = NULL;
851	resp->mmr_err = NULL;
852	resp->mmr_exitval = 0;
853	resp->mmr_comm_state = MDMNE_ACK;
854}
855
856/*
857 * Overall description of mdcommd support that keeps all nodes in-sync
858 * with the ondisk diskset mddbs.
859 *
860 * All configuration changes to the mddb - addition/deletion of metadevices
861 * or replicas must use a CLASS1 message to block out these changes.
862 * Changes to the state of existing replicas do not need to block CLASS1
863 * since there is no conflict when just updating the state of a replica.
864 *
865 * Error encountered when master writes to mddbs:
866 *	As the master updates parts of the mddbs, flags are updated describing
867 *	what has been written.  When all locks are dropped (either in
868 *	mddb_setexit or mdioctl), a PARSE message will be generated to all
869 *	nodes with an index list of known good mddbs and the parse flags.
870 *	The master node ignore the parse message since it sent it.
871 *	The slave nodes re-read in the changed part of the mddb using the list
872 *	of known good replicas that was passed.
873 *	PARSE message does not block CLASS1.
874 *	The PARSE message must be the highest class message.  Since this
875 *	message could be sent on any ioctl, this PARSE message class must
876 *	be higher than any other class message that could issue an ioctl.
877 *
878 *	Master		Slave1		Slave2
879 * 	Handles_error
880 *	PARSE		PARSE		PARSE
881 *
882 *
883 * Add/Delete mddbs can occur from the following commands:
884 *	metadb -s set_name -a/-d
885 *	metaset -s set_name -a/-d disk
886 *	metaset -s set_name -b
887 *
888 *	The metadb/metaset command is run on the node executing the command
889 *	and sends an ATTACH/DETACH message to the master node blocking CLASS1
890 *	messages on all nodes until this message is finished.  The master
891 *	node generates 3 submessages of BLOCK, SM_ATTACH/SM_DETACH, UNBLOCK.
892 *	The BLOCK message is only run on the master node and will BLOCK
893 *	the PARSE messages from being sent to the nodes.
894 *	The SM_ATTACH/SM_DETACH message is run on all nodes and actually adds or
895 *	removes the replica(s) from the given disk slice.
896 *	The UNBLOCK message is only run on the master node and allows the
897 *	sending of PARSE messages.
898 *
899 *	Master		Slave1		Slave2
900 *			Add mddb cmd
901 *			ATTACH msg to master
902 *	BLOCK
903 *	ATTACH		ATTACH		ATTACH
904 *	UNBLOCK
905 *	PARSE		PARSE		PARSE
906 *	ATTACH msg finished
907 *
908 * Add/Delete host side information from the following commands:
909 *	metaset -s set_name -a/-d -h
910 *
911 *	The metaset command is run on the node executing the command and
912 *	sends a DB_NEWSIDE/DB_DELSIDE message and a MD_NEWSIDE/MD_DELSIDE
913 *	message whenever a host is added to or deleted from the diskset.
914 *
915 *	The side information contains the major name and minor number
916 *	associated with a disk slice from a certain node's perspective
917 *	in an (failed) effort to support clustered systems that don't have the
918 *	same device name for a physical device. (The original designers of
919 *	SVM eventually took the shortcut of assuming that all device names
920 *	are the same on all systems, but left the side information in the
921 *	mddb and namespace.)  The side information is used for disk slices
922 *	that contain mddbs and/or are components for metadevices.
923 *
924 *	The DB_NEWSIDE/DELSIDE command adds or deletes the side information
925 *	for each mddb for the host being added or deleted.
926 *	The MD_ADDSIDE/MD_DELSIDE command adds or deletes the side information
927 *	for all disk slice components that are in the namespace records for
928 *	the host being added or deleted.
929 *
930 *	The DB_NEWSIDE/DB_DELSIDE message does not change any mddb records
931 *	and only needs to be executed on the master node since the slave
932 *	nodes will be brought up to date by the PARSE message that is
933 *	generated as a result of a change to the mddb.
934 *	The MD_ADDSIDE/MD_DELSIDE message does modify the records in the mddb
935 *	and needs to be run on all nodes.  The message must block class1
936 *	messages so that record changing commands don't interfere.
937 *
938 *	Master		Slave1		Slave2
939 *			Add host
940 *			DB_NEWSIDE msg to master
941 *	DB_NEWSIDE
942 *	PARSE		PARSE		PARSE
943 *	DB_NEWSIDE msg finished
944 *			MD_NEWSIDE msg to master
945 *	MD_NEWSIDE	MD_NEWSIDE	MD_NEWSIDE
946 *	MD_NEWSIDE msg finished
947 *
948 *
949 * Optimized resync record failure:
950 *	When any node sees a failure to write an optimized resync record
951 *	that node notifies the master node of the replica that failed.
952 *	The master node handles the error and updates the rest of the
953 *	nodes using a PARSE message.  The PARSE message also calls
954 *	fixoptrecord on each slave node causing each node to fix up
955 * 	the optimized resync records that are owned by that node (the mirror
956 *	owner code also sets the optimized resync record owner).  The master
957 *	node will fix up all optimized resync records that have no owner or
958 *	are owned by the master node.
959 *
960 *	Master		Slave1		Slave2
961 *					Optimized Record Failure
962 *					OPTRECERR msg to master
963 *	Master handles opt rec failure
964 *	PARSE		PARSE		PARSE
965 *	OPTRECERR msg finished
966 *					Slave rewrites optimized record
967 *
968 */
969
970/*
971 * Handler for MD_MN_MSG_MDDB_PARSE which send parse messages to the
972 * slave nodes in order to keep the incore view of the mddbs the
973 * same on all nodes.
974 *
975 * Since master node generated the mddb parse message, do nothing
976 * if this is the master node.
977 *
978 * If this is a slave node, send the parse message down to the kernel
979 * where this node will re-read in parts of the mddbs.
980 *
981 */
982void
983mdmn_do_mddb_parse(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
984{
985	md_mn_msg_mddb_parse_t	*d;
986	mddb_parse_parm_t	mpp;
987	int			ret = 0;
988	int			i;
989
990	resp->mmr_out_size = 0;
991	resp->mmr_err_size = 0;
992	resp->mmr_out = NULL;
993	resp->mmr_err = NULL;
994	resp->mmr_comm_state = MDMNE_ACK;
995	d = (md_mn_msg_mddb_parse_t *)((void *)(msg->msg_event_data));
996
997	if (flags & MD_MSGF_ON_MASTER)
998		return;
999
1000	(void) memset(&mpp, 0, sizeof (mpp));
1001	mpp.c_setno = msg->msg_setno;
1002	mpp.c_parse_flags = d->msg_parse_flags;
1003	for (i = 0; i < MDDB_NLB; i++) {
1004		mpp.c_lb_flags[i] = d->msg_lb_flags[i];
1005	}
1006	ret = metaioctl(MD_MN_MDDB_PARSE, &mpp, &mpp.c_mde, NULL);
1007	if (ret)
1008		(void) mdstealerror(&(resp->mmr_ep), &mpp.c_mde);
1009
1010	resp->mmr_exitval = ret;
1011}
1012
1013/*
1014 * Handler for MD_MN_MSG_MDDB_BLOCK which blocks the generation
1015 * of parse messages from this node.
1016 *
1017 * This is needed when attaching/detaching mddbs on the master and the
1018 * slave node is unable to handle a parse message until the slave node
1019 * has done the attach/detach of the mddbs.  So, master node will block
1020 * the parse messages, execute the attach/detach on all nodes and
1021 * then unblock the parse messages which causes the parse message to
1022 * be sent to all nodes.
1023 */
1024/*ARGSUSED*/
1025void
1026mdmn_do_mddb_block(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1027{
1028	md_mn_msg_mddb_block_t	*d;
1029	mddb_block_parm_t	mbp;
1030	int			ret;
1031
1032	resp->mmr_out_size = 0;
1033	resp->mmr_err_size = 0;
1034	resp->mmr_out = NULL;
1035	resp->mmr_err = NULL;
1036	resp->mmr_comm_state = MDMNE_ACK;
1037	d = (md_mn_msg_mddb_block_t *)((void *)(msg->msg_event_data));
1038
1039	(void) memset(&mbp, 0, sizeof (mbp));
1040	mbp.c_setno = msg->msg_setno;
1041	mbp.c_blk_flags = d->msg_block_flags;
1042	ret = metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL);
1043	if (ret)
1044		(void) mdstealerror(&(resp->mmr_ep), &mbp.c_mde);
1045
1046	resp->mmr_exitval = ret;
1047}
1048
1049/*
1050 * Submessage generator for MD_MN_MSG_META_DB_ATTACH which generates
1051 * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_ATTACH
1052 * message on all nodes and then an UNBLOCK message on the master only.
1053 */
1054int
1055mdmn_smgen_mddb_attach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
1056{
1057	md_mn_msg_t			*nmsg;
1058	md_mn_msg_meta_db_attach_t	*d;
1059	md_mn_msg_meta_db_attach_t	*attach_d;
1060	md_mn_msg_mddb_block_t		*block_d;
1061
1062	d = (md_mn_msg_meta_db_attach_t *)(void *)msg->msg_event_data;
1063
1064	nmsg = Zalloc(sizeof (md_mn_msg_t));
1065	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1066
1067	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1068	nmsg->msg_setno		= msg->msg_setno;
1069	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1070	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1071	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1072	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1073	block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1074	msglist[0] = nmsg;
1075
1076	nmsg = Zalloc(sizeof (md_mn_msg_t));
1077	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1078
1079	/* Don't log submessages and panic on inconsistent results */
1080	nmsg->msg_flags = MD_MSGF_NO_LOG |
1081	    MD_MSGF_PANIC_WHEN_INCONSISTENT;
1082	nmsg->msg_setno		= msg->msg_setno;
1083	nmsg->msg_type		= MD_MN_MSG_SM_MDDB_ATTACH;
1084	nmsg->msg_event_size	= sizeof (md_mn_msg_meta_db_attach_t);
1085	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_meta_db_attach_t));
1086	attach_d = (md_mn_msg_meta_db_attach_t *)
1087	    (void *)nmsg->msg_event_data;
1088	attach_d->msg_l_dev = d->msg_l_dev;
1089	attach_d->msg_cnt = d->msg_cnt;
1090	attach_d->msg_dbsize = d->msg_dbsize;
1091	(void) strncpy(attach_d->msg_dname, d->msg_dname, 16);
1092	attach_d->msg_splitname = d->msg_splitname;
1093	attach_d->msg_options = d->msg_options;
1094	msglist[1] = nmsg;
1095
1096	nmsg = Zalloc(sizeof (md_mn_msg_t));
1097	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1098
1099	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1100	nmsg->msg_setno		= msg->msg_setno;
1101	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1102	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1103	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1104	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1105	block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1106	msglist[2] = nmsg;
1107
1108	return (3); /* Return the number of submessages generated */
1109}
1110
1111/*
1112 * Submessage generator for MD_MN_MSG_META_DB_DETACH which generates
1113 * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_DETACH
1114 * message on all nodes and then an UNBLOCK message on the master only.
1115 */
1116int
1117mdmn_smgen_mddb_detach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
1118{
1119	md_mn_msg_t			*nmsg;
1120	md_mn_msg_meta_db_detach_t	*d;
1121	md_mn_msg_meta_db_detach_t	*detach_d;
1122	md_mn_msg_mddb_block_t		*block_d;
1123
1124	d = (md_mn_msg_meta_db_detach_t *)(void *)msg->msg_event_data;
1125
1126	nmsg = Zalloc(sizeof (md_mn_msg_t));
1127	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1128
1129	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1130	nmsg->msg_setno		= msg->msg_setno;
1131	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1132	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1133	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1134	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1135	block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1136	msglist[0] = nmsg;
1137
1138	nmsg = Zalloc(sizeof (md_mn_msg_t));
1139	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1140
1141	/* Don't log submessages and panic on inconsistent results */
1142	nmsg->msg_flags = MD_MSGF_NO_LOG |
1143	    MD_MSGF_PANIC_WHEN_INCONSISTENT;
1144	nmsg->msg_setno		= msg->msg_setno;
1145	nmsg->msg_type		= MD_MN_MSG_SM_MDDB_DETACH;
1146	nmsg->msg_event_size	= sizeof (md_mn_msg_meta_db_detach_t);
1147	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_meta_db_detach_t));
1148	detach_d = (md_mn_msg_meta_db_detach_t *)
1149	    (void *)nmsg->msg_event_data;
1150	detach_d->msg_splitname = d->msg_splitname;
1151	msglist[1] = nmsg;
1152
1153	nmsg = Zalloc(sizeof (md_mn_msg_t));
1154	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1155
1156	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1157	nmsg->msg_setno		= msg->msg_setno;
1158	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1159	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1160	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1161	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1162	block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1163	msglist[2] = nmsg;
1164
1165	return (3); /* Return the number of submessages generated */
1166}
1167
1168/*
1169 * Handler for MD_MN_MSG_SM_MDDB_ATTACH which is used to attach mddbs.
1170 *
1171 * Used when running:
1172 *	metadb -s set_name -a
1173 * 	metaset -s set_name -a/-d disk
1174 *	metaset -s set_name -b
1175 */
1176/*ARGSUSED*/
1177void
1178mdmn_do_sm_mddb_attach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1179{
1180	md_mn_msg_meta_db_attach_t	*d;
1181	struct mddb_config		c;
1182	int				i;
1183	int				ret = 0;
1184	md_error_t			ep = mdnullerror;
1185	char				*name, *add_name;
1186	mdname_t			*np;
1187	mdsetname_t			*sp;
1188
1189	resp->mmr_out_size = 0;
1190	resp->mmr_err_size = 0;
1191	resp->mmr_out = NULL;
1192	resp->mmr_err = NULL;
1193	resp->mmr_comm_state = MDMNE_ACK;
1194	d = (md_mn_msg_meta_db_attach_t *)((void *)(msg->msg_event_data));
1195
1196	(void) memset(&c, 0, sizeof (c));
1197	c.c_setno = msg->msg_setno;
1198	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1199	(void) strncpy(c.c_locator.l_driver, d->msg_dname,
1200	    sizeof (c.c_locator.l_driver));
1201	c.c_devname = d->msg_splitname;
1202	c.c_locator.l_mnum = meta_getminor(d->msg_l_dev);
1203	c.c_multi_node = 1;
1204	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1205		(void) mdstealerror(&(resp->mmr_ep), &ep);
1206		resp->mmr_exitval = -1;
1207		return;
1208	}
1209	(void) strcpy(c.c_setname, sp->setname);
1210	c.c_sideno = getmyside(sp, &ep);
1211	if (c.c_sideno == MD_SIDEWILD) {
1212		(void) mdstealerror(&(resp->mmr_ep), &ep);
1213		resp->mmr_exitval = -1;
1214		return;
1215	}
1216
1217	name = splicename(&d->msg_splitname);
1218	np = metaname(&sp, name, LOGICAL_DEVICE, &ep);
1219	Free(name);
1220	if (np == NULL) {
1221		(void) mdstealerror(&(resp->mmr_ep), &ep);
1222		resp->mmr_exitval = -1;
1223		return;
1224	}
1225	/*
1226	 * All nodes in MN diskset must do meta_check_replica
1227	 * since this causes the shared namespace to be
1228	 * populated by the md driver names while checking
1229	 * to see if this device is already in use as a
1230	 * metadevice.
1231	 */
1232	if (meta_check_replica(sp, np, d->msg_options, 0,
1233	    (d->msg_cnt * d->msg_dbsize), &ep)) {
1234		(void) mdstealerror(&(resp->mmr_ep), &ep);
1235		resp->mmr_exitval = -1;
1236		return;
1237	}
1238
1239	for (i = 0; i < d->msg_cnt; i++) {
1240		c.c_locator.l_blkno = i * d->msg_dbsize + 16;
1241		if (setup_med_cfg(sp, &c,
1242		    (d->msg_options & MDCHK_SET_FORCE), &ep)) {
1243			ret = -1;
1244			(void) mdstealerror(&(resp->mmr_ep), &ep);
1245			break;
1246		}
1247		ret = metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL);
1248		/* If newdev was successful, continue with attach */
1249		if (ret == 0) {
1250			if (meta_db_addsidenms(sp, np, c.c_locator.l_blkno,
1251			    DB_ADDSIDENMS_NO_BCAST, &ep)) {
1252				ret = -1;
1253				(void) mdstealerror(&(resp->mmr_ep), &ep);
1254				break;
1255			}
1256		} else {
1257			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1258			break;
1259		}
1260	}
1261	add_name = splicename(&d->msg_splitname);
1262	if ((np = metaname(&sp, add_name, LOGICAL_DEVICE, &ep)) != NULL) {
1263		meta_invalidate_name(np);
1264	} else {
1265		ret = -1;
1266		(void) mdstealerror(&(resp->mmr_ep), &ep);
1267	}
1268	Free(add_name);
1269
1270	resp->mmr_exitval = ret;
1271}
1272
1273/*
1274 * Handler for MD_MN_MSG_SM_MDDB_DETACH which is used to detach mddbs.
1275 *
1276 * Used when running:
1277 *	metadb -s set_name -d
1278 * 	metaset -s set_name -a/-d disk
1279 *	metaset -s set_name -b
1280 */
1281/*ARGSUSED*/
1282void
1283mdmn_do_sm_mddb_detach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1284{
1285	md_mn_msg_meta_db_detach_t	*d;
1286	struct mddb_config		c;
1287	int				i;
1288	int				ret = 0;
1289	md_error_t			ep = mdnullerror;
1290	char				*name, *del_name;
1291	mdname_t			*np;
1292	mdsetname_t			*sp;
1293
1294	resp->mmr_out_size = 0;
1295	resp->mmr_err_size = 0;
1296	resp->mmr_out = NULL;
1297	resp->mmr_err = NULL;
1298	resp->mmr_comm_state = MDMNE_ACK;
1299	d = (md_mn_msg_meta_db_detach_t *)((void *)(msg->msg_event_data));
1300
1301	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1302		(void) mdstealerror(&(resp->mmr_ep), &ep);
1303		resp->mmr_exitval = -1;
1304		return;
1305	}
1306
1307	(void) memset(&c, 0, sizeof (c));
1308	c.c_setno = msg->msg_setno;
1309	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1310		resp->mmr_exitval = -1;
1311		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1312		return;
1313	}
1314	i = 0;
1315	del_name = splicename(&d->msg_splitname);
1316	while (i < c.c_dbcnt) {
1317		c.c_id = i;
1318		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1319			ret = -1;
1320			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1321			break;
1322		}
1323		name = splicename(&c.c_devname);
1324		if (strcmp(name, del_name) != 0) {
1325			Free(name);
1326			i++;
1327			continue;
1328		}
1329		Free(name);
1330		/* Found a match - delete mddb */
1331		if (metaioctl(MD_DB_DELDEV, &c, &c.c_mde, NULL) != 0) {
1332			ret = -1;
1333			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1334			break;
1335		}
1336		/* Not incrementing "i" intentionally (dbcnt is changed) */
1337	}
1338	if ((np = metaname(&sp, del_name, LOGICAL_DEVICE, &ep)) != NULL) {
1339		meta_invalidate_name(np);
1340	} else {
1341		ret = -1;
1342		(void) mdstealerror(&(resp->mmr_ep), &ep);
1343	}
1344	Free(del_name);
1345
1346	resp->mmr_exitval = ret;
1347}
1348
1349/*
1350 * Handler for MD_MN_MSG_META_DB_NEWSIDE which is used to update the
1351 * side information for each diskset mddb when a new host has been
1352 * added to the diskset.  The side information is the /dev/dsk/ctds name
1353 * that the new node would use to access each mddb.
1354 *
1355 * Since this routine makes no changes to the records in the diskset mddb,
1356 * this routine only needs to be run on the master node.  The master node's
1357 * kernel code will detect that portions of the mddb have changed and
1358 * will send a parse message to all nodes to re-parse parts of the mddb.
1359 *
1360 * Used when running:
1361 * 	metaset -s set_name -a -h new_hostname
1362 */
1363/*ARGSUSED*/
1364void
1365mdmn_do_meta_db_newside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1366{
1367	md_mn_msg_meta_db_newside_t	*d;
1368	struct mddb_config		c;
1369	int				ret = 0;
1370	mdsetname_t			*sp;
1371	md_error_t			ep = mdnullerror;
1372
1373	resp->mmr_out_size = 0;
1374	resp->mmr_err_size = 0;
1375	resp->mmr_out = NULL;
1376	resp->mmr_err = NULL;
1377	resp->mmr_comm_state = MDMNE_ACK;
1378	d = (md_mn_msg_meta_db_newside_t *)((void *)(msg->msg_event_data));
1379
1380	(void) memset(&c, 0, sizeof (c));
1381	c.c_setno = msg->msg_setno;
1382	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1383	c.c_locator.l_blkno = d->msg_blkno;
1384	(void) strncpy(c.c_locator.l_driver, d->msg_dname,
1385	    sizeof (c.c_locator.l_driver));
1386	c.c_devname = d->msg_splitname;
1387	c.c_locator.l_mnum = d->msg_mnum;
1388	c.c_multi_node = 1;
1389	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1390		(void) mdstealerror(&(resp->mmr_ep), &ep);
1391		resp->mmr_exitval = -1;
1392		return;
1393	}
1394	(void) strcpy(c.c_setname, sp->setname);
1395	c.c_sideno = d->msg_sideno;
1396
1397	if ((ret = metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL)) != 0) {
1398		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1399	}
1400	resp->mmr_exitval = ret;
1401}
1402
1403/*
1404 * Handler for MD_MN_MSG_META_DB_DELSIDE which is used to remove the
1405 * side information for each diskset mddb when a host has been
1406 * deleted from the diskset.  The side information is the /dev/dsk/ctds name
1407 * that the node would use to access each mddb.
1408 *
1409 * Since this routine makes no changes to the records in the diskset mddb,
1410 * this routine only needs to be run on the master node.  The master node's
1411 * kernel code will detect that portions of the mddb have changed and
1412 * will send a parse message to all nodes to re-parse parts of the mddb.
1413 *
1414 * Used when running:
1415 * 	metaset -s set_name -d -h hostname
1416 */
1417/*ARGSUSED*/
1418void
1419mdmn_do_meta_db_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1420{
1421	md_mn_msg_meta_db_delside_t	*d;
1422	mddb_config_t			c;
1423	int				ret = 0;
1424	mdsetname_t			*sp;
1425	md_error_t			ep = mdnullerror;
1426
1427	resp->mmr_out_size = 0;
1428	resp->mmr_err_size = 0;
1429	resp->mmr_out = NULL;
1430	resp->mmr_err = NULL;
1431	resp->mmr_comm_state = MDMNE_ACK;
1432	d = (md_mn_msg_meta_db_delside_t *)((void *)(msg->msg_event_data));
1433
1434	(void) memset(&c, 0, sizeof (c));
1435	c.c_setno = msg->msg_setno;
1436	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1437	c.c_locator.l_blkno = d->msg_blkno;
1438	c.c_multi_node = 1;
1439	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1440		(void) mdstealerror(&(resp->mmr_ep), &ep);
1441		resp->mmr_exitval = -1;
1442		return;
1443	}
1444	(void) strcpy(c.c_setname, sp->setname);
1445	c.c_sideno = d->msg_sideno;
1446
1447	if ((ret = metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL)) != 0) {
1448		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1449	}
1450	resp->mmr_exitval = ret;
1451}
1452
1453/*
1454 * Handler for MD_MN_MSG_META_MD_ADDSIDE which is used to add the
1455 * side information for each diskset metadevice component (if that
1456 * component is a disk) when a host has been added to the diskset.
1457 * The side information is the /dev/dsk/ctds name that the node would
1458 * use to access the metadevice component.
1459 *
1460 * This routine makes changes to the mddb records and must be run
1461 * on all nodes.
1462 *
1463 * Used when running:
1464 * 	metaset -s set_name -a -h new_hostname
1465 */
1466/*ARGSUSED*/
1467void
1468mdmn_do_meta_md_addside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1469{
1470	md_mn_msg_meta_md_addside_t	*d;
1471	mdnm_params_t			nm;
1472	mdsetname_t			*sp;
1473	char				*cname, *dname;
1474	minor_t				mnum;
1475	int				done, i;
1476	md_error_t			ep = mdnullerror;
1477
1478	resp->mmr_out_size = 0;
1479	resp->mmr_err_size = 0;
1480	resp->mmr_out = NULL;
1481	resp->mmr_err = NULL;
1482	resp->mmr_comm_state = MDMNE_ACK;
1483	d = (md_mn_msg_meta_md_addside_t *)((void *)(msg->msg_event_data));
1484
1485	(void) memset(&nm, 0, sizeof (nm));
1486	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1487		(void) mdstealerror(&(resp->mmr_ep), &ep);
1488		resp->mmr_exitval = -1;
1489		return;
1490	}
1491	/* While loop continues until IOCNXTKEY_NM gives nm.key of KEYWILD */
1492	/*CONSTCOND*/
1493	while (1) {
1494		char	*drvnm = NULL;
1495
1496		nm.mde = mdnullerror;
1497		nm.setno = msg->msg_setno;
1498		nm.side = d->msg_otherside;
1499		if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1500			(void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1501			resp->mmr_exitval = -1;
1502			return;
1503		}
1504
1505		/* Normal exit path is to eventually get a KEYWILD */
1506		if (nm.key == MD_KEYWILD) {
1507			resp->mmr_exitval = 0;
1508			return;
1509		}
1510
1511		/*
1512		 * Okay we have a valid key
1513		 * Let's see if it is hsp or not
1514		 */
1515		nm.devname = (uintptr_t)meta_getnmentbykey(msg->msg_setno,
1516		    d->msg_otherside, nm.key, &drvnm, NULL, NULL, &ep);
1517		if (nm.devname == NULL || drvnm == NULL) {
1518			if (nm.devname)
1519				Free((void *)(uintptr_t)nm.devname);
1520			if (drvnm)
1521				Free((void *)(uintptr_t)drvnm);
1522			(void) mdstealerror(&(resp->mmr_ep), &ep);
1523			resp->mmr_exitval = -1;
1524			return;
1525		}
1526
1527		/*
1528		 * If it is hsp add here
1529		 */
1530		if (strcmp(drvnm, MD_HOTSPARES) == 0) {
1531			if (add_name(sp, d->msg_sideno, nm.key, MD_HOTSPARES,
1532			    minor(NODEV), (char *)(uintptr_t)nm.devname,
1533			    NULL, NULL, &ep) == -1) {
1534				Free((void *)(uintptr_t)nm.devname);
1535				Free((void *)(uintptr_t)drvnm);
1536				(void) mdstealerror(&(resp->mmr_ep), &ep);
1537				resp->mmr_exitval = -1;
1538				return;
1539			} else {
1540				Free((void *)(uintptr_t)nm.devname);
1541				Free((void *)(uintptr_t)drvnm);
1542				continue;
1543			}
1544		}
1545
1546		nm.side = d->msg_sideno;
1547		if ((done = meta_getside_devinfo(sp,
1548		    (char *)(uintptr_t)nm.devname,
1549		    d->msg_sideno, &cname, &dname, &mnum, &ep)) == -1) {
1550			(void) mdstealerror(&(resp->mmr_ep), &ep);
1551			Free((void *)(uintptr_t)nm.devname);
1552			resp->mmr_exitval = -1;
1553			return;
1554		}
1555
1556		Free((void *)(uintptr_t)nm.devname);
1557		Free((void *)(uintptr_t)drvnm);
1558
1559		if (done != 1) {
1560			Free(cname);
1561			Free(dname);
1562			resp->mmr_exitval = -1;
1563			return;
1564		}
1565
1566		/*
1567		 * The device reference count can be greater than 1 if
1568		 * more than one softpart is configured on top of the
1569		 * same device.  If this is the case then we want to
1570		 * increment the count to sync up with the other sides.
1571		 */
1572		for (i = 0; i < nm.ref_count; i++) {
1573			if (add_name(sp, d->msg_sideno, nm.key, dname, mnum,
1574			    cname, NULL, NULL, &ep) == -1) {
1575				(void) mdstealerror(&(resp->mmr_ep), &ep);
1576				Free(cname);
1577				Free(dname);
1578				resp->mmr_exitval = -1;
1579				return;
1580			}
1581		}
1582		Free(cname);
1583		Free(dname);
1584	}
1585
1586	/*NOTREACHED*/
1587}
1588/*
1589 * Handler for MD_MN_MSG_META_MD_DELSIDE which is used to delete the
1590 * side information for each diskset metadevice component (if that
1591 * component is a disk) when a host has been removed from the diskset.
1592 * The side information is the /dev/dsk/ctds name that the node would
1593 * use to access the metadevice component.
1594 *
1595 * This routine makes changes to the mddb records and must be run
1596 * on all nodes.
1597 *
1598 * Used when running:
1599 * 	metaset -s set_name -d -h hostname
1600 */
1601/*ARGSUSED*/
1602void
1603mdmn_do_meta_md_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1604{
1605	md_mn_msg_meta_md_delside_t	*d;
1606	mdnm_params_t			nm;
1607	mdsetname_t			*sp;
1608	md_error_t			ep = mdnullerror;
1609	int				i;
1610
1611	resp->mmr_out_size = 0;
1612	resp->mmr_err_size = 0;
1613	resp->mmr_out = NULL;
1614	resp->mmr_err = NULL;
1615	resp->mmr_comm_state = MDMNE_ACK;
1616	d = (md_mn_msg_meta_md_delside_t *)((void *)(msg->msg_event_data));
1617
1618	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1619		(void) mdstealerror(&(resp->mmr_ep), &ep);
1620		resp->mmr_exitval = -1;
1621		return;
1622	}
1623
1624	(void) memset(&nm, 0, sizeof (nm));
1625	nm.key = MD_KEYWILD;
1626	/*CONSTCOND*/
1627	while (1) {
1628		nm.mde = mdnullerror;
1629		nm.setno = msg->msg_setno;
1630		nm.side = MD_SIDEWILD;
1631		if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1632			(void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1633			resp->mmr_exitval = -1;
1634			return;
1635		}
1636
1637		/* Normal exit path is to eventually get a KEYWILD */
1638		if (nm.key == MD_KEYWILD) {
1639			resp->mmr_exitval = 0;
1640			return;
1641		}
1642
1643		/*
1644		 * The device reference count can be greater than 1 if
1645		 * more than one softpart is configured on top of the
1646		 * same device.  If this is the case then we want to
1647		 * decrement the count to zero so the entry can be
1648		 * actually removed.
1649		 */
1650		for (i = 0; i < nm.ref_count; i++) {
1651			if (del_name(sp, d->msg_sideno, nm.key, &ep) == -1) {
1652				(void) mdstealerror(&(resp->mmr_ep), &ep);
1653				resp->mmr_exitval = -1;
1654				return;
1655			}
1656		}
1657	}
1658
1659	/*NOTREACHED*/
1660}
1661
1662/*
1663 * Handler for MD_MN_MSG_MDDB_OPTRECERR which is used to notify
1664 * the master node that a node has seen an error when attempting to
1665 * write to the optimized resync records that reside on 2 of the diskset
1666 * mddbs.  Master node will mark the failed replica in error and this
1667 * will send a parse message to all nodes to re-read parts of the mddb
1668 * and to fix their optimized resync records based on this information.
1669 */
1670/*ARGSUSED*/
1671void
1672mdmn_do_mddb_optrecerr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1673{
1674	md_mn_msg_mddb_optrecerr_t	*d;
1675	mddb_optrec_parm_t		mop;
1676	int				ret;
1677	int				i;
1678
1679	resp->mmr_out_size = 0;
1680	resp->mmr_err_size = 0;
1681	resp->mmr_out = NULL;
1682	resp->mmr_err = NULL;
1683	resp->mmr_comm_state = MDMNE_ACK;
1684	d = (md_mn_msg_mddb_optrecerr_t *)((void *)(msg->msg_event_data));
1685
1686	(void) memset(&mop, 0, sizeof (mop));
1687	mop.c_setno = msg->msg_setno;
1688	for (i = 0; i < 2; i++) {
1689		mop.c_recerr[i] = d->msg_recerr[i];
1690	}
1691	ret = metaioctl(MD_MN_MDDB_OPTRECFIX, &mop, &mop.c_mde, NULL);
1692	if (ret)
1693		(void) mdstealerror(&(resp->mmr_ep), &mop.c_mde);
1694
1695	resp->mmr_exitval = ret;
1696}
1697
1698int
1699mdmn_smgen_test6(md_mn_msg_t *msg, md_mn_msg_t **msglist)
1700{
1701	md_mn_msg_t	*nmsg;
1702
1703	nmsg = Zalloc(sizeof (md_mn_msg_t));
1704	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1705
1706	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1707	nmsg->msg_setno		= msg->msg_setno;
1708	nmsg->msg_type		= MD_MN_MSG_TEST2;
1709	nmsg->msg_event_size	= sizeof ("test2");
1710	nmsg->msg_event_data	= Strdup("test2");
1711	msglist[0] = nmsg;
1712
1713	nmsg = Zalloc(sizeof (md_mn_msg_t));
1714	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1715
1716	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1717	nmsg->msg_setno		= msg->msg_setno;
1718	nmsg->msg_type		= MD_MN_MSG_TEST2;
1719	nmsg->msg_event_size	= sizeof ("test2");
1720	nmsg->msg_event_data	= Strdup("test2");
1721	msglist[1] = nmsg;
1722
1723	nmsg = Zalloc(sizeof (md_mn_msg_t));
1724	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1725
1726	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1727	nmsg->msg_setno		= msg->msg_setno;
1728	nmsg->msg_type		= MD_MN_MSG_TEST3;
1729	nmsg->msg_event_size	= sizeof ("test3");
1730	nmsg->msg_event_data	= Strdup("test3");
1731	msglist[2] = nmsg;
1732
1733	nmsg = Zalloc(sizeof (md_mn_msg_t));
1734	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1735
1736	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1737	nmsg->msg_setno		= msg->msg_setno;
1738	nmsg->msg_type		= MD_MN_MSG_TEST4;
1739	nmsg->msg_event_size	= sizeof ("test4");
1740	nmsg->msg_event_data	= Strdup("test4");
1741	msglist[3] = nmsg;
1742
1743	return (4); /* Return the number of submessages generated */
1744}
1745
1746/*
1747 * This is to send an MD_IOCSET ioctl to all nodes to create a soft
1748 * partition.
1749 */
1750/*ARGSUSED*/
1751void
1752mdmn_do_iocset(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1753{
1754	md_mn_msg_iocset_t	*d;
1755	int			ret;
1756	set_t			setno;
1757	mdsetname_t		*sp;
1758	mdname_t		*np;
1759	md_error_t		mde = mdnullerror;
1760
1761	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1762	resp->mmr_out_size = 0;
1763	resp->mmr_err_size = 0;
1764	resp->mmr_out = NULL;
1765	resp->mmr_err = NULL;
1766	d = (md_mn_msg_iocset_t *)(void *)msg->msg_event_data;
1767
1768	setno = MD_MIN2SET(d->iocset_params.mnum);
1769	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1770		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1771		    "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1772		resp->mmr_exitval = 1;
1773		return;
1774	}
1775
1776	/*
1777	 * Device should be in the namespace already
1778	 */
1779	if ((np = metamnumname(&sp, d->iocset_params.mnum, 1, &mde)) == NULL) {
1780		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1781		    "MD_MN_MSG_IOCSET: Invalid mnum %d\n"),
1782		    d->iocset_params.mnum);
1783		resp->mmr_exitval = 1;
1784		return;
1785	}
1786
1787	/*
1788	 * Create unit structure
1789	 */
1790	d->iocset_params.mdp = (uintptr_t)&d->unit; /* set pointer to unit */
1791	ret = metaioctl(MD_IOCSET, &(d->iocset_params), &mde, np->cname);
1792	resp->mmr_exitval = ret;
1793}
1794
1795/*
1796 * This is to update the status of a softpart
1797 */
1798/*ARGSUSED*/
1799void
1800mdmn_do_sp_setstat(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1801{
1802	md_mn_msg_sp_setstat_t	*d;
1803	int			ret;
1804	set_t			setno;
1805	mdsetname_t		*sp;
1806	minor_t			mnum;
1807	md_error_t		mde = mdnullerror;
1808
1809	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1810	resp->mmr_out_size = 0;
1811	resp->mmr_err_size = 0;
1812	resp->mmr_out = NULL;
1813	resp->mmr_err = NULL;
1814	d = (md_mn_msg_sp_setstat_t *)(void *)msg->msg_event_data;
1815
1816	mnum = d->sp_setstat_mnum;
1817	setno = MD_MIN2SET(mnum);
1818	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1819		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1820		    "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1821		resp->mmr_exitval = 1;
1822		return;
1823	}
1824
1825	ret = meta_sp_setstatus(sp, &mnum, 1, d->sp_setstat_status, &mde);
1826	resp->mmr_exitval = ret;
1827}
1828
1829/*
1830 * This is to add a key to the namespace
1831 */
1832/*ARGSUSED*/
1833void
1834mdmn_do_addkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1835{
1836	md_mn_msg_addkeyname_t	*d;
1837	int			ret;
1838	set_t			setno;
1839	mdsetname_t		*sp;
1840	md_error_t		mde = mdnullerror;
1841	mdname_t		*compnp;
1842
1843	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1844	resp->mmr_out_size = 0;
1845	resp->mmr_err_size = 0;
1846	resp->mmr_out = NULL;
1847	resp->mmr_err = NULL;
1848	d = (md_mn_msg_addkeyname_t *)(void *)msg->msg_event_data;
1849
1850	setno = d->addkeyname_setno;
1851	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1852		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1853		    "MD_MN_ADDKEYNAME: Invalid setno %d\n"), setno);
1854		resp->mmr_exitval = -1;
1855		return;
1856	}
1857
1858	compnp = metaname(&sp, d->addkeyname_name, UNKNOWN, &mde);
1859	if (compnp != NULL) {
1860		ret = add_key_name(sp, compnp, NULL, &mde);
1861		if (ret < 0)
1862			resp->mmr_exitval = -1;
1863		else
1864			resp->mmr_exitval = compnp->key;
1865	} else {
1866		resp->mmr_exitval = -1;
1867	}
1868}
1869
1870/*
1871 * This is to delete a key from the namespace
1872 */
1873/*ARGSUSED*/
1874void
1875mdmn_do_delkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1876{
1877	md_mn_msg_delkeyname_t	*d;
1878	int			ret;
1879	set_t			setno;
1880	mdsetname_t		*sp;
1881	md_error_t		mde = mdnullerror;
1882	mdname_t		*compnp;
1883
1884	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1885	resp->mmr_out_size = 0;
1886	resp->mmr_err_size = 0;
1887	resp->mmr_out = NULL;
1888	resp->mmr_err = NULL;
1889	d = (md_mn_msg_delkeyname_t *)(void *)msg->msg_event_data;
1890
1891	setno = d->delkeyname_setno;
1892	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1893		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1894		    "MD_MN_DELKEYNAME: Invalid setno %d\n"), setno);
1895		resp->mmr_exitval = -1;
1896		return;
1897	}
1898
1899	compnp = metadevname(&sp, d->delkeyname_dev, &mde);
1900	if (compnp != NULL) {
1901		/*
1902		 * Reset the key value for the name. This is required because
1903		 * any previous call of del_key_name for the same component
1904		 * will have resulted in the key value being reset to MD_KEYBAD
1905		 * even though there may still be references to this component.
1906		 */
1907		compnp->key = d->delkeyname_key;
1908		ret = del_key_name(sp, compnp, &mde);
1909		resp->mmr_exitval = ret;
1910	} else {
1911		resp->mmr_exitval = -1;
1912	}
1913}
1914
1915/*
1916 * This is to get the value of tstate from the master node. We use this
1917 * to get the ABR state of a metadevice from the master.
1918 */
1919/*ARGSUSED*/
1920void
1921mdmn_do_get_tstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1922{
1923	md_mn_msg_gettstate_t	*d;
1924	int			ret;
1925	uint_t			tstate;
1926	md_error_t		mde = mdnullerror;
1927
1928	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1929	resp->mmr_out_size = 0;
1930	resp->mmr_err_size = 0;
1931	resp->mmr_out = NULL;
1932	resp->mmr_err = NULL;
1933	d = (md_mn_msg_gettstate_t *)(void *)msg->msg_event_data;
1934
1935	ret = meta_get_tstate(d->gettstate_dev, &tstate, &mde);
1936	if (ret != 0) {
1937		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1938		    "MD_MN_GET_TSTATE: Invalid dev %llx\n"), d->gettstate_dev);
1939		tstate = 0;
1940	}
1941	resp->mmr_exitval = tstate;
1942}
1943
1944/*
1945 * This is to get the mirror ABR state and the state of its submirrors from
1946 * the master node. We need this to ensure consistent output from metastat
1947 * when a new node joins the cluster during a resync. Without this the
1948 * submirror status will be incorrect until the whole resync is complete which
1949 * may take days for very large metadevices.
1950 */
1951/*ARGSUSED*/
1952void
1953mdmn_do_get_mirstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1954{
1955	md_mn_msg_mir_state_t		*d;
1956	md_mn_msg_mir_state_res_t	*res;		/* Results */
1957	set_t				setno;
1958	mdsetname_t			*sp;		/* Set name */
1959	mdname_t			*mirnp;		/* Mirror name */
1960	md_error_t			mde = mdnullerror;
1961	mm_unit_t			*mm;		/* Mirror */
1962	int				smi;
1963	uint_t				tstate;
1964
1965	resp->mmr_comm_state = MDMNE_ACK;
1966	resp->mmr_out_size = sizeof (md_mn_msg_mir_state_res_t);
1967	resp->mmr_err_size = 0;
1968	resp->mmr_out = Malloc(resp->mmr_out_size);
1969	resp->mmr_err = NULL;
1970	d = (md_mn_msg_mir_state_t *)(void *)msg->msg_event_data;
1971	res = (md_mn_msg_mir_state_res_t *)(void *)resp->mmr_out;
1972
1973	/* Validate set information from minor number */
1974	setno = MD_MIN2SET(d->mir_state_mnum);
1975	sp = metasetnosetname(setno, &mde);
1976	if (sp == NULL) {
1977		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1978		    "MD_MN_GET_MIRROR_STATE: Invalid set %d\n"), setno);
1979		resp->mmr_exitval = 1;	/* Failure */
1980		Free(resp->mmr_out);
1981		resp->mmr_out_size = 0;
1982		return;
1983	}
1984
1985	/* Construct mirror name from minor number */
1986	mirnp = metamnumname(&sp, d->mir_state_mnum, 0, &mde);
1987	if (mirnp == NULL) {
1988		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1989		    "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
1990		    d->mir_state_mnum);
1991		resp->mmr_exitval = 2;	/* Failure */
1992		Free(resp->mmr_out);
1993		resp->mmr_out_size = 0;
1994		return;
1995	}
1996
1997	/* Get common mirror structure */
1998	mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, &mde);
1999	if (mm == NULL) {
2000		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2001		    "MD_MN_GET_MIRROR_STATE: Invalid mirror minor %x\n"),
2002		    d->mir_state_mnum);
2003		resp->mmr_exitval = 3;	/* Failure */
2004		Free(resp->mmr_out);
2005		resp->mmr_out_size = 0;
2006		return;
2007	}
2008
2009	if (meta_get_tstate(d->mir_state_mnum, &tstate, &mde) != 0) {
2010		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2011		    "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
2012		    d->mir_state_mnum);
2013		resp->mmr_exitval = 4;	/* Failure */
2014		Free(resp->mmr_out);
2015		resp->mmr_out_size = 0;
2016		return;
2017	}
2018	/*
2019	 * Fill in the sm_state/sm_flags value in the results structure which
2020	 * gets passed back to the message originator
2021	 */
2022	resp->mmr_exitval = 0;
2023	for (smi = 0; (smi < NMIRROR); smi++) {
2024		mm_submirror_t *mmsp = &mm->un_sm[smi];
2025		res->sm_state[smi] = mmsp->sm_state;
2026		res->sm_flags[smi] = mmsp->sm_flags;
2027	}
2028	/* Returm value of tstate for mirror */
2029	res->mir_tstate = tstate;
2030}
2031
2032/*
2033 * This is to issue an ioctl to call poke_hotspares
2034 */
2035/*ARGSUSED*/
2036void
2037mdmn_do_poke_hotspares(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2038{
2039
2040	md_mn_poke_hotspares_t	pokehsp;
2041	md_mn_msg_pokehsp_t	*d;
2042
2043	resp->mmr_out_size = 0;
2044	resp->mmr_err_size = 0;
2045	resp->mmr_out = NULL;
2046	resp->mmr_err = NULL;
2047	resp->mmr_comm_state = MDMNE_ACK;
2048	d = (md_mn_msg_pokehsp_t *)(void *)msg->msg_event_data;
2049
2050	(void) memset(&pokehsp, 0, sizeof (pokehsp));
2051	MD_SETDRIVERNAME(&pokehsp, MD_MIRROR, d->pokehsp_setno);
2052
2053	resp->mmr_exitval = metaioctl(MD_MN_POKE_HOTSPARES, &pokehsp,
2054	    &pokehsp.mde, NULL);
2055}
2056
2057/*
2058 * Called to create a softpart during a metarecover operation
2059 */
2060/*ARGSUSED*/
2061void
2062mdmn_do_addmdname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2063{
2064	md_mn_msg_addmdname_t	*d;
2065	md_error_t		mde = mdnullerror;
2066	mdsetname_t		*sp;
2067	int			init = 0;
2068	mdkey_t			key;
2069	minor_t			mnum;
2070
2071	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
2072	resp->mmr_out_size = 0;
2073	resp->mmr_err_size = 0;
2074	resp->mmr_out = NULL;
2075	resp->mmr_err = NULL;
2076	d = (md_mn_msg_addmdname_t *)(void *)msg->msg_event_data;
2077
2078	if ((sp = metasetnosetname(d->addmdname_setno, &mde)) == NULL) {
2079		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2080		    "MD_MN_MSG_ADDMDNAME: Invalid setno %d\n"),
2081		    d->addmdname_setno);
2082		resp->mmr_exitval = 1;
2083		return;
2084	}
2085
2086	/*
2087	 * If device node does not exist then init it
2088	 */
2089	if (!is_existing_meta_hsp(sp, d->addmdname_name)) {
2090		if ((key = meta_init_make_device(&sp, d->addmdname_name,
2091		    &mde)) <= 0) {
2092			syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2093			    "MD_MN_MSG_ADDMDNAME: Invalid name %s\n"),
2094			    d->addmdname_name);
2095			resp->mmr_exitval = 1;
2096			return;
2097		}
2098
2099		init = 1;
2100	}
2101
2102	/*
2103	 * We should have it
2104	 */
2105	if (metaname(&sp, d->addmdname_name, META_DEVICE, &mde) == NULL) {
2106
2107		if (init) {
2108			if (meta_getnmentbykey(sp->setno, MD_SIDEWILD,
2109			    key, NULL, &mnum, NULL, &mde) != NULL) {
2110				(void) metaioctl(
2111				    MD_IOCREM_DEV, &mnum, &mde, NULL);
2112			}
2113		(void) del_self_name(sp, key, &mde);
2114		}
2115
2116		resp->mmr_exitval = 1;
2117		return;
2118	}
2119
2120	resp->mmr_exitval = 0;
2121}
2122
2123/*
2124 * This is used to issue a MD_MN_RR_DIRTY ioctl to the mirror.
2125 */
2126/*ARGSUSED*/
2127void
2128mdmn_do_mark_dirty(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2129{
2130	md_mn_msg_rr_dirty_t	*d;
2131	md_mn_rr_dirty_params_t	rp;
2132	int			ret;
2133
2134	resp->mmr_out_size = 0;
2135	resp->mmr_err_size = 0;
2136	resp->mmr_out = NULL;
2137	resp->mmr_err = NULL;
2138	resp->mmr_comm_state = MDMNE_ACK;
2139	d = (md_mn_msg_rr_dirty_t *)((void *)(msg->msg_event_data));
2140
2141	(void) memset(&rp, 0, sizeof (rp));
2142	MD_SETDRIVERNAME(&rp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
2143	rp.rr_mnum = d->rr_mnum;
2144	rp.rr_nodeid = d->rr_nodeid;
2145	rp.rr_start = (ushort_t)((d->rr_range >> 16) & 0xffff);
2146	rp.rr_end = (ushort_t)(d->rr_range & 0xffff);
2147
2148	ret = metaioctl(MD_MN_RR_DIRTY, &rp, &rp.mde, NULL);
2149
2150	resp->mmr_exitval = ret;
2151}
2152
2153/*
2154 * This is used to issue a MD_MN_RR_CLEAN ioctl to the mirror.
2155 */
2156/*ARGSUSED*/
2157void
2158mdmn_do_mark_clean(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2159{
2160	md_mn_msg_rr_clean_t	*d;
2161	md_mn_rr_clean_params_t	*rcp;
2162	int			ret;
2163
2164	resp->mmr_out_size = 0;
2165	resp->mmr_err_size = 0;
2166	resp->mmr_out = NULL;
2167	resp->mmr_err = NULL;
2168	resp->mmr_comm_state = MDMNE_ACK;
2169	d = (md_mn_msg_rr_clean_t *)((void *)(msg->msg_event_data));
2170
2171	rcp = Zalloc(sizeof (struct md_mn_rr_clean_params) +
2172	    MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
2173	MD_SETDRIVERNAME(rcp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
2174	rcp->rr_mnum = d->rr_mnum;
2175	rcp->rr_nodeid = d->rr_nodeid;
2176	rcp->rr_start_size = d->rr_start_size;
2177	(void) memcpy(MDMN_RR_CLEAN_PARAMS_DATA(rcp), MDMN_MSG_RR_CLEAN_DATA(d),
2178	    MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
2179
2180	ret = metaioctl(MD_MN_RR_CLEAN, rcp, &rcp->mde, NULL);
2181
2182	Free(rcp);
2183
2184	resp->mmr_exitval = ret;
2185}
2186