meta_mn_subr.c revision 8452:89d32dfdae6e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Just in case we're not in a build environment, make sure that
28 * TEXT_DOMAIN gets set to something.
29 */
30#if !defined(TEXT_DOMAIN)
31#define	TEXT_DOMAIN "SYS_TEST"
32#endif
33
34#include <meta.h>
35#include <sdssc.h>
36#include <arpa/inet.h>
37#include <sys/lvm/md_mddb.h>
38
39#define	MAX_LINE_SIZE 1024
40
41/*
42 * Maximum amount of time to spend waiting for an ownership change to complete.
43 */
44static const int OWNER_TIMEOUT = 3;
45
46/*
47 * FUNCTION:	meta_is_mn_set()
48 * INPUT:       sp      - the set name
49 * OUTPUT:	ep	- return error pointer
50 * RETURNS:	int	- 1 if MultiNode set else 0
51 * PURPOSE:	checks if the set is a MultiNode set
52 */
53int
54meta_is_mn_set(
55	mdsetname_t	*sp,
56	md_error_t	*ep
57)
58{
59	md_set_desc	*sd;
60
61	/* Local set cannot be MultiNode */
62	if ((sp == NULL) || (sp->setname == NULL) ||
63	    (strcmp(sp->setname, MD_LOCAL_NAME) == 0))
64		return (0);
65	sd = metaget_setdesc(sp, ep);
66	ASSERT(sd != NULL);
67	if (sd->sd_flags & MD_SR_MN)
68		return (1);
69	return (0);
70}
71
72/*
73 * FUNCTION:	meta_is_mn_name()
74 * INPUT:       spp     - ptr to the set name, if NULL the setname is derived
75 *			  from the metadevice name (eg set/d10 )
76 *		name	- the metadevice/hsp name
77 * OUTPUT:	ep	- return error pointer
78 * RETURNS:	int	- 1 if MultiNode set else 0
79 * PURPOSE:	checks if the metadevice is in a MultiNode set
80 */
81int
82meta_is_mn_name(
83	mdsetname_t	**spp,
84	char		*name,
85	md_error_t	*ep
86)
87{
88	if (*spp == NULL) {
89		char		*cname;
90
91		/*
92		 * if the setname is specified in uname and *spp is
93		 * not set, then it is setup using that set name value.
94		 * If *spp is set and a setname specified in uname and
95		 * the set names don't agree then cname will be
96		 * returned as NULL
97		 */
98		cname = meta_canonicalize_check_set(spp, name, ep);
99		if (cname == NULL) {
100			mdclrerror(ep);
101			return (0);
102		}
103
104		Free(cname);
105	}
106
107	if ((strcmp((*spp)->setname, MD_LOCAL_NAME) != 0) &&
108	    (metaget_setdesc(*spp, ep) != NULL) &&
109	    ((*spp)->setdesc->sd_flags & MD_SR_MN)) {
110		return (1);
111	}
112	return (0);
113}
114
115/*
116 * meta_ping_mnset(set_t setno)
117 * Send a test message for this set in order to make commd do some init stuff
118 * Don't bother changelog.
119 * If set is suspended, fail immediately.
120 */
121void
122meta_ping_mnset(set_t setno)
123{
124	char		*data = "test";
125	md_error_t	mde = mdnullerror;
126	md_mn_result_t	*resp = NULL;
127
128	(void) mdmn_send_message(setno, MD_MN_MSG_TEST2,
129	    MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, 0, data,
130	    sizeof (data), &resp, &mde);
131
132	if (resp != (md_mn_result_t *)NULL) {
133		free_result(resp);
134	}
135}
136
137/*
138 *
139 * FUNCTION:	print_stderr
140 * INPUT:	errstr	- the error message returned by the command
141 *		context	- the context string from metainit -a
142 * PURPOSE:	called from meta_mn_send_command to print the error message
143 *		to stderr. When context is NO_CONTEXT_STRING, the errstr string
144 *		is output unchanged. When context is a string, it is the context
145 *		string for the metainit -a command and in this case the errstr
146 *		string has to be parsed to extract the command and node name
147 *		and to send a message to stderr in the format
148 *		command: node: context: error message
149 */
150static void
151print_stderr(
152	char	*errstr,
153	char	*context
154)
155{
156	char	*command;
157	char	*node;
158	char	*message;
159	int	length = strlen(errstr + 1);
160
161	if (context == NO_CONTEXT_STRING) {
162		(void) fprintf(stderr, "%s", errstr);
163	} else {
164		command = Malloc(length);
165		node = Malloc(length);
166		message = Malloc(length);
167		if (sscanf(errstr, "%[^:]: %[^:]: %[^\n]", command, node,
168		    message) == 3) {
169			(void) fprintf(stderr, "%s: %s: %s: %s\n", command,
170			    node, context, message);
171		} else {
172			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
173			    "%s: Invalid format error message"), errstr);
174		}
175		Free(command);
176		Free(node);
177		Free(message);
178	}
179}
180
181/*
182 * FUNCTION:	meta_mn_send_command()
183 * INPUT:	sp	- the set name
184 *		argc	- number of arguments
185 *		argv	- arg list
186 *		flags	- some controlling flags
187 *		initall_context	- context string for metainit -a
188 * OUTPUT:	ep	- return error pointer
189 * RETURNS:	return exitval from mdmn_send_message
190 * PURPOSE:	sends the command to the master node for execution
191 */
192int
193meta_mn_send_command(
194	mdsetname_t	*sp,
195	int		argc,
196	char		*argv[],
197	int		flags,
198	char		*initall_context,
199	md_error_t	*ep
200)
201{
202	int		a;
203	int		err;
204	int		retval;
205	int		send_message_flags = MD_MSGF_DEFAULT_FLAGS;
206	int		send_message_type;
207	char		*cmd;
208	md_mn_result_t	*resp = NULL;
209
210	cmd = Malloc(1024);
211	(void) strlcpy(cmd, argv[0], 1024);
212	for (a = 1; a < argc; a++) {
213		/* don't copy empty arguments */
214		if (*argv[a] == '\0') {
215			continue;
216		}
217		(void) strcat(cmd, " ");
218		(void) strcat(cmd, argv[a]);
219	}
220	/*
221	 * in dryrun mode stop on the first error
222	 * use the CMD_RETRY message type if RETRY_BUSY flag set
223	 */
224	if (flags & MD_DRYRUN)
225		send_message_flags |= MD_MSGF_STOP_ON_ERROR;
226	if (flags & MD_NOLOG)
227		send_message_flags |= MD_MSGF_NO_LOG;
228	if (flags & MD_PANIC_WHEN_INCONSISTENT)
229		send_message_flags |= MD_MSGF_PANIC_WHEN_INCONSISTENT;
230	if (flags & MD_RETRY_BUSY)  {
231		send_message_type = MD_MN_MSG_BC_CMD_RETRY;
232	} else {
233		send_message_type = MD_MN_MSG_BC_CMD;
234	}
235	err = mdmn_send_message(sp->setno, send_message_type,
236	    send_message_flags, 0, cmd, 1024, &resp, ep);
237
238	free(cmd);
239
240	if (err == 0) {
241		/*
242		 * stderr may be turned off by IGNORE_STDERR
243		 * In dryrun we only print stderr if the exit_val is non-zero
244		 */
245		if ((resp->mmr_err_size != 0) &&
246		    ((flags & MD_IGNORE_STDERR) == 0)) {
247			if (((flags & MD_DRYRUN) == 0) ||
248			    (resp->mmr_exitval != 0)) {
249				print_stderr(resp->mmr_err, initall_context);
250			}
251		}
252
253		/*
254		 * If dryrun is set, we don't display stdout,
255		 * because the real run has yet to follow.
256		 */
257		if (((flags & MD_DRYRUN) == 0) && (resp->mmr_out_size != 0)) {
258			(void) printf("%s", resp->mmr_out);
259		}
260		retval = resp->mmr_exitval;
261		free_result(resp);
262		return (retval);
263	}
264	if (resp != NULL) {
265		if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) {
266			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
267			    "rpc.mdcommd currently busy. "
268			    "Retry operation later.\n"));
269		} else if (resp->mmr_comm_state == MDMNE_NOT_JOINED) {
270			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
271			    "Node %s must join the %s multi-owner diskset to "
272			    "issue commands.\n"
273			    "To join, use: metaset -s %s -j\n"),
274			    mynode(), sp->setname, sp->setname);
275		} else if (resp->mmr_comm_state == MDMNE_LOG_FAIL) {
276			mddb_config_t	c;
277
278			(void) memset(&c, 0, sizeof (c));
279			c.c_setno = sp->setno;
280			(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
281			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
282			    "Command not attempted: Unable to log message "
283			    "in set %s\n"), sp->setname);
284			if (c.c_flags & MDDB_C_STALE) {
285				(void) mdmddberror(ep, MDE_DB_STALE,
286				    (minor_t)NODEV64, sp->setno, 0, NULL);
287				mde_perror(ep, "");
288			}
289		} else {
290			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
291			    "Command failed: Commd State %d "
292			    "encountered.\n"), resp->mmr_comm_state);
293		}
294		free_result(resp);
295	} else {
296		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
297		    "Command failed: mdmn_send_message returned %d.\n"),
298		    err);
299	}
300
301
302	return (1);
303}
304
305/*
306 * FUNCTION:	meta_mn_send_suspend_writes()
307 * INPUT:	mnum	- minor num of mirror
308 * OUTPUT:	ep	- return error pointer
309 * RETURNS:	return value from mdmn_send_message()
310 * PURPOSE:	sends message to all nodes to suspend writes to the mirror.
311 */
312int
313meta_mn_send_suspend_writes(
314	minor_t		mnum,
315	md_error_t	*ep
316)
317{
318	int			result;
319	md_mn_msg_suspwr_t	suspwrmsg;
320	md_mn_result_t		*resp = NULL;
321
322	suspwrmsg.msg_suspwr_mnum =  mnum;
323	/*
324	 * This message is never directly issued.
325	 * So we launch it with a suspend override flag.
326	 * If the commd is suspended, and this message comes
327	 * along it must be sent due to replaying a command or similar.
328	 * In that case we don't want this message to be blocked.
329	 * If the commd is not suspended, the flag does no harm.
330	 */
331	result = mdmn_send_message(MD_MIN2SET(mnum),
332	    MD_MN_MSG_SUSPEND_WRITES,
333	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
334	    (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep);
335	if (resp != NULL) {
336		free_result(resp);
337	}
338	return (result);
339}
340
341/*
342 * Parse the multi-node list file
343 *
344 * Return Values:	Zero	 - Success
345 *			Non Zero - Failure
346 *
347 * File content:	The content of the nodelist file should consist of
348 *			triplets of nodeid, nodename and private interconnect
349 *			address seperated by one or more white space.
350 * e.g.
351 *			1 node_a 192.168.111.3
352 *			2 node_b 192.168.111.5
353 *
354 *			Any missing fields will result in an error.
355 */
356int
357meta_read_nodelist(
358	int				*nodecnt,
359	mndiskset_membershiplist_t	**nl,
360	md_error_t			*ep
361)
362{
363	FILE				*fp = NULL;
364	char				line[MAX_LINE_SIZE];
365	char				*buf;
366	uint_t				i;
367	int				sz;
368	mndiskset_membershiplist_t	**tailp = nl;
369
370	/* open file */
371	if ((fp = fopen(META_MNSET_NODELIST, "r")) == NULL) {
372		mndiskset_membershiplist_t	*nlp;
373		struct hostent *hp;
374
375		/* return this node with id of 1 */
376		nlp = *tailp = Zalloc(sizeof (*nlp));
377		tailp = &nlp->next;
378
379		*nodecnt = 1;
380		nlp->msl_node_id = 1;
381		buf = mynode();
382		sz = min(strlen(buf), sizeof (nlp->msl_node_name) - 1);
383		(void) strncpy(nlp->msl_node_name, buf, sz);
384		nlp->msl_node_name[sz] = '\0';
385
386		/* retrieve info about our host */
387		if ((hp = gethostbyname(buf)) == NULL) {
388			return (mdsyserror(ep, EADDRNOTAVAIL, buf));
389		}
390		/* We only do IPv4 addresses, for now */
391		if (hp->h_addrtype != AF_INET) {
392			return (mdsyserror(ep, EPFNOSUPPORT, buf));
393		}
394		/* We take the first address only */
395		if (*hp->h_addr_list) {
396			struct in_addr in;
397
398			(void) memcpy(&in.s_addr, *hp->h_addr_list,
399			    sizeof (struct in_addr));
400			(void) strncpy(nlp->msl_node_addr, inet_ntoa(in),
401			    MD_MAX_NODENAME);
402		} else {
403			return (mdsyserror(ep, EADDRNOTAVAIL, buf));
404		}
405
406		return (0);
407	}
408
409	*nl = NULL;
410	*nodecnt = 0;
411
412	while ((fp != NULL) && ((buf = fgets(line, sizeof (line) - 1, fp)) !=
413	    NULL)) {
414		mndiskset_membershiplist_t	*nlp;
415
416		/* skip leading spaces */
417		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
418			buf++;
419
420		/* skip comments and blank lines */
421		if (*buf == '\0' || *buf == '#')
422			continue;
423
424		/* allocate memory and set tail pointer */
425		nlp = *tailp = Zalloc(sizeof (*nlp));
426		tailp = &nlp->next;
427
428		/* parse node id */
429		nlp->msl_node_id = strtoul(buf, NULL, 0);
430		buf += i;
431
432		/* skip leading spaces */
433		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
434			buf++;
435
436		/* fields missing, return error */
437		if (*buf == '\0' || *buf == '#') {
438			meta_free_nodelist(*nl);
439			*nl = NULL;
440			*nodecnt = 0;
441
442			/* close file and return */
443			if ((fp) && (fclose(fp) != 0))
444				return (mdsyserror(ep, errno,
445				    META_MNSET_NODELIST));
446
447			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
448		}
449
450		/* parse node name */
451		sz = min(i, sizeof (nlp->msl_node_name) - 1);
452		(void) strncpy(nlp->msl_node_name, buf, sz);
453		nlp->msl_node_name[sz] = '\0';
454		buf += i;
455
456		/* skip leading spaces */
457		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
458			buf++;
459
460		/* fields missing, return error */
461		if (*buf == '\0' || *buf == '#') {
462			meta_free_nodelist(*nl);
463			*nl = NULL;
464			*nodecnt = 0;
465
466			/* close file and return */
467			if ((fp) && (fclose(fp) != 0))
468				return (mdsyserror(ep, errno,
469				    META_MNSET_NODELIST));
470
471			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
472		}
473
474		/* parse node address */
475		sz = min(i, sizeof (nlp->msl_node_addr) - 1);
476		(void) strncpy(nlp->msl_node_addr, buf, sz);
477		nlp->msl_node_addr[sz] = '\0';
478
479		++*nodecnt;
480	}
481
482	/* close file */
483	if ((fp) && (fclose(fp) != 0))
484		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
485
486	return (0);
487}
488
489/*
490 * Populate the multi-node list file from a given list of node id's
491 * The nids must have only one node id in each cell. Range of node
492 * id's in the form 1-n are not allowed.
493 *
494 * Return Values:	Zero	 - Success
495 *			Non Zero - Failure
496 */
497int
498meta_write_nodelist(
499	int		nodecnt,
500	char		**nids,
501	md_error_t	*ep
502)
503{
504	FILE		*fp = NULL;
505	char		name[MAX_LINE_SIZE], addr[MAX_LINE_SIZE];
506	uint_t		i, nid;
507	struct in_addr	ipaddr;
508	int		err = 0;
509
510	/* check if we are running on clustering */
511	if ((err = sdssc_bind_library()) != SDSSC_OKAY) {
512		return (mdsyserror(ep, err, META_MNSET_NODELIST));
513	}
514
515	/* open file for writing */
516	if ((fp = fopen(META_MNSET_NODELIST, "w")) == NULL) {
517		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
518	}
519
520	for (i = 0; i < nodecnt; i++) {
521		/* extract the node id */
522		errno = 0;
523		nid = strtoul(nids[i], NULL, 0);
524		if (errno != 0) {
525			if ((fp) && (fclose(fp) != 0))
526				return (mdsyserror(ep, errno,
527				    META_MNSET_NODELIST));
528
529			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
530		}
531
532		/* get node name */
533		(void) snprintf(name, sizeof (name), "%d", nid);
534		sdssc_cm_nid2nm(name);
535
536		/* finally get the private ip address */
537		(void) snprintf(addr, sizeof (addr), "%s", name);
538		if (sdssc_get_priv_ipaddr(addr, &ipaddr) != SDSSC_OKAY) {
539			if ((fp) && (fclose(fp) != 0))
540				return (mdsyserror(ep, errno,
541				    META_MNSET_NODELIST));
542
543			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
544		}
545
546		(void) fprintf(fp, "%d\t%s\t%s\n", nid, name,
547		    inet_ntoa(ipaddr));
548	}
549
550	/* close file */
551	if ((fp) && (fclose(fp) != 0))
552		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
553
554	return (0);
555}
556
557/*
558 * Free node list
559 */
560void
561meta_free_nodelist(
562	mndiskset_membershiplist_t	*nl
563)
564{
565	mndiskset_membershiplist_t	*next = NULL;
566
567	for (/* void */; (nl != NULL); nl = next) {
568		next = nl->next;
569		Free(nl);
570	}
571}
572
573/*
574 * FUNCTION:	meta_mn_send_setsync()
575 * INPUT:	sp	- setname
576 *		mirnp	- mirror name
577 *		size	- buffer size, 0 if none
578 * OUTPUT:	ep	- return error pointer
579 * RETURNS:	return value from meta_mn_send_command()
580 * PURPOSE:  Send a setsync command to all nodes to set resync status
581 */
582
583int
584meta_mn_send_setsync(
585	mdsetname_t		*sp,
586	mdname_t		*mirnp,
587	daddr_t			size,
588	md_error_t		*ep
589)
590{
591	md_mn_msg_setsync_t	setsyncmsg;
592	int			ret;
593	md_mn_result_t		*resp = NULL;
594
595	setsyncmsg.setsync_mnum = meta_getminor(mirnp->dev);
596	setsyncmsg.setsync_copysize = size;
597	setsyncmsg.setsync_flags = 0;
598
599	/*
600	 * We do not log the metasync command as it will have no effect on the
601	 * underlying metadb state. If we have a master change the
602	 * reconfiguration process will issue a new 'metasync' to all affected
603	 * mirrors, so we would actually end up sending the message twice.
604	 * Removing the logging of the message helps reduce the processing
605	 * time required.
606	 */
607	ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC,
608	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
609	    (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep);
610	if (resp != NULL) {
611		free_result(resp);
612	}
613
614	/*
615	 * Unlike non-MN sets, the metasync command does not actually
616	 * start a resync, it simply updates the state on all of the
617	 * nodes. Therefore, to start a resync we send a resync starting
618	 * message for the metadevice
619	 */
620	if (ret == 0)
621		ret = meta_mn_send_resync_starting(mirnp, ep);
622	return (ret);
623}
624
625/*
626 * FUNCTION:	meta_mn_send_metaclear_command()
627 * INPUT:	sp	- setname
628 *		name	- metadevice name
629 *		options - command options
630 *		pflag	- clear all soft partitions for a given device
631 * OUTPUT:	ep	- return error pointer
632 * RETURNS:	return value from meta_mn_send_command()
633 * PURPOSE:  Send a metaclear command to all nodes with force(-f) and
634 *	     recurse(-r) options set if required. For hotspare pool and
635 *	     metadevices, the metadevice name is of the form setname/dxx or
636 *	     setname/hspxxx so a '-s' argument isn't required. If pflag is set
637 *	     the name refers to a metadevice or component and in the is case
638 *	     a '-s' argument is required to define the set.
639 */
640
641int
642meta_mn_send_metaclear_command(
643	mdsetname_t		*sp,
644	char			*name,
645	mdcmdopts_t		options,
646	int			pflag,
647	md_error_t		*ep
648)
649{
650	int	newargc;
651	char	**newargv;
652	int	ret;
653
654	/*
655	 * Allocate an array large enough to hold all of the possible
656	 * metaclear arguments
657	 */
658	newargv = Calloc(7, sizeof (char *));
659	newargv[0] = "metaclear";
660	newargc = 1;
661	if (pflag) {
662		newargv[newargc] = "-s";
663		newargc++;
664		newargv[newargc] = sp->setname;
665		newargc++;
666	}
667	if (options & MDCMD_FORCE) {
668		newargv[newargc] = "-f";
669		newargc++;
670	}
671	if (options & MDCMD_RECURSE) {
672		newargv[newargc] = "-r";
673		newargc++;
674	}
675	if (pflag) {
676		newargv[newargc] = "-p";
677		newargc++;
678	}
679	newargv[newargc] = name;
680	newargc++;
681
682	ret = meta_mn_send_command(sp, newargc, newargv,
683	    MD_DISP_STDERR, NO_CONTEXT_STRING, ep);
684
685	free(newargv);
686	return (ret);
687}
688
689/*
690 * FUNCTION:	meta_mn_send_resync_starting()
691 * INPUT:	sp	- setname
692 *		mirnp	- mirror name
693 * OUTPUT:	ep	- return error pointer
694 * RETURNS:	return value from mdmn_send_message()
695 * PURPOSE:  Send a resync starting message to all nodes.
696 */
697
698int
699meta_mn_send_resync_starting(
700	mdname_t		*mirnp,
701	md_error_t		*ep
702)
703{
704	int			result;
705	md_mn_msg_resync_t	resyncmsg;
706	md_mn_result_t		*resp = NULL;
707	minor_t			mnum = meta_getminor(mirnp->dev);
708
709	/*
710	 * This message is never directly issued.
711	 * So we launch it with a suspend override flag.
712	 * If the commd is suspended, and this message comes
713	 * along it must be sent due to replaying a command or similar.
714	 * In that case we don't want this message to be blocked.
715	 * If the commd is not suspended, the flag does no harm.
716	 */
717	resyncmsg.msg_resync_mnum =  mnum;
718	result = mdmn_send_message(MD_MIN2SET(mnum),
719	    MD_MN_MSG_RESYNC_STARTING,
720	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
721	    (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep);
722
723	if (resp != NULL) {
724		free_result(resp);
725	}
726	return (result);
727}
728
729/*
730 * FUNCTION:	meta_mn_change_owner()
731 * INPUT:	opp	- pointer to parameter block
732 *		setno	- set number of mirror metadevice
733 *		mnum	- minor number of mirror metadevice
734 *		owner	- node ID of mirror owner
735 *		flags	- flag field for ioctl
736 * OUTPUT:	opp	- parameter block used to send ioctl
737 * RETURNS:	int	- 0 success, -1 error
738 * PURPOSE:	issue an ioctl to change the ownership of the specified mirror
739 *		to our node ID. We need to be the owner before any watermarks
740 *		are committed to the device otherwise we'll enter a deadly
741 *		embrace when attempting to write the watermark.
742 *		This function can also be used so set the owner on a node to
743 *		NULL. In this case the change is only made on the local node.
744 *		In addition by setting the MD_MN_MM_CHOOSE_OWNER flag, the
745 *		function can also be used to choose a mirror resync owner. This
746 *		function should only be called on the master and it will
747 *		select the owner and request it to become the owner.
748 */
749int
750meta_mn_change_owner(
751	md_set_mmown_params_t 	**opp,	/* Returned parameter block */
752	set_t			setno,	/* Mirror set number */
753	uint_t 			mnum,	/* Minor number */
754	uint_t			owner,	/* Node ID of mirror owner */
755	uint_t			flags	/* Flags */
756)
757{
758	md_set_mmown_params_t	*ownpar = *opp;
759	md_mn_own_status_t	*ownstat = NULL;
760	struct timeval tvs, tve;
761	int			n = 0;
762	int			rval;
763
764	if (ownpar != NULL) {
765		(void) memset(ownpar, 0, sizeof (*ownpar));
766	} else {
767		ownpar = Zalloc(sizeof (*ownpar));
768	}
769	ownstat = Zalloc(sizeof (*ownstat));
770
771	ownpar->d.mnum = mnum;
772	ownpar->d.owner = owner;
773	ownpar->d.flags = flags;
774	MD_SETDRIVERNAME(ownpar, MD_MIRROR, setno);
775	MD_SETDRIVERNAME(ownstat, MD_MIRROR, setno);
776
777	/*
778	 * Attempt to change the ownership to the specified node. We retry this
779	 * up to 10 times if we receive EAGAIN from the metadevice. This only
780	 * happens if the underlying metadevice is busy with outstanding i/o
781	 * that requires ownership change.
782	 */
783	while ((rval = metaioctl(MD_MN_SET_MM_OWNER, ownpar, &ownpar->mde,
784	    NULL)) != 0) {
785		md_sys_error_t	*ip =
786		    &ownpar->mde.info.md_error_info_t_u.sys_error;
787		if (ip->errnum != EAGAIN)
788			break;
789		if (n++ >= 10)
790			break;
791		(void) sleep(1);
792	}
793
794	/*
795	 * There is no need to wait for the ioctl completion if we are setting
796	 * the owner to NULL or requesting the master to choose the owner
797	 */
798	if ((owner == 0) || (flags & MD_MN_MM_CHOOSE_OWNER)) {
799		Free(ownstat);
800		*opp = ownpar;
801		return (0);
802	}
803
804	/*
805	 * Wait for ioctl completion or a timeout to occur. If we
806	 * timeout we fail the i/o request.
807	 */
808	ownstat->mnum = ownpar->d.mnum;
809	(void) gettimeofday(&tvs, NULL);
810
811	while ((rval == 0) && !(ownstat->flags & MD_MN_MM_RESULT)) {
812		while ((rval = metaioctl(MD_MN_MM_OWNER_STATUS, ownstat,
813		    &ownstat->mde, NULL)) != 0) {
814			(void) gettimeofday(&tve, NULL);
815			if ((tve.tv_sec - tvs.tv_sec) > OWNER_TIMEOUT) {
816				rval = -1;
817				break;
818			}
819			(void) sleep(1);
820		}
821	}
822
823	/* we did not not timeout but ioctl failed set rval */
824
825	if (rval == 0) {
826		rval = (ownstat->flags & MD_MN_MM_RES_FAIL) ? -1 : 0;
827	}
828
829	Free(ownstat);
830	*opp = ownpar;
831	return (rval);
832}
833/*
834 * special handling is required when running on a single node
835 * non-SC3.x environment.  This function determines tests
836 * for that case.
837 *
838 * Return values:
839 *	0 - no nodes or joined or in a SC3.x env
840 *	1 - 1 node and not in SC3.x env
841 */
842
843int
844meta_mn_singlenode()
845{
846	md_error_t			xep = mdnullerror;
847	int				nodecnt;
848	int				mnset_single_node = 0;
849	mndiskset_membershiplist_t	*nl;
850
851	/*
852	 * If running on SunCluster, then don't validate MN sets,
853	 * this is done during a reconfig cycle since all nodes must
854	 * take the same action.
855	 *
856	 * Only cleanup in case of a single node situation
857	 * when not running on SunCluster.  This single node
858	 * situation occurs when the nodelist only contains
859	 * this node and the MN setrecords only contain this
860	 * node.
861	 */
862	if (meta_read_nodelist(&nodecnt, &nl, &xep) == -1) {
863		nodecnt = 0;  /* no nodes are alive */
864		nl = NULL;
865		mdclrerror(&xep);
866	} else {
867		/*
868		 * If only 1 node in nodelist and not running
869		 * on SunCluster, set single_node flag.
870		 */
871		if ((nodecnt == 1) &&
872		    (strcmp(nl->msl_node_name, mynode()) == 0) &&
873		    ((sdssc_bind_library()) != SDSSC_OKAY)) {
874			mnset_single_node = 1;
875		}
876		meta_free_nodelist(nl);
877	}
878	return (mnset_single_node);
879}
880
881/*
882 * FUNCTION:	meta_mn_send_get_tstate()
883 * INPUT:	dev	- dev_t of device
884 * OUTPUT:	tstatep - tstate value
885 *		ep	- return error pointer
886 * RETURNS:	return value from mdmn_send_message()
887 * PURPOSE:  Send a message to the master to get ui_tstate for a given device.
888 */
889
890int
891meta_mn_send_get_tstate(
892	md_dev64_t		dev,
893	uint_t			*tstatep,
894	md_error_t		*ep
895)
896{
897	int			result;
898	md_mn_msg_gettstate_t	tstatemsg;
899	md_mn_result_t		*resp = NULL;
900	minor_t			mnum = meta_getminor(dev);
901
902	tstatemsg.gettstate_dev = dev;
903	result = mdmn_send_message(MD_MIN2SET(mnum),
904	    MD_MN_MSG_GET_TSTATE,
905	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0,
906	    (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep);
907
908	if (result == 0)
909		*tstatep = resp->mmr_exitval;
910	else
911		/* If some error occurred set tstate to 0 */
912		*tstatep = 0;
913
914	if (resp != NULL) {
915		free_result(resp);
916	}
917	return (result);
918}
919