xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c (revision 1623:7bac4a816ebe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 #include <meta.h>
37 #include <sdssc.h>
38 #include <arpa/inet.h>
39 #include <sys/lvm/md_mddb.h>
40 
41 #define	MAX_LINE_SIZE 1024
42 
43 /*
44  * Maximum amount of time to spend waiting for an ownership change to complete.
45  */
46 static const int OWNER_TIMEOUT = 3;
47 
48 /*
49  * FUNCTION:	meta_is_mn_set()
50  * INPUT:       sp      - the set name
51  * OUTPUT:	ep	- return error pointer
52  * RETURNS:	int	- 1 if MultiNode set else 0
53  * PURPOSE:	checks if the set is a MultiNode set
54  */
55 int
56 meta_is_mn_set(
57 	mdsetname_t	*sp,
58 	md_error_t	*ep
59 )
60 {
61 	md_set_desc	*sd;
62 
63 	/* Local set cannot be MultiNode */
64 	if ((sp == NULL) || (sp->setname == NULL) ||
65 				(strcmp(sp->setname, MD_LOCAL_NAME) == 0))
66 		return (0);
67 	sd = metaget_setdesc(sp, ep);
68 	ASSERT(sd != NULL);
69 	if (sd->sd_flags & MD_SR_MN)
70 		return (1);
71 	return (0);
72 }
73 
74 /*
75  * FUNCTION:	meta_is_mn_name()
76  * INPUT:       spp     - ptr to the set name, if NULL the setname is derived
77  *			  from the metadevice name (eg set/d10 )
78  *		name	- the metadevice/hsp name
79  * OUTPUT:	ep	- return error pointer
80  * RETURNS:	int	- 1 if MultiNode set else 0
81  * PURPOSE:	checks if the metadevice is in a MultiNode set
82  */
83 int
84 meta_is_mn_name(
85 	mdsetname_t	**spp,
86 	char		*name,
87 	md_error_t	*ep
88 )
89 {
90 	if (*spp == NULL) {
91 		char		*cname;
92 
93 		/*
94 		 * if the setname is specified in uname and *spp is
95 		 * not set, then it is setup using that set name value.
96 		 * If *spp is set and a setname specified in uname and
97 		 * the set names don't agree then cname will be
98 		 * returned as NULL
99 		 */
100 		cname = meta_canonicalize_check_set(spp, name, ep);
101 		if (cname == NULL) {
102 			mdclrerror(ep);
103 			return (0);
104 		}
105 
106 		Free(cname);
107 	}
108 
109 	if ((strcmp((*spp)->setname, MD_LOCAL_NAME) != 0) &&
110 	    (metaget_setdesc(*spp, ep) != NULL) &&
111 	    ((*spp)->setdesc->sd_flags & MD_SR_MN)) {
112 		return (1);
113 	}
114 	return (0);
115 }
116 
117 /*
118  * meta_ping_mnset(set_t setno)
119  * Send a test message for this set in order to make commd do some init stuff
120  * Don't bother changelog.
121  * If set is suspended, fail immediately.
122  */
123 void
124 meta_ping_mnset(set_t setno)
125 {
126 	char		*data = "test";
127 	md_error_t	mde = mdnullerror;
128 	md_mn_result_t	*resp = NULL;
129 
130 	(void) mdmn_send_message(setno, MD_MN_MSG_TEST2,
131 	    MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, data,
132 	    sizeof (data), &resp, &mde);
133 
134 	if (resp != (md_mn_result_t *)NULL) {
135 		free_result(resp);
136 	}
137 }
138 
139 /*
140  *
141  * FUNCTION:	print_stderr
142  * INPUT:	errstr	- the error message returned by the command
143  *		context	- the context string from metainit -a
144  * PURPOSE:	called from meta_mn_send_command to print the error message
145  *		to stderr. When context is NO_CONTEXT_STRING, the errstr string
146  *		is output unchanged. When context is a string, it is the context
147  *		string for the metainit -a command and in this case the errstr
148  *		string has to be parsed to extract the command and node name
149  *		and to send a message to stderr in the format
150  *		command: node: context: error message
151  */
152 static void
153 print_stderr(
154 	char	*errstr,
155 	char	*context
156 )
157 {
158 	char	*command;
159 	char	*node;
160 	char	*message;
161 	int	length = strlen(errstr + 1);
162 
163 	if (context == NO_CONTEXT_STRING) {
164 		(void) fprintf(stderr, "%s", errstr);
165 	} else {
166 		command = Malloc(length);
167 		node = Malloc(length);
168 		message = Malloc(length);
169 		if (sscanf(errstr, "%[^:]: %[^:]: %[^\n]", command, node,
170 		    message) == 3) {
171 			(void) fprintf(stderr, "%s: %s: %s: %s\n", command,
172 			    node, context, message);
173 		} else {
174 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
175 			    "%s: Invalid format error message"), errstr);
176 		}
177 		Free(command);
178 		Free(node);
179 		Free(message);
180 	}
181 }
182 
183 /*
184  * FUNCTION:	meta_mn_send_command()
185  * INPUT:	sp	- the set name
186  *		argc	- number of arguments
187  *		argv	- arg list
188  *		flags	- some controlling flags
189  *		initall_context	- context string for metainit -a
190  * OUTPUT:	ep	- return error pointer
191  * RETURNS:	return exitval from mdmn_send_message
192  * PURPOSE:	sends the command to the master node for execution
193  */
194 int
195 meta_mn_send_command(
196 	mdsetname_t	*sp,
197 	int		argc,
198 	char		*argv[],
199 	int		flags,
200 	char		*initall_context,
201 	md_error_t	*ep
202 )
203 {
204 	int		a;
205 	int		err;
206 	int		retval;
207 	int		send_message_flags = MD_MSGF_DEFAULT_FLAGS;
208 	int		send_message_type;
209 	char		*cmd;
210 	md_mn_result_t	*resp = NULL;
211 
212 	cmd = Malloc(1024);
213 	(void) strlcpy(cmd, argv[0], 1024);
214 	for (a = 1; a < argc; a++) {
215 		/* don't copy empty arguments */
216 		if (*argv[a] == '\0') {
217 			continue;
218 		}
219 		(void) strcat(cmd, " ");
220 		(void) strcat(cmd, argv[a]);
221 	}
222 	/*
223 	 * in dryrun mode stop on the first error
224 	 * use the CMD_RETRY message type if RETRY_BUSY flag set
225 	 */
226 	if (flags & MD_DRYRUN)
227 		send_message_flags |= MD_MSGF_STOP_ON_ERROR;
228 	if (flags & MD_NOLOG)
229 		send_message_flags |= MD_MSGF_NO_LOG;
230 	if (flags & MD_PANIC_WHEN_INCONSISTENT)
231 		send_message_flags |= MD_MSGF_PANIC_WHEN_INCONSISTENT;
232 	if (flags & MD_RETRY_BUSY)  {
233 		send_message_type = MD_MN_MSG_BC_CMD_RETRY;
234 	} else {
235 		send_message_type = MD_MN_MSG_BC_CMD;
236 	}
237 	err = mdmn_send_message(
238 		sp->setno, send_message_type, send_message_flags,
239 		cmd, 1024, &resp, ep);
240 
241 	free(cmd);
242 
243 	if (err == 0) {
244 		/*
245 		 * stderr may be turned off by IGNORE_STDERR
246 		 * In dryrun we only print stderr if the exit_val is non-zero
247 		 */
248 		if ((resp->mmr_err_size != 0) &&
249 		    ((flags & MD_IGNORE_STDERR) == 0)) {
250 			if (((flags & MD_DRYRUN) == 0) ||
251 			    (resp->mmr_exitval != 0)) {
252 				print_stderr(resp->mmr_err, initall_context);
253 			}
254 		}
255 
256 		/*
257 		 * If dryrun is set, we don't display stdout,
258 		 * because the real run has yet to follow.
259 		 */
260 		if (((flags & MD_DRYRUN) == 0) && (resp->mmr_out_size != 0)) {
261 			(void) printf("%s", resp->mmr_out);
262 		}
263 		retval = resp->mmr_exitval;
264 		free_result(resp);
265 		return (retval);
266 	}
267 	if (resp != NULL) {
268 		if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) {
269 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
270 			    "rpc.mdcommd currently busy. "
271 			    "Retry operation later.\n"));
272 		} else if (resp->mmr_comm_state == MDMNE_NOT_JOINED) {
273 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
274 			    "Node %s must join the %s multi-owner diskset to "
275 			    "issue commands.\n"
276 			    "To join, use: metaset -s %s -j\n"),
277 			    mynode(), sp->setname, sp->setname);
278 		} else if (resp->mmr_comm_state == MDMNE_LOG_FAIL) {
279 			mddb_config_t	c;
280 
281 			(void) memset(&c, 0, sizeof (c));
282 			c.c_setno = sp->setno;
283 			(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
284 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
285 			    "Command not attempted: Unable to log message "
286 			    "in set %s\n"), sp->setname);
287 			if (c.c_flags & MDDB_C_STALE) {
288 			    (void) mdmddberror(ep, MDE_DB_STALE,
289 			    (minor_t)NODEV64, sp->setno, 0, NULL);
290 			    mde_perror(ep, "");
291 			}
292 		} else {
293 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
294 			    "Command failed: Commd State %d "
295 			    "encountered.\n"), resp->mmr_comm_state);
296 		}
297 		free_result(resp);
298 	} else {
299 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
300 		    "Command failed: mdmn_send_message returned %d.\n"),
301 		    err);
302 	}
303 
304 
305 	return (1);
306 }
307 
308 /*
309  * FUNCTION:	meta_mn_send_suspend_writes()
310  * INPUT:	mnum	- minor num of mirror
311  * OUTPUT:	ep	- return error pointer
312  * RETURNS:	return value from mdmn_send_message()
313  * PURPOSE:	sends message to all nodes to suspend writes to the mirror.
314  */
315 int
316 meta_mn_send_suspend_writes(
317 	minor_t		mnum,
318 	md_error_t	*ep
319 )
320 {
321 	int			result;
322 	md_mn_msg_suspwr_t	suspwrmsg;
323 	md_mn_result_t		*resp = NULL;
324 
325 	suspwrmsg.msg_suspwr_mnum =  mnum;
326 	/*
327 	 * This message is never directly issued.
328 	 * So we launch it with a suspend override flag.
329 	 * If the commd is suspended, and this message comes
330 	 * along it must be sent due to replaying a command or similar.
331 	 * In that case we don't want this message to be blocked.
332 	 * If the commd is not suspended, the flag does no harm.
333 	 */
334 	result = mdmn_send_message(MD_MIN2SET(mnum),
335 	    MD_MN_MSG_SUSPEND_WRITES,
336 	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
337 	    (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep);
338 	if (resp != NULL) {
339 		free_result(resp);
340 	}
341 	return (result);
342 }
343 
344 /*
345  * Parse the multi-node list file
346  *
347  * Return Values:	Zero	 - Success
348  *			Non Zero - Failure
349  *
350  * File content:	The content of the nodelist file should consist of
351  *			triplets of nodeid, nodename and private interconnect
352  *			address seperated by one or more white space.
353  * e.g.
354  *			1 node_a 192.168.111.3
355  *			2 node_b 192.168.111.5
356  *
357  *			Any missing fields will result in an error.
358  */
359 int
360 meta_read_nodelist(
361 	int				*nodecnt,
362 	mndiskset_membershiplist_t	**nl,
363 	md_error_t			*ep
364 )
365 {
366 	FILE				*fp = NULL;
367 	char				line[MAX_LINE_SIZE];
368 	char				*buf;
369 	uint_t				i;
370 	int				sz;
371 	mndiskset_membershiplist_t	**tailp = nl;
372 
373 	/* open file */
374 	if ((fp = fopen(META_MNSET_NODELIST, "r")) == NULL) {
375 		mndiskset_membershiplist_t	*nlp;
376 		struct hostent *hp;
377 
378 		/* return this node with id of 1 */
379 		nlp = *tailp = Zalloc(sizeof (*nlp));
380 		tailp = &nlp->next;
381 
382 		*nodecnt = 1;
383 		nlp->msl_node_id = 1;
384 		buf = mynode();
385 		sz = min(strlen(buf), sizeof (nlp->msl_node_name) - 1);
386 		(void) strncpy(nlp->msl_node_name, buf, sz);
387 		nlp->msl_node_name[sz] = '\0';
388 
389 		/* retrieve info about our host */
390 		if ((hp = gethostbyname(buf)) == NULL) {
391 			return (mdsyserror(ep, EADDRNOTAVAIL, buf));
392 		}
393 		/* We only do IPv4 addresses, for now */
394 		if (hp->h_addrtype != AF_INET) {
395 			return (mdsyserror(ep, EPFNOSUPPORT, buf));
396 		}
397 		/* We take the first address only */
398 		if (*hp->h_addr_list) {
399 			struct in_addr in;
400 
401 			(void) memcpy(&in.s_addr, *hp->h_addr_list,
402 			    sizeof (struct in_addr));
403 			(void) strncpy(nlp->msl_node_addr, inet_ntoa(in),
404 			    MD_MAX_NODENAME);
405 		} else {
406 			return (mdsyserror(ep, EADDRNOTAVAIL, buf));
407 		}
408 
409 		return (0);
410 	}
411 
412 	*nl = NULL;
413 	*nodecnt = 0;
414 
415 	while ((fp != NULL) && ((buf = fgets(line, sizeof (line) - 1, fp)) !=
416 	    NULL)) {
417 		mndiskset_membershiplist_t	*nlp;
418 
419 		/* skip leading spaces */
420 		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
421 			buf++;
422 
423 		/* skip comments and blank lines */
424 		if (*buf == '\0' || *buf == '#')
425 			continue;
426 
427 		/* allocate memory and set tail pointer */
428 		nlp = *tailp = Zalloc(sizeof (*nlp));
429 		tailp = &nlp->next;
430 
431 		/* parse node id */
432 		nlp->msl_node_id = strtoul(buf, NULL, 0);
433 		buf += i;
434 
435 		/* skip leading spaces */
436 		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
437 			buf++;
438 
439 		/* fields missing, return error */
440 		if (*buf == '\0' || *buf == '#') {
441 			meta_free_nodelist(*nl);
442 			*nl = NULL;
443 			*nodecnt = 0;
444 
445 			/* close file and return */
446 			if ((fp) && (fclose(fp) != 0))
447 				return (mdsyserror(ep, errno,
448 				    META_MNSET_NODELIST));
449 
450 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
451 		}
452 
453 		/* parse node name */
454 		sz = min(i, sizeof (nlp->msl_node_name) - 1);
455 		(void) strncpy(nlp->msl_node_name, buf, sz);
456 		nlp->msl_node_name[sz] = '\0';
457 		buf += i;
458 
459 		/* skip leading spaces */
460 		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
461 			buf++;
462 
463 		/* fields missing, return error */
464 		if (*buf == '\0' || *buf == '#') {
465 			meta_free_nodelist(*nl);
466 			*nl = NULL;
467 			*nodecnt = 0;
468 
469 			/* close file and return */
470 			if ((fp) && (fclose(fp) != 0))
471 				return (mdsyserror(ep, errno,
472 				    META_MNSET_NODELIST));
473 
474 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
475 		}
476 
477 		/* parse node address */
478 		sz = min(i, sizeof (nlp->msl_node_addr) - 1);
479 		(void) strncpy(nlp->msl_node_addr, buf, sz);
480 		nlp->msl_node_addr[sz] = '\0';
481 
482 		++*nodecnt;
483 	}
484 
485 	/* close file */
486 	if ((fp) && (fclose(fp) != 0))
487 		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
488 
489 	return (0);
490 }
491 
492 /*
493  * Populate the multi-node list file from a given list of node id's
494  * The nids must have only one node id in each cell. Range of node
495  * id's in the form 1-n are not allowed.
496  *
497  * Return Values:	Zero	 - Success
498  *			Non Zero - Failure
499  */
500 int
501 meta_write_nodelist(
502 	int		nodecnt,
503 	char		**nids,
504 	md_error_t	*ep
505 )
506 {
507 	FILE		*fp = NULL;
508 	char		name[MAX_LINE_SIZE], addr[MAX_LINE_SIZE];
509 	uint_t		i, nid;
510 	struct in_addr	ipaddr;
511 	int		err = 0;
512 
513 	/* check if we are running on clustering */
514 	if ((err = sdssc_bind_library()) != SDSSC_OKAY) {
515 		return (mdsyserror(ep, err, META_MNSET_NODELIST));
516 	}
517 
518 	/* open file for writing */
519 	if ((fp = fopen(META_MNSET_NODELIST, "w")) == NULL) {
520 		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
521 	}
522 
523 	for (i = 0; i < nodecnt; i++) {
524 		/* extract the node id */
525 		errno = 0;
526 		nid = strtoul(nids[i], NULL, 0);
527 		if (errno != 0) {
528 			if ((fp) && (fclose(fp) != 0))
529 				return (mdsyserror(ep, errno,
530 				    META_MNSET_NODELIST));
531 
532 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
533 		}
534 
535 		/* get node name */
536 		(void) snprintf(name, sizeof (name), "%d", nid);
537 		sdssc_cm_nid2nm(name);
538 
539 		/* finally get the private ip address */
540 		(void) snprintf(addr, sizeof (addr), "%s", name);
541 		if (sdssc_get_priv_ipaddr(addr, &ipaddr) != SDSSC_OKAY) {
542 			if ((fp) && (fclose(fp) != 0))
543 				return (mdsyserror(ep, errno,
544 				    META_MNSET_NODELIST));
545 
546 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
547 		}
548 
549 		(void) fprintf(fp, "%d\t%s\t%s\n", nid, name,
550 		    inet_ntoa(ipaddr));
551 	}
552 
553 	/* close file */
554 	if ((fp) && (fclose(fp) != 0))
555 		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
556 
557 	return (0);
558 }
559 
560 /*
561  * Free node list
562  */
563 void
564 meta_free_nodelist(
565 	mndiskset_membershiplist_t	*nl
566 )
567 {
568 	mndiskset_membershiplist_t	*next = NULL;
569 
570 	for (/* void */; (nl != NULL); nl = next) {
571 		next = nl->next;
572 		Free(nl);
573 	}
574 }
575 
576 /*
577  * FUNCTION:	meta_mn_send_setsync()
578  * INPUT:	sp	- setname
579  *		mirnp	- mirror name
580  *		size	- buffer size, 0 if none
581  * OUTPUT:	ep	- return error pointer
582  * RETURNS:	return value from meta_mn_send_command()
583  * PURPOSE:  Send a setsync command to all nodes to set resync status
584  */
585 
586 int
587 meta_mn_send_setsync(
588 	mdsetname_t		*sp,
589 	mdname_t		*mirnp,
590 	daddr_t			size,
591 	md_error_t		*ep
592 )
593 {
594 	md_mn_msg_setsync_t	setsyncmsg;
595 	int			ret;
596 	md_mn_result_t		*resp = NULL;
597 
598 	setsyncmsg.setsync_mnum = meta_getminor(mirnp->dev);
599 	setsyncmsg.setsync_copysize = size;
600 	setsyncmsg.setsync_flags = 0;
601 
602 	/*
603 	 * We do not log the metasync command as it will have no effect on the
604 	 * underlying metadb state. If we have a master change the
605 	 * reconfiguration process will issue a new 'metasync' to all affected
606 	 * mirrors, so we would actually end up sending the message twice.
607 	 * Removing the logging of the message helps reduce the processing
608 	 * time required.
609 	 */
610 	ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC,
611 	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
612 	    (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep);
613 	if (resp != NULL) {
614 		free_result(resp);
615 	}
616 
617 	/*
618 	 * Unlike non-MN sets, the metasync command does not actually
619 	 * start a resync, it simply updates the state on all of the
620 	 * nodes. Therefore, to start a resync we send a resync starting
621 	 * message for the metadevice
622 	 */
623 	if (ret == 0)
624 		ret = meta_mn_send_resync_starting(mirnp, ep);
625 	return (ret);
626 }
627 
628 /*
629  * FUNCTION:	meta_mn_send_metaclear_command()
630  * INPUT:	sp	- setname
631  *		name	- metadevice name
632  *		options - command options
633  *		pflag	- clear all soft partitions for a given device
634  * OUTPUT:	ep	- return error pointer
635  * RETURNS:	return value from meta_mn_send_command()
636  * PURPOSE:  Send a metaclear command to all nodes with force(-f) and
637  *	     recurse(-r) options set if required. For hotspare pool and
638  *	     metadevices, the metadevice name is of the form setname/dxx or
639  *	     setname/hspxxx so a '-s' argument isn't required. If pflag is set
640  *	     the name refers to a metadevice or component and in the is case
641  *	     a '-s' argument is required to define the set.
642  */
643 
644 int
645 meta_mn_send_metaclear_command(
646 	mdsetname_t		*sp,
647 	char			*name,
648 	mdcmdopts_t		options,
649 	int			pflag,
650 	md_error_t		*ep
651 )
652 {
653 	int	newargc;
654 	char	**newargv;
655 	int	ret;
656 
657 	/*
658 	 * Allocate an array large enough to hold all of the possible
659 	 * metaclear arguments
660 	 */
661 	newargv = Calloc(7, sizeof (char *));
662 	newargv[0] = "metaclear";
663 	newargc = 1;
664 	if (pflag) {
665 		newargv[newargc] = "-s";
666 		newargc++;
667 		newargv[newargc] = sp->setname;
668 		newargc++;
669 	}
670 	if (options & MDCMD_FORCE) {
671 		newargv[newargc] = "-f";
672 		newargc++;
673 	}
674 	if (options & MDCMD_RECURSE) {
675 		newargv[newargc] = "-r";
676 		newargc++;
677 	}
678 	if (pflag) {
679 		newargv[newargc] = "-p";
680 		newargc++;
681 	}
682 	newargv[newargc] = name;
683 	newargc++;
684 
685 	ret = meta_mn_send_command(sp, newargc, newargv,
686 	    MD_DISP_STDERR, NO_CONTEXT_STRING, ep);
687 
688 	free(newargv);
689 	return (ret);
690 }
691 
692 /*
693  * FUNCTION:	meta_mn_send_resync_starting()
694  * INPUT:	sp	- setname
695  *		mirnp	- mirror name
696  * OUTPUT:	ep	- return error pointer
697  * RETURNS:	return value from mdmn_send_message()
698  * PURPOSE:  Send a resync starting message to all nodes.
699  */
700 
701 int
702 meta_mn_send_resync_starting(
703 	mdname_t		*mirnp,
704 	md_error_t		*ep
705 )
706 {
707 	int			result;
708 	md_mn_msg_resync_t	resyncmsg;
709 	md_mn_result_t		*resp = NULL;
710 	minor_t			mnum = meta_getminor(mirnp->dev);
711 
712 	/*
713 	 * This message is never directly issued.
714 	 * So we launch it with a suspend override flag.
715 	 * If the commd is suspended, and this message comes
716 	 * along it must be sent due to replaying a command or similar.
717 	 * In that case we don't want this message to be blocked.
718 	 * If the commd is not suspended, the flag does no harm.
719 	 */
720 	resyncmsg.msg_resync_mnum =  mnum;
721 	result = mdmn_send_message(MD_MIN2SET(mnum),
722 	    MD_MN_MSG_RESYNC_STARTING,
723 	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
724 	    (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep);
725 
726 	if (resp != NULL) {
727 		free_result(resp);
728 	}
729 	return (result);
730 }
731 
732 /*
733  * FUNCTION:	meta_mn_change_owner()
734  * INPUT:	opp	- pointer to parameter block
735  *		setno	- set number of mirror metadevice
736  *		mnum	- minor number of mirror metadevice
737  *		owner	- node ID of mirror owner
738  *		flags	- flag field for ioctl
739  * OUTPUT:	opp	- parameter block used to send ioctl
740  * RETURNS:	int	- 0 success, -1 error
741  * PURPOSE:	issue an ioctl to change the ownership of the specified mirror
742  *		to our node ID. We need to be the owner before any watermarks
743  *		are committed to the device otherwise we'll enter a deadly
744  *		embrace when attempting to write the watermark.
745  *		This function can also be used so set the owner on a node to
746  *		NULL. In this case the change is only made on the local node.
747  *		In addition by setting the MD_MN_MM_CHOOSE_OWNER flag, the
748  *		function can also be used to choose a mirror resync owner. This
749  *		function should only be called on the master and it will
750  *		select the owner and request it to become the owner.
751  */
752 int
753 meta_mn_change_owner(
754 	md_set_mmown_params_t 	**opp,	/* Returned parameter block */
755 	set_t			setno,	/* Mirror set number */
756 	uint_t 			mnum,	/* Minor number */
757 	uint_t			owner,	/* Node ID of mirror owner */
758 	uint_t			flags	/* Flags */
759 )
760 {
761 	md_set_mmown_params_t	*ownpar = *opp;
762 	md_mn_own_status_t	*ownstat = NULL;
763 	struct timeval tvs, tve;
764 	int			n = 0;
765 	int			rval;
766 
767 	if (ownpar != NULL) {
768 		(void) memset(ownpar, 0, sizeof (*ownpar));
769 	} else {
770 		ownpar = Zalloc(sizeof (*ownpar));
771 	}
772 	ownstat = Zalloc(sizeof (*ownstat));
773 
774 	ownpar->d.mnum = mnum;
775 	ownpar->d.owner = owner;
776 	ownpar->d.flags = flags;
777 	MD_SETDRIVERNAME(ownpar, MD_MIRROR, setno);
778 	MD_SETDRIVERNAME(ownstat, MD_MIRROR, setno);
779 
780 	/*
781 	 * Attempt to change the ownership to the specified node. We retry this
782 	 * up to 10 times if we receive EAGAIN from the metadevice. This only
783 	 * happens if the underlying metadevice is busy with outstanding i/o
784 	 * that requires ownership change.
785 	 */
786 	while ((rval = metaioctl(MD_MN_SET_MM_OWNER, ownpar, &ownpar->mde,
787 	    NULL)) != 0) {
788 		md_sys_error_t	*ip =
789 		    &ownpar->mde.info.md_error_info_t_u.sys_error;
790 		if (ip->errnum != EAGAIN)
791 			break;
792 		if (n++ >= 10)
793 			break;
794 		(void) sleep(1);
795 	}
796 
797 	/*
798 	 * There is no need to wait for the ioctl completion if we are setting
799 	 * the owner to NULL or requesting the master to choose the owner
800 	 */
801 	if ((owner == 0) || (flags & MD_MN_MM_CHOOSE_OWNER)) {
802 		Free(ownstat);
803 		*opp = ownpar;
804 		return (0);
805 	}
806 
807 	/*
808 	 * Wait for ioctl completion or a timeout to occur. If we
809 	 * timeout we fail the i/o request.
810 	 */
811 	ownstat->mnum = ownpar->d.mnum;
812 	(void) gettimeofday(&tvs, NULL);
813 
814 	while ((rval == 0) && !(ownstat->flags & MD_MN_MM_RESULT)) {
815 		while ((rval = metaioctl(MD_MN_MM_OWNER_STATUS, ownstat,
816 		    &ownstat->mde, NULL)) != 0) {
817 			(void) gettimeofday(&tve, NULL);
818 			if ((tve.tv_sec - tvs.tv_sec) > OWNER_TIMEOUT) {
819 				rval = -1;
820 				break;
821 			}
822 			(void) sleep(1);
823 		}
824 	}
825 
826 	/* we did not not timeout but ioctl failed set rval */
827 
828 	if (rval == 0) {
829 		rval = (ownstat->flags & MD_MN_MM_RES_FAIL) ? -1 : 0;
830 	}
831 
832 	Free(ownstat);
833 	*opp = ownpar;
834 	return (rval);
835 }
836 /*
837  * special handling is required when running on a single node
838  * non-SC3.x environment.  This function determines tests
839  * for that case.
840  *
841  * Return values:
842  *	0 - no nodes or joined or in a SC3.x env
843  *	1 - 1 node and not in SC3.x env
844  */
845 
846 int
847 meta_mn_singlenode()
848 {
849 	md_error_t			xep = mdnullerror;
850 	int				nodecnt;
851 	int				mnset_single_node = 0;
852 	mndiskset_membershiplist_t	*nl;
853 
854 	/*
855 	 * If running on SunCluster, then don't validate MN sets,
856 	 * this is done during a reconfig cycle since all nodes must
857 	 * take the same action.
858 	 *
859 	 * Only cleanup in case of a single node situation
860 	 * when not running on SunCluster.  This single node
861 	 * situation occurs when the nodelist only contains
862 	 * this node and the MN setrecords only contain this
863 	 * node.
864 	 */
865 	if (meta_read_nodelist(&nodecnt, &nl, &xep) == -1) {
866 		nodecnt = 0;  /* no nodes are alive */
867 		nl = NULL;
868 		mdclrerror(&xep);
869 	} else {
870 		/*
871 		 * If only 1 node in nodelist and not running
872 		 * on SunCluster, set single_node flag.
873 		 */
874 		if ((nodecnt == 1) &&
875 		    (strcmp(nl->msl_node_name, mynode()) == 0) &&
876 		    ((sdssc_bind_library()) != SDSSC_OKAY)) {
877 			mnset_single_node = 1;
878 		}
879 		meta_free_nodelist(nl);
880 	}
881 	return (mnset_single_node);
882 }
883 
884 /*
885  * FUNCTION:	meta_mn_send_get_tstate()
886  * INPUT:	dev	- dev_t of device
887  * OUTPUT:	tstatep - tstate value
888  *		ep	- return error pointer
889  * RETURNS:	return value from mdmn_send_message()
890  * PURPOSE:  Send a message to the master to get ui_tstate for a given device.
891  */
892 
893 int
894 meta_mn_send_get_tstate(
895 	md_dev64_t		dev,
896 	uint_t			*tstatep,
897 	md_error_t		*ep
898 )
899 {
900 	int			result;
901 	md_mn_msg_gettstate_t	tstatemsg;
902 	md_mn_result_t		*resp = NULL;
903 	minor_t			mnum = meta_getminor(dev);
904 
905 	tstatemsg.gettstate_dev = dev;
906 	result = mdmn_send_message(MD_MIN2SET(mnum),
907 	    MD_MN_MSG_GET_TSTATE,
908 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST,
909 	    (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep);
910 
911 	if (result == 0)
912 		*tstatep = resp->mmr_exitval;
913 	else
914 		/* If some error occurred set tstate to 0 */
915 		*tstatep = 0;
916 
917 	if (resp != NULL) {
918 		free_result(resp);
919 	}
920 	return (result);
921 }
922