xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c (revision 62:5e51ad5d0496)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <stdlib.h>
30 #include <unistd.h>
31 #include <wait.h>
32 #include <sys/time.h>
33 #include <syslog.h>
34 
35 #include <meta.h>
36 #include <sys/lvm/mdio.h>
37 #include <sys/lvm/md_mddb.h>
38 #include <sys/lvm/md_mirror.h>
39 
40 #define	MAX_N_ARGS 64
41 #define	MAX_ARG_LEN 1024
42 
43 /* we reserve 1024 bytes for stdout and the same for stderr */
44 #define	MAX_OUT	1024
45 #define	MAX_ERR	1024
46 #define	JUNK 128 /* used to flush stdout and stderr */
47 
48 
49 /*ARGSUSED*/
50 void
51 mdmn_do_cmd(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
52 {
53 
54 	/*
55 	 * We are given one string containing all the arguments
56 	 * For execvp() we have to regenerate the arguments again
57 	 */
58 	int	arg;		/* argument that is currently been built */
59 	int	index;		/* runs through arg above */
60 	int	i;		/* helper for for loop */
61 	char	*argv[MAX_N_ARGS]; /* argument array for execvp */
62 	char	*cp;		/* runs through the given command line string */
63 	char	*command = NULL; /* the command we call locally */
64 	int	pout[2];	/* pipe for stdout */
65 	int	perr[2];	/* pipe for stderr */
66 	pid_t	pid;		/* process id */
67 
68 	cp	= msg->msg_event_data;
69 	arg	= 0;
70 	index	= 0;
71 
72 	/* init the args array alloc the first one and null out the rest */
73 	argv[0] = Malloc(MAX_ARG_LEN);
74 	for (i = 1; i < MAX_N_ARGS; i++) {
75 		argv[i] = NULL;
76 	}
77 
78 	resp->mmr_comm_state	= MDMNE_ACK; /* Ok state */;
79 
80 	while (*cp != '\0') {
81 		if (arg == MAX_N_ARGS) {
82 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
83 			    "PANIC: too many arguments specified\n"));
84 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
85 			goto out;
86 		}
87 		if (index == MAX_ARG_LEN) {
88 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
89 			    "PANIC: argument too long\n"));
90 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
91 			goto out;
92 		}
93 
94 		if ((*cp != ' ') && (*cp != '\t')) {
95 			/*
96 			 * No space or tab: copy char into current
97 			 * argv and advance both pointers
98 			 */
99 
100 			argv[arg][index] = *cp;
101 			cp++;	/* next char in command line	*/
102 			index++;	/* next char in argument	*/
103 		} else {
104 			/*
105 			 * space or tab: terminate current argv,
106 			 * advance arg, reset pointer into arg,
107 			 * advance pointer in command line
108 			 */
109 			argv[arg][index] = '\0';
110 			arg++; /* next argument */
111 			argv[arg] = Malloc(MAX_ARG_LEN);
112 			cp++; /* next char in command line */
113 			index = 0; /* starts at char 0 */
114 		}
115 	}
116 	/* terminate the last real argument */
117 	argv[arg][index] = '\0';
118 	/* the last argument is an NULL pointer */
119 	argv[++arg] = NULL;
120 	if (pipe(pout) < 0)  {
121 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
122 		    "PANIC: pipe failed\n"));
123 		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
124 		goto out;
125 	}
126 	if (pipe(perr) < 0) {
127 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
128 		    "PANIC: pipe failed\n"));
129 		(void) close(pout[0]);
130 		(void) close(pout[1]);
131 		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
132 		goto out;
133 	}
134 	command = Strdup(argv[0]);
135 	(void) strcat(argv[0], ".rpc_call");
136 	pid = fork1();
137 	if (pid == (pid_t)-1) {
138 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
139 		    "PANIC: fork failed\n"));
140 		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
141 		(void) close(pout[0]);
142 		(void) close(pout[1]);
143 		(void) close(perr[0]);
144 		(void) close(perr[1]);
145 		goto out;
146 	} else  if (pid == (pid_t)0) {
147 		/* child */
148 		(void) close(0);
149 		/* close the reading channels of pout and perr */
150 		(void) close(pout[0]);
151 		(void) close(perr[0]);
152 		/* redirect stdout */
153 		if (dup2(pout[1], 1) < 0) {
154 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
155 			    "PANIC: dup2 failed\n"));
156 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
157 			return;
158 		}
159 
160 		/* redirect stderr */
161 		if (dup2(perr[1], 2) < 0) {
162 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
163 			    "PANIC: dup2 failed\n"));
164 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
165 			return;
166 		}
167 
168 		(void) execvp(command, (char *const *)argv);
169 		perror("execvp");
170 		_exit(1);
171 	} else {
172 		/* parent process */
173 		int stat_loc;
174 		char *out, *err; /* for stdout and stderr of child */
175 		int i; /* index into the aboves */
176 		char junk[JUNK];
177 		int out_done = 0;
178 		int err_done = 0;
179 		int out_read = 0;
180 		int err_read = 0;
181 		int maxfd;
182 		fd_set	rset;
183 
184 
185 		/* close the writing channels of pout and perr */
186 		(void) close(pout[1]);
187 		(void) close(perr[1]);
188 		resp->mmr_out = Malloc(MAX_OUT);
189 		resp->mmr_err = Malloc(MAX_ERR);
190 		resp->mmr_out_size = MAX_OUT;
191 		resp->mmr_err_size = MAX_ERR;
192 		out = resp->mmr_out;
193 		err = resp->mmr_err;
194 		FD_ZERO(&rset);
195 		while ((out_done == 0) || (err_done == 0)) {
196 			FD_SET(pout[0], &rset);
197 			FD_SET(perr[0], &rset);
198 			maxfd = max(pout[0], perr[0]) + 1;
199 			(void) select(maxfd, &rset, NULL, NULL, NULL);
200 
201 			/*
202 			 * Did the child produce some output to stdout?
203 			 * If so, read it until we either reach the end of the
204 			 * output or until we read MAX_OUT bytes.
205 			 * Whatever comes first.
206 			 * In case we already read MAX_OUT bytes we simply
207 			 * read away the output into a junk buffer.
208 			 * Just to make the child happy
209 			 */
210 			if (FD_ISSET(pout[0], &rset)) {
211 				if (MAX_OUT - out_read - 1 > 0) {
212 					i = read(pout[0], out,
213 						MAX_OUT - out_read);
214 					out_read += i;
215 					out += i;
216 				} else {
217 					/* buffer full, empty stdout */
218 					i = read(pout[0], junk, JUNK);
219 				}
220 				if (i == 0) {
221 					/* stdout is closed by child */
222 					out_done++;
223 				}
224 			}
225 			/* same comment as above | sed -e 's/stdout/stderr/' */
226 			if (FD_ISSET(perr[0], &rset)) {
227 				if (MAX_ERR - err_read - 1 > 0) {
228 					i = read(perr[0], err,
229 						MAX_ERR - err_read);
230 					err_read += i;
231 					err += i;
232 				} else {
233 					/* buffer full, empty stderr */
234 					i = read(perr[0], junk, JUNK);
235 				}
236 				if (i == 0) {
237 					/* stderr is closed by child */
238 					err_done++;
239 				}
240 			}
241 		}
242 		resp->mmr_out[out_read] = '\0';
243 		resp->mmr_err[err_read] = '\0';
244 
245 		while (waitpid(pid, &stat_loc, 0) < 0) {
246 			if (errno != EINTR) {
247 				resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
248 				break;
249 			}
250 		}
251 		if (errno == 0)
252 			resp->mmr_exitval = WEXITSTATUS(stat_loc);
253 
254 		(void) close(pout[0]);
255 		(void) close(perr[0]);
256 	}
257 out:
258 	for (i = 0; i < MAX_N_ARGS; i++) {
259 		if (argv[i] != NULL) {
260 			free(argv[i]);
261 		}
262 	}
263 	if (command != NULL) {
264 		Free(command);
265 	}
266 }
267 
268 /*
269  * This is for checking if a metadevice is opened, and for
270  * locking in case it is not and for
271  * unlocking a locked device
272  */
273 /*ARGSUSED*/
274 void
275 mdmn_do_clu(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
276 {
277 	if (msg->msg_type == MD_MN_MSG_CLU_CHECK) {
278 		md_isopen_t	*d;
279 		int		ret;
280 
281 		resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
282 		resp->mmr_out_size = 0;
283 		resp->mmr_err_size = 0;
284 		resp->mmr_out = NULL;
285 		resp->mmr_err = NULL;
286 		d = (md_isopen_t *)(void *)msg->msg_event_data;
287 		ret = metaioctl(MD_IOCISOPEN, d, &(d->mde), NULL);
288 		/*
289 		 * In case the ioctl succeeded, return the open state of
290 		 * the metadevice. Otherwise we return the error the ioctl
291 		 * produced. As this is not zero, no attempt is made to
292 		 * remove/rename the metadevice later
293 		 */
294 
295 		if (ret == 0) {
296 			resp->mmr_exitval = d->isopen;
297 		} else {
298 			/*
299 			 * When doing a metaclear, one node after the other
300 			 * does the two steps:
301 			 * - check on all nodes if this md is opened.
302 			 * - remove the md locally.
303 			 * When the 2nd node asks all nodes if the md is
304 			 * open it starts with the first node.
305 			 * As this already removed the md, the check
306 			 * returns MDE_UNIT_NOT_SETUP.
307 			 * In order to not keep the 2nd node from proceeding,
308 			 * we map this to an Ok.
309 			 */
310 			if (mdismderror(&(d->mde), MDE_UNIT_NOT_SETUP)) {
311 				mdclrerror(&(d->mde));
312 				ret = 0;
313 			}
314 
315 			resp->mmr_exitval = ret;
316 		}
317 	}
318 }
319 
320 /* handler for MD_MN_MSG_REQUIRE_OWNER */
321 /*ARGSUSED*/
322 void
323 mdmn_do_req_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
324 {
325 	md_set_mmown_params_t	setown;
326 	md_mn_req_owner_t	*d;
327 	int			ret, n = 0;
328 
329 	resp->mmr_out_size = 0;
330 	resp->mmr_err_size = 0;
331 	resp->mmr_out = NULL;
332 	resp->mmr_err = NULL;
333 	resp->mmr_comm_state = MDMNE_ACK;
334 	d = (md_mn_req_owner_t *)(void *)msg->msg_event_data;
335 
336 	(void) memset(&setown, 0, sizeof (setown));
337 	MD_SETDRIVERNAME(&setown, MD_MIRROR, MD_MIN2SET(d->mnum))
338 	setown.d.mnum = d->mnum;
339 	setown.d.owner = d->owner;
340 
341 	/* Retry ownership change if we get EAGAIN returned */
342 	while ((ret = metaioctl(MD_MN_SET_MM_OWNER, &setown, &setown.mde, NULL))
343 	    != 0) {
344 		md_sys_error_t	*ip =
345 		    &setown.mde.info.md_error_info_t_u.sys_error;
346 		if (ip->errnum != EAGAIN) {
347 			break;
348 		}
349 		if (n++ >= 10) {
350 			break;
351 		}
352 		(void) sleep(1);
353 	}
354 
355 	resp->mmr_exitval = ret;
356 }
357 
358 /*
359  * handler for MD_MN_MSG_CHOOSE_OWNER
360  * This is called when a mirror resync has no owner. The master node generates
361  * this message which is not broadcast to the other nodes. The message is
362  * required as the kernel does not have access to the nodelist for the set.
363  */
364 /*ARGSUSED*/
365 void
366 mdmn_do_choose_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
367 {
368 	md_mn_msg_chowner_t	chownermsg;
369 	md_mn_msg_chooseid_t	*d;
370 	int			ret = 0;
371 	int			nodecnt;
372 	int			nodeno;
373 	uint_t			nodeid;
374 	uint_t			myflags;
375 	set_t			setno;
376 	mdsetname_t		*sp;
377 	md_set_desc		*sd;
378 	md_mnnode_desc		*nd;
379 	md_error_t		mde = mdnullerror;
380 	md_mn_result_t		*resp1 = NULL;
381 
382 	resp->mmr_out_size = 0;
383 	resp->mmr_err_size = 0;
384 	resp->mmr_out = NULL;
385 	resp->mmr_err = NULL;
386 	resp->mmr_comm_state = MDMNE_ACK;
387 	d = (md_mn_msg_chooseid_t *)(void *)msg->msg_event_data;
388 
389 	/*
390 	 * The node to be chosen will be the resync count for the set
391 	 * modulo the number of live nodes in the set
392 	 */
393 	setno = MD_MIN2SET(d->msg_chooseid_mnum);
394 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
395 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
396 		    "MD_MN_MSG_CHOOSE_OWNER: Invalid setno %d\n"), setno);
397 		resp->mmr_exitval = 1;
398 		return;
399 	}
400 	if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
401 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
402 		    "MD_MN_MSG_CHOOSE_OWNER: Invalid set pointer\n"));
403 		resp->mmr_exitval = 1;
404 		return;
405 	}
406 
407 	/* Count the number of live nodes */
408 	nodecnt = 0;
409 	nd = sd->sd_nodelist;
410 	while (nd) {
411 		if (nd->nd_flags & MD_MN_NODE_ALIVE)
412 			nodecnt++;
413 		nd = nd->nd_next;
414 	}
415 	nodeno = (d->msg_chooseid_rcnt%nodecnt);
416 
417 	/*
418 	 * If we've been called with msg_chooseid_set_node set TRUE then we
419 	 * are simply re-setting the owner id to ensure consistency across
420 	 * the cluster.
421 	 * If the flag is reset (B_FALSE) we are requesting a new owner to be
422 	 * determined.
423 	 */
424 	if (d->msg_chooseid_set_node) {
425 		nodeid = d->msg_chooseid_rcnt;
426 	} else {
427 		/* scan the nodelist looking for the required node */
428 		nodecnt = 0;
429 		nd = sd->sd_nodelist;
430 		while (nd) {
431 			if (nd->nd_flags & MD_MN_NODE_ALIVE) {
432 				if (nodecnt == nodeno)
433 					break;
434 				nodecnt++;
435 			}
436 			nd = nd->nd_next;
437 		}
438 		nodeid = nd->nd_nodeid;
439 	}
440 
441 	/* Send message to all nodes to make ownership change */
442 	chownermsg.msg_chowner_mnum =  d->msg_chooseid_mnum;
443 	chownermsg.msg_chowner_nodeid = nodeid;
444 	myflags = MD_MSGF_NO_LOG;
445 
446 	/* inherit some flags from the parent message */
447 	myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS;
448 
449 	ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum),
450 	    MD_MN_MSG_CHANGE_OWNER, myflags, (char *)&chownermsg,
451 	    sizeof (chownermsg), &resp1, &mde);
452 	if (resp1 != NULL)
453 		free_result(resp1);
454 	resp->mmr_exitval = ret;
455 }
456 
457 /*
458  * Handler for MD_MN_MSG_CHANGE_OWNER
459  * This is called when we are perfoming a resync and wish to change from
460  * no mirror owner to an owner chosen by the master.
461  * This mesage is only relevant for the new owner, the message will be
462  * ignored by all other nodes
463  */
464 /*ARGSUSED*/
465 void
466 mdmn_do_change_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
467 {
468 	md_set_mmown_params_t	setown;
469 	md_mn_msg_chowner_t	*d;
470 	int			ret = 0;
471 	set_t			setno;
472 	mdsetname_t		*sp;
473 	md_set_desc		*sd;
474 	md_error_t		mde = mdnullerror;
475 
476 	resp->mmr_out_size = 0;
477 	resp->mmr_err_size = 0;
478 	resp->mmr_out = NULL;
479 	resp->mmr_err = NULL;
480 	resp->mmr_comm_state = MDMNE_ACK;
481 	d = (md_mn_msg_chowner_t *)(void *)msg->msg_event_data;
482 
483 	setno = MD_MIN2SET(d->msg_chowner_mnum);
484 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
485 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
486 		    "MD_MN_MSG_CHANGE_OWNER: Invalid setno %d\n"), setno);
487 		resp->mmr_exitval = 1;
488 		return;
489 	}
490 	if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
491 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
492 		    "MD_MN_MSG_CHANGE_OWNER: Invalid set pointer\n"));
493 		resp->mmr_exitval = 1;
494 		return;
495 	}
496 
497 	if (d->msg_chowner_nodeid == sd->sd_mn_mynode->nd_nodeid) {
498 		/*
499 		 * If we are the chosen owner, issue ioctl to make the
500 		 * ownership change
501 		 */
502 		(void) memset(&setown, 0, sizeof (md_set_mmown_params_t));
503 		setown.d.mnum = d->msg_chowner_mnum;
504 		setown.d.owner = d->msg_chowner_nodeid;
505 		setown.d.flags = MD_MN_MM_SPAWN_THREAD;
506 		MD_SETDRIVERNAME(&setown, MD_MIRROR,
507 		    MD_MIN2SET(d->msg_chowner_mnum));
508 
509 		/*
510 		 * Single shot at changing the the owner, if it fails EAGAIN,
511 		 * another node must have become the owner while we are in the
512 		 * process of making this choice.
513 		 */
514 
515 		ret = metaioctl(MD_MN_SET_MM_OWNER, &setown,
516 		    &(setown.mde), NULL);
517 		if (ret == EAGAIN)
518 			ret = 0;
519 	}
520 	resp->mmr_exitval = ret;
521 }
522 
523 /* handler for MD_MN_MSG_SUSPEND_WRITES */
524 /*ARGSUSED*/
525 void
526 mdmn_do_susp_write(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
527 {
528 	/* Suspend writes to a region of a mirror */
529 	md_suspend_wr_params_t	suspwr_ioc;
530 	md_mn_msg_suspwr_t	*d;
531 	int			ret;
532 
533 	resp->mmr_out_size = 0;
534 	resp->mmr_err_size = 0;
535 	resp->mmr_out = NULL;
536 	resp->mmr_err = NULL;
537 	resp->mmr_comm_state = MDMNE_ACK;
538 	d = (md_mn_msg_suspwr_t *)(void *)msg->msg_event_data;
539 
540 	(void) memset(&suspwr_ioc, 0, sizeof (md_suspend_wr_params_t));
541 	MD_SETDRIVERNAME(&suspwr_ioc, MD_MIRROR,
542 	    MD_MIN2SET(d->msg_suspwr_mnum));
543 	suspwr_ioc.mnum = d->msg_suspwr_mnum;
544 	ret = metaioctl(MD_MN_SUSPEND_WRITES, &suspwr_ioc,
545 	    &(suspwr_ioc.mde), NULL);
546 	resp->mmr_exitval = ret;
547 }
548 
549 /*
550  * handler for MD_MN_MSG_STATE_UPDATE_RESWR
551  * This functions update a submirror component state and then resumes writes
552  * to the mirror
553  */
554 /*ARGSUSED*/
555 void
556 mdmn_do_state_upd_reswr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
557 {
558 	/* Update the state of the component of a mirror */
559 	md_set_state_params_t	setstate_ioc;
560 	md_mn_msg_stch_t	*d;
561 	int			ret;
562 
563 	resp->mmr_out_size = 0;
564 	resp->mmr_err_size = 0;
565 	resp->mmr_out = NULL;
566 	resp->mmr_err = NULL;
567 	resp->mmr_comm_state = MDMNE_ACK;
568 	d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
569 
570 	(void) memset(&setstate_ioc, 0, sizeof (md_set_state_params_t));
571 	MD_SETDRIVERNAME(&setstate_ioc, MD_MIRROR,
572 	    MD_MIN2SET(d->msg_stch_mnum));
573 	setstate_ioc.mnum = d->msg_stch_mnum;
574 	setstate_ioc.sm = d->msg_stch_sm;
575 	setstate_ioc.comp = d->msg_stch_comp;
576 	setstate_ioc.state = d->msg_stch_new_state;
577 	setstate_ioc.hs_id = d->msg_stch_hs_id;
578 	ret = metaioctl(MD_MN_SET_STATE, &setstate_ioc,
579 	    &(setstate_ioc.mde), NULL);
580 	resp->mmr_exitval = ret;
581 }
582 
583 /*
584  * submessage generator for MD_MN_MSG_STATE_UPDATE and MD_MN_MSG_STATE_UPDATE2
585  * This generates 2 messages, the first is SUSPEND_WRITES and
586  * depending on the type of the original message the second one is
587  * either STATE_UPDATE_RESWR or STATE_UPDATE_RESWR2 which actually does
588  * the same, but runs on a higher class.
589  */
590 int
591 mdmn_smgen_state_upd(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
592 {
593 	md_mn_msg_t		*nmsg;
594 	md_mn_msg_stch_t	*d;
595 	md_mn_msg_stch_t	*stch_data;
596 	md_mn_msg_suspwr_t	*suspwr_data;
597 
598 	d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
599 
600 	nmsg = Zalloc(sizeof (md_mn_msg_t));
601 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
602 
603 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
604 	nmsg->msg_setno		= msg->msg_setno;
605 	nmsg->msg_type		= MD_MN_MSG_SUSPEND_WRITES;
606 	nmsg->msg_event_size	= sizeof (md_mn_msg_suspwr_t);
607 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_suspwr_t));
608 	suspwr_data = (md_mn_msg_suspwr_t *)(void *)nmsg->msg_event_data;
609 	suspwr_data->msg_suspwr_mnum = d->msg_stch_mnum;
610 	msglist[0] = nmsg;
611 
612 	nmsg = Zalloc(sizeof (md_mn_msg_t));
613 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
614 
615 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
616 	nmsg->msg_setno		= msg->msg_setno;
617 	if (msg->msg_type == MD_MN_MSG_STATE_UPDATE2) {
618 		nmsg->msg_type		= MD_MN_MSG_STATE_UPDATE_RESWR2;
619 	} else {
620 		nmsg->msg_type		= MD_MN_MSG_STATE_UPDATE_RESWR;
621 	}
622 	nmsg->msg_event_size	= sizeof (md_mn_msg_stch_t);
623 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_stch_t));
624 	stch_data = (md_mn_msg_stch_t *)(void *)nmsg->msg_event_data;
625 	stch_data->msg_stch_mnum = d->msg_stch_mnum;
626 	stch_data->msg_stch_sm = d->msg_stch_sm;
627 	stch_data->msg_stch_comp = d->msg_stch_comp;
628 	stch_data->msg_stch_new_state = d->msg_stch_new_state;
629 	stch_data->msg_stch_hs_id = d->msg_stch_hs_id;
630 	msglist[1] = nmsg;
631 	return (2); /* Return the number of submessages generated */
632 }
633 
634 /*
635  * handler for MD_MN_MSG_ALLOCATE_HOTSPARE and MD_MN_MSG_ALLOCATE_HOTSPARE2
636  * This sends a message to all nodes requesting them to allocate a hotspare
637  * for the specified component. The component is specified by the mnum of
638  * the mirror, the submirror index and the component index.
639  */
640 /*ARGSUSED*/
641 void
642 mdmn_do_allocate_hotspare(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
643 {
644 	/* Allocate a hotspare for a mirror component */
645 	md_alloc_hotsp_params_t allochsp_ioc;
646 	md_mn_msg_allochsp_t    *d;
647 	int			ret;
648 
649 	resp->mmr_out_size = 0;
650 	resp->mmr_err_size = 0;
651 	resp->mmr_out = NULL;
652 	resp->mmr_err = NULL;
653 	resp->mmr_comm_state = MDMNE_ACK;
654 	d = (md_mn_msg_allochsp_t *)((void *)(msg->msg_event_data));
655 
656 	(void) memset(&allochsp_ioc, 0,
657 	sizeof (md_alloc_hotsp_params_t));
658 	MD_SETDRIVERNAME(&allochsp_ioc, MD_MIRROR,
659 	    MD_MIN2SET(d->msg_allochsp_mnum));
660 	allochsp_ioc.mnum = d->msg_allochsp_mnum;
661 	allochsp_ioc.sm = d->msg_allochsp_sm;
662 	allochsp_ioc.comp = d->msg_allochsp_comp;
663 	allochsp_ioc.hs_id = d->msg_allochsp_hs_id;
664 	ret = metaioctl(MD_MN_ALLOCATE_HOTSPARE, &allochsp_ioc,
665 	    &(allochsp_ioc.mde), NULL);
666 	resp->mmr_exitval = ret;
667 }
668 
669 /*
670  * handler for MD_MN_MSG_RESYNC_STARTING,MD_MN_MSG_RESYNC_FIRST,
671  * MD_MN_MSG_RESYNC_NEXT, MD_MN_MSG_RESYNC_FINISH, MD_MN_MSG_RESYNC_PHASE_DONE
672  */
673 /*ARGSUSED*/
674 void
675 mdmn_do_resync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
676 {
677 	md_mn_msg_resync_t	*d;
678 	md_mn_rs_params_t	respar;
679 	int			ret;
680 	int			smi;
681 
682 	resp->mmr_out_size = 0;
683 	resp->mmr_err_size = 0;
684 	resp->mmr_out = NULL;
685 	resp->mmr_err = NULL;
686 	resp->mmr_comm_state = MDMNE_ACK;
687 	d = (md_mn_msg_resync_t *)((void *)(msg->msg_event_data));
688 
689 	(void) memset(&respar, 0, sizeof (respar));
690 	MD_SETDRIVERNAME(&respar, MD_MIRROR,
691 	    MD_MIN2SET(d->msg_resync_mnum))
692 	respar.msg_type = (int)msg->msg_type;
693 	respar.mnum = d->msg_resync_mnum;
694 	respar.rs_type = d->msg_resync_type;
695 	respar.rs_start = d->msg_resync_start;
696 	respar.rs_size = d->msg_resync_rsize;
697 	respar.rs_done = d->msg_resync_done;
698 	respar.rs_2_do = d->msg_resync_2_do;
699 	respar.rs_originator = d->msg_originator;
700 	respar.rs_flags = d->msg_resync_flags;
701 
702 	for (smi = 0; smi < NMIRROR; smi++) {
703 		respar.rs_sm_state[smi] = d->msg_sm_state[smi];
704 		respar.rs_sm_flags[smi] = d->msg_sm_flags[smi];
705 	}
706 
707 	ret = metaioctl(MD_MN_RESYNC, &respar, &respar.mde, NULL);
708 
709 	resp->mmr_exitval = ret;
710 }
711 
712 /*
713  * handler for MD_MN_MSG_SETSYNC
714  */
715 /*ARGSUSED*/
716 void
717 mdmn_do_setsync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
718 {
719 	md_mn_msg_setsync_t	*d;
720 	md_resync_ioctl_t	ri;
721 	int			ret;
722 
723 	resp->mmr_out_size = 0;
724 	resp->mmr_err_size = 0;
725 	resp->mmr_out = NULL;
726 	resp->mmr_err = NULL;
727 	resp->mmr_comm_state = MDMNE_ACK;
728 	d = (md_mn_msg_setsync_t *)((void *)(msg->msg_event_data));
729 
730 	(void) memset(&ri, 0, sizeof (ri));
731 	MD_SETDRIVERNAME(&ri, MD_MIRROR, MD_MIN2SET(d->setsync_mnum))
732 	ri.ri_mnum = d->setsync_mnum;
733 	ri.ri_copysize = d->setsync_copysize;
734 	ri.ri_flags = d->setsync_flags;
735 
736 	ret = metaioctl(MD_MN_SETSYNC, &ri, &ri.mde, NULL);
737 
738 	resp->mmr_exitval = ret;
739 }
740 
741 /*
742  * handler for MD_MN_MSG_SET_CAP. As this handler can deal with both mirrors
743  * and soft partitions, the driver name that is required for the ioctl call
744  * is included in the message.
745  */
746 /*ARGSUSED*/
747 void
748 mdmn_do_set_cap(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
749 {
750 	md_mn_msg_setcap_t	*d;
751 	md_mn_setcap_params_t	setcap_ioc;
752 	minor_t			mnum;
753 	int			ret;
754 
755 	resp->mmr_out_size = 0;
756 	resp->mmr_err_size = 0;
757 	resp->mmr_out = NULL;
758 	resp->mmr_err = NULL;
759 	resp->mmr_comm_state = MDMNE_ACK;
760 	d = (md_mn_msg_setcap_t *)((void *)(msg->msg_event_data));
761 	mnum = d->msg_setcap_mnum;
762 
763 	(void) memset(&setcap_ioc, 0, sizeof (setcap_ioc));
764 
765 	MD_SETDRIVERNAME(&setcap_ioc, d->msg_setcap_driver, MD_MIN2SET(mnum));
766 	setcap_ioc.mnum = mnum;
767 	setcap_ioc.sc_set = d->msg_setcap_set;
768 
769 	ret = metaioctl(MD_MN_SET_CAP, &setcap_ioc, &setcap_ioc.mde, NULL);
770 
771 	resp->mmr_exitval = ret;
772 }
773 
774 /*
775  * Dummy handler for various CLASS0 messages like
776  * MD_MN_MSG_VERBOSITY / MD_MN_MSG_RESUME / MD_MN_MSG_SUSPEND ...
777  */
778 /*ARGSUSED*/
779 void
780 mdmn_do_dummy(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
781 {
782 	resp->mmr_out_size = 0;
783 	resp->mmr_err_size = 0;
784 	resp->mmr_out = NULL;
785 	resp->mmr_err = NULL;
786 	resp->mmr_exitval = 0;
787 	resp->mmr_comm_state = MDMNE_ACK;
788 }
789 
790 /*
791  * Overall description of mdcommd support that keeps all nodes in-sync
792  * with the ondisk diskset mddbs.
793  *
794  * All configuration changes to the mddb - addition/deletion of metadevices
795  * or replicas must use a CLASS1 message to block out these changes.
796  * Changes to the state of existing replicas do not need to block CLASS1
797  * since there is no conflict when just updating the state of a replica.
798  *
799  * Error encountered when master writes to mddbs:
800  *	As the master updates parts of the mddbs, flags are updated describing
801  *	what has been written.  When all locks are dropped (either in
802  *	mddb_setexit or mdioctl), a PARSE message will be generated to all
803  *	nodes with an index list of known good mddbs and the parse flags.
804  *	The master node ignore the parse message since it sent it.
805  *	The slave nodes re-read in the changed part of the mddb using the list
806  *	of known good replicas that was passed.
807  *	PARSE message does not block CLASS1.
808  *	The PARSE message must be the highest class message.  Since this
809  *	message could be sent on any ioctl, this PARSE message class must
810  *	be higher than any other class message that could issue an ioctl.
811  *
812  *	Master		Slave1		Slave2
813  * 	Handles_error
814  *	PARSE		PARSE		PARSE
815  *
816  *
817  * Add/Delete mddbs can occur from the following commands:
818  *	metadb -s set_name -a/-d
819  *	metaset -s set_name -a/-d disk
820  *	metaset -s set_name -b
821  *
822  *	The metadb/metaset command is run on the node executing the command
823  *	and sends an ATTACH/DETACH message to the master node blocking CLASS1
824  *	messages on all nodes until this message is finished.  The master
825  *	node generates 3 submessages of BLOCK, SM_ATTACH/SM_DETACH, UNBLOCK.
826  *	The BLOCK message is only run on the master node and will BLOCK
827  *	the PARSE messages from being sent to the nodes.
828  *	The SM_ATTACH/SM_DETACH message is run on all nodes and actually adds or
829  *	removes the replica(s) from the given disk slice.
830  *	The UNBLOCK message is only run on the master node and allows the
831  *	sending of PARSE messages.
832  *
833  *	Master		Slave1		Slave2
834  *			Add mddb cmd
835  *			ATTACH msg to master
836  *	BLOCK
837  *	ATTACH		ATTACH		ATTACH
838  *	UNBLOCK
839  *	PARSE		PARSE		PARSE
840  *	ATTACH msg finished
841  *
842  * Add/Delete host side information from the following commands:
843  *	metaset -s set_name -a/-d -h
844  *
845  *	The metaset command is run on the node executing the command and
846  *	sends a DB_NEWSIDE/DB_DELSIDE message and a MD_NEWSIDE/MD_DELSIDE
847  *	message whenever a host is added to or deleted from the diskset.
848  *
849  *	The side information contains the major name and minor number
850  *	associated with a disk slice from a certain node's perspective
851  *	in an (failed) effort to support clustered systems that don't have the
852  *	same device name for a physical device. (The original designers of
853  *	SVM eventually took the shortcut of assuming that all device names
854  *	are the same on all systems, but left the side information in the
855  *	mddb and namespace.)  The side information is used for disk slices
856  *	that contain mddbs and/or are components for metadevices.
857  *
858  *	The DB_NEWSIDE/DELSIDE command adds or deletes the side information
859  *	for each mddb for the host being added or deleted.
860  *	The MD_ADDSIDE/MD_DELSIDE command adds or deletes the side information
861  *	for all disk slice components that are in the namespace records for
862  *	the host being added or deleted.
863  *
864  *	The DB_NEWSIDE/DB_DELSIDE message does not change any mddb records
865  *	and only needs to be executed on the master node since the slave
866  *	nodes will be brought up to date by the PARSE message that is
867  *	generated as a result of a change to the mddb.
868  *	The MD_ADDSIDE/MD_DELSIDE message does modify the records in the mddb
869  *	and needs to be run on all nodes.  The message must block class1
870  *	messages so that record changing commands don't interfere.
871  *
872  *	Master		Slave1		Slave2
873  *			Add host
874  *			DB_NEWSIDE msg to master
875  *	DB_NEWSIDE
876  *	PARSE		PARSE		PARSE
877  *	DB_NEWSIDE msg finished
878  *			MD_NEWSIDE msg to master
879  *	MD_NEWSIDE	MD_NEWSIDE	MD_NEWSIDE
880  *	MD_NEWSIDE msg finished
881  *
882  *
883  * Optimized resync record failure:
884  *	When any node sees a failure to write an optimized resync record
885  *	that node notifies the master node of the replica that failed.
886  *	The master node handles the error and updates the rest of the
887  *	nodes using a PARSE message.  The PARSE message also calls
888  *	fixoptrecord on each slave node causing each node to fix up
889  * 	the optimized resync records that are owned by that node (the mirror
890  *	owner code also sets the optimized resync record owner).  The master
891  *	node will fix up all optimized resync records that have no owner or
892  *	are owned by the master node.
893  *
894  *	Master		Slave1		Slave2
895  *					Optimized Record Failure
896  *					OPTRECERR msg to master
897  *	Master handles opt rec failure
898  *	PARSE		PARSE		PARSE
899  *	OPTRECERR msg finished
900  *					Slave rewrites optimized record
901  *
902  */
903 
904 /*
905  * Handler for MD_MN_MSG_MDDB_PARSE which send parse messages to the
906  * slave nodes in order to keep the incore view of the mddbs the
907  * same on all nodes.
908  *
909  * Since master node generated the mddb parse message, do nothing
910  * if this is the master node.
911  *
912  * If this is a slave node, send the parse message down to the kernel
913  * where this node will re-read in parts of the mddbs.
914  *
915  */
916 void
917 mdmn_do_mddb_parse(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
918 {
919 	md_mn_msg_mddb_parse_t	*d;
920 	mddb_parse_parm_t	mpp;
921 	int			ret = 0;
922 	int			i;
923 
924 	resp->mmr_out_size = 0;
925 	resp->mmr_err_size = 0;
926 	resp->mmr_out = NULL;
927 	resp->mmr_err = NULL;
928 	resp->mmr_comm_state = MDMNE_ACK;
929 	d = (md_mn_msg_mddb_parse_t *)((void *)(msg->msg_event_data));
930 
931 	if (flags & MD_MSGF_ON_MASTER)
932 		return;
933 
934 	(void) memset(&mpp, 0, sizeof (mpp));
935 	mpp.c_setno = msg->msg_setno;
936 	mpp.c_parse_flags = d->msg_parse_flags;
937 	for (i = 0; i < MDDB_NLB; i++) {
938 		mpp.c_lb_flags[i] = d->msg_lb_flags[i];
939 	}
940 	ret = metaioctl(MD_MN_MDDB_PARSE, &mpp, &mpp.c_mde, NULL);
941 	if (ret)
942 		(void) mdstealerror(&(resp->mmr_ep), &mpp.c_mde);
943 
944 	resp->mmr_exitval = ret;
945 }
946 
947 /*
948  * Handler for MD_MN_MSG_MDDB_BLOCK which blocks the generation
949  * of parse messages from this node.
950  *
951  * This is needed when attaching/detaching mddbs on the master and the
952  * slave node is unable to handle a parse message until the slave node
953  * has done the attach/detach of the mddbs.  So, master node will block
954  * the parse messages, execute the attach/detach on all nodes and
955  * then unblock the parse messages which causes the parse message to
956  * be sent to all nodes.
957  */
958 /*ARGSUSED*/
959 void
960 mdmn_do_mddb_block(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
961 {
962 	md_mn_msg_mddb_block_t	*d;
963 	mddb_block_parm_t	mbp;
964 	int			ret;
965 
966 	resp->mmr_out_size = 0;
967 	resp->mmr_err_size = 0;
968 	resp->mmr_out = NULL;
969 	resp->mmr_err = NULL;
970 	resp->mmr_comm_state = MDMNE_ACK;
971 	d = (md_mn_msg_mddb_block_t *)((void *)(msg->msg_event_data));
972 
973 	(void) memset(&mbp, 0, sizeof (mbp));
974 	mbp.c_setno = msg->msg_setno;
975 	mbp.c_blk_flags = d->msg_block_flags;
976 	ret = metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL);
977 	if (ret)
978 		(void) mdstealerror(&(resp->mmr_ep), &mbp.c_mde);
979 
980 	resp->mmr_exitval = ret;
981 }
982 
983 /*
984  * Submessage generator for MD_MN_MSG_META_DB_ATTACH which generates
985  * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_ATTACH
986  * message on all nodes and then an UNBLOCK message on the master only.
987  */
988 int
989 mdmn_smgen_mddb_attach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
990 {
991 	md_mn_msg_t			*nmsg;
992 	md_mn_msg_meta_db_attach_t	*d;
993 	md_mn_msg_meta_db_attach_t	*attach_d;
994 	md_mn_msg_mddb_block_t		*block_d;
995 
996 	d = (md_mn_msg_meta_db_attach_t *)(void *)msg->msg_event_data;
997 
998 	nmsg = Zalloc(sizeof (md_mn_msg_t));
999 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1000 
1001 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1002 	nmsg->msg_setno		= msg->msg_setno;
1003 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1004 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1005 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1006 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1007 	block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1008 	msglist[0] = nmsg;
1009 
1010 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1011 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1012 
1013 	/* Don't log submessages and panic on inconsistent results */
1014 	nmsg->msg_flags		= MD_MSGF_NO_LOG |
1015 				    MD_MSGF_PANIC_WHEN_INCONSISTENT;
1016 	nmsg->msg_setno		= msg->msg_setno;
1017 	nmsg->msg_type		= MD_MN_MSG_SM_MDDB_ATTACH;
1018 	nmsg->msg_event_size	= sizeof (md_mn_msg_meta_db_attach_t);
1019 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_meta_db_attach_t));
1020 	attach_d = (md_mn_msg_meta_db_attach_t *)
1021 			(void *)nmsg->msg_event_data;
1022 	attach_d->msg_l_dev = d->msg_l_dev;
1023 	attach_d->msg_cnt = d->msg_cnt;
1024 	attach_d->msg_dbsize = d->msg_dbsize;
1025 	(void) strncpy(attach_d->msg_dname, d->msg_dname, 16);
1026 	attach_d->msg_splitname = d->msg_splitname;
1027 	attach_d->msg_options = d->msg_options;
1028 	msglist[1] = nmsg;
1029 
1030 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1031 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1032 
1033 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1034 	nmsg->msg_setno		= msg->msg_setno;
1035 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1036 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1037 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1038 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1039 	block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1040 	msglist[2] = nmsg;
1041 
1042 	return (3); /* Return the number of submessages generated */
1043 }
1044 
1045 /*
1046  * Submessage generator for MD_MN_MSG_META_DB_DETACH which generates
1047  * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_DETACH
1048  * message on all nodes and then an UNBLOCK message on the master only.
1049  */
1050 int
1051 mdmn_smgen_mddb_detach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
1052 {
1053 	md_mn_msg_t			*nmsg;
1054 	md_mn_msg_meta_db_detach_t	*d;
1055 	md_mn_msg_meta_db_detach_t	*detach_d;
1056 	md_mn_msg_mddb_block_t		*block_d;
1057 
1058 	d = (md_mn_msg_meta_db_detach_t *)(void *)msg->msg_event_data;
1059 
1060 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1061 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1062 
1063 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1064 	nmsg->msg_setno		= msg->msg_setno;
1065 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1066 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1067 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1068 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1069 	block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1070 	msglist[0] = nmsg;
1071 
1072 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1073 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1074 
1075 	/* Don't log submessages and panic on inconsistent results */
1076 	nmsg->msg_flags		= MD_MSGF_NO_LOG |
1077 				    MD_MSGF_PANIC_WHEN_INCONSISTENT;
1078 	nmsg->msg_setno		= msg->msg_setno;
1079 	nmsg->msg_type		= MD_MN_MSG_SM_MDDB_DETACH;
1080 	nmsg->msg_event_size	= sizeof (md_mn_msg_meta_db_detach_t);
1081 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_meta_db_detach_t));
1082 	detach_d = (md_mn_msg_meta_db_detach_t *)
1083 			(void *)nmsg->msg_event_data;
1084 	detach_d->msg_splitname = d->msg_splitname;
1085 	msglist[1] = nmsg;
1086 
1087 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1088 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1089 
1090 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1091 	nmsg->msg_setno		= msg->msg_setno;
1092 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1093 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1094 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1095 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1096 	block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1097 	msglist[2] = nmsg;
1098 
1099 	return (3); /* Return the number of submessages generated */
1100 }
1101 
1102 /*
1103  * Handler for MD_MN_MSG_SM_MDDB_ATTACH which is used to attach mddbs.
1104  *
1105  * Used when running:
1106  *	metadb -s set_name -a
1107  * 	metaset -s set_name -a/-d disk
1108  *	metaset -s set_name -b
1109  */
1110 /*ARGSUSED*/
1111 void
1112 mdmn_do_sm_mddb_attach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1113 {
1114 	md_mn_msg_meta_db_attach_t	*d;
1115 	struct mddb_config		c;
1116 	int				i;
1117 	int				ret = 0;
1118 	md_error_t			ep = mdnullerror;
1119 	char				*name, *add_name;
1120 	mdname_t			*np;
1121 	mdsetname_t			*sp;
1122 
1123 	resp->mmr_out_size = 0;
1124 	resp->mmr_err_size = 0;
1125 	resp->mmr_out = NULL;
1126 	resp->mmr_err = NULL;
1127 	resp->mmr_comm_state = MDMNE_ACK;
1128 	d = (md_mn_msg_meta_db_attach_t *)((void *)(msg->msg_event_data));
1129 
1130 	(void) memset(&c, 0, sizeof (c));
1131 	c.c_setno = msg->msg_setno;
1132 	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1133 	(void) strncpy(c.c_locator.l_driver, d->msg_dname,
1134 		sizeof (c.c_locator.l_driver));
1135 	c.c_devname = d->msg_splitname;
1136 	c.c_locator.l_mnum = meta_getminor(d->msg_l_dev);
1137 	c.c_multi_node = 1;
1138 	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1139 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1140 		resp->mmr_exitval = -1;
1141 		return;
1142 	}
1143 	(void) strcpy(c.c_setname, sp->setname);
1144 	c.c_sideno = getmyside(sp, &ep);
1145 	if (c.c_sideno == MD_SIDEWILD) {
1146 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1147 		resp->mmr_exitval = -1;
1148 		return;
1149 	}
1150 
1151 	name = splicename(&d->msg_splitname);
1152 	if ((np = metaname(&sp, name, &ep)) == NULL) {
1153 		Free(name);
1154 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1155 		resp->mmr_exitval = -1;
1156 		return;
1157 	}
1158 	/*
1159 	 * All nodes in MN diskset must do meta_check_replica
1160 	 * since this causes the shared namespace to be
1161 	 * populated by the md driver names while checking
1162 	 * to see if this device is already in use as a
1163 	 * metadevice.
1164 	 */
1165 	if (meta_check_replica(sp, np, d->msg_options, 0,
1166 	    (d->msg_cnt * d->msg_dbsize), &ep)) {
1167 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1168 		resp->mmr_exitval = -1;
1169 		return;
1170 	}
1171 
1172 	for (i = 0; i < d->msg_cnt; i++) {
1173 		c.c_locator.l_blkno = i * d->msg_dbsize + 16;
1174 		if (setup_med_cfg(sp, &c,
1175 		    (d->msg_options & MDCHK_SET_FORCE), &ep)) {
1176 			ret = -1;
1177 			(void) mdstealerror(&(resp->mmr_ep), &ep);
1178 			break;
1179 		}
1180 		ret = metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL);
1181 		/* If newdev was successful, continue with attach */
1182 		if (ret == 0) {
1183 			if (meta_db_addsidenms(sp, np, c.c_locator.l_blkno,
1184 			    DB_ADDSIDENMS_NO_BCAST, &ep)) {
1185 				ret = -1;
1186 				(void) mdstealerror(&(resp->mmr_ep), &ep);
1187 				break;
1188 			}
1189 		} else {
1190 			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1191 			break;
1192 		}
1193 	}
1194 	add_name = splicename(&d->msg_splitname);
1195 	if ((np = metaname(&sp, add_name, &ep)) != NULL) {
1196 		meta_invalidate_name(np);
1197 	} else {
1198 		ret = -1;
1199 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1200 	}
1201 	Free(add_name);
1202 
1203 	resp->mmr_exitval = ret;
1204 }
1205 
1206 /*
1207  * Handler for MD_MN_MSG_SM_MDDB_DETACH which is used to detach mddbs.
1208  *
1209  * Used when running:
1210  *	metadb -s set_name -d
1211  * 	metaset -s set_name -a/-d disk
1212  *	metaset -s set_name -b
1213  */
1214 /*ARGSUSED*/
1215 void
1216 mdmn_do_sm_mddb_detach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1217 {
1218 	md_mn_msg_meta_db_detach_t	*d;
1219 	struct mddb_config		c;
1220 	int				i;
1221 	int				ret = 0;
1222 	md_error_t			ep = mdnullerror;
1223 	char				*name, *del_name;
1224 	mdname_t			*np;
1225 	mdsetname_t			*sp;
1226 
1227 	resp->mmr_out_size = 0;
1228 	resp->mmr_err_size = 0;
1229 	resp->mmr_out = NULL;
1230 	resp->mmr_err = NULL;
1231 	resp->mmr_comm_state = MDMNE_ACK;
1232 	d = (md_mn_msg_meta_db_detach_t *)((void *)(msg->msg_event_data));
1233 
1234 	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1235 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1236 		resp->mmr_exitval = -1;
1237 		return;
1238 	}
1239 
1240 	(void) memset(&c, 0, sizeof (c));
1241 	c.c_setno = msg->msg_setno;
1242 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1243 		resp->mmr_exitval = -1;
1244 		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1245 		return;
1246 	}
1247 	i = 0;
1248 	del_name = splicename(&d->msg_splitname);
1249 	while (i < c.c_dbcnt) {
1250 		c.c_id = i;
1251 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1252 			ret = -1;
1253 			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1254 			break;
1255 		}
1256 		name = splicename(&c.c_devname);
1257 		if (strcmp(name, del_name) != 0) {
1258 			Free(name);
1259 			i++;
1260 			continue;
1261 		}
1262 		Free(name);
1263 		/* Found a match - delete mddb */
1264 		if (metaioctl(MD_DB_DELDEV, &c, &c.c_mde, NULL) != 0) {
1265 			ret = -1;
1266 			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1267 			break;
1268 		}
1269 		/* Not incrementing "i" intentionally (dbcnt is changed) */
1270 	}
1271 	if ((np = metaname(&sp, del_name, &ep)) != NULL) {
1272 		meta_invalidate_name(np);
1273 	} else {
1274 		ret = -1;
1275 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1276 	}
1277 	Free(del_name);
1278 
1279 	resp->mmr_exitval = ret;
1280 }
1281 
1282 /*
1283  * Handler for MD_MN_MSG_META_DB_NEWSIDE which is used to update the
1284  * side information for each diskset mddb when a new host has been
1285  * added to the diskset.  The side information is the /dev/dsk/ctds name
1286  * that the new node would use to access each mddb.
1287  *
1288  * Since this routine makes no changes to the records in the diskset mddb,
1289  * this routine only needs to be run on the master node.  The master node's
1290  * kernel code will detect that portions of the mddb have changed and
1291  * will send a parse message to all nodes to re-parse parts of the mddb.
1292  *
1293  * Used when running:
1294  * 	metaset -s set_name -a -h new_hostname
1295  */
1296 /*ARGSUSED*/
1297 void
1298 mdmn_do_meta_db_newside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1299 {
1300 	md_mn_msg_meta_db_newside_t	*d;
1301 	struct mddb_config		c;
1302 	int				ret = 0;
1303 	mdsetname_t			*sp;
1304 	md_error_t			ep = mdnullerror;
1305 
1306 	resp->mmr_out_size = 0;
1307 	resp->mmr_err_size = 0;
1308 	resp->mmr_out = NULL;
1309 	resp->mmr_err = NULL;
1310 	resp->mmr_comm_state = MDMNE_ACK;
1311 	d = (md_mn_msg_meta_db_newside_t *)((void *)(msg->msg_event_data));
1312 
1313 	(void) memset(&c, 0, sizeof (c));
1314 	c.c_setno = msg->msg_setno;
1315 	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1316 	c.c_locator.l_blkno = d->msg_blkno;
1317 	(void) strncpy(c.c_locator.l_driver, d->msg_dname,
1318 		sizeof (c.c_locator.l_driver));
1319 	c.c_devname = d->msg_splitname;
1320 	c.c_locator.l_mnum = d->msg_mnum;
1321 	c.c_multi_node = 1;
1322 	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1323 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1324 		resp->mmr_exitval = -1;
1325 		return;
1326 	}
1327 	(void) strcpy(c.c_setname, sp->setname);
1328 	c.c_sideno = d->msg_sideno;
1329 
1330 	if ((ret = metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL)) != 0) {
1331 		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1332 	}
1333 	resp->mmr_exitval = ret;
1334 }
1335 
1336 /*
1337  * Handler for MD_MN_MSG_META_DB_DELSIDE which is used to remove the
1338  * side information for each diskset mddb when a host has been
1339  * deleted from the diskset.  The side information is the /dev/dsk/ctds name
1340  * that the node would use to access each mddb.
1341  *
1342  * Since this routine makes no changes to the records in the diskset mddb,
1343  * this routine only needs to be run on the master node.  The master node's
1344  * kernel code will detect that portions of the mddb have changed and
1345  * will send a parse message to all nodes to re-parse parts of the mddb.
1346  *
1347  * Used when running:
1348  * 	metaset -s set_name -d -h hostname
1349  */
1350 /*ARGSUSED*/
1351 void
1352 mdmn_do_meta_db_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1353 {
1354 	md_mn_msg_meta_db_delside_t	*d;
1355 	mddb_config_t			c;
1356 	int				ret = 0;
1357 	mdsetname_t			*sp;
1358 	md_error_t			ep = mdnullerror;
1359 
1360 	resp->mmr_out_size = 0;
1361 	resp->mmr_err_size = 0;
1362 	resp->mmr_out = NULL;
1363 	resp->mmr_err = NULL;
1364 	resp->mmr_comm_state = MDMNE_ACK;
1365 	d = (md_mn_msg_meta_db_delside_t *)((void *)(msg->msg_event_data));
1366 
1367 	(void) memset(&c, 0, sizeof (c));
1368 	c.c_setno = msg->msg_setno;
1369 	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1370 	c.c_locator.l_blkno = d->msg_blkno;
1371 	c.c_multi_node = 1;
1372 	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1373 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1374 		resp->mmr_exitval = -1;
1375 		return;
1376 	}
1377 	(void) strcpy(c.c_setname, sp->setname);
1378 	c.c_sideno = d->msg_sideno;
1379 
1380 	if ((ret = metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL)) != 0) {
1381 		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1382 	}
1383 	resp->mmr_exitval = ret;
1384 }
1385 
1386 /*
1387  * Handler for MD_MN_MSG_META_MD_ADDSIDE which is used to add the
1388  * side information for each diskset metadevice component (if that
1389  * component is a disk) when a host has been added to the diskset.
1390  * The side information is the /dev/dsk/ctds name that the node would
1391  * use to access the metadevice component.
1392  *
1393  * This routine makes changes to the mddb records and must be run
1394  * on all nodes.
1395  *
1396  * Used when running:
1397  * 	metaset -s set_name -a -h new_hostname
1398  */
1399 /*ARGSUSED*/
1400 void
1401 mdmn_do_meta_md_addside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1402 {
1403 	md_mn_msg_meta_md_addside_t	*d;
1404 	mdnm_params_t			nm;
1405 	mdsetname_t			*sp;
1406 	char				*cname, *dname;
1407 	minor_t				mnum;
1408 	int				done, i;
1409 	md_error_t			ep = mdnullerror;
1410 
1411 	resp->mmr_out_size = 0;
1412 	resp->mmr_err_size = 0;
1413 	resp->mmr_out = NULL;
1414 	resp->mmr_err = NULL;
1415 	resp->mmr_comm_state = MDMNE_ACK;
1416 	d = (md_mn_msg_meta_md_addside_t *)((void *)(msg->msg_event_data));
1417 
1418 	(void) memset(&nm, 0, sizeof (nm));
1419 	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1420 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1421 		resp->mmr_exitval = -1;
1422 		return;
1423 	}
1424 	/* While loop continues until IOCNXTKEY_NM gives nm.key of KEYWILD */
1425 	/*CONSTCOND*/
1426 	while (1) {
1427 		nm.mde = mdnullerror;
1428 		nm.setno = msg->msg_setno;
1429 		nm.side = d->msg_otherside;
1430 		if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1431 			(void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1432 			resp->mmr_exitval = -1;
1433 			return;
1434 		}
1435 
1436 		/* Normal exit path is to eventually get a KEYWILD */
1437 		if (nm.key == MD_KEYWILD) {
1438 			resp->mmr_exitval = 0;
1439 			return;
1440 		}
1441 
1442 		nm.devname = (uintptr_t)meta_getnmbykey(msg->msg_setno,
1443 			d->msg_otherside, nm.key, &ep);
1444 		if (nm.devname == NULL) {
1445 			(void) mdstealerror(&(resp->mmr_ep), &ep);
1446 			resp->mmr_exitval = -1;
1447 			return;
1448 		}
1449 		nm.side = d->msg_sideno;
1450 		if ((done = meta_getside_devinfo(sp,
1451 		    (char *)(uintptr_t)nm.devname,
1452 		    d->msg_sideno, &cname, &dname, &mnum, &ep)) == -1) {
1453 			(void) mdstealerror(&(resp->mmr_ep), &ep);
1454 			Free((void *)(uintptr_t)nm.devname);
1455 			resp->mmr_exitval = -1;
1456 			return;
1457 		}
1458 		Free((void *)(uintptr_t)nm.devname);
1459 		if (done != 1) {
1460 			Free(cname);
1461 			Free(dname);
1462 			resp->mmr_exitval = -1;
1463 			return;
1464 		}
1465 
1466 		/*
1467 		 * The device reference count can be greater than 1 if
1468 		 * more than one softpart is configured on top of the
1469 		 * same device.  If this is the case then we want to
1470 		 * increment the count to sync up with the other sides.
1471 		 */
1472 		for (i = 0; i < nm.ref_count; i++) {
1473 			if (add_name(sp, d->msg_sideno, nm.key, dname, mnum,
1474 			    cname, &ep) == -1) {
1475 				(void) mdstealerror(&(resp->mmr_ep), &ep);
1476 				Free(cname);
1477 				Free(dname);
1478 				resp->mmr_exitval = -1;
1479 				return;
1480 			}
1481 		}
1482 		Free(cname);
1483 		Free(dname);
1484 	}
1485 
1486 	/*NOTREACHED*/
1487 }
1488 /*
1489  * Handler for MD_MN_MSG_META_MD_DELSIDE which is used to delete the
1490  * side information for each diskset metadevice component (if that
1491  * component is a disk) when a host has been removed from the diskset.
1492  * The side information is the /dev/dsk/ctds name that the node would
1493  * use to access the metadevice component.
1494  *
1495  * This routine makes changes to the mddb records and must be run
1496  * on all nodes.
1497  *
1498  * Used when running:
1499  * 	metaset -s set_name -d -h hostname
1500  */
1501 /*ARGSUSED*/
1502 void
1503 mdmn_do_meta_md_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1504 {
1505 	md_mn_msg_meta_md_delside_t	*d;
1506 	mdnm_params_t			nm;
1507 	mdsetname_t			*sp;
1508 	md_error_t			ep = mdnullerror;
1509 	int				i;
1510 
1511 	resp->mmr_out_size = 0;
1512 	resp->mmr_err_size = 0;
1513 	resp->mmr_out = NULL;
1514 	resp->mmr_err = NULL;
1515 	resp->mmr_comm_state = MDMNE_ACK;
1516 	d = (md_mn_msg_meta_md_delside_t *)((void *)(msg->msg_event_data));
1517 
1518 	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1519 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1520 		resp->mmr_exitval = -1;
1521 		return;
1522 	}
1523 
1524 	(void) memset(&nm, 0, sizeof (nm));
1525 	nm.key = MD_KEYWILD;
1526 	/*CONSTCOND*/
1527 	while (1) {
1528 		nm.mde = mdnullerror;
1529 		nm.setno = msg->msg_setno;
1530 		nm.side = MD_SIDEWILD;
1531 		if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1532 			(void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1533 			resp->mmr_exitval = -1;
1534 			return;
1535 		}
1536 
1537 		/* Normal exit path is to eventually get a KEYWILD */
1538 		if (nm.key == MD_KEYWILD) {
1539 			resp->mmr_exitval = 0;
1540 			return;
1541 		}
1542 
1543 		/*
1544 		 * The device reference count can be greater than 1 if
1545 		 * more than one softpart is configured on top of the
1546 		 * same device.  If this is the case then we want to
1547 		 * decrement the count to zero so the entry can be
1548 		 * actually removed.
1549 		 */
1550 		for (i = 0; i < nm.ref_count; i++) {
1551 			if (del_name(sp, d->msg_sideno, nm.key, &ep) == -1) {
1552 				(void) mdstealerror(&(resp->mmr_ep), &ep);
1553 				resp->mmr_exitval = -1;
1554 				return;
1555 			}
1556 		}
1557 	}
1558 
1559 	/*NOTREACHED*/
1560 }
1561 
1562 /*
1563  * Handler for MD_MN_MSG_MDDB_OPTRECERR which is used to notify
1564  * the master node that a node has seen an error when attempting to
1565  * write to the optimized resync records that reside on 2 of the diskset
1566  * mddbs.  Master node will mark the failed replica in error and this
1567  * will send a parse message to all nodes to re-read parts of the mddb
1568  * and to fix their optimized resync records based on this information.
1569  */
1570 /*ARGSUSED*/
1571 void
1572 mdmn_do_mddb_optrecerr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1573 {
1574 	md_mn_msg_mddb_optrecerr_t	*d;
1575 	mddb_optrec_parm_t		mop;
1576 	int				ret;
1577 	int				i;
1578 
1579 	resp->mmr_out_size = 0;
1580 	resp->mmr_err_size = 0;
1581 	resp->mmr_out = NULL;
1582 	resp->mmr_err = NULL;
1583 	resp->mmr_comm_state = MDMNE_ACK;
1584 	d = (md_mn_msg_mddb_optrecerr_t *)((void *)(msg->msg_event_data));
1585 
1586 	(void) memset(&mop, 0, sizeof (mop));
1587 	mop.c_setno = msg->msg_setno;
1588 	for (i = 0; i < 2; i++) {
1589 		mop.c_recerr[i] = d->msg_recerr[i];
1590 	}
1591 	ret = metaioctl(MD_MN_MDDB_OPTRECFIX, &mop, &mop.c_mde, NULL);
1592 	if (ret)
1593 		(void) mdstealerror(&(resp->mmr_ep), &mop.c_mde);
1594 
1595 	resp->mmr_exitval = ret;
1596 }
1597 
1598 int
1599 mdmn_smgen_test6(md_mn_msg_t *msg, md_mn_msg_t **msglist)
1600 {
1601 	md_mn_msg_t	*nmsg;
1602 
1603 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1604 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1605 
1606 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1607 	nmsg->msg_setno		= msg->msg_setno;
1608 	nmsg->msg_type		= MD_MN_MSG_TEST2;
1609 	nmsg->msg_event_size	= sizeof ("test2");
1610 	nmsg->msg_event_data	= Strdup("test2");
1611 	msglist[0] = nmsg;
1612 
1613 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1614 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1615 
1616 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1617 	nmsg->msg_setno		= msg->msg_setno;
1618 	nmsg->msg_type		= MD_MN_MSG_TEST2;
1619 	nmsg->msg_event_size	= sizeof ("test2");
1620 	nmsg->msg_event_data	= Strdup("test2");
1621 	msglist[1] = nmsg;
1622 
1623 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1624 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1625 
1626 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1627 	nmsg->msg_setno		= msg->msg_setno;
1628 	nmsg->msg_type		= MD_MN_MSG_TEST3;
1629 	nmsg->msg_event_size	= sizeof ("test3");
1630 	nmsg->msg_event_data	= Strdup("test3");
1631 	msglist[2] = nmsg;
1632 
1633 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1634 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1635 
1636 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1637 	nmsg->msg_setno		= msg->msg_setno;
1638 	nmsg->msg_type		= MD_MN_MSG_TEST4;
1639 	nmsg->msg_event_size	= sizeof ("test4");
1640 	nmsg->msg_event_data	= Strdup("test4");
1641 	msglist[3] = nmsg;
1642 
1643 	return (4); /* Return the number of submessages generated */
1644 }
1645 
1646 /*
1647  * This is to send an MD_IOCSET ioctl to all nodes to create a soft
1648  * partition.
1649  */
1650 /*ARGSUSED*/
1651 void
1652 mdmn_do_iocset(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1653 {
1654 	md_mn_msg_iocset_t	*d;
1655 	int			ret;
1656 	set_t			setno;
1657 	mdsetname_t		*sp;
1658 	mdname_t		*np;
1659 	md_error_t		mde = mdnullerror;
1660 
1661 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1662 	resp->mmr_out_size = 0;
1663 	resp->mmr_err_size = 0;
1664 	resp->mmr_out = NULL;
1665 	resp->mmr_err = NULL;
1666 	d = (md_mn_msg_iocset_t *)(void *)msg->msg_event_data;
1667 
1668 	setno = MD_MIN2SET(d->iocset_params.mnum);
1669 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1670 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1671 		    "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1672 		resp->mmr_exitval = 1;
1673 		return;
1674 	}
1675 
1676 	if ((np = metamnumname(&sp, d->iocset_params.mnum, 1, &mde)) == NULL) {
1677 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1678 		    "MD_MN_MSG_IOCSET: Invalid mnum %d\n"),
1679 		    d->iocset_params.mnum);
1680 		resp->mmr_exitval = 1;
1681 		return;
1682 	}
1683 
1684 	if (meta_init_make_device(&sp, np->cname, &mde) == -1) {
1685 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1686 		    "MD_MN_MSG_IOCSET: Invalid metadevice name %s\n"),
1687 		    np->cname);
1688 		resp->mmr_exitval = 1;
1689 		return;
1690 	}
1691 
1692 	d->iocset_params.mdp = (uintptr_t)&d->unit; /* set pointer to unit */
1693 	ret = metaioctl(MD_IOCSET, &(d->iocset_params), &mde, np->cname);
1694 	resp->mmr_exitval = ret;
1695 }
1696 
1697 /*
1698  * This is to update the status of a softpart
1699  */
1700 /*ARGSUSED*/
1701 void
1702 mdmn_do_sp_setstat(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1703 {
1704 	md_mn_msg_sp_setstat_t	*d;
1705 	int			ret;
1706 	set_t			setno;
1707 	mdsetname_t		*sp;
1708 	minor_t			mnum;
1709 	md_error_t		mde = mdnullerror;
1710 
1711 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1712 	resp->mmr_out_size = 0;
1713 	resp->mmr_err_size = 0;
1714 	resp->mmr_out = NULL;
1715 	resp->mmr_err = NULL;
1716 	d = (md_mn_msg_sp_setstat_t *)(void *)msg->msg_event_data;
1717 
1718 	mnum = d->sp_setstat_mnum;
1719 	setno = MD_MIN2SET(mnum);
1720 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1721 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1722 		    "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1723 		resp->mmr_exitval = 1;
1724 		return;
1725 	}
1726 
1727 	ret = meta_sp_setstatus(sp, &mnum, 1, d->sp_setstat_status, &mde);
1728 	resp->mmr_exitval = ret;
1729 }
1730 
1731 /*
1732  * This is to add a key to the namespace
1733  */
1734 /*ARGSUSED*/
1735 void
1736 mdmn_do_addkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1737 {
1738 	md_mn_msg_addkeyname_t	*d;
1739 	int			ret;
1740 	set_t			setno;
1741 	mdsetname_t		*sp;
1742 	md_error_t		mde = mdnullerror;
1743 	mdname_t		*compnp;
1744 
1745 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1746 	resp->mmr_out_size = 0;
1747 	resp->mmr_err_size = 0;
1748 	resp->mmr_out = NULL;
1749 	resp->mmr_err = NULL;
1750 	d = (md_mn_msg_addkeyname_t *)(void *)msg->msg_event_data;
1751 
1752 	setno = d->addkeyname_setno;
1753 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1754 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1755 		    "MD_MN_ADDKEYNAME: Invalid setno %d\n"), setno);
1756 		resp->mmr_exitval = -1;
1757 		return;
1758 	}
1759 
1760 	compnp = metaname(&sp, d->addkeyname_name, &mde);
1761 	if (compnp != NULL) {
1762 		ret = add_key_name(sp, compnp, NULL, &mde);
1763 		if (ret < 0)
1764 			resp->mmr_exitval = -1;
1765 		else
1766 			resp->mmr_exitval = compnp->key;
1767 	} else {
1768 		resp->mmr_exitval = -1;
1769 	}
1770 }
1771 
1772 /*
1773  * This is to delete a key from the namespace
1774  */
1775 /*ARGSUSED*/
1776 void
1777 mdmn_do_delkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1778 {
1779 	md_mn_msg_delkeyname_t	*d;
1780 	int			ret;
1781 	set_t			setno;
1782 	mdsetname_t		*sp;
1783 	md_error_t		mde = mdnullerror;
1784 	mdname_t		*compnp;
1785 
1786 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1787 	resp->mmr_out_size = 0;
1788 	resp->mmr_err_size = 0;
1789 	resp->mmr_out = NULL;
1790 	resp->mmr_err = NULL;
1791 	d = (md_mn_msg_delkeyname_t *)(void *)msg->msg_event_data;
1792 
1793 	setno = d->delkeyname_setno;
1794 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1795 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1796 		    "MD_MN_DELKEYNAME: Invalid setno %d\n"), setno);
1797 		resp->mmr_exitval = -1;
1798 		return;
1799 	}
1800 
1801 	compnp = metadevname(&sp, d->delkeyname_dev, &mde);
1802 	if (compnp != NULL) {
1803 		/*
1804 		 * Reset the key value for the name. This is required because
1805 		 * any previous call of del_key_name for the same component
1806 		 * will have resulted in the key value being reset to MD_KEYBAD
1807 		 * even though there may still be references to this component.
1808 		 */
1809 		compnp->key = d->delkeyname_key;
1810 		ret = del_key_name(sp, compnp, &mde);
1811 		resp->mmr_exitval = ret;
1812 	} else {
1813 		resp->mmr_exitval = -1;
1814 	}
1815 }
1816 
1817 /*
1818  * This is to get the value of tstate from the master node. We use this
1819  * to get the ABR state of a metadevice from the master.
1820  */
1821 /*ARGSUSED*/
1822 void
1823 mdmn_do_get_tstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1824 {
1825 	md_mn_msg_gettstate_t	*d;
1826 	int			ret;
1827 	uint_t			tstate;
1828 	md_error_t		mde = mdnullerror;
1829 
1830 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1831 	resp->mmr_out_size = 0;
1832 	resp->mmr_err_size = 0;
1833 	resp->mmr_out = NULL;
1834 	resp->mmr_err = NULL;
1835 	d = (md_mn_msg_gettstate_t *)(void *)msg->msg_event_data;
1836 
1837 	ret = meta_get_tstate(d->gettstate_dev, &tstate, &mde);
1838 	if (ret != 0) {
1839 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1840 		    "MD_MN_GET_TSTATE: Invalid dev %llx\n"), d->gettstate_dev);
1841 		tstate = 0;
1842 	}
1843 	resp->mmr_exitval = tstate;
1844 }
1845 
1846 /*
1847  * This is to get the mirror ABR state and the state of its submirrors from
1848  * the master node. We need this to ensure consistent output from metastat
1849  * when a new node joins the cluster during a resync. Without this the
1850  * submirror status will be incorrect until the whole resync is complete which
1851  * may take days for very large metadevices.
1852  */
1853 /*ARGSUSED*/
1854 void
1855 mdmn_do_get_mirstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1856 {
1857 	md_mn_msg_mir_state_t		*d;
1858 	md_mn_msg_mir_state_res_t	*res;		/* Results */
1859 	set_t				setno;
1860 	mdsetname_t			*sp;		/* Set name */
1861 	mdname_t			*mirnp;		/* Mirror name */
1862 	md_error_t			mde = mdnullerror;
1863 	mm_unit_t			*mm;		/* Mirror */
1864 	int				smi;
1865 	uint_t				tstate;
1866 
1867 	resp->mmr_comm_state = MDMNE_ACK;
1868 	resp->mmr_out_size = sizeof (md_mn_msg_mir_state_res_t);
1869 	resp->mmr_err_size = 0;
1870 	resp->mmr_out = Malloc(resp->mmr_out_size);
1871 	resp->mmr_err = NULL;
1872 	d = (md_mn_msg_mir_state_t *)(void *)msg->msg_event_data;
1873 	res = (md_mn_msg_mir_state_res_t *)(void *)resp->mmr_out;
1874 
1875 	/* Validate set information from minor number */
1876 	setno = MD_MIN2SET(d->mir_state_mnum);
1877 	sp = metasetnosetname(setno, &mde);
1878 	if (sp == NULL) {
1879 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1880 		    "MD_MN_GET_MIRROR_STATE: Invalid set %d\n"), setno);
1881 		resp->mmr_exitval = 1;	/* Failure */
1882 		Free(resp->mmr_out);
1883 		resp->mmr_out_size = 0;
1884 		return;
1885 	}
1886 
1887 	/* Construct mirror name from minor number */
1888 	mirnp = metamnumname(&sp, d->mir_state_mnum, 0, &mde);
1889 	if (mirnp == NULL) {
1890 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1891 		    "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
1892 		    d->mir_state_mnum);
1893 		resp->mmr_exitval = 2;	/* Failure */
1894 		Free(resp->mmr_out);
1895 		resp->mmr_out_size = 0;
1896 		return;
1897 	}
1898 
1899 	/* Get common mirror structure */
1900 	mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, &mde);
1901 	if (mm == NULL) {
1902 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1903 		    "MD_MN_GET_MIRROR_STATE: Invalid mirror minor %x\n"),
1904 		    d->mir_state_mnum);
1905 		resp->mmr_exitval = 3;	/* Failure */
1906 		Free(resp->mmr_out);
1907 		resp->mmr_out_size = 0;
1908 		return;
1909 	}
1910 
1911 	if (meta_get_tstate(d->mir_state_mnum, &tstate, &mde) != 0) {
1912 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1913 		    "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
1914 		    d->mir_state_mnum);
1915 		resp->mmr_exitval = 4;	/* Failure */
1916 		Free(resp->mmr_out);
1917 		resp->mmr_out_size = 0;
1918 		return;
1919 	}
1920 	/*
1921 	 * Fill in the sm_state/sm_flags value in the results structure which
1922 	 * gets passed back to the message originator
1923 	 */
1924 	resp->mmr_exitval = 0;
1925 	for (smi = 0; (smi < NMIRROR); smi++) {
1926 		mm_submirror_t *mmsp = &mm->un_sm[smi];
1927 		res->sm_state[smi] = mmsp->sm_state;
1928 		res->sm_flags[smi] = mmsp->sm_flags;
1929 	}
1930 	/* Returm value of tstate for mirror */
1931 	res->mir_tstate = tstate;
1932 }
1933 
1934 /*
1935  * This is to issue an ioctl to call poke_hotspares
1936  */
1937 /*ARGSUSED*/
1938 void
1939 mdmn_do_poke_hotspares(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1940 {
1941 
1942 	md_mn_poke_hotspares_t	pokehsp;
1943 	md_mn_msg_pokehsp_t	*d;
1944 
1945 	resp->mmr_out_size = 0;
1946 	resp->mmr_err_size = 0;
1947 	resp->mmr_out = NULL;
1948 	resp->mmr_err = NULL;
1949 	resp->mmr_comm_state = MDMNE_ACK;
1950 	d = (md_mn_msg_pokehsp_t *)(void *)msg->msg_event_data;
1951 
1952 	(void) memset(&pokehsp, 0, sizeof (pokehsp));
1953 	MD_SETDRIVERNAME(&pokehsp, MD_MIRROR, d->pokehsp_setno);
1954 
1955 	resp->mmr_exitval = metaioctl(MD_MN_POKE_HOTSPARES, &pokehsp,
1956 	    &pokehsp.mde, NULL);
1957 }
1958