xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c (revision 1623:7bac4a816ebe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <stdlib.h>
29 #include <unistd.h>
30 #include <wait.h>
31 #include <sys/time.h>
32 #include <syslog.h>
33 
34 #include <meta.h>
35 #include <sys/lvm/mdio.h>
36 #include <sys/lvm/md_mddb.h>
37 #include <sys/lvm/md_mirror.h>
38 
39 #define	MAX_N_ARGS 64
40 #define	MAX_ARG_LEN 1024
41 
42 /* we reserve 1024 bytes for stdout and the same for stderr */
43 #define	MAX_OUT	1024
44 #define	MAX_ERR	1024
45 #define	JUNK 128 /* used to flush stdout and stderr */
46 
47 
48 /*ARGSUSED*/
49 void
50 mdmn_do_cmd(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
51 {
52 
53 	/*
54 	 * We are given one string containing all the arguments
55 	 * For execvp() we have to regenerate the arguments again
56 	 */
57 	int	arg;		/* argument that is currently been built */
58 	int	index;		/* runs through arg above */
59 	int	i;		/* helper for for loop */
60 	char	*argv[MAX_N_ARGS]; /* argument array for execvp */
61 	char	*cp;		/* runs through the given command line string */
62 	char	*command = NULL; /* the command we call locally */
63 	int	pout[2];	/* pipe for stdout */
64 	int	perr[2];	/* pipe for stderr */
65 	pid_t	pid;		/* process id */
66 
67 	cp	= msg->msg_event_data;
68 	arg	= 0;
69 	index	= 0;
70 
71 	/* init the args array alloc the first one and null out the rest */
72 	argv[0] = Malloc(MAX_ARG_LEN);
73 	for (i = 1; i < MAX_N_ARGS; i++) {
74 		argv[i] = NULL;
75 	}
76 
77 	resp->mmr_comm_state	= MDMNE_ACK; /* Ok state */;
78 
79 	while (*cp != '\0') {
80 		if (arg == MAX_N_ARGS) {
81 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
82 			    "PANIC: too many arguments specified\n"));
83 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
84 			goto out;
85 		}
86 		if (index == MAX_ARG_LEN) {
87 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
88 			    "PANIC: argument too long\n"));
89 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
90 			goto out;
91 		}
92 
93 		if ((*cp != ' ') && (*cp != '\t')) {
94 			/*
95 			 * No space or tab: copy char into current
96 			 * argv and advance both pointers
97 			 */
98 
99 			argv[arg][index] = *cp;
100 			cp++;	/* next char in command line	*/
101 			index++;	/* next char in argument	*/
102 		} else {
103 			/*
104 			 * space or tab: terminate current argv,
105 			 * advance arg, reset pointer into arg,
106 			 * advance pointer in command line
107 			 */
108 			argv[arg][index] = '\0';
109 			arg++; /* next argument */
110 			argv[arg] = Malloc(MAX_ARG_LEN);
111 			cp++; /* next char in command line */
112 			index = 0; /* starts at char 0 */
113 		}
114 	}
115 	/* terminate the last real argument */
116 	argv[arg][index] = '\0';
117 	/* the last argument is an NULL pointer */
118 	argv[++arg] = NULL;
119 	if (pipe(pout) < 0)  {
120 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
121 		    "PANIC: pipe failed\n"));
122 		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
123 		goto out;
124 	}
125 	if (pipe(perr) < 0) {
126 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
127 		    "PANIC: pipe failed\n"));
128 		(void) close(pout[0]);
129 		(void) close(pout[1]);
130 		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
131 		goto out;
132 	}
133 	command = Strdup(argv[0]);
134 	(void) strcat(argv[0], ".rpc_call");
135 	pid = fork1();
136 	if (pid == (pid_t)-1) {
137 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
138 		    "PANIC: fork failed\n"));
139 		resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
140 		(void) close(pout[0]);
141 		(void) close(pout[1]);
142 		(void) close(perr[0]);
143 		(void) close(perr[1]);
144 		goto out;
145 	} else  if (pid == (pid_t)0) {
146 		/* child */
147 		(void) close(0);
148 		/* close the reading channels of pout and perr */
149 		(void) close(pout[0]);
150 		(void) close(perr[0]);
151 		/* redirect stdout */
152 		if (dup2(pout[1], 1) < 0) {
153 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
154 			    "PANIC: dup2 failed\n"));
155 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
156 			return;
157 		}
158 
159 		/* redirect stderr */
160 		if (dup2(perr[1], 2) < 0) {
161 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
162 			    "PANIC: dup2 failed\n"));
163 			resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
164 			return;
165 		}
166 
167 		(void) execvp(command, (char *const *)argv);
168 		perror("execvp");
169 		_exit(1);
170 	} else {
171 		/* parent process */
172 		int stat_loc;
173 		char *out, *err; /* for stdout and stderr of child */
174 		int i; /* index into the aboves */
175 		char junk[JUNK];
176 		int out_done = 0;
177 		int err_done = 0;
178 		int out_read = 0;
179 		int err_read = 0;
180 		int maxfd;
181 		fd_set	rset;
182 
183 
184 		/* close the writing channels of pout and perr */
185 		(void) close(pout[1]);
186 		(void) close(perr[1]);
187 		resp->mmr_out = Malloc(MAX_OUT);
188 		resp->mmr_err = Malloc(MAX_ERR);
189 		resp->mmr_out_size = MAX_OUT;
190 		resp->mmr_err_size = MAX_ERR;
191 		out = resp->mmr_out;
192 		err = resp->mmr_err;
193 		FD_ZERO(&rset);
194 		while ((out_done == 0) || (err_done == 0)) {
195 			FD_SET(pout[0], &rset);
196 			FD_SET(perr[0], &rset);
197 			maxfd = max(pout[0], perr[0]) + 1;
198 			(void) select(maxfd, &rset, NULL, NULL, NULL);
199 
200 			/*
201 			 * Did the child produce some output to stdout?
202 			 * If so, read it until we either reach the end of the
203 			 * output or until we read MAX_OUT bytes.
204 			 * Whatever comes first.
205 			 * In case we already read MAX_OUT bytes we simply
206 			 * read away the output into a junk buffer.
207 			 * Just to make the child happy
208 			 */
209 			if (FD_ISSET(pout[0], &rset)) {
210 				if (MAX_OUT - out_read - 1 > 0) {
211 					i = read(pout[0], out,
212 						MAX_OUT - out_read);
213 					out_read += i;
214 					out += i;
215 				} else {
216 					/* buffer full, empty stdout */
217 					i = read(pout[0], junk, JUNK);
218 				}
219 				if (i == 0) {
220 					/* stdout is closed by child */
221 					out_done++;
222 				}
223 			}
224 			/* same comment as above | sed -e 's/stdout/stderr/' */
225 			if (FD_ISSET(perr[0], &rset)) {
226 				if (MAX_ERR - err_read - 1 > 0) {
227 					i = read(perr[0], err,
228 						MAX_ERR - err_read);
229 					err_read += i;
230 					err += i;
231 				} else {
232 					/* buffer full, empty stderr */
233 					i = read(perr[0], junk, JUNK);
234 				}
235 				if (i == 0) {
236 					/* stderr is closed by child */
237 					err_done++;
238 				}
239 			}
240 		}
241 		resp->mmr_out[out_read] = '\0';
242 		resp->mmr_err[err_read] = '\0';
243 
244 		while (waitpid(pid, &stat_loc, 0) < 0) {
245 			if (errno != EINTR) {
246 				resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
247 				break;
248 			}
249 		}
250 		if (errno == 0)
251 			resp->mmr_exitval = WEXITSTATUS(stat_loc);
252 
253 		(void) close(pout[0]);
254 		(void) close(perr[0]);
255 	}
256 out:
257 	for (i = 0; i < MAX_N_ARGS; i++) {
258 		if (argv[i] != NULL) {
259 			free(argv[i]);
260 		}
261 	}
262 	if (command != NULL) {
263 		Free(command);
264 	}
265 }
266 
267 /*
268  * This is for checking if a metadevice is opened, and for
269  * locking in case it is not and for
270  * unlocking a locked device
271  */
272 /*ARGSUSED*/
273 void
274 mdmn_do_clu(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
275 {
276 	if (msg->msg_type == MD_MN_MSG_CLU_CHECK) {
277 		md_isopen_t	*d;
278 		int		ret;
279 
280 		resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
281 		resp->mmr_out_size = 0;
282 		resp->mmr_err_size = 0;
283 		resp->mmr_out = NULL;
284 		resp->mmr_err = NULL;
285 		d = (md_isopen_t *)(void *)msg->msg_event_data;
286 		ret = metaioctl(MD_IOCISOPEN, d, &(d->mde), NULL);
287 		/*
288 		 * In case the ioctl succeeded, return the open state of
289 		 * the metadevice. Otherwise we return the error the ioctl
290 		 * produced. As this is not zero, no attempt is made to
291 		 * remove/rename the metadevice later
292 		 */
293 
294 		if (ret == 0) {
295 			resp->mmr_exitval = d->isopen;
296 		} else {
297 			/*
298 			 * When doing a metaclear, one node after the other
299 			 * does the two steps:
300 			 * - check on all nodes if this md is opened.
301 			 * - remove the md locally.
302 			 * When the 2nd node asks all nodes if the md is
303 			 * open it starts with the first node.
304 			 * As this already removed the md, the check
305 			 * returns MDE_UNIT_NOT_SETUP.
306 			 * In order to not keep the 2nd node from proceeding,
307 			 * we map this to an Ok.
308 			 */
309 			if (mdismderror(&(d->mde), MDE_UNIT_NOT_SETUP)) {
310 				mdclrerror(&(d->mde));
311 				ret = 0;
312 			}
313 
314 			resp->mmr_exitval = ret;
315 		}
316 	}
317 }
318 
319 /* handler for MD_MN_MSG_REQUIRE_OWNER */
320 /*ARGSUSED*/
321 void
322 mdmn_do_req_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
323 {
324 	md_set_mmown_params_t	setown;
325 	md_mn_req_owner_t	*d;
326 	int			ret, n = 0;
327 
328 	resp->mmr_out_size = 0;
329 	resp->mmr_err_size = 0;
330 	resp->mmr_out = NULL;
331 	resp->mmr_err = NULL;
332 	resp->mmr_comm_state = MDMNE_ACK;
333 	d = (md_mn_req_owner_t *)(void *)msg->msg_event_data;
334 
335 	(void) memset(&setown, 0, sizeof (setown));
336 	MD_SETDRIVERNAME(&setown, MD_MIRROR, MD_MIN2SET(d->mnum))
337 	setown.d.mnum = d->mnum;
338 	setown.d.owner = d->owner;
339 
340 	/* Retry ownership change if we get EAGAIN returned */
341 	while ((ret = metaioctl(MD_MN_SET_MM_OWNER, &setown, &setown.mde, NULL))
342 	    != 0) {
343 		md_sys_error_t	*ip =
344 		    &setown.mde.info.md_error_info_t_u.sys_error;
345 		if (ip->errnum != EAGAIN) {
346 			break;
347 		}
348 		if (n++ >= 10) {
349 			break;
350 		}
351 		(void) sleep(1);
352 	}
353 
354 	resp->mmr_exitval = ret;
355 }
356 
357 /*
358  * handler for MD_MN_MSG_CHOOSE_OWNER
359  * This is called when a mirror resync has no owner. The master node generates
360  * this message which is not broadcast to the other nodes. The message is
361  * required as the kernel does not have access to the nodelist for the set.
362  */
363 /*ARGSUSED*/
364 void
365 mdmn_do_choose_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
366 {
367 	md_mn_msg_chowner_t	chownermsg;
368 	md_mn_msg_chooseid_t	*d;
369 	int			ret = 0;
370 	int			nodecnt;
371 	int			nodeno;
372 	uint_t			nodeid;
373 	uint_t			myflags;
374 	set_t			setno;
375 	mdsetname_t		*sp;
376 	md_set_desc		*sd;
377 	md_mnnode_desc		*nd;
378 	md_error_t		mde = mdnullerror;
379 	md_mn_result_t		*resp1 = NULL;
380 
381 	resp->mmr_out_size = 0;
382 	resp->mmr_err_size = 0;
383 	resp->mmr_out = NULL;
384 	resp->mmr_err = NULL;
385 	resp->mmr_comm_state = MDMNE_ACK;
386 	d = (md_mn_msg_chooseid_t *)(void *)msg->msg_event_data;
387 
388 	/*
389 	 * The node to be chosen will be the resync count for the set
390 	 * modulo the number of live nodes in the set
391 	 */
392 	setno = MD_MIN2SET(d->msg_chooseid_mnum);
393 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
394 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
395 		    "MD_MN_MSG_CHOOSE_OWNER: Invalid setno %d\n"), setno);
396 		resp->mmr_exitval = 1;
397 		return;
398 	}
399 	if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
400 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
401 		    "MD_MN_MSG_CHOOSE_OWNER: Invalid set pointer\n"));
402 		resp->mmr_exitval = 1;
403 		return;
404 	}
405 
406 	/* Count the number of live nodes */
407 	nodecnt = 0;
408 	nd = sd->sd_nodelist;
409 	while (nd) {
410 		if (nd->nd_flags & MD_MN_NODE_ALIVE)
411 			nodecnt++;
412 		nd = nd->nd_next;
413 	}
414 	nodeno = (d->msg_chooseid_rcnt%nodecnt);
415 
416 	/*
417 	 * If we've been called with msg_chooseid_set_node set TRUE then we
418 	 * are simply re-setting the owner id to ensure consistency across
419 	 * the cluster.
420 	 * If the flag is reset (B_FALSE) we are requesting a new owner to be
421 	 * determined.
422 	 */
423 	if (d->msg_chooseid_set_node) {
424 		nodeid = d->msg_chooseid_rcnt;
425 	} else {
426 		/* scan the nodelist looking for the required node */
427 		nodecnt = 0;
428 		nd = sd->sd_nodelist;
429 		while (nd) {
430 			if (nd->nd_flags & MD_MN_NODE_ALIVE) {
431 				if (nodecnt == nodeno)
432 					break;
433 				nodecnt++;
434 			}
435 			nd = nd->nd_next;
436 		}
437 		nodeid = nd->nd_nodeid;
438 	}
439 
440 	/* Send message to all nodes to make ownership change */
441 	chownermsg.msg_chowner_mnum =  d->msg_chooseid_mnum;
442 	chownermsg.msg_chowner_nodeid = nodeid;
443 	myflags = MD_MSGF_NO_LOG;
444 
445 	/* inherit some flags from the parent message */
446 	myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS;
447 
448 	ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum),
449 	    MD_MN_MSG_CHANGE_OWNER, myflags, (char *)&chownermsg,
450 	    sizeof (chownermsg), &resp1, &mde);
451 	if (resp1 != NULL)
452 		free_result(resp1);
453 	resp->mmr_exitval = ret;
454 }
455 
456 /*
457  * Handler for MD_MN_MSG_CHANGE_OWNER
458  * This is called when we are perfoming a resync and wish to change from
459  * no mirror owner to an owner chosen by the master.
460  * This mesage is only relevant for the new owner, the message will be
461  * ignored by all other nodes
462  */
463 /*ARGSUSED*/
464 void
465 mdmn_do_change_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
466 {
467 	md_set_mmown_params_t	setown;
468 	md_mn_msg_chowner_t	*d;
469 	int			ret = 0;
470 	set_t			setno;
471 	mdsetname_t		*sp;
472 	md_set_desc		*sd;
473 	md_error_t		mde = mdnullerror;
474 
475 	resp->mmr_out_size = 0;
476 	resp->mmr_err_size = 0;
477 	resp->mmr_out = NULL;
478 	resp->mmr_err = NULL;
479 	resp->mmr_comm_state = MDMNE_ACK;
480 	d = (md_mn_msg_chowner_t *)(void *)msg->msg_event_data;
481 
482 	setno = MD_MIN2SET(d->msg_chowner_mnum);
483 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
484 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
485 		    "MD_MN_MSG_CHANGE_OWNER: Invalid setno %d\n"), setno);
486 		resp->mmr_exitval = 1;
487 		return;
488 	}
489 	if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
490 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
491 		    "MD_MN_MSG_CHANGE_OWNER: Invalid set pointer\n"));
492 		resp->mmr_exitval = 1;
493 		return;
494 	}
495 
496 	if (d->msg_chowner_nodeid == sd->sd_mn_mynode->nd_nodeid) {
497 		/*
498 		 * If we are the chosen owner, issue ioctl to make the
499 		 * ownership change
500 		 */
501 		(void) memset(&setown, 0, sizeof (md_set_mmown_params_t));
502 		setown.d.mnum = d->msg_chowner_mnum;
503 		setown.d.owner = d->msg_chowner_nodeid;
504 		setown.d.flags = MD_MN_MM_SPAWN_THREAD;
505 		MD_SETDRIVERNAME(&setown, MD_MIRROR,
506 		    MD_MIN2SET(d->msg_chowner_mnum));
507 
508 		/*
509 		 * Single shot at changing the the owner, if it fails EAGAIN,
510 		 * another node must have become the owner while we are in the
511 		 * process of making this choice.
512 		 */
513 
514 		ret = metaioctl(MD_MN_SET_MM_OWNER, &setown,
515 		    &(setown.mde), NULL);
516 		if (ret == EAGAIN)
517 			ret = 0;
518 	}
519 	resp->mmr_exitval = ret;
520 }
521 
522 /* handler for MD_MN_MSG_SUSPEND_WRITES */
523 /*ARGSUSED*/
524 void
525 mdmn_do_susp_write(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
526 {
527 	/* Suspend writes to a region of a mirror */
528 	md_suspend_wr_params_t	suspwr_ioc;
529 	md_mn_msg_suspwr_t	*d;
530 	int			ret;
531 
532 	resp->mmr_out_size = 0;
533 	resp->mmr_err_size = 0;
534 	resp->mmr_out = NULL;
535 	resp->mmr_err = NULL;
536 	resp->mmr_comm_state = MDMNE_ACK;
537 	d = (md_mn_msg_suspwr_t *)(void *)msg->msg_event_data;
538 
539 	(void) memset(&suspwr_ioc, 0, sizeof (md_suspend_wr_params_t));
540 	MD_SETDRIVERNAME(&suspwr_ioc, MD_MIRROR,
541 	    MD_MIN2SET(d->msg_suspwr_mnum));
542 	suspwr_ioc.mnum = d->msg_suspwr_mnum;
543 	ret = metaioctl(MD_MN_SUSPEND_WRITES, &suspwr_ioc,
544 	    &(suspwr_ioc.mde), NULL);
545 	resp->mmr_exitval = ret;
546 }
547 
548 /*
549  * handler for MD_MN_MSG_STATE_UPDATE_RESWR
550  * This functions update a submirror component state and then resumes writes
551  * to the mirror
552  */
553 /*ARGSUSED*/
554 void
555 mdmn_do_state_upd_reswr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
556 {
557 	/* Update the state of the component of a mirror */
558 	md_set_state_params_t	setstate_ioc;
559 	md_mn_msg_stch_t	*d;
560 	int			ret;
561 
562 	resp->mmr_out_size = 0;
563 	resp->mmr_err_size = 0;
564 	resp->mmr_out = NULL;
565 	resp->mmr_err = NULL;
566 	resp->mmr_comm_state = MDMNE_ACK;
567 	d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
568 
569 	(void) memset(&setstate_ioc, 0, sizeof (md_set_state_params_t));
570 	MD_SETDRIVERNAME(&setstate_ioc, MD_MIRROR,
571 	    MD_MIN2SET(d->msg_stch_mnum));
572 	setstate_ioc.mnum = d->msg_stch_mnum;
573 	setstate_ioc.sm = d->msg_stch_sm;
574 	setstate_ioc.comp = d->msg_stch_comp;
575 	setstate_ioc.state = d->msg_stch_new_state;
576 	setstate_ioc.hs_id = d->msg_stch_hs_id;
577 	ret = metaioctl(MD_MN_SET_STATE, &setstate_ioc,
578 	    &(setstate_ioc.mde), NULL);
579 	resp->mmr_exitval = ret;
580 }
581 
582 /*
583  * submessage generator for MD_MN_MSG_STATE_UPDATE and MD_MN_MSG_STATE_UPDATE2
584  * This generates 2 messages, the first is SUSPEND_WRITES and
585  * depending on the type of the original message the second one is
586  * either STATE_UPDATE_RESWR or STATE_UPDATE_RESWR2 which actually does
587  * the same, but runs on a higher class.
588  */
589 int
590 mdmn_smgen_state_upd(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
591 {
592 	md_mn_msg_t		*nmsg;
593 	md_mn_msg_stch_t	*d;
594 	md_mn_msg_stch_t	*stch_data;
595 	md_mn_msg_suspwr_t	*suspwr_data;
596 
597 	d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
598 
599 	nmsg = Zalloc(sizeof (md_mn_msg_t));
600 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
601 
602 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
603 	nmsg->msg_setno		= msg->msg_setno;
604 	nmsg->msg_type		= MD_MN_MSG_SUSPEND_WRITES;
605 	nmsg->msg_event_size	= sizeof (md_mn_msg_suspwr_t);
606 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_suspwr_t));
607 	suspwr_data = (md_mn_msg_suspwr_t *)(void *)nmsg->msg_event_data;
608 	suspwr_data->msg_suspwr_mnum = d->msg_stch_mnum;
609 	msglist[0] = nmsg;
610 
611 	nmsg = Zalloc(sizeof (md_mn_msg_t));
612 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
613 
614 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
615 	nmsg->msg_setno		= msg->msg_setno;
616 	if (msg->msg_type == MD_MN_MSG_STATE_UPDATE2) {
617 		nmsg->msg_type		= MD_MN_MSG_STATE_UPDATE_RESWR2;
618 	} else {
619 		nmsg->msg_type		= MD_MN_MSG_STATE_UPDATE_RESWR;
620 	}
621 	nmsg->msg_event_size	= sizeof (md_mn_msg_stch_t);
622 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_stch_t));
623 	stch_data = (md_mn_msg_stch_t *)(void *)nmsg->msg_event_data;
624 	stch_data->msg_stch_mnum = d->msg_stch_mnum;
625 	stch_data->msg_stch_sm = d->msg_stch_sm;
626 	stch_data->msg_stch_comp = d->msg_stch_comp;
627 	stch_data->msg_stch_new_state = d->msg_stch_new_state;
628 	stch_data->msg_stch_hs_id = d->msg_stch_hs_id;
629 	msglist[1] = nmsg;
630 	return (2); /* Return the number of submessages generated */
631 }
632 
633 /*
634  * handler for MD_MN_MSG_ALLOCATE_HOTSPARE and MD_MN_MSG_ALLOCATE_HOTSPARE2
635  * This sends a message to all nodes requesting them to allocate a hotspare
636  * for the specified component. The component is specified by the mnum of
637  * the mirror, the submirror index and the component index.
638  */
639 /*ARGSUSED*/
640 void
641 mdmn_do_allocate_hotspare(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
642 {
643 	/* Allocate a hotspare for a mirror component */
644 	md_alloc_hotsp_params_t allochsp_ioc;
645 	md_mn_msg_allochsp_t    *d;
646 	int			ret;
647 
648 	resp->mmr_out_size = 0;
649 	resp->mmr_err_size = 0;
650 	resp->mmr_out = NULL;
651 	resp->mmr_err = NULL;
652 	resp->mmr_comm_state = MDMNE_ACK;
653 	d = (md_mn_msg_allochsp_t *)((void *)(msg->msg_event_data));
654 
655 	(void) memset(&allochsp_ioc, 0,
656 	sizeof (md_alloc_hotsp_params_t));
657 	MD_SETDRIVERNAME(&allochsp_ioc, MD_MIRROR,
658 	    MD_MIN2SET(d->msg_allochsp_mnum));
659 	allochsp_ioc.mnum = d->msg_allochsp_mnum;
660 	allochsp_ioc.sm = d->msg_allochsp_sm;
661 	allochsp_ioc.comp = d->msg_allochsp_comp;
662 	allochsp_ioc.hs_id = d->msg_allochsp_hs_id;
663 	ret = metaioctl(MD_MN_ALLOCATE_HOTSPARE, &allochsp_ioc,
664 	    &(allochsp_ioc.mde), NULL);
665 	resp->mmr_exitval = ret;
666 }
667 
668 /*
669  * handler for MD_MN_MSG_RESYNC_STARTING,MD_MN_MSG_RESYNC_FIRST,
670  * MD_MN_MSG_RESYNC_NEXT, MD_MN_MSG_RESYNC_FINISH, MD_MN_MSG_RESYNC_PHASE_DONE
671  */
672 /*ARGSUSED*/
673 void
674 mdmn_do_resync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
675 {
676 	md_mn_msg_resync_t	*d;
677 	md_mn_rs_params_t	respar;
678 	int			ret;
679 	int			smi;
680 
681 	resp->mmr_out_size = 0;
682 	resp->mmr_err_size = 0;
683 	resp->mmr_out = NULL;
684 	resp->mmr_err = NULL;
685 	resp->mmr_comm_state = MDMNE_ACK;
686 	d = (md_mn_msg_resync_t *)((void *)(msg->msg_event_data));
687 
688 	(void) memset(&respar, 0, sizeof (respar));
689 	MD_SETDRIVERNAME(&respar, MD_MIRROR,
690 	    MD_MIN2SET(d->msg_resync_mnum))
691 	respar.msg_type = (int)msg->msg_type;
692 	respar.mnum = d->msg_resync_mnum;
693 	respar.rs_type = d->msg_resync_type;
694 	respar.rs_start = d->msg_resync_start;
695 	respar.rs_size = d->msg_resync_rsize;
696 	respar.rs_done = d->msg_resync_done;
697 	respar.rs_2_do = d->msg_resync_2_do;
698 	respar.rs_originator = d->msg_originator;
699 	respar.rs_flags = d->msg_resync_flags;
700 
701 	for (smi = 0; smi < NMIRROR; smi++) {
702 		respar.rs_sm_state[smi] = d->msg_sm_state[smi];
703 		respar.rs_sm_flags[smi] = d->msg_sm_flags[smi];
704 	}
705 
706 	ret = metaioctl(MD_MN_RESYNC, &respar, &respar.mde, NULL);
707 
708 	resp->mmr_exitval = ret;
709 }
710 
711 /*
712  * handler for MD_MN_MSG_SETSYNC
713  */
714 /*ARGSUSED*/
715 void
716 mdmn_do_setsync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
717 {
718 	md_mn_msg_setsync_t	*d;
719 	md_resync_ioctl_t	ri;
720 	int			ret;
721 
722 	resp->mmr_out_size = 0;
723 	resp->mmr_err_size = 0;
724 	resp->mmr_out = NULL;
725 	resp->mmr_err = NULL;
726 	resp->mmr_comm_state = MDMNE_ACK;
727 	d = (md_mn_msg_setsync_t *)((void *)(msg->msg_event_data));
728 
729 	(void) memset(&ri, 0, sizeof (ri));
730 	MD_SETDRIVERNAME(&ri, MD_MIRROR, MD_MIN2SET(d->setsync_mnum))
731 	ri.ri_mnum = d->setsync_mnum;
732 	ri.ri_copysize = d->setsync_copysize;
733 	ri.ri_flags = d->setsync_flags;
734 
735 	ret = metaioctl(MD_MN_SETSYNC, &ri, &ri.mde, NULL);
736 
737 	resp->mmr_exitval = ret;
738 }
739 
740 /*
741  * handler for MD_MN_MSG_SET_CAP. As this handler can deal with both mirrors
742  * and soft partitions, the driver name that is required for the ioctl call
743  * is included in the message.
744  */
745 /*ARGSUSED*/
746 void
747 mdmn_do_set_cap(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
748 {
749 	md_mn_msg_setcap_t	*d;
750 	md_mn_setcap_params_t	setcap_ioc;
751 	minor_t			mnum;
752 	int			ret;
753 
754 	resp->mmr_out_size = 0;
755 	resp->mmr_err_size = 0;
756 	resp->mmr_out = NULL;
757 	resp->mmr_err = NULL;
758 	resp->mmr_comm_state = MDMNE_ACK;
759 	d = (md_mn_msg_setcap_t *)((void *)(msg->msg_event_data));
760 	mnum = d->msg_setcap_mnum;
761 
762 	(void) memset(&setcap_ioc, 0, sizeof (setcap_ioc));
763 
764 	MD_SETDRIVERNAME(&setcap_ioc, d->msg_setcap_driver, MD_MIN2SET(mnum));
765 	setcap_ioc.mnum = mnum;
766 	setcap_ioc.sc_set = d->msg_setcap_set;
767 
768 	ret = metaioctl(MD_MN_SET_CAP, &setcap_ioc, &setcap_ioc.mde, NULL);
769 
770 	resp->mmr_exitval = ret;
771 }
772 
773 /*
774  * Dummy handler for various CLASS0 messages like
775  * MD_MN_MSG_VERBOSITY / MD_MN_MSG_RESUME / MD_MN_MSG_SUSPEND ...
776  */
777 /*ARGSUSED*/
778 void
779 mdmn_do_dummy(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
780 {
781 	resp->mmr_out_size = 0;
782 	resp->mmr_err_size = 0;
783 	resp->mmr_out = NULL;
784 	resp->mmr_err = NULL;
785 	resp->mmr_exitval = 0;
786 	resp->mmr_comm_state = MDMNE_ACK;
787 }
788 
789 /*
790  * Overall description of mdcommd support that keeps all nodes in-sync
791  * with the ondisk diskset mddbs.
792  *
793  * All configuration changes to the mddb - addition/deletion of metadevices
794  * or replicas must use a CLASS1 message to block out these changes.
795  * Changes to the state of existing replicas do not need to block CLASS1
796  * since there is no conflict when just updating the state of a replica.
797  *
798  * Error encountered when master writes to mddbs:
799  *	As the master updates parts of the mddbs, flags are updated describing
800  *	what has been written.  When all locks are dropped (either in
801  *	mddb_setexit or mdioctl), a PARSE message will be generated to all
802  *	nodes with an index list of known good mddbs and the parse flags.
803  *	The master node ignore the parse message since it sent it.
804  *	The slave nodes re-read in the changed part of the mddb using the list
805  *	of known good replicas that was passed.
806  *	PARSE message does not block CLASS1.
807  *	The PARSE message must be the highest class message.  Since this
808  *	message could be sent on any ioctl, this PARSE message class must
809  *	be higher than any other class message that could issue an ioctl.
810  *
811  *	Master		Slave1		Slave2
812  * 	Handles_error
813  *	PARSE		PARSE		PARSE
814  *
815  *
816  * Add/Delete mddbs can occur from the following commands:
817  *	metadb -s set_name -a/-d
818  *	metaset -s set_name -a/-d disk
819  *	metaset -s set_name -b
820  *
821  *	The metadb/metaset command is run on the node executing the command
822  *	and sends an ATTACH/DETACH message to the master node blocking CLASS1
823  *	messages on all nodes until this message is finished.  The master
824  *	node generates 3 submessages of BLOCK, SM_ATTACH/SM_DETACH, UNBLOCK.
825  *	The BLOCK message is only run on the master node and will BLOCK
826  *	the PARSE messages from being sent to the nodes.
827  *	The SM_ATTACH/SM_DETACH message is run on all nodes and actually adds or
828  *	removes the replica(s) from the given disk slice.
829  *	The UNBLOCK message is only run on the master node and allows the
830  *	sending of PARSE messages.
831  *
832  *	Master		Slave1		Slave2
833  *			Add mddb cmd
834  *			ATTACH msg to master
835  *	BLOCK
836  *	ATTACH		ATTACH		ATTACH
837  *	UNBLOCK
838  *	PARSE		PARSE		PARSE
839  *	ATTACH msg finished
840  *
841  * Add/Delete host side information from the following commands:
842  *	metaset -s set_name -a/-d -h
843  *
844  *	The metaset command is run on the node executing the command and
845  *	sends a DB_NEWSIDE/DB_DELSIDE message and a MD_NEWSIDE/MD_DELSIDE
846  *	message whenever a host is added to or deleted from the diskset.
847  *
848  *	The side information contains the major name and minor number
849  *	associated with a disk slice from a certain node's perspective
850  *	in an (failed) effort to support clustered systems that don't have the
851  *	same device name for a physical device. (The original designers of
852  *	SVM eventually took the shortcut of assuming that all device names
853  *	are the same on all systems, but left the side information in the
854  *	mddb and namespace.)  The side information is used for disk slices
855  *	that contain mddbs and/or are components for metadevices.
856  *
857  *	The DB_NEWSIDE/DELSIDE command adds or deletes the side information
858  *	for each mddb for the host being added or deleted.
859  *	The MD_ADDSIDE/MD_DELSIDE command adds or deletes the side information
860  *	for all disk slice components that are in the namespace records for
861  *	the host being added or deleted.
862  *
863  *	The DB_NEWSIDE/DB_DELSIDE message does not change any mddb records
864  *	and only needs to be executed on the master node since the slave
865  *	nodes will be brought up to date by the PARSE message that is
866  *	generated as a result of a change to the mddb.
867  *	The MD_ADDSIDE/MD_DELSIDE message does modify the records in the mddb
868  *	and needs to be run on all nodes.  The message must block class1
869  *	messages so that record changing commands don't interfere.
870  *
871  *	Master		Slave1		Slave2
872  *			Add host
873  *			DB_NEWSIDE msg to master
874  *	DB_NEWSIDE
875  *	PARSE		PARSE		PARSE
876  *	DB_NEWSIDE msg finished
877  *			MD_NEWSIDE msg to master
878  *	MD_NEWSIDE	MD_NEWSIDE	MD_NEWSIDE
879  *	MD_NEWSIDE msg finished
880  *
881  *
882  * Optimized resync record failure:
883  *	When any node sees a failure to write an optimized resync record
884  *	that node notifies the master node of the replica that failed.
885  *	The master node handles the error and updates the rest of the
886  *	nodes using a PARSE message.  The PARSE message also calls
887  *	fixoptrecord on each slave node causing each node to fix up
888  * 	the optimized resync records that are owned by that node (the mirror
889  *	owner code also sets the optimized resync record owner).  The master
890  *	node will fix up all optimized resync records that have no owner or
891  *	are owned by the master node.
892  *
893  *	Master		Slave1		Slave2
894  *					Optimized Record Failure
895  *					OPTRECERR msg to master
896  *	Master handles opt rec failure
897  *	PARSE		PARSE		PARSE
898  *	OPTRECERR msg finished
899  *					Slave rewrites optimized record
900  *
901  */
902 
903 /*
904  * Handler for MD_MN_MSG_MDDB_PARSE which send parse messages to the
905  * slave nodes in order to keep the incore view of the mddbs the
906  * same on all nodes.
907  *
908  * Since master node generated the mddb parse message, do nothing
909  * if this is the master node.
910  *
911  * If this is a slave node, send the parse message down to the kernel
912  * where this node will re-read in parts of the mddbs.
913  *
914  */
915 void
916 mdmn_do_mddb_parse(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
917 {
918 	md_mn_msg_mddb_parse_t	*d;
919 	mddb_parse_parm_t	mpp;
920 	int			ret = 0;
921 	int			i;
922 
923 	resp->mmr_out_size = 0;
924 	resp->mmr_err_size = 0;
925 	resp->mmr_out = NULL;
926 	resp->mmr_err = NULL;
927 	resp->mmr_comm_state = MDMNE_ACK;
928 	d = (md_mn_msg_mddb_parse_t *)((void *)(msg->msg_event_data));
929 
930 	if (flags & MD_MSGF_ON_MASTER)
931 		return;
932 
933 	(void) memset(&mpp, 0, sizeof (mpp));
934 	mpp.c_setno = msg->msg_setno;
935 	mpp.c_parse_flags = d->msg_parse_flags;
936 	for (i = 0; i < MDDB_NLB; i++) {
937 		mpp.c_lb_flags[i] = d->msg_lb_flags[i];
938 	}
939 	ret = metaioctl(MD_MN_MDDB_PARSE, &mpp, &mpp.c_mde, NULL);
940 	if (ret)
941 		(void) mdstealerror(&(resp->mmr_ep), &mpp.c_mde);
942 
943 	resp->mmr_exitval = ret;
944 }
945 
946 /*
947  * Handler for MD_MN_MSG_MDDB_BLOCK which blocks the generation
948  * of parse messages from this node.
949  *
950  * This is needed when attaching/detaching mddbs on the master and the
951  * slave node is unable to handle a parse message until the slave node
952  * has done the attach/detach of the mddbs.  So, master node will block
953  * the parse messages, execute the attach/detach on all nodes and
954  * then unblock the parse messages which causes the parse message to
955  * be sent to all nodes.
956  */
957 /*ARGSUSED*/
958 void
959 mdmn_do_mddb_block(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
960 {
961 	md_mn_msg_mddb_block_t	*d;
962 	mddb_block_parm_t	mbp;
963 	int			ret;
964 
965 	resp->mmr_out_size = 0;
966 	resp->mmr_err_size = 0;
967 	resp->mmr_out = NULL;
968 	resp->mmr_err = NULL;
969 	resp->mmr_comm_state = MDMNE_ACK;
970 	d = (md_mn_msg_mddb_block_t *)((void *)(msg->msg_event_data));
971 
972 	(void) memset(&mbp, 0, sizeof (mbp));
973 	mbp.c_setno = msg->msg_setno;
974 	mbp.c_blk_flags = d->msg_block_flags;
975 	ret = metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL);
976 	if (ret)
977 		(void) mdstealerror(&(resp->mmr_ep), &mbp.c_mde);
978 
979 	resp->mmr_exitval = ret;
980 }
981 
982 /*
983  * Submessage generator for MD_MN_MSG_META_DB_ATTACH which generates
984  * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_ATTACH
985  * message on all nodes and then an UNBLOCK message on the master only.
986  */
987 int
988 mdmn_smgen_mddb_attach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
989 {
990 	md_mn_msg_t			*nmsg;
991 	md_mn_msg_meta_db_attach_t	*d;
992 	md_mn_msg_meta_db_attach_t	*attach_d;
993 	md_mn_msg_mddb_block_t		*block_d;
994 
995 	d = (md_mn_msg_meta_db_attach_t *)(void *)msg->msg_event_data;
996 
997 	nmsg = Zalloc(sizeof (md_mn_msg_t));
998 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
999 
1000 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1001 	nmsg->msg_setno		= msg->msg_setno;
1002 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1003 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1004 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1005 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1006 	block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1007 	msglist[0] = nmsg;
1008 
1009 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1010 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1011 
1012 	/* Don't log submessages and panic on inconsistent results */
1013 	nmsg->msg_flags		= MD_MSGF_NO_LOG |
1014 				    MD_MSGF_PANIC_WHEN_INCONSISTENT;
1015 	nmsg->msg_setno		= msg->msg_setno;
1016 	nmsg->msg_type		= MD_MN_MSG_SM_MDDB_ATTACH;
1017 	nmsg->msg_event_size	= sizeof (md_mn_msg_meta_db_attach_t);
1018 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_meta_db_attach_t));
1019 	attach_d = (md_mn_msg_meta_db_attach_t *)
1020 			(void *)nmsg->msg_event_data;
1021 	attach_d->msg_l_dev = d->msg_l_dev;
1022 	attach_d->msg_cnt = d->msg_cnt;
1023 	attach_d->msg_dbsize = d->msg_dbsize;
1024 	(void) strncpy(attach_d->msg_dname, d->msg_dname, 16);
1025 	attach_d->msg_splitname = d->msg_splitname;
1026 	attach_d->msg_options = d->msg_options;
1027 	msglist[1] = nmsg;
1028 
1029 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1030 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1031 
1032 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1033 	nmsg->msg_setno		= msg->msg_setno;
1034 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1035 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1036 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1037 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1038 	block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1039 	msglist[2] = nmsg;
1040 
1041 	return (3); /* Return the number of submessages generated */
1042 }
1043 
1044 /*
1045  * Submessage generator for MD_MN_MSG_META_DB_DETACH which generates
1046  * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_DETACH
1047  * message on all nodes and then an UNBLOCK message on the master only.
1048  */
1049 int
1050 mdmn_smgen_mddb_detach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
1051 {
1052 	md_mn_msg_t			*nmsg;
1053 	md_mn_msg_meta_db_detach_t	*d;
1054 	md_mn_msg_meta_db_detach_t	*detach_d;
1055 	md_mn_msg_mddb_block_t		*block_d;
1056 
1057 	d = (md_mn_msg_meta_db_detach_t *)(void *)msg->msg_event_data;
1058 
1059 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1060 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1061 
1062 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1063 	nmsg->msg_setno		= msg->msg_setno;
1064 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1065 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1066 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1067 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1068 	block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1069 	msglist[0] = nmsg;
1070 
1071 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1072 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1073 
1074 	/* Don't log submessages and panic on inconsistent results */
1075 	nmsg->msg_flags		= MD_MSGF_NO_LOG |
1076 				    MD_MSGF_PANIC_WHEN_INCONSISTENT;
1077 	nmsg->msg_setno		= msg->msg_setno;
1078 	nmsg->msg_type		= MD_MN_MSG_SM_MDDB_DETACH;
1079 	nmsg->msg_event_size	= sizeof (md_mn_msg_meta_db_detach_t);
1080 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_meta_db_detach_t));
1081 	detach_d = (md_mn_msg_meta_db_detach_t *)
1082 			(void *)nmsg->msg_event_data;
1083 	detach_d->msg_splitname = d->msg_splitname;
1084 	msglist[1] = nmsg;
1085 
1086 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1087 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1088 
1089 	nmsg->msg_flags		= (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1090 	nmsg->msg_setno		= msg->msg_setno;
1091 	nmsg->msg_type		= MD_MN_MSG_MDDB_BLOCK;
1092 	nmsg->msg_event_size	= sizeof (md_mn_msg_mddb_block_t);
1093 	nmsg->msg_event_data	= Zalloc(sizeof (md_mn_msg_mddb_block_t));
1094 	block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1095 	block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1096 	msglist[2] = nmsg;
1097 
1098 	return (3); /* Return the number of submessages generated */
1099 }
1100 
1101 /*
1102  * Handler for MD_MN_MSG_SM_MDDB_ATTACH which is used to attach mddbs.
1103  *
1104  * Used when running:
1105  *	metadb -s set_name -a
1106  * 	metaset -s set_name -a/-d disk
1107  *	metaset -s set_name -b
1108  */
1109 /*ARGSUSED*/
1110 void
1111 mdmn_do_sm_mddb_attach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1112 {
1113 	md_mn_msg_meta_db_attach_t	*d;
1114 	struct mddb_config		c;
1115 	int				i;
1116 	int				ret = 0;
1117 	md_error_t			ep = mdnullerror;
1118 	char				*name, *add_name;
1119 	mdname_t			*np;
1120 	mdsetname_t			*sp;
1121 
1122 	resp->mmr_out_size = 0;
1123 	resp->mmr_err_size = 0;
1124 	resp->mmr_out = NULL;
1125 	resp->mmr_err = NULL;
1126 	resp->mmr_comm_state = MDMNE_ACK;
1127 	d = (md_mn_msg_meta_db_attach_t *)((void *)(msg->msg_event_data));
1128 
1129 	(void) memset(&c, 0, sizeof (c));
1130 	c.c_setno = msg->msg_setno;
1131 	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1132 	(void) strncpy(c.c_locator.l_driver, d->msg_dname,
1133 		sizeof (c.c_locator.l_driver));
1134 	c.c_devname = d->msg_splitname;
1135 	c.c_locator.l_mnum = meta_getminor(d->msg_l_dev);
1136 	c.c_multi_node = 1;
1137 	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1138 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1139 		resp->mmr_exitval = -1;
1140 		return;
1141 	}
1142 	(void) strcpy(c.c_setname, sp->setname);
1143 	c.c_sideno = getmyside(sp, &ep);
1144 	if (c.c_sideno == MD_SIDEWILD) {
1145 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1146 		resp->mmr_exitval = -1;
1147 		return;
1148 	}
1149 
1150 	name = splicename(&d->msg_splitname);
1151 	np = metaname(&sp, name, LOGICAL_DEVICE, &ep);
1152 	Free(name);
1153 	if (np == NULL) {
1154 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1155 		resp->mmr_exitval = -1;
1156 		return;
1157 	}
1158 	/*
1159 	 * All nodes in MN diskset must do meta_check_replica
1160 	 * since this causes the shared namespace to be
1161 	 * populated by the md driver names while checking
1162 	 * to see if this device is already in use as a
1163 	 * metadevice.
1164 	 */
1165 	if (meta_check_replica(sp, np, d->msg_options, 0,
1166 	    (d->msg_cnt * d->msg_dbsize), &ep)) {
1167 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1168 		resp->mmr_exitval = -1;
1169 		return;
1170 	}
1171 
1172 	for (i = 0; i < d->msg_cnt; i++) {
1173 		c.c_locator.l_blkno = i * d->msg_dbsize + 16;
1174 		if (setup_med_cfg(sp, &c,
1175 		    (d->msg_options & MDCHK_SET_FORCE), &ep)) {
1176 			ret = -1;
1177 			(void) mdstealerror(&(resp->mmr_ep), &ep);
1178 			break;
1179 		}
1180 		ret = metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL);
1181 		/* If newdev was successful, continue with attach */
1182 		if (ret == 0) {
1183 			if (meta_db_addsidenms(sp, np, c.c_locator.l_blkno,
1184 			    DB_ADDSIDENMS_NO_BCAST, &ep)) {
1185 				ret = -1;
1186 				(void) mdstealerror(&(resp->mmr_ep), &ep);
1187 				break;
1188 			}
1189 		} else {
1190 			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1191 			break;
1192 		}
1193 	}
1194 	add_name = splicename(&d->msg_splitname);
1195 	if ((np = metaname(&sp, add_name, LOGICAL_DEVICE, &ep)) != NULL) {
1196 		meta_invalidate_name(np);
1197 	} else {
1198 		ret = -1;
1199 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1200 	}
1201 	Free(add_name);
1202 
1203 	resp->mmr_exitval = ret;
1204 }
1205 
1206 /*
1207  * Handler for MD_MN_MSG_SM_MDDB_DETACH which is used to detach mddbs.
1208  *
1209  * Used when running:
1210  *	metadb -s set_name -d
1211  * 	metaset -s set_name -a/-d disk
1212  *	metaset -s set_name -b
1213  */
1214 /*ARGSUSED*/
1215 void
1216 mdmn_do_sm_mddb_detach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1217 {
1218 	md_mn_msg_meta_db_detach_t	*d;
1219 	struct mddb_config		c;
1220 	int				i;
1221 	int				ret = 0;
1222 	md_error_t			ep = mdnullerror;
1223 	char				*name, *del_name;
1224 	mdname_t			*np;
1225 	mdsetname_t			*sp;
1226 
1227 	resp->mmr_out_size = 0;
1228 	resp->mmr_err_size = 0;
1229 	resp->mmr_out = NULL;
1230 	resp->mmr_err = NULL;
1231 	resp->mmr_comm_state = MDMNE_ACK;
1232 	d = (md_mn_msg_meta_db_detach_t *)((void *)(msg->msg_event_data));
1233 
1234 	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1235 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1236 		resp->mmr_exitval = -1;
1237 		return;
1238 	}
1239 
1240 	(void) memset(&c, 0, sizeof (c));
1241 	c.c_setno = msg->msg_setno;
1242 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1243 		resp->mmr_exitval = -1;
1244 		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1245 		return;
1246 	}
1247 	i = 0;
1248 	del_name = splicename(&d->msg_splitname);
1249 	while (i < c.c_dbcnt) {
1250 		c.c_id = i;
1251 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1252 			ret = -1;
1253 			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1254 			break;
1255 		}
1256 		name = splicename(&c.c_devname);
1257 		if (strcmp(name, del_name) != 0) {
1258 			Free(name);
1259 			i++;
1260 			continue;
1261 		}
1262 		Free(name);
1263 		/* Found a match - delete mddb */
1264 		if (metaioctl(MD_DB_DELDEV, &c, &c.c_mde, NULL) != 0) {
1265 			ret = -1;
1266 			(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1267 			break;
1268 		}
1269 		/* Not incrementing "i" intentionally (dbcnt is changed) */
1270 	}
1271 	if ((np = metaname(&sp, del_name, LOGICAL_DEVICE, &ep)) != NULL) {
1272 		meta_invalidate_name(np);
1273 	} else {
1274 		ret = -1;
1275 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1276 	}
1277 	Free(del_name);
1278 
1279 	resp->mmr_exitval = ret;
1280 }
1281 
1282 /*
1283  * Handler for MD_MN_MSG_META_DB_NEWSIDE which is used to update the
1284  * side information for each diskset mddb when a new host has been
1285  * added to the diskset.  The side information is the /dev/dsk/ctds name
1286  * that the new node would use to access each mddb.
1287  *
1288  * Since this routine makes no changes to the records in the diskset mddb,
1289  * this routine only needs to be run on the master node.  The master node's
1290  * kernel code will detect that portions of the mddb have changed and
1291  * will send a parse message to all nodes to re-parse parts of the mddb.
1292  *
1293  * Used when running:
1294  * 	metaset -s set_name -a -h new_hostname
1295  */
1296 /*ARGSUSED*/
1297 void
1298 mdmn_do_meta_db_newside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1299 {
1300 	md_mn_msg_meta_db_newside_t	*d;
1301 	struct mddb_config		c;
1302 	int				ret = 0;
1303 	mdsetname_t			*sp;
1304 	md_error_t			ep = mdnullerror;
1305 
1306 	resp->mmr_out_size = 0;
1307 	resp->mmr_err_size = 0;
1308 	resp->mmr_out = NULL;
1309 	resp->mmr_err = NULL;
1310 	resp->mmr_comm_state = MDMNE_ACK;
1311 	d = (md_mn_msg_meta_db_newside_t *)((void *)(msg->msg_event_data));
1312 
1313 	(void) memset(&c, 0, sizeof (c));
1314 	c.c_setno = msg->msg_setno;
1315 	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1316 	c.c_locator.l_blkno = d->msg_blkno;
1317 	(void) strncpy(c.c_locator.l_driver, d->msg_dname,
1318 		sizeof (c.c_locator.l_driver));
1319 	c.c_devname = d->msg_splitname;
1320 	c.c_locator.l_mnum = d->msg_mnum;
1321 	c.c_multi_node = 1;
1322 	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1323 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1324 		resp->mmr_exitval = -1;
1325 		return;
1326 	}
1327 	(void) strcpy(c.c_setname, sp->setname);
1328 	c.c_sideno = d->msg_sideno;
1329 
1330 	if ((ret = metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL)) != 0) {
1331 		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1332 	}
1333 	resp->mmr_exitval = ret;
1334 }
1335 
1336 /*
1337  * Handler for MD_MN_MSG_META_DB_DELSIDE which is used to remove the
1338  * side information for each diskset mddb when a host has been
1339  * deleted from the diskset.  The side information is the /dev/dsk/ctds name
1340  * that the node would use to access each mddb.
1341  *
1342  * Since this routine makes no changes to the records in the diskset mddb,
1343  * this routine only needs to be run on the master node.  The master node's
1344  * kernel code will detect that portions of the mddb have changed and
1345  * will send a parse message to all nodes to re-parse parts of the mddb.
1346  *
1347  * Used when running:
1348  * 	metaset -s set_name -d -h hostname
1349  */
1350 /*ARGSUSED*/
1351 void
1352 mdmn_do_meta_db_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1353 {
1354 	md_mn_msg_meta_db_delside_t	*d;
1355 	mddb_config_t			c;
1356 	int				ret = 0;
1357 	mdsetname_t			*sp;
1358 	md_error_t			ep = mdnullerror;
1359 
1360 	resp->mmr_out_size = 0;
1361 	resp->mmr_err_size = 0;
1362 	resp->mmr_out = NULL;
1363 	resp->mmr_err = NULL;
1364 	resp->mmr_comm_state = MDMNE_ACK;
1365 	d = (md_mn_msg_meta_db_delside_t *)((void *)(msg->msg_event_data));
1366 
1367 	(void) memset(&c, 0, sizeof (c));
1368 	c.c_setno = msg->msg_setno;
1369 	c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1370 	c.c_locator.l_blkno = d->msg_blkno;
1371 	c.c_multi_node = 1;
1372 	if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1373 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1374 		resp->mmr_exitval = -1;
1375 		return;
1376 	}
1377 	(void) strcpy(c.c_setname, sp->setname);
1378 	c.c_sideno = d->msg_sideno;
1379 
1380 	if ((ret = metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL)) != 0) {
1381 		(void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1382 	}
1383 	resp->mmr_exitval = ret;
1384 }
1385 
1386 /*
1387  * Handler for MD_MN_MSG_META_MD_ADDSIDE which is used to add the
1388  * side information for each diskset metadevice component (if that
1389  * component is a disk) when a host has been added to the diskset.
1390  * The side information is the /dev/dsk/ctds name that the node would
1391  * use to access the metadevice component.
1392  *
1393  * This routine makes changes to the mddb records and must be run
1394  * on all nodes.
1395  *
1396  * Used when running:
1397  * 	metaset -s set_name -a -h new_hostname
1398  */
1399 /*ARGSUSED*/
1400 void
1401 mdmn_do_meta_md_addside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1402 {
1403 	md_mn_msg_meta_md_addside_t	*d;
1404 	mdnm_params_t			nm;
1405 	mdsetname_t			*sp;
1406 	char				*cname, *dname;
1407 	minor_t				mnum;
1408 	int				done, i;
1409 	md_error_t			ep = mdnullerror;
1410 
1411 	resp->mmr_out_size = 0;
1412 	resp->mmr_err_size = 0;
1413 	resp->mmr_out = NULL;
1414 	resp->mmr_err = NULL;
1415 	resp->mmr_comm_state = MDMNE_ACK;
1416 	d = (md_mn_msg_meta_md_addside_t *)((void *)(msg->msg_event_data));
1417 
1418 	(void) memset(&nm, 0, sizeof (nm));
1419 	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1420 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1421 		resp->mmr_exitval = -1;
1422 		return;
1423 	}
1424 	/* While loop continues until IOCNXTKEY_NM gives nm.key of KEYWILD */
1425 	/*CONSTCOND*/
1426 	while (1) {
1427 		nm.mde = mdnullerror;
1428 		nm.setno = msg->msg_setno;
1429 		nm.side = d->msg_otherside;
1430 		if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1431 			(void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1432 			resp->mmr_exitval = -1;
1433 			return;
1434 		}
1435 
1436 		/* Normal exit path is to eventually get a KEYWILD */
1437 		if (nm.key == MD_KEYWILD) {
1438 			resp->mmr_exitval = 0;
1439 			return;
1440 		}
1441 
1442 		nm.devname = (uintptr_t)meta_getnmbykey(msg->msg_setno,
1443 			d->msg_otherside, nm.key, &ep);
1444 		if (nm.devname == NULL) {
1445 			(void) mdstealerror(&(resp->mmr_ep), &ep);
1446 			resp->mmr_exitval = -1;
1447 			return;
1448 		}
1449 		nm.side = d->msg_sideno;
1450 		if ((done = meta_getside_devinfo(sp,
1451 		    (char *)(uintptr_t)nm.devname,
1452 		    d->msg_sideno, &cname, &dname, &mnum, &ep)) == -1) {
1453 			(void) mdstealerror(&(resp->mmr_ep), &ep);
1454 			Free((void *)(uintptr_t)nm.devname);
1455 			resp->mmr_exitval = -1;
1456 			return;
1457 		}
1458 		Free((void *)(uintptr_t)nm.devname);
1459 		if (done != 1) {
1460 			Free(cname);
1461 			Free(dname);
1462 			resp->mmr_exitval = -1;
1463 			return;
1464 		}
1465 
1466 		/*
1467 		 * The device reference count can be greater than 1 if
1468 		 * more than one softpart is configured on top of the
1469 		 * same device.  If this is the case then we want to
1470 		 * increment the count to sync up with the other sides.
1471 		 */
1472 		for (i = 0; i < nm.ref_count; i++) {
1473 			if (add_name(sp, d->msg_sideno, nm.key, dname, mnum,
1474 			    cname, &ep) == -1) {
1475 				(void) mdstealerror(&(resp->mmr_ep), &ep);
1476 				Free(cname);
1477 				Free(dname);
1478 				resp->mmr_exitval = -1;
1479 				return;
1480 			}
1481 		}
1482 		Free(cname);
1483 		Free(dname);
1484 	}
1485 
1486 	/*NOTREACHED*/
1487 }
1488 /*
1489  * Handler for MD_MN_MSG_META_MD_DELSIDE which is used to delete the
1490  * side information for each diskset metadevice component (if that
1491  * component is a disk) when a host has been removed from the diskset.
1492  * The side information is the /dev/dsk/ctds name that the node would
1493  * use to access the metadevice component.
1494  *
1495  * This routine makes changes to the mddb records and must be run
1496  * on all nodes.
1497  *
1498  * Used when running:
1499  * 	metaset -s set_name -d -h hostname
1500  */
1501 /*ARGSUSED*/
1502 void
1503 mdmn_do_meta_md_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1504 {
1505 	md_mn_msg_meta_md_delside_t	*d;
1506 	mdnm_params_t			nm;
1507 	mdsetname_t			*sp;
1508 	md_error_t			ep = mdnullerror;
1509 	int				i;
1510 
1511 	resp->mmr_out_size = 0;
1512 	resp->mmr_err_size = 0;
1513 	resp->mmr_out = NULL;
1514 	resp->mmr_err = NULL;
1515 	resp->mmr_comm_state = MDMNE_ACK;
1516 	d = (md_mn_msg_meta_md_delside_t *)((void *)(msg->msg_event_data));
1517 
1518 	if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1519 		(void) mdstealerror(&(resp->mmr_ep), &ep);
1520 		resp->mmr_exitval = -1;
1521 		return;
1522 	}
1523 
1524 	(void) memset(&nm, 0, sizeof (nm));
1525 	nm.key = MD_KEYWILD;
1526 	/*CONSTCOND*/
1527 	while (1) {
1528 		nm.mde = mdnullerror;
1529 		nm.setno = msg->msg_setno;
1530 		nm.side = MD_SIDEWILD;
1531 		if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1532 			(void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1533 			resp->mmr_exitval = -1;
1534 			return;
1535 		}
1536 
1537 		/* Normal exit path is to eventually get a KEYWILD */
1538 		if (nm.key == MD_KEYWILD) {
1539 			resp->mmr_exitval = 0;
1540 			return;
1541 		}
1542 
1543 		/*
1544 		 * The device reference count can be greater than 1 if
1545 		 * more than one softpart is configured on top of the
1546 		 * same device.  If this is the case then we want to
1547 		 * decrement the count to zero so the entry can be
1548 		 * actually removed.
1549 		 */
1550 		for (i = 0; i < nm.ref_count; i++) {
1551 			if (del_name(sp, d->msg_sideno, nm.key, &ep) == -1) {
1552 				(void) mdstealerror(&(resp->mmr_ep), &ep);
1553 				resp->mmr_exitval = -1;
1554 				return;
1555 			}
1556 		}
1557 	}
1558 
1559 	/*NOTREACHED*/
1560 }
1561 
1562 /*
1563  * Handler for MD_MN_MSG_MDDB_OPTRECERR which is used to notify
1564  * the master node that a node has seen an error when attempting to
1565  * write to the optimized resync records that reside on 2 of the diskset
1566  * mddbs.  Master node will mark the failed replica in error and this
1567  * will send a parse message to all nodes to re-read parts of the mddb
1568  * and to fix their optimized resync records based on this information.
1569  */
1570 /*ARGSUSED*/
1571 void
1572 mdmn_do_mddb_optrecerr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1573 {
1574 	md_mn_msg_mddb_optrecerr_t	*d;
1575 	mddb_optrec_parm_t		mop;
1576 	int				ret;
1577 	int				i;
1578 
1579 	resp->mmr_out_size = 0;
1580 	resp->mmr_err_size = 0;
1581 	resp->mmr_out = NULL;
1582 	resp->mmr_err = NULL;
1583 	resp->mmr_comm_state = MDMNE_ACK;
1584 	d = (md_mn_msg_mddb_optrecerr_t *)((void *)(msg->msg_event_data));
1585 
1586 	(void) memset(&mop, 0, sizeof (mop));
1587 	mop.c_setno = msg->msg_setno;
1588 	for (i = 0; i < 2; i++) {
1589 		mop.c_recerr[i] = d->msg_recerr[i];
1590 	}
1591 	ret = metaioctl(MD_MN_MDDB_OPTRECFIX, &mop, &mop.c_mde, NULL);
1592 	if (ret)
1593 		(void) mdstealerror(&(resp->mmr_ep), &mop.c_mde);
1594 
1595 	resp->mmr_exitval = ret;
1596 }
1597 
1598 int
1599 mdmn_smgen_test6(md_mn_msg_t *msg, md_mn_msg_t **msglist)
1600 {
1601 	md_mn_msg_t	*nmsg;
1602 
1603 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1604 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1605 
1606 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1607 	nmsg->msg_setno		= msg->msg_setno;
1608 	nmsg->msg_type		= MD_MN_MSG_TEST2;
1609 	nmsg->msg_event_size	= sizeof ("test2");
1610 	nmsg->msg_event_data	= Strdup("test2");
1611 	msglist[0] = nmsg;
1612 
1613 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1614 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1615 
1616 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1617 	nmsg->msg_setno		= msg->msg_setno;
1618 	nmsg->msg_type		= MD_MN_MSG_TEST2;
1619 	nmsg->msg_event_size	= sizeof ("test2");
1620 	nmsg->msg_event_data	= Strdup("test2");
1621 	msglist[1] = nmsg;
1622 
1623 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1624 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1625 
1626 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1627 	nmsg->msg_setno		= msg->msg_setno;
1628 	nmsg->msg_type		= MD_MN_MSG_TEST3;
1629 	nmsg->msg_event_size	= sizeof ("test3");
1630 	nmsg->msg_event_data	= Strdup("test3");
1631 	msglist[2] = nmsg;
1632 
1633 	nmsg = Zalloc(sizeof (md_mn_msg_t));
1634 	MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1635 
1636 	nmsg->msg_flags		= MD_MSGF_NO_LOG; /* Don't log submessages */
1637 	nmsg->msg_setno		= msg->msg_setno;
1638 	nmsg->msg_type		= MD_MN_MSG_TEST4;
1639 	nmsg->msg_event_size	= sizeof ("test4");
1640 	nmsg->msg_event_data	= Strdup("test4");
1641 	msglist[3] = nmsg;
1642 
1643 	return (4); /* Return the number of submessages generated */
1644 }
1645 
1646 /*
1647  * This is to send an MD_IOCSET ioctl to all nodes to create a soft
1648  * partition.
1649  */
1650 /*ARGSUSED*/
1651 void
1652 mdmn_do_iocset(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1653 {
1654 	md_mn_msg_iocset_t	*d;
1655 	int			ret;
1656 	set_t			setno;
1657 	mdsetname_t		*sp;
1658 	mdname_t		*np;
1659 	md_error_t		mde = mdnullerror;
1660 
1661 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1662 	resp->mmr_out_size = 0;
1663 	resp->mmr_err_size = 0;
1664 	resp->mmr_out = NULL;
1665 	resp->mmr_err = NULL;
1666 	d = (md_mn_msg_iocset_t *)(void *)msg->msg_event_data;
1667 
1668 	setno = MD_MIN2SET(d->iocset_params.mnum);
1669 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1670 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1671 		    "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1672 		resp->mmr_exitval = 1;
1673 		return;
1674 	}
1675 
1676 	/*
1677 	 * Device should be in the namespace already
1678 	 */
1679 	if ((np = metamnumname(&sp, d->iocset_params.mnum, 1, &mde)) == NULL) {
1680 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1681 		    "MD_MN_MSG_IOCSET: Invalid mnum %d\n"),
1682 		    d->iocset_params.mnum);
1683 		resp->mmr_exitval = 1;
1684 		return;
1685 	}
1686 
1687 	/*
1688 	 * Create unit structure
1689 	 */
1690 	d->iocset_params.mdp = (uintptr_t)&d->unit; /* set pointer to unit */
1691 	ret = metaioctl(MD_IOCSET, &(d->iocset_params), &mde, np->cname);
1692 	resp->mmr_exitval = ret;
1693 }
1694 
1695 /*
1696  * This is to update the status of a softpart
1697  */
1698 /*ARGSUSED*/
1699 void
1700 mdmn_do_sp_setstat(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1701 {
1702 	md_mn_msg_sp_setstat_t	*d;
1703 	int			ret;
1704 	set_t			setno;
1705 	mdsetname_t		*sp;
1706 	minor_t			mnum;
1707 	md_error_t		mde = mdnullerror;
1708 
1709 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1710 	resp->mmr_out_size = 0;
1711 	resp->mmr_err_size = 0;
1712 	resp->mmr_out = NULL;
1713 	resp->mmr_err = NULL;
1714 	d = (md_mn_msg_sp_setstat_t *)(void *)msg->msg_event_data;
1715 
1716 	mnum = d->sp_setstat_mnum;
1717 	setno = MD_MIN2SET(mnum);
1718 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1719 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1720 		    "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1721 		resp->mmr_exitval = 1;
1722 		return;
1723 	}
1724 
1725 	ret = meta_sp_setstatus(sp, &mnum, 1, d->sp_setstat_status, &mde);
1726 	resp->mmr_exitval = ret;
1727 }
1728 
1729 /*
1730  * This is to add a key to the namespace
1731  */
1732 /*ARGSUSED*/
1733 void
1734 mdmn_do_addkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1735 {
1736 	md_mn_msg_addkeyname_t	*d;
1737 	int			ret;
1738 	set_t			setno;
1739 	mdsetname_t		*sp;
1740 	md_error_t		mde = mdnullerror;
1741 	mdname_t		*compnp;
1742 
1743 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1744 	resp->mmr_out_size = 0;
1745 	resp->mmr_err_size = 0;
1746 	resp->mmr_out = NULL;
1747 	resp->mmr_err = NULL;
1748 	d = (md_mn_msg_addkeyname_t *)(void *)msg->msg_event_data;
1749 
1750 	setno = d->addkeyname_setno;
1751 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1752 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1753 		    "MD_MN_ADDKEYNAME: Invalid setno %d\n"), setno);
1754 		resp->mmr_exitval = -1;
1755 		return;
1756 	}
1757 
1758 	compnp = metaname(&sp, d->addkeyname_name, UNKNOWN, &mde);
1759 	if (compnp != NULL) {
1760 		ret = add_key_name(sp, compnp, NULL, &mde);
1761 		if (ret < 0)
1762 			resp->mmr_exitval = -1;
1763 		else
1764 			resp->mmr_exitval = compnp->key;
1765 	} else {
1766 		resp->mmr_exitval = -1;
1767 	}
1768 }
1769 
1770 /*
1771  * This is to delete a key from the namespace
1772  */
1773 /*ARGSUSED*/
1774 void
1775 mdmn_do_delkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1776 {
1777 	md_mn_msg_delkeyname_t	*d;
1778 	int			ret;
1779 	set_t			setno;
1780 	mdsetname_t		*sp;
1781 	md_error_t		mde = mdnullerror;
1782 	mdname_t		*compnp;
1783 
1784 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1785 	resp->mmr_out_size = 0;
1786 	resp->mmr_err_size = 0;
1787 	resp->mmr_out = NULL;
1788 	resp->mmr_err = NULL;
1789 	d = (md_mn_msg_delkeyname_t *)(void *)msg->msg_event_data;
1790 
1791 	setno = d->delkeyname_setno;
1792 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1793 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1794 		    "MD_MN_DELKEYNAME: Invalid setno %d\n"), setno);
1795 		resp->mmr_exitval = -1;
1796 		return;
1797 	}
1798 
1799 	compnp = metadevname(&sp, d->delkeyname_dev, &mde);
1800 	if (compnp != NULL) {
1801 		/*
1802 		 * Reset the key value for the name. This is required because
1803 		 * any previous call of del_key_name for the same component
1804 		 * will have resulted in the key value being reset to MD_KEYBAD
1805 		 * even though there may still be references to this component.
1806 		 */
1807 		compnp->key = d->delkeyname_key;
1808 		ret = del_key_name(sp, compnp, &mde);
1809 		resp->mmr_exitval = ret;
1810 	} else {
1811 		resp->mmr_exitval = -1;
1812 	}
1813 }
1814 
1815 /*
1816  * This is to get the value of tstate from the master node. We use this
1817  * to get the ABR state of a metadevice from the master.
1818  */
1819 /*ARGSUSED*/
1820 void
1821 mdmn_do_get_tstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1822 {
1823 	md_mn_msg_gettstate_t	*d;
1824 	int			ret;
1825 	uint_t			tstate;
1826 	md_error_t		mde = mdnullerror;
1827 
1828 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1829 	resp->mmr_out_size = 0;
1830 	resp->mmr_err_size = 0;
1831 	resp->mmr_out = NULL;
1832 	resp->mmr_err = NULL;
1833 	d = (md_mn_msg_gettstate_t *)(void *)msg->msg_event_data;
1834 
1835 	ret = meta_get_tstate(d->gettstate_dev, &tstate, &mde);
1836 	if (ret != 0) {
1837 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1838 		    "MD_MN_GET_TSTATE: Invalid dev %llx\n"), d->gettstate_dev);
1839 		tstate = 0;
1840 	}
1841 	resp->mmr_exitval = tstate;
1842 }
1843 
1844 /*
1845  * This is to get the mirror ABR state and the state of its submirrors from
1846  * the master node. We need this to ensure consistent output from metastat
1847  * when a new node joins the cluster during a resync. Without this the
1848  * submirror status will be incorrect until the whole resync is complete which
1849  * may take days for very large metadevices.
1850  */
1851 /*ARGSUSED*/
1852 void
1853 mdmn_do_get_mirstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1854 {
1855 	md_mn_msg_mir_state_t		*d;
1856 	md_mn_msg_mir_state_res_t	*res;		/* Results */
1857 	set_t				setno;
1858 	mdsetname_t			*sp;		/* Set name */
1859 	mdname_t			*mirnp;		/* Mirror name */
1860 	md_error_t			mde = mdnullerror;
1861 	mm_unit_t			*mm;		/* Mirror */
1862 	int				smi;
1863 	uint_t				tstate;
1864 
1865 	resp->mmr_comm_state = MDMNE_ACK;
1866 	resp->mmr_out_size = sizeof (md_mn_msg_mir_state_res_t);
1867 	resp->mmr_err_size = 0;
1868 	resp->mmr_out = Malloc(resp->mmr_out_size);
1869 	resp->mmr_err = NULL;
1870 	d = (md_mn_msg_mir_state_t *)(void *)msg->msg_event_data;
1871 	res = (md_mn_msg_mir_state_res_t *)(void *)resp->mmr_out;
1872 
1873 	/* Validate set information from minor number */
1874 	setno = MD_MIN2SET(d->mir_state_mnum);
1875 	sp = metasetnosetname(setno, &mde);
1876 	if (sp == NULL) {
1877 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1878 		    "MD_MN_GET_MIRROR_STATE: Invalid set %d\n"), setno);
1879 		resp->mmr_exitval = 1;	/* Failure */
1880 		Free(resp->mmr_out);
1881 		resp->mmr_out_size = 0;
1882 		return;
1883 	}
1884 
1885 	/* Construct mirror name from minor number */
1886 	mirnp = metamnumname(&sp, d->mir_state_mnum, 0, &mde);
1887 	if (mirnp == NULL) {
1888 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1889 		    "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
1890 		    d->mir_state_mnum);
1891 		resp->mmr_exitval = 2;	/* Failure */
1892 		Free(resp->mmr_out);
1893 		resp->mmr_out_size = 0;
1894 		return;
1895 	}
1896 
1897 	/* Get common mirror structure */
1898 	mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, &mde);
1899 	if (mm == NULL) {
1900 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1901 		    "MD_MN_GET_MIRROR_STATE: Invalid mirror minor %x\n"),
1902 		    d->mir_state_mnum);
1903 		resp->mmr_exitval = 3;	/* Failure */
1904 		Free(resp->mmr_out);
1905 		resp->mmr_out_size = 0;
1906 		return;
1907 	}
1908 
1909 	if (meta_get_tstate(d->mir_state_mnum, &tstate, &mde) != 0) {
1910 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1911 		    "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
1912 		    d->mir_state_mnum);
1913 		resp->mmr_exitval = 4;	/* Failure */
1914 		Free(resp->mmr_out);
1915 		resp->mmr_out_size = 0;
1916 		return;
1917 	}
1918 	/*
1919 	 * Fill in the sm_state/sm_flags value in the results structure which
1920 	 * gets passed back to the message originator
1921 	 */
1922 	resp->mmr_exitval = 0;
1923 	for (smi = 0; (smi < NMIRROR); smi++) {
1924 		mm_submirror_t *mmsp = &mm->un_sm[smi];
1925 		res->sm_state[smi] = mmsp->sm_state;
1926 		res->sm_flags[smi] = mmsp->sm_flags;
1927 	}
1928 	/* Returm value of tstate for mirror */
1929 	res->mir_tstate = tstate;
1930 }
1931 
1932 /*
1933  * This is to issue an ioctl to call poke_hotspares
1934  */
1935 /*ARGSUSED*/
1936 void
1937 mdmn_do_poke_hotspares(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1938 {
1939 
1940 	md_mn_poke_hotspares_t	pokehsp;
1941 	md_mn_msg_pokehsp_t	*d;
1942 
1943 	resp->mmr_out_size = 0;
1944 	resp->mmr_err_size = 0;
1945 	resp->mmr_out = NULL;
1946 	resp->mmr_err = NULL;
1947 	resp->mmr_comm_state = MDMNE_ACK;
1948 	d = (md_mn_msg_pokehsp_t *)(void *)msg->msg_event_data;
1949 
1950 	(void) memset(&pokehsp, 0, sizeof (pokehsp));
1951 	MD_SETDRIVERNAME(&pokehsp, MD_MIRROR, d->pokehsp_setno);
1952 
1953 	resp->mmr_exitval = metaioctl(MD_MN_POKE_HOTSPARES, &pokehsp,
1954 	    &pokehsp.mde, NULL);
1955 }
1956 
1957 /*
1958  * Called to create a softpart during a metarecover operation
1959  */
1960 /*ARGSUSED*/
1961 void
1962 mdmn_do_addmdname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1963 {
1964 	md_mn_msg_addmdname_t	*d;
1965 	md_error_t		mde = mdnullerror;
1966 	mdsetname_t		*sp;
1967 	int			init = 0;
1968 	mdkey_t			key;
1969 	minor_t			mnum;
1970 
1971 	resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1972 	resp->mmr_out_size = 0;
1973 	resp->mmr_err_size = 0;
1974 	resp->mmr_out = NULL;
1975 	resp->mmr_err = NULL;
1976 	d = (md_mn_msg_addmdname_t *)(void *)msg->msg_event_data;
1977 
1978 	if ((sp = metasetnosetname(d->addmdname_setno, &mde)) == NULL) {
1979 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1980 		    "MD_MN_MSG_ADDMDNAME: Invalid setno %d\n"),
1981 		    d->addmdname_setno);
1982 		resp->mmr_exitval = 1;
1983 		return;
1984 	}
1985 
1986 	/*
1987 	 * If device node does not exist then init it
1988 	 */
1989 	if (!is_existing_meta_hsp(sp, d->addmdname_name)) {
1990 	    if ((key = meta_init_make_device(&sp, d->addmdname_name,
1991 		&mde)) <= 0) {
1992 		    syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1993 			"MD_MN_MSG_ADDMDNAME: Invalid name %s\n"),
1994 			d->addmdname_name);
1995 		    resp->mmr_exitval = 1;
1996 		    return;
1997 		}
1998 
1999 		init = 1;
2000 	}
2001 
2002 	/*
2003 	 * We should have it
2004 	 */
2005 	if (metaname(&sp, d->addmdname_name, META_DEVICE, &mde) == NULL) {
2006 
2007 	    if (init) {
2008 		if (meta_getnmentbykey(sp->setno, MD_SIDEWILD,
2009 		    key, NULL, &mnum, NULL, &mde) != NULL) {
2010 			(void) metaioctl(MD_IOCREM_DEV, &mnum,
2011 				&mde, NULL);
2012 		}
2013 		(void) del_self_name(sp, key, &mde);
2014 	    }
2015 
2016 	    resp->mmr_exitval = 1;
2017 	    return;
2018 	}
2019 
2020 	resp->mmr_exitval = 0;
2021 }
2022