xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_set_hst.c (revision 62:5e51ad5d0496)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Just in case we're not in a build environment, make sure that
31  * TEXT_DOMAIN gets set to something.
32  */
33 #if !defined(TEXT_DOMAIN)
34 #define	TEXT_DOMAIN "SYS_TEST"
35 #endif
36 
37 /*
38  * Metadevice diskset interfaces
39  */
40 
41 #include "meta_set_prv.h"
42 #include <meta.h>
43 #include <sys/lvm/md_crc.h>
44 #include <sys/time.h>
45 #include <sdssc.h>
46 
47 static int
48 add_db_sidenms(
49 	mdsetname_t	*sp,
50 	md_error_t	*ep
51 )
52 {
53 	md_replicalist_t	*rlp = NULL;
54 	md_replicalist_t	*rl;
55 	int			rval = 0;
56 
57 	if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0)
58 		return (-1);
59 
60 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
61 		md_replica_t	*r = rl->rl_repp;
62 
63 		/*
64 		 * This is not the first replica being added to the
65 		 * diskset so call with ADDSIDENMS_BCAST.  If this
66 		 * is a traditional diskset, the bcast flag is ignored
67 		 * since traditional disksets don't use the rpc.mdcommd.
68 		 */
69 		if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
70 		    DB_ADDSIDENMS_BCAST, ep)) {
71 			rval = -1;
72 			goto out;
73 		}
74 	}
75 
76 out:
77 	metafreereplicalist(rlp);
78 	return (rval);
79 }
80 
81 static int
82 add_drvs_to_hosts(
83 	mdsetname_t	*sp,
84 	int		node_c,
85 	char		**node_v,
86 	md_error_t	*ep
87 )
88 {
89 	int		i;
90 	md_set_desc	*sd;
91 	md_drive_desc	*dd;
92 	md_timeval32_t	now;
93 	ulong_t		genid;
94 
95 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
96 		return (-1);
97 
98 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
99 		if (! mdisok(ep))
100 			return (-1);
101 		return (0);
102 	}
103 
104 	now = sd->sd_ctime;
105 	genid = sd->sd_genid - 1;
106 
107 	for (i = 0; i < node_c; i++) {
108 		if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1)
109 			return (-1);
110 	}
111 
112 	return (0);
113 }
114 
115 static int
116 add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
117 {
118 	mdnm_params_t	nm;
119 	char		*cname, *dname;
120 	side_t		tmp_sideno;
121 	minor_t		mnum;
122 	int		done, i;
123 	int		rval = 0;
124 	md_set_desc	*sd;
125 
126 	(void) memset(&nm, '\0', sizeof (nm));
127 	nm.key   = MD_KEYWILD;
128 
129 	if (!metaislocalset(sp)) {
130 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
131 			return (-1);
132 	}
133 	/* Use rpc.mdcommd to add md side info from all nodes */
134 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
135 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
136 		md_mn_result_t			*resultp = NULL;
137 		md_mn_msg_meta_md_addside_t	md_as;
138 		int				send_rval;
139 
140 		md_as.msg_sideno = sideno;
141 		md_as.msg_otherside = otherside;
142 		/*
143 		 * If reconfig cycle has been started, this node is stuck in
144 		 * in the return step until this command has completed.  If
145 		 * mdcommd is suspended, ask send_message to fail (instead of
146 		 * retrying) so that metaset can finish allowing the
147 		 * reconfig cycle to proceed.
148 		 */
149 		send_rval = mdmn_send_message(sp->setno,
150 		    MD_MN_MSG_META_MD_ADDSIDE,
151 		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
152 		    (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
153 		    &resultp, ep);
154 		if (send_rval != 0) {
155 			(void) mdstealerror(ep, &(resultp->mmr_ep));
156 			if (resultp)
157 				free_result(resultp);
158 			return (-1);
159 		}
160 		if (resultp)
161 			free_result(resultp);
162 		return (0);
163 	} else {
164 		/*CONSTCOND*/
165 		while (1) {
166 			nm.mde   = mdnullerror;
167 			nm.setno = sp->setno;
168 			nm.side  = otherside;
169 			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
170 				return (mdstealerror(ep, &nm.mde));
171 
172 			if (nm.key == MD_KEYWILD)
173 				return (0);
174 
175 			nm.devname = (uintptr_t)meta_getnmbykey(sp->setno,
176 				otherside, nm.key, ep);
177 			if (nm.devname == NULL)
178 				return (-1);
179 
180 			nm.side = sideno;
181 			if (MD_MNSET_DESC(sd)) {
182 				tmp_sideno = sideno;
183 			} else {
184 				tmp_sideno = sideno - 1;
185 			}
186 
187 			if ((done = meta_getnextside_devinfo(sp,
188 			    (char *)(uintptr_t)nm.devname, &tmp_sideno,
189 			    &cname, &dname, &mnum, ep)) == -1) {
190 				Free((void *)(uintptr_t)nm.devname);
191 				return (-1);
192 			}
193 
194 			assert(done == 1);
195 			Free((void *)(uintptr_t)nm.devname);
196 
197 			/*
198 			 * The device reference count can be greater than 1 if
199 			 * more than one softpart is configured on top of the
200 			 * same device.  If this is the case then we want to
201 			 * increment the count to sync up with the other sides.
202 			 */
203 			for (i = 0; i < nm.ref_count; i++) {
204 			    if (add_name(sp, sideno, nm.key, dname, mnum, cname,
205 				ep) == -1)
206 				rval = -1;
207 			}
208 
209 			Free(cname);
210 			Free(dname);
211 
212 			if (rval != 0)
213 				return (rval);
214 		}
215 	}
216 
217 	/*NOTREACHED*/
218 }
219 
220 static int
221 check_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep)
222 {
223 	mddrivename_t	*dp;
224 	md_drive_desc	*dd, *ddp;
225 
226 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
227 		if (! mdisok(ep))
228 			return (-1);
229 
230 	for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
231 		dp = ddp->dd_dnp;
232 
233 		if (checkdrive_onnode(sp, dp, node, ep))
234 			return (-1);
235 	}
236 
237 	return (0);
238 }
239 
240 static int
241 create_multinode_set_on_hosts(
242 	mdsetname_t	*sp,
243 	int		node_c,		/* Number of new nodes */
244 	char		**node_v,	/* Nodes which are being added */
245 	int		new_set,
246 	md_error_t	*ep
247 )
248 {
249 	int				i;
250 	md_set_desc			*sd;
251 	md_timeval32_t			now;
252 	ulong_t				genid;
253 	int				rval = 0;
254 	md_mnnode_desc			*nd, *ndm = NULL;
255 	md_mnnode_desc			*nd_prev, *nd_curr;
256 	int				nodecnt;
257 	mndiskset_membershiplist_t	*nl, *nl2;
258 
259 	if (!new_set) {
260 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
261 			return (-1);
262 		now = sd->sd_ctime;
263 		genid = sd->sd_genid - 1;
264 		if (sd->sd_drvs)
265 			genid--;
266 	} else {
267 		sd = Zalloc(sizeof (*sd));
268 
269 		if (meta_gettimeofday(&now) == -1) {
270 			(void) mdsyserror(ep, errno,
271 			    dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
272 			rval = -1;
273 			goto out;
274 		}
275 
276 		/* Put the new entries into the set */
277 		/*
278 		 * Get membershiplist from API routine.  If there's
279 		 * an error, fail to create set and pass back error.
280 		 */
281 		if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
282 			rval = -1;
283 			goto out;
284 		}
285 
286 		/*
287 		 * meta_set_addhosts has already verified that
288 		 * this node list is in the membership list
289 		 * so set ALIVE flag.
290 		 * Since this is a new set, all hosts being
291 		 * added are new to the set, so also set ADD flag.
292 		 */
293 		for (i = 0; i < node_c; i++) {
294 			nd = Zalloc(sizeof (*nd));
295 			(void) strcpy(nd->nd_nodename, node_v[i]);
296 			nd->nd_ctime = now;
297 			nd->nd_flags = (MD_MN_NODE_ALIVE |
298 				MD_MN_NODE_ADD);
299 			nl2 = nl;
300 			while (nl2) {
301 			    if (strcmp(nl2->msl_node_name,
302 				node_v[i]) == 0) {
303 				    nd->nd_nodeid = nl2->msl_node_id;
304 				    (void) strcpy(nd->nd_priv_ic,
305 					nl2->msl_node_addr);
306 				    break;
307 			    }
308 			    nl2 = nl2->next;
309 			}
310 
311 			/*
312 			 * Nodelist must be kept in ascending
313 			 * nodeid order.
314 			 */
315 			if (sd->sd_nodelist == NULL) {
316 				/* Nothing in list, just add it */
317 				sd->sd_nodelist = nd;
318 			} else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) {
319 				/* Add to head of list */
320 				nd->nd_next = sd->sd_nodelist;
321 				sd->sd_nodelist = nd;
322 			} else {
323 				nd_curr = sd->sd_nodelist->nd_next;
324 				nd_prev = sd->sd_nodelist;
325 				/* Search for place ot add it */
326 				while (nd_curr) {
327 					if (nd->nd_nodeid <
328 					    nd_curr->nd_nodeid) {
329 						/* Add before nd_curr */
330 						nd->nd_next = nd_curr;
331 						nd_prev->nd_next = nd;
332 						break;
333 					}
334 					nd_prev = nd_curr;
335 					nd_curr = nd_curr->nd_next;
336 				}
337 				/* Add to end of list */
338 				if (nd_curr == NULL) {
339 					nd_prev->nd_next = nd;
340 				}
341 
342 			}
343 			/* Set master to be first node added */
344 			if (ndm == NULL)
345 				ndm = nd;
346 		}
347 
348 		meta_free_nodelist(nl);
349 		/*
350 		 * Creating mnset for first time.
351 		 * Set master to be invalid until first drive is
352 		 * in set.
353 		 */
354 		(void) strcpy(sd->sd_mn_master_nodenm, "");
355 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
356 		sd->sd_mn_masternode = ndm;
357 		sd->sd_ctime = now;
358 		genid = sd->sd_genid = 0;
359 	}
360 
361 	/* Create the set where needed */
362 	for (i = 0; i < node_c; i++) {
363 		/*
364 		 * Create the set on each new node.  If the set already
365 		 * exists, then the node list being created on each new node
366 		 * is the current node list from before the new nodes
367 		 * were added.  If the set doesn't exist, then the node
368 		 * list being created on each new node is the entire
369 		 * new node list.
370 		 */
371 		if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist,
372 		    now, genid, sd->sd_mn_master_nodenm,
373 		    sd->sd_mn_master_nodeid, ep) == -1) {
374 			rval = -1;
375 			break;
376 		}
377 	}
378 
379 out:
380 	if (new_set) {
381 		nd = sd->sd_nodelist;
382 		while (nd) {
383 			sd->sd_nodelist = nd->nd_next;
384 			Free(nd);
385 			nd = sd->sd_nodelist;
386 		}
387 		Free(sd);
388 	}
389 
390 	if (rval != 0 || new_set)
391 		return (rval);
392 
393 	/*
394 	 * Add the drive records to the new sets
395 	 * and names for the new sides.
396 	 */
397 	return (add_drvs_to_hosts(sp, node_c, node_v, ep));
398 }
399 
400 
401 static int
402 create_traditional_set_on_hosts(
403 	mdsetname_t	*sp,
404 	int		node_c,		/* Number of new nodes */
405 	char		**node_v,	/* Nodes which are being added */
406 	int		new_set,
407 	md_error_t	*ep
408 )
409 {
410 	int		i;
411 	md_set_desc	*sd;
412 	md_timeval32_t	now;
413 	ulong_t		genid;
414 	int		rval = 0;
415 
416 	if (!new_set) {
417 
418 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
419 			return (-1);
420 		now = sd->sd_ctime;
421 
422 		genid = sd->sd_genid;
423 
424 		if (sd->sd_drvs)
425 			genid--;
426 	} else {
427 		if (node_c > MD_MAXSIDES)
428 			return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL,
429 			    sp->setno, NULL, NULL, sp->setname));
430 
431 		sd = Zalloc(sizeof (*sd));
432 
433 		/* Put the new entries into the set */
434 		for (i = 0; i < node_c; i++) {
435 			(void) strcpy(sd->sd_nodes[i], node_v[i]);
436 		}
437 
438 		if (meta_gettimeofday(&now) == -1) {
439 			(void) mdsyserror(ep, errno, "meta_gettimeofday()");
440 			rval = -1;
441 			goto out;
442 		}
443 
444 		sd->sd_ctime = now;
445 		genid = sd->sd_genid = 0;
446 	}
447 
448 	/* Create the set where needed */
449 	for (i = 0; i < node_c; i++) {
450 		/*
451 		 * Create the set on each new host
452 		 */
453 		if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid,
454 		    ep) == -1) {
455 			rval = -1;
456 			break;
457 		}
458 	}
459 
460 out:
461 	if (new_set)
462 		Free(sd);
463 
464 	if (rval != 0 || new_set)
465 		return (rval);
466 
467 	/*
468 	 * Add the drive records to the new sets
469 	 * and names for the new sides.
470 	 */
471 	return (add_drvs_to_hosts(sp, node_c, node_v, ep));
472 }
473 
474 static int
475 create_set_on_hosts(
476 	mdsetname_t	*sp,
477 	int		multi_node,	/* Multi_node diskset or not? */
478 	int		node_c,		/* Number of new nodes */
479 	char		**node_v,	/* Nodes which are being added */
480 	int		new_set,
481 	md_error_t	*ep
482 )
483 {
484 	if (multi_node)
485 		return (create_multinode_set_on_hosts(sp, node_c, node_v,
486 		    new_set, ep));
487 	else
488 		return (create_traditional_set_on_hosts(sp, node_c, node_v,
489 		    new_set, ep));
490 }
491 
492 static int
493 create_set(
494 	mdsetname_t	*sp,
495 	int		multi_node,	/* Multi-node diskset or not? */
496 	int		node_c,
497 	char		**node_v,
498 	int		auto_take,
499 	md_error_t	*ep
500 )
501 {
502 	int		i;
503 	int		rval = 0;
504 	set_t		max_sets;
505 	set_t		setno;
506 	int		bool;
507 	uint_t		sr_flags;
508 	sigset_t	oldsigs;
509 	md_setkey_t	*cl_sk;
510 	int		rb_level = 0;
511 	md_error_t	xep = mdnullerror;
512 	rval_e		sdssc_rval;
513 	int		lock_flag = 0;
514 	int		sig_flag = 0;
515 
516 	if ((max_sets = get_max_sets(ep)) == 0)
517 		return (-1);
518 
519 	/* We must be a member of the set we are creating */
520 	if (! strinlst(mynode(), node_c, node_v))
521 		return (mddserror(ep, MDE_DS_SELFNOTIN,
522 		    sp->setno, mynode(), NULL, sp->setname));
523 
524 	/*
525 	 * If auto_take then we must be the only member of the set
526 	 * that we are creating.
527 	 */
528 	if (auto_take && node_c > 1)
529 		return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
530 		    sp->setname));
531 
532 	/*
533 	 * If we're part of SC3.0 we'll already have allocated the
534 	 * set number so we can skip the allocation algorithm used.
535 	 * Set number is unique across traditional and MN disksets.
536 	 */
537 	if ((sdssc_rval = sdssc_get_index(sp->setname, &setno))
538 	    == SDSSC_NOT_BOUND) {
539 
540 		for (i = 0; i < node_c; i++) {
541 			int	has_set;
542 
543 			/* Skip my node */
544 			if (strcmp(mynode(), node_v[i]) == 0)
545 				continue;
546 
547 			/*
548 			 * Make sure this set name is not used on the
549 			 * other hosts
550 			 */
551 			has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
552 			if (has_set < 0) {
553 				if (! mdiserror(ep, MDE_NO_SET)) {
554 					rval = -1;
555 					goto out;
556 				}
557 				mdclrerror(ep);
558 				continue;
559 			}
560 
561 			if (has_set) {
562 				(void) mddserror(ep, MDE_DS_NODEHASSET,
563 				    sp->setno, node_v[i], NULL, sp->setname);
564 				rval = -1;
565 				goto out;
566 			}
567 		}
568 
569 		for (setno = 1; setno < max_sets; setno++) {
570 			for (i = 0; i < node_c; i++) {
571 				if (clnt_setnumbusy(node_v[i], setno,
572 				    &bool, ep) == -1) {
573 					rval = -1;
574 					goto out;
575 				}
576 
577 				if (bool == TRUE)
578 					break;
579 			}
580 			if (i == node_c)
581 				break;
582 		}
583 	} else if (sdssc_rval != SDSSC_OKAY) {
584 		(void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
585 		    NULL, sp->setname);
586 		rval = -1;
587 		goto out;
588 	}
589 
590 	if (setno == max_sets) {
591 		(void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
592 		    NULL, sp->setname);
593 		rval = -1;
594 		goto out;
595 	}
596 
597 	sp->setno = setno;
598 
599 	/*
600 	 * Lock the set on current set members.
601 	 * Set locking done much earlier for MN diskset than for traditional
602 	 * diskset since lock_set is used to protect against
603 	 * other meta* commands running on the other nodes.
604 	 * Don't issue mdcommd SUSPEND command since there is nothing
605 	 * to suspend since there currently is no set.
606 	 */
607 	if (multi_node) {
608 		/* Make sure we are blocking all signals */
609 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
610 			mdclrerror(&xep);
611 		sig_flag = 1;
612 
613 		/* Lock the set on new set members */
614 		for (i = 0; i < node_c; i++) {
615 			if (clnt_lock_set(node_v[i], sp, ep)) {
616 				rval = -1;
617 				goto out;
618 			}
619 			lock_flag = 1;
620 		}
621 		/* Now have the diskset locked, verify set number is still ok */
622 		for (i = 0; i < node_c; i++) {
623 			if (clnt_setnumbusy(node_v[i], setno,
624 			    &bool, ep) == -1) {
625 				rval = -1;
626 				goto out;
627 			}
628 		}
629 	}
630 
631 
632 	if (meta_set_checkname(sp->setname, ep)) {
633 		rval = -1;
634 		goto out;
635 	}
636 
637 	for (i = 0; i < node_c; i++) {
638 		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
639 			rval = -1;
640 			goto out;
641 		}
642 		if (bool == FALSE) {
643 			(void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
644 			    node_v[i], NULL, sp->setname);
645 			rval = -1;
646 			goto out;
647 		}
648 	}
649 
650 	/* END CHECK CODE */
651 
652 	/* Lock the set on new set members */
653 	if (!multi_node) {
654 		md_rb_sig_handling_on();
655 		sig_flag = 1;
656 		for (i = 0; i < node_c; i++) {
657 			if (clnt_lock_set(node_v[i], sp, ep)) {
658 				rval = -1;
659 				goto out;
660 			}
661 			lock_flag = 1;
662 		}
663 	}
664 
665 	RB_TEST(1, "create_set", ep)
666 
667 	RB_PREEMPT;
668 	rb_level = 1;	/* level 1 */
669 
670 	RB_TEST(2, "create_set", ep)
671 
672 	if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v,
673 	    1, ep)) == -1)
674 		goto rollback;
675 
676 	RB_TEST(3, "create_set", ep)
677 
678 	if (auto_take)
679 		sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE;
680 	else
681 		sr_flags = MD_SR_OK;
682 
683 	/*
684 	 * Mark the set record MD_SR_OK
685 	 */
686 	for (i = 0; i < node_c; i++)
687 		if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep))
688 			goto rollback;
689 
690 	rb_level = 2;	/* level 2 */
691 
692 	/*
693 	 * For MN diskset:
694 	 * On each added node, set the node record for that node
695 	 * to OK.  Then set all node records for the newly added
696 	 * nodes on all nodes to ok.
697 	 *
698 	 * By setting a node's own node record to ok first, even if
699 	 * the node adding the hosts panics, the rest of the nodes can
700 	 * determine the same node list during the choosing of the master
701 	 * during reconfig.  So, only nodes considered for mastership
702 	 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
703 	 * on that node's rpc.metad.  If all nodes have MD_SR_OK set,
704 	 * but no node has its own MD_MN_NODE_OK set, then the set will
705 	 * be removed during reconfig since a panic occurred during the
706 	 * creation of the initial diskset.
707 	 */
708 
709 	if (multi_node) {
710 		md_mnnode_desc	*nd, *saved_nd_next;
711 		md_set_desc	*sd;
712 
713 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
714 			goto rollback;
715 		}
716 
717 		for (i = 0; i < node_c; i++) {
718 			nd = sd->sd_nodelist;
719 			/* All nodes are guaranteed to be ALIVE */
720 			while (nd) {
721 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
722 					break;
723 				nd = nd->nd_next;
724 			}
725 			/* Something wrong, will pick this up in next loop */
726 			if (nd == NULL)
727 				continue;
728 
729 			/* Only changing my local cache of node list */
730 			saved_nd_next = nd->nd_next;
731 			nd->nd_next = NULL;
732 
733 			/* Set node record for added host to ok on that host */
734 			if (clnt_upd_nr_flags(node_v[i], sp,
735 			    nd, MD_NR_OK, NULL, ep)) {
736 				nd->nd_next = saved_nd_next;
737 				goto rollback;
738 			}
739 			nd->nd_next = saved_nd_next;
740 		}
741 
742 		/* Now set all node records on all nodes to be ok */
743 		nd = sd->sd_nodelist;
744 		/* All nodes are guaranteed to be ALIVE */
745 		while (nd) {
746 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
747 			    sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
748 				goto rollback;
749 			}
750 			nd = nd->nd_next;
751 		}
752 	}
753 
754 	RB_TEST(4, "create_set", ep)
755 
756 out:
757 	if ((rval == 0) && multi_node) {
758 		/*
759 		 * Set successfully created.
760 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
761 		 * Send reinit command to mdcommd which forces it to get
762 		 * fresh set description.  Then send resume.
763 		 * Resume on class 0 will resume all classes.
764 		 */
765 		for (i = 0; i < node_c; i++) {
766 			/* Class is ignored for REINIT */
767 			if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
768 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
769 				if (rval == 0)
770 					(void) mdstealerror(ep, &xep);
771 				rval = -1;
772 				mde_perror(ep, dgettext(TEXT_DOMAIN,
773 				    "Unable to reinit rpc.mdcommd.\n"));
774 			}
775 		}
776 		for (i = 0; i < node_c; i++) {
777 			if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
778 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
779 				if (rval == 0)
780 					(void) mdstealerror(ep, &xep);
781 				rval = -1;
782 				mde_perror(ep, dgettext(TEXT_DOMAIN,
783 				    "Unable to resume rpc.mdcommd.\n"));
784 			}
785 		}
786 		meta_ping_mnset(sp->setno);
787 	}
788 	if (lock_flag) {
789 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
790 		for (i = 0; i < node_c; i++) {
791 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
792 				if (rval == 0)
793 					(void) mdstealerror(ep, &xep);
794 				rval = -1;
795 			}
796 		}
797 		cl_set_setkey(NULL);
798 	}
799 
800 	if (sig_flag) {
801 		if (multi_node) {
802 			/* release signals back to what they were on entry */
803 			if (procsigs(FALSE, &oldsigs, &xep) < 0)
804 				mdclrerror(&xep);
805 		} else {
806 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
807 		}
808 	}
809 
810 	return (rval);
811 
812 rollback:
813 	/* all signals already blocked for MN disket */
814 	if (!multi_node) {
815 		/* Make sure we are blocking all signals */
816 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
817 			mdclrerror(&xep);
818 	}
819 
820 	rval = -1;
821 
822 	/*
823 	 * For MN diskset:
824 	 * On each added node (which is now each node to be deleted),
825 	 * set the node record for that node to DEL.  Then set all
826 	 * node records for the newly added (soon to be deleted) nodes
827 	 * on all nodes to ok.
828 	 *
829 	 * By setting a node's own node record to DEL first, even if
830 	 * the node doing the rollback panics, the rest of the nodes can
831 	 * determine the same node list during the choosing of the master
832 	 * during reconfig.
833 	 */
834 
835 	/* level 3 */
836 	if ((rb_level > 1) && (multi_node)) {
837 		md_mnnode_desc	*nd, *saved_nd_next;
838 		md_set_desc	*sd;
839 
840 		if ((sd = metaget_setdesc(sp, &xep)) == NULL) {
841 			mdclrerror(&xep);
842 		}
843 
844 		for (i = 0; i < node_c; i++) {
845 			nd = sd->sd_nodelist;
846 			/* All nodes are guaranteed to be ALIVE */
847 			while (nd) {
848 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
849 					break;
850 				nd = nd->nd_next;
851 			}
852 			/* Something wrong, will pick this up in next loop */
853 			if (nd == NULL)
854 				continue;
855 
856 			/* Only changing my local cache of node list */
857 			saved_nd_next = nd->nd_next;
858 			nd->nd_next = NULL;
859 
860 			/* Set node record for added host to DEL on that host */
861 			if (clnt_upd_nr_flags(node_v[i], sp,
862 			    nd, MD_NR_DEL, NULL, &xep)) {
863 				nd->nd_next = saved_nd_next;
864 				mdclrerror(&xep);
865 			}
866 			nd->nd_next = saved_nd_next;
867 		}
868 
869 		/* Now set all node records on all nodes to be DEL */
870 		nd = sd->sd_nodelist;
871 		/* All nodes are guaranteed to be ALIVE */
872 		while (nd) {
873 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
874 			    sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) {
875 				mdclrerror(&xep);
876 			}
877 			nd = nd->nd_next;
878 		}
879 
880 		/* Mark set record on all hosts to be DELETED */
881 		for (i = 0; i < node_c; i++) {
882 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
883 				mdclrerror(&xep);
884 			}
885 		}
886 	}
887 	/* level 1 */
888 	if (rb_level > 0) {
889 		for (i = 0; i < node_c; i++) {
890 			if (clnt_delset(node_v[i], sp, &xep) == -1)
891 				mdclrerror(&xep);
892 		}
893 	}
894 
895 	/* level 0 */
896 	/* Don't test lock flag since guaranteed to be set if in rollback */
897 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
898 	for (i = 0; i < node_c; i++) {
899 		if (clnt_unlock_set(node_v[i], cl_sk, &xep))
900 			mdclrerror(&xep);
901 	}
902 	cl_set_setkey(NULL);
903 
904 	/* release signals back to what they were on entry */
905 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
906 		mdclrerror(&xep);
907 
908 	if ((sig_flag) && (!multi_node))
909 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
910 
911 	return (rval);
912 }
913 
914 static int
915 del_db_sidenms(
916 	mdsetname_t	*sp,
917 	side_t		sideno,
918 	md_error_t	*ep
919 )
920 {
921 	md_replicalist_t	*rlp = NULL;
922 	md_replicalist_t	*rl;
923 	int			rval = 0;
924 
925 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
926 		return (-1);
927 
928 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
929 		md_replica_t	*r = rl->rl_repp;
930 
931 		if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) {
932 			rval = -1;
933 			goto out;
934 		}
935 	}
936 
937 out:
938 	metafreereplicalist(rlp);
939 	return (rval);
940 }
941 
942 static int
943 del_drvs_from_hosts(
944 	mdsetname_t	*sp,
945 	md_set_desc	*sd,
946 	md_drive_desc	*dd,
947 	int		node_c,
948 	char		**node_v,
949 	int		oha,
950 	md_error_t	*ep
951 )
952 {
953 	int 		i;
954 	md_mnnode_desc	*nd;
955 
956 	for (i = 0; i < node_c; i++) {
957 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
958 			/*
959 			 * During OHA mode, don't issue RPCs to
960 			 * non-alive nodes since there is no reason to
961 			 * wait for RPC timeouts.
962 			 */
963 			nd = sd->sd_nodelist;
964 			while (nd) {
965 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
966 					break;
967 				nd = nd->nd_next;
968 			}
969 			if (nd == NULL) {
970 				return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
971 				    sp->setno, nd->nd_nodename,
972 				    NULL, sp->setname));
973 			}
974 
975 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
976 				continue;
977 			}
978 			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
979 				return (-1);
980 			}
981 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
982 			/*
983 			 * All nodes should be alive in non-oha mode.
984 			 */
985 			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
986 				return (-1);
987 			}
988 		} else {
989 			/*
990 			 * For traditional diskset, issue the RPC and
991 			 * ignore RPC failure if in OHA mode.
992 			 */
993 			if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
994 				if (oha == TRUE && mdanyrpcerror(ep)) {
995 					mdclrerror(ep);
996 					continue;
997 				}
998 				return (-1);
999 			}
1000 		}
1001 	}
1002 
1003 	return (0);
1004 }
1005 
1006 static int
1007 del_host_noset(
1008 	mdsetname_t	*sp,
1009 	char		**anode,
1010 	md_error_t	*ep
1011 )
1012 {
1013 	int		rval = 0;
1014 	md_setkey_t	*cl_sk;
1015 	md_drive_desc	*dd;
1016 	md_error_t	xep = mdnullerror;
1017 	md_set_desc	*sd;
1018 
1019 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1020 		return (-1);
1021 
1022 	/* Make sure we own the set */
1023 	if (meta_check_ownership(sp, ep) != 0)
1024 		return (-1);
1025 
1026 	/* Lock the set on our side */
1027 	if (clnt_lock_set(mynode(), sp, ep)) {
1028 		rval = -1;
1029 		goto out;
1030 	}
1031 
1032 	if (clnt_delhosts(mynode(), sp, 1, anode, ep)) {
1033 		rval = -1;
1034 		goto out;
1035 	}
1036 
1037 	if (!MD_MNSET_DESC(sd)) {
1038 		if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1039 		    ep)) == NULL) {
1040 			if (! mdisok(ep)) {
1041 				rval = -1;
1042 				goto out;
1043 			}
1044 		}
1045 
1046 		/* If we have drives */
1047 		if (dd != NULL) {
1048 			if (clnt_del_drv_sidenms(mynode(), sp, ep)) {
1049 				rval = -1;
1050 				goto out;
1051 			}
1052 		}
1053 	}
1054 
1055 out:
1056 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1057 	if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1058 		if (rval == 0)
1059 			(void) mdstealerror(ep, &xep);
1060 		rval = -1;
1061 	}
1062 	cl_set_setkey(NULL);
1063 
1064 	metaflushsetname(sp);
1065 
1066 	return (rval);
1067 }
1068 
1069 static int
1070 del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
1071 {
1072 	mdnm_params_t		nm;
1073 	md_set_desc		*sd;
1074 	int			i;
1075 
1076 	if (!metaislocalset(sp)) {
1077 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1078 			return (-1);
1079 	}
1080 	/* Use rpc.mdcommd to add md side info from all nodes */
1081 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1082 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1083 		md_mn_result_t			*resultp = NULL;
1084 		md_mn_msg_meta_md_delside_t	md_ds;
1085 		int				send_rval;
1086 
1087 		md_ds.msg_sideno = sideno;
1088 		/*
1089 		 * If reconfig cycle has been started, this node is stuck in
1090 		 * in the return step until this command has completed.  If
1091 		 * mdcommd is suspended, ask send_message to fail (instead of
1092 		 * retrying) so that metaset can finish allowing the
1093 		 * reconfig cycle to proceed.
1094 		 */
1095 		send_rval = mdmn_send_message(sp->setno,
1096 		    MD_MN_MSG_META_MD_DELSIDE,
1097 		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
1098 		    (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
1099 		    &resultp, ep);
1100 		if (send_rval != 0) {
1101 			(void) mdstealerror(ep, &(resultp->mmr_ep));
1102 			if (resultp)
1103 				free_result(resultp);
1104 			return (-1);
1105 		}
1106 		if (resultp)
1107 			free_result(resultp);
1108 	} else {
1109 		(void) memset(&nm, '\0', sizeof (nm));
1110 		nm.key   = MD_KEYWILD;
1111 
1112 		/*CONSTCOND*/
1113 		while (1) {
1114 			nm.mde   = mdnullerror;
1115 			nm.setno = sp->setno;
1116 			nm.side  = MD_SIDEWILD;
1117 			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
1118 				return (mdstealerror(ep, &nm.mde));
1119 
1120 			if (nm.key == MD_KEYWILD)
1121 				return (0);
1122 
1123 			/*
1124 			 * The device reference count can be greater than 1 if
1125 			 * more than one softpart is configured on top of the
1126 			 * same device.  If this is the case then we want to
1127 			 * decrement the count to zero so the entry can be
1128 			 * actually removed.
1129 			 */
1130 			for (i = 0; i < nm.ref_count; i++) {
1131 			    if (del_name(sp, sideno, nm.key, ep) == -1)
1132 				return (-1);
1133 			}
1134 		}
1135 	}
1136 	return (0);
1137 }
1138 
1139 static void
1140 recreate_set(
1141 	mdsetname_t		*sp,
1142 	md_set_desc		*sd
1143 )
1144 {
1145 	int			i;
1146 	int			has_set;
1147 	md_error_t		xep = mdnullerror;
1148 	md_mnnode_desc		*nd;
1149 
1150 	if (MD_MNSET_DESC(sd)) {
1151 		nd = sd->sd_nodelist;
1152 		while (nd) {
1153 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1154 				nd = nd->nd_next;
1155 				continue;
1156 			}
1157 			has_set = nodehasset(sp, nd->nd_nodename,
1158 				NHS_NST_EQ, &xep);
1159 
1160 			if (has_set >= 0) {
1161 				nd = nd->nd_next;
1162 				continue;
1163 			}
1164 
1165 			mdclrerror(&xep);
1166 
1167 			if (clnt_mncreateset(nd->nd_nodename, sp,
1168 			    sd->sd_nodelist,
1169 			    sd->sd_ctime, sd->sd_genid,
1170 			    sd->sd_mn_master_nodenm,
1171 			    sd->sd_mn_master_nodeid, &xep) == -1)
1172 				mdclrerror(&xep);
1173 			nd = nd->nd_next;
1174 		}
1175 	} else {
1176 		for (i = 0; i < MD_MAXSIDES; i++) {
1177 			/* Skip empty slots */
1178 			if (sd->sd_nodes[i][0] == '\0')
1179 				continue;
1180 
1181 			has_set = nodehasset(sp, sd->sd_nodes[i],
1182 				NHS_NST_EQ, &xep);
1183 
1184 			if (has_set >= 0)
1185 				continue;
1186 
1187 			mdclrerror(&xep);
1188 
1189 			if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes,
1190 			    sd->sd_ctime, sd->sd_genid, &xep) == -1)
1191 				mdclrerror(&xep);
1192 		}
1193 	}
1194 }
1195 
1196 /*
1197  * If a MN diskset, set is already locked on all nodes via clnt_lock_set.
1198  */
1199 static int
1200 del_set_nodrives(
1201 	mdsetname_t		*sp,
1202 	int			node_c,
1203 	char			**node_v,
1204 	int			oha,
1205 	md_error_t		*ep
1206 )
1207 {
1208 	md_set_desc		*sd;
1209 	int			i;
1210 	sigset_t		oldsigs;
1211 	md_setkey_t		*cl_sk;
1212 	int			rb_level = 0;
1213 	ulong_t			max_genid = 0;
1214 	int			rval = 0;
1215 	md_error_t		xep = mdnullerror;
1216 	md_mnnode_desc		*nd;
1217 	int			delete_end = 1;
1218 
1219 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1220 		return (-1);
1221 
1222 	if (MD_MNSET_DESC(sd)) {
1223 		/* Make sure we are blocking all signals */
1224 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1225 			mdclrerror(&xep);
1226 	} else {
1227 		md_rb_sig_handling_on();
1228 	}
1229 
1230 	/*
1231 	 * Lock the set on current set members for traditional disksets.
1232 	 */
1233 	if (!(MD_MNSET_DESC(sd))) {
1234 		for (i = 0; i < node_c; i++) {
1235 			/*
1236 			 * For traditional diskset, issue the RPC and
1237 			 * ignore RPC failure if in OHA mode.
1238 			 */
1239 			if (clnt_lock_set(node_v[i], sp, ep)) {
1240 				if (oha == TRUE && mdanyrpcerror(ep)) {
1241 					mdclrerror(ep);
1242 					continue;
1243 				}
1244 				rval = -1;
1245 				goto out;
1246 			}
1247 		}
1248 	}
1249 
1250 
1251 	RB_TEST(1, "deletehosts", ep)
1252 
1253 	RB_PREEMPT;
1254 	rb_level = 1;	/* level 1 */
1255 
1256 	RB_TEST(2, "deletehosts", ep)
1257 
1258 	/*
1259 	 * Mark the set record MD_SR_DEL
1260 	 */
1261 	for (i = 0; i < node_c; i++) {
1262 
1263 		RB_TEST(3, "deletehosts", ep)
1264 
1265 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1266 			/*
1267 			 * During OHA mode, don't issue RPCs to
1268 			 * non-alive nodes since there is no reason to
1269 			 * wait for RPC timeouts.
1270 			 */
1271 			nd = sd->sd_nodelist;
1272 			while (nd) {
1273 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1274 					break;
1275 				nd = nd->nd_next;
1276 			}
1277 			if (nd == NULL) {
1278 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1279 				    sp->setno, nd->nd_nodename,
1280 				    NULL, sp->setname);
1281 				goto rollback;
1282 			}
1283 
1284 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1285 				continue;
1286 			}
1287 
1288 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1289 				goto rollback;
1290 			}
1291 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1292 			/*
1293 			 * All nodes should be alive in non-oha mode.
1294 			 */
1295 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1296 				goto rollback;
1297 			}
1298 		} else {
1299 			/*
1300 			 * For traditional diskset, issue the RPC and
1301 			 * ignore RPC failure if in OHA mode.
1302 			 */
1303 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1304 				if (oha == TRUE && mdanyrpcerror(ep)) {
1305 					mdclrerror(ep);
1306 					continue;
1307 				}
1308 				goto rollback;
1309 			}
1310 		}
1311 
1312 		RB_TEST(4, "deletehosts", ep)
1313 	}
1314 
1315 	RB_TEST(5, "deletehosts", ep)
1316 
1317 	RB_PREEMPT;
1318 	rb_level = 2;	/* level 2 */
1319 
1320 	RB_TEST(6, "deletehosts", ep)
1321 
1322 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR)
1323 		if (metad_isautotakebyname(sp->setname))
1324 			delete_end = 0;
1325 		else
1326 			goto rollback;
1327 
1328 	/* The set is OK to delete, make it so. */
1329 	for (i = 0; i < node_c; i++) {
1330 
1331 		RB_TEST(7, "deletehosts", ep)
1332 
1333 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1334 			/*
1335 			 * During OHA mode, don't issue RPCs to
1336 			 * non-alive nodes since there is no reason to
1337 			 * wait for RPC timeouts.
1338 			 */
1339 			nd = sd->sd_nodelist;
1340 			while (nd) {
1341 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1342 					break;
1343 				nd = nd->nd_next;
1344 			}
1345 			if (nd == NULL) {
1346 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1347 				    sp->setno, nd->nd_nodename,
1348 				    NULL, sp->setname);
1349 				goto rollback;
1350 			}
1351 
1352 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1353 				continue;
1354 			}
1355 
1356 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1357 				goto rollback;
1358 			}
1359 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1360 			/*
1361 			 * All nodes should be alive in non-oha mode.
1362 			 */
1363 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1364 				goto rollback;
1365 			}
1366 		} else {
1367 			/*
1368 			 * For traditional diskset, issue the RPC and
1369 			 * ignore RPC failure if in OHA mode.
1370 			 */
1371 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1372 				if (oha == TRUE && mdanyrpcerror(ep)) {
1373 					mdclrerror(ep);
1374 					continue;
1375 				}
1376 				goto rollback;
1377 			}
1378 		}
1379 
1380 		RB_TEST(8, "deletehosts", ep)
1381 	}
1382 
1383 	RB_TEST(9, "deletehosts", ep)
1384 
1385 out:
1386 	/*
1387 	 * Unlock the set on current set members
1388 	 * for traditional disksets.
1389 	 */
1390 	if (!(MD_MNSET_DESC(sd))) {
1391 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1392 		for (i = 0; i < node_c; i++) {
1393 			/*
1394 			 * For traditional diskset, issue the RPC and
1395 			 * ignore RPC failure if in OHA mode.
1396 			 */
1397 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
1398 				if (oha == TRUE && mdanyrpcerror(&xep)) {
1399 					mdclrerror(&xep);
1400 					continue;
1401 				}
1402 				if (rval == 0)
1403 					(void) mdstealerror(ep, &xep);
1404 				rval = -1;
1405 			}
1406 		}
1407 		cl_set_setkey(NULL);
1408 	}
1409 
1410 	/*
1411 	 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1412 	 * don't flush that data until meta_set_deletehosts has finished
1413 	 * with it.  meta_set_deletehosts will handle the flush of the
1414 	 * setname.
1415 	 */
1416 	if (!(MD_MNSET_DESC(sd))) {
1417 		metaflushsetname(sp);
1418 	}
1419 
1420 	if (delete_end &&
1421 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1422 		rval = -1;
1423 
1424 	if (MD_MNSET_DESC(sd)) {
1425 		/* release signals back to what they were on entry */
1426 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1427 			mdclrerror(&xep);
1428 	} else {
1429 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1430 	}
1431 
1432 	return (rval);
1433 
1434 rollback:
1435 	/* all signals already blocked for MN disket */
1436 	if (!(MD_MNSET_DESC(sd))) {
1437 		/* Make sure we are blocking all signals */
1438 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1439 			mdclrerror(&xep);
1440 	}
1441 
1442 	rval = -1;
1443 
1444 	max_genid = sd->sd_genid;
1445 
1446 	/* level 2 */
1447 	if (rb_level > 1) {
1448 		recreate_set(sp, sd);
1449 		max_genid++;
1450 
1451 		if (delete_end)
1452 			(void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
1453 	}
1454 
1455 	/* level 1 */
1456 	if (rb_level > 0) {
1457 		max_genid++;
1458 		resync_genid(sp, sd, max_genid, node_c, node_v);
1459 	}
1460 
1461 	/* level 0 */
1462 	/*
1463 	 * Unlock the set on current set members
1464 	 * for traditional disksets.
1465 	 */
1466 	if (!(MD_MNSET_DESC(sd))) {
1467 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1468 		for (i = 0; i < node_c; i++) {
1469 			/*
1470 			 * For traditional diskset, issue the RPC and
1471 			 * ignore RPC failure if in OHA mode.
1472 			 */
1473 			if (clnt_unlock_set(node_v[i], cl_sk, &xep))
1474 				mdclrerror(&xep);
1475 		}
1476 		cl_set_setkey(NULL);
1477 	}
1478 
1479 	/* release signals back to what they were on entry */
1480 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1481 		mdclrerror(&xep);
1482 
1483 	/*
1484 	 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1485 	 * don't flush that data until meta_set_deletehosts has finished
1486 	 * with it.  meta_set_deletehosts will handle the flush of the
1487 	 * setname.
1488 	 */
1489 	if (!(MD_MNSET_DESC(sd))) {
1490 		metaflushsetname(sp);
1491 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1492 	}
1493 
1494 	return (rval);
1495 }
1496 
1497 /*
1498  * On entry:
1499  *   procsigs already called for MN diskset.
1500  *   md_rb_sig_handling already called for traditional diskset.
1501  */
1502 static int
1503 del_set_on_hosts(
1504 	mdsetname_t		*sp,
1505 	md_set_desc		*sd,
1506 	md_drive_desc		*dd,
1507 	int			node_c,		/* Number of nodes */
1508 	char			**node_v,	/* Nodes being deleted */
1509 	int			oha,
1510 	md_error_t		*ep
1511 )
1512 {
1513 	int			i;
1514 	int			j;
1515 	side_t			sideno;
1516 	md_replicalist_t	*rlp = NULL;
1517 	sigset_t		oldsigs;
1518 	md_setkey_t		*cl_sk;
1519 	ulong_t			max_genid = 0;
1520 	int			rb_level = 1;	/* This is a special case */
1521 	md_error_t		xep = mdnullerror;
1522 	md_mnnode_desc		*nd;
1523 
1524 	RB_PREEMPT;
1525 
1526 	RB_TEST(7, "deletehosts", ep)
1527 
1528 	if (dd != NULL) {
1529 		/*
1530 		 * May need this to re-add sidenames on roll back.
1531 		 */
1532 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
1533 		    ep) < 0)
1534 			goto rollback;
1535 
1536 		RB_TEST(8, "deletehosts", ep)
1537 
1538 		RB_PREEMPT;
1539 		rb_level = 2;	/* level 2 */
1540 
1541 		RB_TEST(9, "deletehosts", ep)
1542 
1543 		if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep))
1544 			goto rollback;
1545 
1546 		RB_TEST(10, "deletehosts", ep)
1547 
1548 		RB_PREEMPT;
1549 		rb_level = 3;	/* level 3 */
1550 
1551 		RB_TEST(11, "deletehosts", ep)
1552 
1553 		/*
1554 		 * Delete the db replica sides
1555 		 * This is done before the next loop, so that
1556 		 * the db does not get unloaded before we are finished
1557 		 * deleting the sides.
1558 		 */
1559 		if (MD_MNSET_DESC(sd)) {
1560 			nd = sd->sd_nodelist;
1561 			while (nd) {
1562 				/* Skip hosts not being deleted */
1563 				if (! strinlst(nd->nd_nodename, node_c,
1564 				    node_v)) {
1565 					nd = nd->nd_next;
1566 					continue;
1567 				}
1568 
1569 				if (del_db_sidenms(sp, nd->nd_nodeid, ep))
1570 					goto rollback;
1571 
1572 				RB_TEST(12, "deletehosts", ep)
1573 				nd = nd->nd_next;
1574 			}
1575 		} else {
1576 			for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1577 				/* Skip empty slots */
1578 				if (sd->sd_nodes[sideno][0] == '\0')
1579 					continue;
1580 
1581 				/* Skip hosts not being deleted */
1582 				if (! strinlst(sd->sd_nodes[sideno], node_c,
1583 				    node_v))
1584 					continue;
1585 
1586 				if (del_db_sidenms(sp, sideno, ep))
1587 					goto rollback;
1588 
1589 				RB_TEST(12, "deletehosts", ep)
1590 			}
1591 		}
1592 
1593 		RB_TEST(13, "deletehosts", ep)
1594 
1595 		RB_PREEMPT;
1596 		rb_level = 4;	/* level 4 */
1597 
1598 		RB_TEST(14, "deletehosts", ep)
1599 
1600 		/* Delete the names from the namespace */
1601 		if (MD_MNSET_DESC(sd)) {
1602 			nd = sd->sd_nodelist;
1603 			while (nd) {
1604 				/* Skip hosts not being deleted */
1605 				if (! strinlst(nd->nd_nodename, node_c,
1606 				    node_v)) {
1607 					nd = nd->nd_next;
1608 					continue;
1609 				}
1610 
1611 				if (del_md_sidenms(sp, nd->nd_nodeid, ep))
1612 					goto rollback;
1613 
1614 				RB_TEST(15, "deletehosts", ep)
1615 				nd = nd->nd_next;
1616 			}
1617 		} else {
1618 			for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1619 				/* Skip empty slots */
1620 				if (sd->sd_nodes[sideno][0] == '\0')
1621 					continue;
1622 
1623 				/* Skip hosts not being deleted */
1624 				if (! strinlst(sd->sd_nodes[sideno], node_c,
1625 				    node_v))
1626 					continue;
1627 
1628 				if (del_md_sidenms(sp, sideno, ep))
1629 					goto rollback;
1630 
1631 				RB_TEST(15, "deletehosts", ep)
1632 			}
1633 		}
1634 	}
1635 
1636 	RB_TEST(16, "deletehosts", ep)
1637 
1638 	RB_PREEMPT;
1639 	rb_level = 5;	/* level 6 */
1640 
1641 	RB_TEST(17, "deletehosts", ep)
1642 
1643 	for (i = 0; i < node_c; i++) {
1644 		if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1645 			/*
1646 			 * During OHA mode, don't issue RPCs to
1647 			 * non-alive nodes since there is no reason to
1648 			 * wait for RPC timeouts.
1649 			 */
1650 			nd = sd->sd_nodelist;
1651 			while (nd) {
1652 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1653 					break;
1654 				nd = nd->nd_next;
1655 			}
1656 			if (nd == NULL) {
1657 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1658 				    sp->setno, nd->nd_nodename,
1659 				    NULL, sp->setname);
1660 				goto rollback;
1661 			}
1662 
1663 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1664 				continue;
1665 			}
1666 
1667 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1668 				goto rollback;
1669 			}
1670 		} else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1671 			/*
1672 			 * All nodes should be alive in non-oha mode.
1673 			 */
1674 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1675 				goto rollback;
1676 			}
1677 		} else {
1678 			/*
1679 			 * For traditional diskset, issue the RPC and
1680 			 * ignore RPC failure if in OHA mode.
1681 			 */
1682 			if (clnt_delset(node_v[i], sp, ep) == -1) {
1683 				if (oha == TRUE && mdanyrpcerror(ep)) {
1684 					mdclrerror(ep);
1685 					continue;
1686 				}
1687 				goto rollback;
1688 			}
1689 		}
1690 
1691 		RB_TEST(18, "deletehosts", ep)
1692 	}
1693 
1694 	metafreereplicalist(rlp);
1695 
1696 	if (MD_MNSET_DESC(sd)) {
1697 		/* release signals back to what they were on entry */
1698 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1699 			mdclrerror(&xep);
1700 	} else {
1701 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1702 	}
1703 
1704 	return (0);
1705 
1706 rollback:
1707 	/* all signals already blocked for MN disket */
1708 	if (!(MD_MNSET_DESC(sd))) {
1709 		/* Make sure we are blocking all signals */
1710 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1711 			mdclrerror(&xep);
1712 	}
1713 
1714 	max_genid = sd->sd_genid;
1715 
1716 	/* level 5 */
1717 	if (rb_level > 4) {
1718 		recreate_set(sp, sd);
1719 		max_genid++;
1720 	}
1721 
1722 	/* level 2 */
1723 	if (rb_level > 1 && dd != NULL) {
1724 		/*
1725 		 * See if we have to re-add the drives specified.
1726 		 */
1727 		for (i = 0; i < node_c; i++) {
1728 			md_set_record	*sr;
1729 
1730 			if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1731 				/*
1732 				 * During OHA mode, don't issue RPCs to
1733 				 * non-alive nodes since there is no reason to
1734 				 * wait for RPC timeouts.
1735 				 */
1736 				nd = sd->sd_nodelist;
1737 				while (nd) {
1738 					if (strcmp(nd->nd_nodename, node_v[i])
1739 					    == 0)
1740 						break;
1741 					nd = nd->nd_next;
1742 				}
1743 				if (nd == NULL)
1744 					continue;
1745 
1746 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1747 					continue;
1748 			}
1749 
1750 			/* Don't care if set record is MN or not */
1751 			if (clnt_getset(node_v[i], sp->setname,
1752 			    MD_SET_BAD, &sr, &xep) == -1) {
1753 				mdclrerror(&xep);
1754 				continue;
1755 			}
1756 
1757 			/* Drive already added, skip to next node */
1758 			if (sr->sr_drivechain != NULL) {
1759 				/*
1760 				 * Set record structure was allocated from RPC
1761 				 * routine getset so this structure is only of
1762 				 * size md_set_record even if the MN flag is
1763 				 * set.  So, clear the flag so that the free
1764 				 * code doesn't attempt to free a structure
1765 				 * the size of md_mnset_record.
1766 				 */
1767 				sr->sr_flags &= ~MD_SR_MN;
1768 				free_sr(sr);
1769 				continue;
1770 			}
1771 
1772 			if (clnt_adddrvs(node_v[i], sp, dd,
1773 			    sr->sr_ctime, sr->sr_genid, &xep) == -1)
1774 				mdclrerror(&xep);
1775 
1776 			if (clnt_upd_dr_flags(node_v[i], sp, dd,
1777 			    MD_DR_OK, &xep) == -1)
1778 				mdclrerror(&xep);
1779 
1780 			/*
1781 			 * Set record structure was allocated from RPC routine
1782 			 * getset so this structure is only of size
1783 			 * md_set_record even if the MN flag is set.  So,
1784 			 * clear the flag so that the free code doesn't
1785 			 * attempt to free a structure the size of
1786 			 * md_mnset_record.
1787 			 */
1788 			sr->sr_flags &= ~MD_SR_MN;
1789 			free_sr(sr);
1790 		}
1791 		max_genid += 3;
1792 	}
1793 
1794 	/* level 3 */
1795 	if (rb_level > 2 && dd != NULL) {
1796 		md_replicalist_t	*rl;
1797 
1798 		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1799 			md_replica_t	*r = rl->rl_repp;
1800 
1801 			/*
1802 			 * This is not the first replica being added to the
1803 			 * diskset so call with ADDSIDENMS_BCAST.  If this
1804 			 * is a traditional diskset, the bcast flag is ignored
1805 			 * since traditional disksets don't use the rpc.mdcommd.
1806 			 */
1807 			if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
1808 			    DB_ADDSIDENMS_BCAST, &xep))
1809 				mdclrerror(&xep);
1810 		}
1811 	}
1812 
1813 	/* level 4 */
1814 	if (rb_level > 3 && dd != NULL) {
1815 		int	nodeid_addsides = 0;
1816 		/*
1817 		 * Add the device names for the new sides into the namespace,
1818 		 * on all hosts not being deleted.
1819 		 */
1820 		if (MD_MNSET_DESC(sd)) {
1821 			nd = sd->sd_nodelist;
1822 			while (nd) {
1823 				/* Find a node that is not being deleted */
1824 				if (! strinlst(nd->nd_nodename, node_c,
1825 				    node_v)) {
1826 					nodeid_addsides = nd->nd_nodeid;
1827 					break;
1828 				}
1829 				nd = nd->nd_next;
1830 			}
1831 		} else {
1832 			for (j = 0; j < MD_MAXSIDES; j++) {
1833 				/* Skip empty slots */
1834 				if (sd->sd_nodes[j][0] == '\0')
1835 					continue;
1836 
1837 				/* Find a node that is not being deleted */
1838 				if (! strinlst(sd->sd_nodes[j], node_c,
1839 				    node_v))
1840 					break;
1841 			}
1842 			nodeid_addsides = j;
1843 		}
1844 
1845 		if (MD_MNSET_DESC(sd)) {
1846 			nd = sd->sd_nodelist;
1847 			while (nd) {
1848 				/* Skip nodes not being deleted */
1849 				if (!strinlst(nd->nd_nodename, node_c,
1850 				    node_v)) {
1851 					nd = nd->nd_next;
1852 					continue;
1853 				}
1854 
1855 				/* this side was just created, add the names */
1856 				if (add_md_sidenms(sp, nd->nd_nodeid,
1857 				    nodeid_addsides, &xep))
1858 					mdclrerror(&xep);
1859 				nd = nd->nd_next;
1860 			}
1861 		} else {
1862 			for (i = 0; i < MD_MAXSIDES; i++) {
1863 				/* Skip empty slots */
1864 				if (sd->sd_nodes[i][0] == '\0')
1865 					continue;
1866 
1867 				/* Skip nodes not being deleted */
1868 				if (!strinlst(sd->sd_nodes[i], node_c, node_v))
1869 					continue;
1870 
1871 				/* this side was just created, add the names */
1872 				if (add_md_sidenms(sp, i, nodeid_addsides,
1873 				    &xep))
1874 					mdclrerror(&xep);
1875 			}
1876 		}
1877 	}
1878 
1879 	/* level 1 */
1880 	if (rb_level > 0) {
1881 		max_genid++;
1882 		resync_genid(sp, sd, max_genid, node_c, node_v);
1883 	}
1884 
1885 	/* level 0 */
1886 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1887 	if (MD_MNSET_DESC(sd)) {
1888 		nd = sd->sd_nodelist;
1889 		while (nd) {
1890 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1891 				continue;
1892 			/* To balance lock/unlock; can send to dead node */
1893 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1894 				mdclrerror(&xep);
1895 			nd = nd->nd_next;
1896 		}
1897 	} else {
1898 		for (i = 0; i < MD_MAXSIDES; i++) {
1899 			/* Skip empty slots */
1900 			if (sd->sd_nodes[i][0] == '\0')
1901 				continue;
1902 
1903 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1904 				mdclrerror(&xep);
1905 		}
1906 	}
1907 	cl_set_setkey(NULL);
1908 
1909 	/* release signals back to what they were on entry */
1910 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1911 		mdclrerror(&xep);
1912 
1913 	metafreereplicalist(rlp);
1914 
1915 	if (!(MD_MNSET_DESC(sd))) {
1916 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1917 	}
1918 
1919 	return (-1);
1920 }
1921 
1922 static int
1923 make_sideno_sidenm(
1924 	mdsetname_t	*sp,
1925 	mddrivename_t	*dnp,
1926 	side_t		sideno,
1927 	md_error_t	*ep
1928 )
1929 {
1930 	mdsidenames_t	*sn, **sn_next;
1931 	md_set_desc	*sd;
1932 	mdname_t	*np;
1933 	uint_t		rep_slice;
1934 	int		err = 0;
1935 
1936 	assert(dnp->side_names_key != MD_KEYWILD);
1937 
1938 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1939 		return (-1);
1940 
1941 	/* find the end of the link list */
1942 	for (sn = dnp->side_names; sn->next != NULL; sn = sn->next);
1943 	sn_next = &sn->next;
1944 
1945 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
1946 		return (-1);
1947 
1948 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1949 		return (-1);
1950 
1951 	sn = Zalloc(sizeof (*sn));
1952 	sn->sideno = sideno;
1953 
1954 	if (MD_MNSET_DESC(sd)) {
1955 		/*
1956 		 * For MO diskset the sideno is not an index into
1957 		 * the array of nodes.  Hence getside_devinfo is
1958 		 * used instead of meta_getnextside_devinfo.
1959 		 */
1960 		if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
1961 			&sn->dname, &sn->mnum, ep) == -1)
1962 			err = -1;
1963 	} else {
1964 		/* decrement sideno, to look like the previous sideno */
1965 		sideno--;
1966 		if (meta_getnextside_devinfo(sp, np->bname, &sideno, &sn->cname,
1967 			&sn->dname, &sn->mnum, ep) == -1)
1968 			err = -1;
1969 	}
1970 
1971 	if (err) {
1972 		Free(sn);
1973 		return (err);
1974 	}
1975 	assert(sn->sideno == sideno);
1976 
1977 	/* Add to the end of the linked list */
1978 	*sn_next = sn;
1979 	return (0);
1980 }
1981 
1982 static int
1983 validate_nodes(
1984 	mdsetname_t	*sp,
1985 	int		node_c,
1986 	char		**node_v,
1987 	md_error_t	*ep
1988 )
1989 {
1990 	char		*hostname;
1991 	int		i;
1992 
1993 
1994 	for (i = 0; i < node_c; i++) {
1995 		if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME)
1996 			return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
1997 			    sp->setno, node_v[i], NULL, sp->setname));
1998 		if (clnt_hostname(node_v[i], &hostname, ep))
1999 			return (-1);
2000 		if (strcmp(node_v[i], hostname) != 0) {
2001 			Free(hostname);
2002 			return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno,
2003 			    node_v[i], NULL, sp->setname));
2004 		}
2005 		Free(hostname);
2006 	}
2007 	return (0);
2008 }
2009 
2010 /*
2011  * Exported Entry Points
2012  */
2013 
2014 /*
2015  * Check the given disk set name for syntactic correctness.
2016  */
2017 int
2018 meta_set_checkname(char *setname, md_error_t *ep)
2019 {
2020 	char	*cp;
2021 
2022 	if (strlen(setname) > (size_t)MD_MAX_SETNAME)
2023 		return (mddserror(ep, MDE_DS_SETNAMETOOLONG,
2024 		    MD_SET_BAD, NULL, NULL, setname));
2025 
2026 	for (cp = setname; *cp; cp++)
2027 		if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL)
2028 			return (mddserror(ep, MDE_DS_INVALIDSETNAME,
2029 			    MD_SET_BAD, NULL, NULL, setname));
2030 	return (0);
2031 }
2032 
2033 /*
2034  * Add host(s) to the multi-node diskset provided in sp.
2035  * 	- create set if non-existent.
2036  */
2037 static int
2038 meta_multinode_set_addhosts(
2039 	mdsetname_t	*sp,
2040 	int		multi_node,
2041 	int		node_c,
2042 	char		**node_v,
2043 	int		auto_take,
2044 	md_error_t	*ep
2045 )
2046 {
2047 	md_set_desc			*sd;
2048 	md_drive_desc			*dd, *p;
2049 	int				rval = 0;
2050 	int				bool;
2051 	int				nodeindex;
2052 	int 				i;
2053 	int				has_set;
2054 	sigset_t			oldsigs;
2055 	md_setkey_t			*cl_sk;
2056 	int				rb_level = 0;
2057 	md_error_t			xep = mdnullerror;
2058 	md_mnnode_desc			*nd, *nd_curr, *nd_prev;
2059 	md_timeval32_t			now;
2060 	int				nodecnt;
2061 	mndiskset_membershiplist_t	*nl, *nl2;
2062 	int				suspendall_flag = 0;
2063 	int				suspend1_flag = 0;
2064 	int				lock_flag = 0;
2065 	int				stale_flag = 0;
2066 	md_mnnode_desc			*saved_nd_next;
2067 	int				remote_sets_created = 0;
2068 
2069 	/*
2070 	 * Check membershiplist first.  If there's
2071 	 * an error, fail to create set and pass back error.
2072 	 */
2073 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2074 		return (-1);
2075 	}
2076 	/* Verify that all nodes are in member list */
2077 	for (i = 0; i < node_c; i++) {
2078 		/*
2079 		 * If node in list isn't a member of the membership,
2080 		 * just return error.
2081 		 */
2082 		if (meta_is_member(node_v[i], NULL, nl) == 0) {
2083 			meta_free_nodelist(nl);
2084 			return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2085 			    sp->setno, node_v[i], NULL, sp->setname));
2086 		}
2087 	}
2088 	/*
2089 	 * Node list is needed later, but there is a lot of error
2090 	 * checking and possible failures between here and there, so
2091 	 * just re-get the list later if there are no errors.
2092 	 */
2093 	meta_free_nodelist(nl);
2094 	nl = NULL;
2095 
2096 	/*
2097 	 * Verify that list of nodes being added contains no
2098 	 * duplicates.
2099 	 */
2100 	if (nodesuniq(sp, node_c, node_v, ep))
2101 		return (-1);
2102 
2103 	/*
2104 	 * Verify that each node being added thinks that its nodename
2105 	 * is the same as the nodename given.
2106 	 */
2107 	if (validate_nodes(sp, node_c, node_v, ep))
2108 		return (-1);
2109 
2110 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2111 		if (! mdiserror(ep, MDE_NO_SET))
2112 			return (-1);
2113 		mdclrerror(ep);
2114 		return (create_set(sp, multi_node, node_c, node_v, auto_take,
2115 		    ep));
2116 	} else {
2117 		/*
2118 		 * If this node and another node were both attempting to
2119 		 * create the same setname at the same time, and the other
2120 		 * node has just created the set on this node then sd would
2121 		 * be non-NULL, but sp->setno would be null (setno is filled
2122 		 * in by the create_set). If this is true, then fail since
2123 		 * the other node has already won this race.
2124 		 */
2125 		if (sp->setno == NULL) {
2126 			return (mddserror(ep, MDE_DS_NODEINSET,
2127 			    NULL, mynode(), NULL, sp->setname));
2128 		}
2129 	}
2130 
2131 	/* The auto_take behavior is inconsistent with multiple hosts. */
2132 	if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
2133 		(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
2134 		    sp->setname);
2135 		return (-1);
2136 	}
2137 
2138 	/*
2139 	 * We already have the set.
2140 	 */
2141 
2142 	/* Make sure we own the set */
2143 	if (meta_check_ownership(sp, ep) != 0)
2144 		return (-1);
2145 
2146 	/*
2147 	 * The drive and node records are stored in the local mddbs of each
2148 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
2149 	 * drive and node records from that node's local mddb and caches them
2150 	 * internally. Any process needing diskset information contacts its
2151 	 * local rpc.metad to get this information.  Since each node in the
2152 	 * diskset is independently reading the set information from its local
2153 	 * mddb, the set, drive and node records in the local mddbs must stay
2154 	 * in-sync, so that all nodes have a consistent view of the diskset.
2155 	 *
2156 	 * For a multinode diskset, explicitly verify that all nodes in the
2157 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
2158 	 * fail this operation since all nodes must be ALIVE in order to add
2159 	 * the new node record to their local mddb.  If a panic of this node
2160 	 * leaves the local mddbs set, node and drive records out-of-sync, the
2161 	 * reconfig cycle will fix the local mddbs and force them back into
2162 	 * synchronization.
2163 	 */
2164 	nd = sd->sd_nodelist;
2165 	while (nd) {
2166 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2167 			return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2168 			    sp->setno, nd->nd_nodename, NULL,
2169 			    sp->setname));
2170 		}
2171 		nd = nd->nd_next;
2172 	}
2173 
2174 	/*
2175 	 * Check if node is already in set.
2176 	 */
2177 	for (i = 0; i < node_c; i++) {
2178 		/* Is node already in set? */
2179 		nd = sd->sd_nodelist;
2180 		while (nd) {
2181 			if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2182 				break;
2183 			nd = nd->nd_next;
2184 		}
2185 		if (nd) {
2186 			return (mddserror(ep, MDE_DS_NODEINSET,
2187 			    sp->setno, node_v[i], NULL,
2188 			    sp->setname));
2189 		}
2190 	}
2191 
2192 	/*
2193 	 * Lock the set on current set members.
2194 	 * Set locking done much earlier for MN diskset than for traditional
2195 	 * diskset since lock_set and SUSPEND are used to protect against
2196 	 * other meta* commands running on the other nodes.
2197 	 */
2198 	/* Make sure we are blocking all signals */
2199 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2200 		mdclrerror(&xep);
2201 
2202 	nd = sd->sd_nodelist;
2203 	/* All nodes are guaranteed to be ALIVE */
2204 	while (nd) {
2205 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2206 			rval = -1;
2207 			goto out;
2208 		}
2209 		lock_flag = 1;
2210 		nd = nd->nd_next;
2211 	}
2212 	/*
2213 	 * Lock out other meta* commands by suspending
2214 	 * class 1 messages across the diskset.
2215 	 */
2216 	nd = sd->sd_nodelist;
2217 	/* Send suspend to nodes in nodelist before addhosts call */
2218 	/* All nodes are guaranteed to be ALIVE */
2219 	while (nd) {
2220 		if (clnt_mdcommdctl(nd->nd_nodename,
2221 		    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2222 		    MD_MSCF_NO_FLAGS, ep)) {
2223 			rval = -1;
2224 			goto out;
2225 		}
2226 		suspend1_flag = 1;
2227 		nd = nd->nd_next;
2228 	}
2229 
2230 	/* Lock the set on new set members */
2231 	for (i = 0; i < node_c; i++) {
2232 		/* Already verified to be alive */
2233 		if (clnt_lock_set(node_v[i], sp, ep)) {
2234 			rval = -1;
2235 			goto out;
2236 		}
2237 		lock_flag = 1;
2238 	}
2239 
2240 	/*
2241 	 * Perform the required checks for new hosts
2242 	 */
2243 	for (i = 0; i < node_c; i++) {
2244 		/* Make sure this set name is not used on the other hosts */
2245 		has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
2246 		if (has_set < 0) {
2247 			if (! mdiserror(ep, MDE_NO_SET)) {
2248 				rval = -1;
2249 				goto out;
2250 			}
2251 			/* Keep on truck'n */
2252 			mdclrerror(ep);
2253 		} else if (has_set) {
2254 			(void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
2255 			    node_v[i], NULL, sp->setname);
2256 			rval = -1;
2257 			goto out;
2258 		}
2259 
2260 		if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) {
2261 			rval = -1;
2262 			goto out;
2263 		}
2264 
2265 		if (bool == TRUE) {
2266 			(void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
2267 			    node_v[i], NULL, sp->setname);
2268 			rval = -1;
2269 			goto out;
2270 		}
2271 
2272 		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
2273 			rval = -1;
2274 			goto out;
2275 		}
2276 
2277 		if (bool == FALSE) {
2278 			(void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
2279 			    node_v[i], NULL, sp->setname);
2280 			rval = -1;
2281 			goto out;
2282 		}
2283 
2284 		if (check_setdrvs_againstnode(sp, node_v[i], ep)) {
2285 			rval = -1;
2286 			goto out;
2287 		}
2288 	}
2289 
2290 	/* Get drive descriptors for the set */
2291 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
2292 		if (! mdisok(ep)) {
2293 			rval = -1;
2294 			goto out;
2295 		}
2296 	}
2297 
2298 	/* END CHECK CODE */
2299 
2300 	RB_TEST(1, "addhosts", ep)
2301 
2302 	RB_PREEMPT;
2303 	rb_level = 1;	/* level 1 */
2304 
2305 	RB_TEST(2, "addhosts", ep)
2306 
2307 	/*
2308 	 * Create the set where needed
2309 	 */
2310 	if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
2311 		goto rollback;
2312 	}
2313 
2314 	/*
2315 	 * Send suspend to rpc.mdcommd on nodes where a set has been
2316 	 * created since rpc.mdcommd must now be running on the remote nodes.
2317 	 */
2318 	remote_sets_created = 1;
2319 	for (i = 0; i < node_c; i++) {
2320 		/*
2321 		 * Lock out other meta* commands by suspending
2322 		 * class 1 messages across the diskset.
2323 		 */
2324 		if (clnt_mdcommdctl(node_v[i],
2325 		    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2326 		    MD_MSCF_NO_FLAGS, ep)) {
2327 			rval = -1;
2328 			goto rollback;
2329 		}
2330 	}
2331 
2332 	/*
2333 	 * Merge the new entries into the set with the existing sides.
2334 	 * Get membershiplist from API routine.  If there's
2335 	 * an error, fail to create set and pass back error.
2336 	 */
2337 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2338 		goto rollback;
2339 	}
2340 	if (meta_gettimeofday(&now) == -1) {
2341 		meta_free_nodelist(nl);
2342 		(void) mdsyserror(ep, errno,
2343 		    dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
2344 		goto rollback;
2345 	}
2346 	for (nodeindex = 0; nodeindex < node_c; nodeindex++) {
2347 		nd = Zalloc(sizeof (*nd));
2348 		(void) strcpy(nd->nd_nodename, node_v[nodeindex]);
2349 		nd->nd_ctime = now;
2350 		nl2 = nl;
2351 		while (nl2) {
2352 		    if (strcmp(nl2->msl_node_name,
2353 			node_v[nodeindex]) == 0) {
2354 			    nd->nd_nodeid = nl2->msl_node_id;
2355 			    (void) strcpy(nd->nd_priv_ic,
2356 				nl2->msl_node_addr);
2357 			    break;
2358 		    }
2359 		    nl2 = nl2->next;
2360 		}
2361 
2362 		/*
2363 		 * Nodelist must be kept in ascending nodeid order.
2364 		 */
2365 		if (sd->sd_nodelist == NULL) {
2366 			/* Nothing in list, just add it */
2367 			sd->sd_nodelist = nd;
2368 		} else if (nd->nd_nodeid <
2369 		    sd->sd_nodelist->nd_nodeid) {
2370 			/* Add to head of list */
2371 			nd->nd_next = sd->sd_nodelist;
2372 			sd->sd_nodelist = nd;
2373 		} else {
2374 			nd_curr = sd->sd_nodelist->nd_next;
2375 			nd_prev = sd->sd_nodelist;
2376 			/* Search for place to add it */
2377 			while (nd_curr) {
2378 				if (nd->nd_nodeid < nd_curr->nd_nodeid) {
2379 					/* Add before nd_curr */
2380 					nd->nd_next = nd_curr;
2381 					nd_prev->nd_next = nd;
2382 					break;
2383 				}
2384 				nd_prev = nd_curr;
2385 				nd_curr = nd_curr->nd_next;
2386 			}
2387 			/* Add to end of list */
2388 			if (nd_curr == NULL) {
2389 				nd_prev->nd_next = nd;
2390 			}
2391 
2392 		}
2393 		/* Node already verified to be in membership */
2394 		nd->nd_flags |= MD_MN_NODE_ALIVE;
2395 	}
2396 	meta_free_nodelist(nl);
2397 
2398 	/* If we have drives */
2399 	if (dd != NULL) {
2400 		/*
2401 		 * For all the hosts being added, create a sidename structure
2402 		 */
2403 		nd = sd->sd_nodelist;
2404 		while (nd) {
2405 			/* Skip nodes not being added */
2406 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2407 				nd = nd->nd_next;
2408 				continue;
2409 			}
2410 			for (p = dd; p != NULL; p = p->dd_next) {
2411 				if (make_sideno_sidenm(sp, p->dd_dnp,
2412 				    nd->nd_nodeid, ep) != 0)
2413 					goto rollback;
2414 			}
2415 			nd = nd->nd_next;
2416 		}
2417 
2418 		RB_PREEMPT;
2419 		rb_level = 2;   /* level 2 */
2420 
2421 		RB_TEST(4, "addhosts", ep)
2422 
2423 		/*
2424 		 * Add the new sidename for each drive to all the hosts
2425 		 *
2426 		 * If a multi-node diskset, each host only stores
2427 		 * the side information for itself.  So, only send
2428 		 * side information to the new hosts where each host
2429 		 * will add the appropriate side information to its
2430 		 * local mddb.
2431 		 */
2432 		nd = sd->sd_nodelist;
2433 		while (nd) {
2434 			/* Skip nodes not being added */
2435 			if (!strinlst(nd->nd_nodename, node_c,
2436 			    node_v)) {
2437 				nd = nd->nd_next;
2438 				continue;
2439 			}
2440 
2441 			/* Add side info to new hosts */
2442 			if (clnt_add_drv_sidenms(nd->nd_nodename,
2443 			    mynode(), sp, sd, node_c, node_v, ep))
2444 				goto rollback;
2445 
2446 			nd = nd->nd_next;
2447 		}
2448 
2449 		RB_TEST(5, "addhosts", ep)
2450 
2451 		RB_PREEMPT;
2452 		rb_level = 3;	/* level 3 */
2453 
2454 		RB_TEST(6, "addhosts", ep)
2455 
2456 		/*
2457 		 * Add the device names for the new sides into the namespace
2458 		 * for all hosts being added.  This is adding the side
2459 		 * names to the diskset's mddb so add sidenames for all
2460 		 * of the new hosts.
2461 		 */
2462 		nd = sd->sd_nodelist;
2463 		while (nd) {
2464 			/* Skip nodes not being added */
2465 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2466 				nd = nd->nd_next;
2467 				continue;
2468 			}
2469 
2470 			/* this side was just created, add the names */
2471 			if (add_md_sidenms(sp, nd->nd_nodeid,
2472 			    MD_SIDEWILD, ep))
2473 				goto rollback;
2474 
2475 			nd = nd->nd_next;
2476 		}
2477 
2478 		RB_TEST(7, "addhosts", ep)
2479 
2480 		RB_PREEMPT;
2481 		rb_level = 4;   /* level 4 */
2482 
2483 		RB_TEST(8, "addhosts", ep)
2484 
2485 		if (add_db_sidenms(sp, ep))
2486 			goto rollback;
2487 
2488 	} else {
2489 		RB_PREEMPT;
2490 		rb_level = 4;
2491 	}
2492 
2493 	RB_TEST(9, "addhosts", ep)
2494 
2495 	RB_PREEMPT;
2496 	rb_level = 5;	/* level 5 */
2497 
2498 	RB_TEST(10, "addhosts", ep)
2499 
2500 	if (dd != NULL) {
2501 		/*
2502 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
2503 		 * Start by suspending rpc.mdcommd (which drains it of all
2504 		 * messages), then change the nodelist followed by a reinit
2505 		 * and resume.
2506 		 */
2507 		nd = sd->sd_nodelist;
2508 		/* Send suspend_all to nodes in nodelist (existing + new) */
2509 		/* All nodes are guaranteed to be ALIVE */
2510 		while (nd) {
2511 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2512 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2513 				rval = -1;
2514 				goto rollback;
2515 			}
2516 			suspendall_flag = 1;
2517 			nd = nd->nd_next;
2518 		}
2519 	}
2520 
2521 	/* Add the node(s) to the each host that is currently in the set */
2522 	nd = sd->sd_nodelist;
2523 	/* All nodes are guaranteed to be ALIVE */
2524 	while (nd) {
2525 		if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) {
2526 			goto rollback;
2527 		}
2528 		nd = nd->nd_next;
2529 	}
2530 
2531 	RB_TEST(11, "addhosts", ep)
2532 
2533 	if (dd != NULL) {
2534 		/*
2535 		 * Mark the drives MD_DR_OK.
2536 		 */
2537 		nd = sd->sd_nodelist;
2538 		/* All nodes are guaranteed to be ALIVE */
2539 		while (nd) {
2540 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2541 			    MD_DR_OK, ep) == -1)
2542 				goto rollback;
2543 			nd = nd->nd_next;
2544 		}
2545 	}
2546 
2547 	RB_TEST(12, "addhosts", ep)
2548 
2549 	RB_PREEMPT;
2550 	rb_level = 6;   /* level 6 */
2551 
2552 	RB_TEST(13, "addhosts", ep)
2553 
2554 
2555 	/* Add the mediator information to all hosts in the set. */
2556 	nd = sd->sd_nodelist;
2557 	/* All nodes are guaranteed to be ALIVE */
2558 	while (nd) {
2559 		if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
2560 			goto rollback;
2561 		nd = nd->nd_next;
2562 	}
2563 
2564 	RB_TEST(14, "addhosts", ep)
2565 
2566 	/*
2567 	 * If a MN diskset and there are drives in the set,
2568 	 * set the master on the new nodes and
2569 	 * automatically join the new nodes into the set.
2570 	 */
2571 	if (dd != NULL) {
2572 		mddb_config_t   c;
2573 		/*
2574 		 * Is current set STALE?
2575 		 */
2576 		(void) memset(&c, 0, sizeof (c));
2577 		c.c_id = 0;
2578 		c.c_setno = sp->setno;
2579 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2580 			(void) mdstealerror(ep, &c.c_mde);
2581 			rval = -1;
2582 			goto out;
2583 		}
2584 		if (c.c_flags & MDDB_C_STALE) {
2585 			stale_flag = MNSET_IS_STALE;
2586 		}
2587 
2588 		/* Set master on newly added nodes */
2589 		for (i = 0; i < node_c; i++) {
2590 			if (clnt_mnsetmaster(node_v[i], sp,
2591 			    sd->sd_mn_master_nodenm,
2592 			    sd->sd_mn_master_nodeid, ep)) {
2593 				goto rollback;
2594 			}
2595 		}
2596 		/* Join newly added nodes to diskset and set OWN flag */
2597 		for (i = 0; i < node_c; i++) {
2598 			if (clnt_joinset(node_v[i], sp, stale_flag, ep))
2599 				goto rollback;
2600 			nd = sd->sd_nodelist;
2601 			while (nd) {
2602 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2603 					nd->nd_flags |= MD_MN_NODE_OWN;
2604 					/*
2605 					 * Also set ADD flag since this flag
2606 					 * is already set in rpc.metad - it's
2607 					 * just not in the local copy.
2608 					 * Could flush local cache and call
2609 					 * metaget_setdesc, but this just
2610 					 * adds time.  Since this node knows
2611 					 * the state of the node flags in
2612 					 * rpc.metad, just set the ADD
2613 					 * flag and save time.
2614 					 */
2615 					nd->nd_flags |= MD_MN_NODE_ADD;
2616 					break;
2617 				}
2618 				nd = nd->nd_next;
2619 			}
2620 		}
2621 
2622 		/* Send new node flag list to all Owner nodes */
2623 		nd = sd->sd_nodelist;
2624 		while (nd) {
2625 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2626 				nd = nd->nd_next;
2627 				continue;
2628 			}
2629 			/*
2630 			 * Will effectively set OWN flag in records kept
2631 			 * cached in rpc.metad.  The ADD flag would have
2632 			 * already been set by the call to clnt_addhosts.
2633 			 */
2634 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2635 			    sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
2636 				goto rollback;
2637 			}
2638 			nd = nd->nd_next;
2639 		}
2640 	}
2641 
2642 	/*
2643 	 * Mark the set record MD_SR_OK
2644 	 */
2645 	nd = sd->sd_nodelist;
2646 	/* All nodes are guaranteed to be ALIVE */
2647 	while (nd) {
2648 		if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK,
2649 		    ep)) {
2650 			goto rollback;
2651 		}
2652 		nd = nd->nd_next;
2653 	}
2654 
2655 	/*
2656 	 * For MN diskset:
2657 	 * On each newly added node, set the node record for that node
2658 	 * to OK.  Then set all node records for the newly added
2659 	 * nodes on all nodes to ok.
2660 	 *
2661 	 * By setting a node's own node record to ok first, even if
2662 	 * the node adding the hosts panics, the rest of the nodes can
2663 	 * determine the same node list during the choosing of the master
2664 	 * during reconfig.  So, only nodes considered for mastership
2665 	 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
2666 	 * on that node's rpc.metad.  If all nodes have MD_SR_OK set,
2667 	 * but no node has its own MD_MN_NODE_OK set, then the set will
2668 	 * be removed during reconfig since a panic occurred during the
2669 	 * creation of the initial diskset.
2670 	 */
2671 
2672 	for (i = 0; i < node_c; i++) {
2673 		nd = sd->sd_nodelist;
2674 		/* All nodes are guaranteed to be ALIVE */
2675 		while (nd) {
2676 			if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2677 				break;
2678 			nd = nd->nd_next;
2679 		}
2680 		/* Something wrong, will pick this up in next loop */
2681 		if (nd == NULL)
2682 			continue;
2683 
2684 		/* Only changing my local cache of node list */
2685 		saved_nd_next = nd->nd_next;
2686 		nd->nd_next = NULL;
2687 
2688 		/* Set node record for added host to ok on that host */
2689 		if (clnt_upd_nr_flags(node_v[i], sp,
2690 		    nd, MD_NR_OK, NULL, ep)) {
2691 			nd->nd_next = saved_nd_next;
2692 			goto rollback;
2693 		}
2694 		nd->nd_next = saved_nd_next;
2695 	}
2696 
2697 	/* Now set all node records on all nodes to be ok */
2698 	nd = sd->sd_nodelist;
2699 	/* All nodes are guaranteed to be ALIVE */
2700 	while (nd) {
2701 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2702 		    sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
2703 			goto rollback;
2704 		}
2705 		nd = nd->nd_next;
2706 	}
2707 
2708 	RB_TEST(15, "addhosts", ep)
2709 out:
2710 	/*
2711 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2712 	 * Send reinit command to mdcommd which forces it to get
2713 	 * fresh set description.  Then send resume.
2714 	 * Resume on class 0 will resume all classes, so can skip
2715 	 * doing an explicit resume of class1 (ignore suspend1_flag).
2716 	 */
2717 	if (suspendall_flag) {
2718 		/*
2719 		 * Don't know if nodelist contains the nodes being added
2720 		 * or not, so do reinit to nodes not being added (by skipping
2721 		 * any nodes in the nodelist being added) and then do
2722 		 * reinit to nodes being added if remote_sets_created is 1.
2723 		 */
2724 		nd = sd->sd_nodelist;
2725 		/* All nodes are guaranteed to be ALIVE */
2726 		while (nd) {
2727 			/* Skip nodes being added - handled later */
2728 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2729 				nd = nd->nd_next;
2730 				continue;
2731 			}
2732 			/* Class is ignored for REINIT */
2733 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2734 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2735 				if (rval == 0)
2736 					(void) mdstealerror(ep, &xep);
2737 				rval = -1;
2738 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2739 				    "Unable to reinit rpc.mdcommd.\n"));
2740 			}
2741 			nd = nd->nd_next;
2742 		}
2743 		/*
2744 		 * Send reinit to added nodes that had a set created since
2745 		 * rpc.mdcommd is running on the nodes with a set.
2746 		 */
2747 		if (remote_sets_created == 1) {
2748 		    for (i = 0; i < node_c; i++) {
2749 			if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
2750 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2751 				if (rval == 0)
2752 					(void) mdstealerror(ep, &xep);
2753 				rval = -1;
2754 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2755 				    "Unable to reinit rpc.mdcommd.\n"));
2756 			}
2757 		    }
2758 		}
2759 	}
2760 	if ((suspend1_flag) || (suspendall_flag)) {
2761 		/*
2762 		 * Unlock diskset by resuming messages across the diskset.
2763 		 * Just resume all classes so that resume is the same whether
2764 		 * just one class was locked or all classes were locked.
2765 		 *
2766 		 * Don't know if nodelist contains the nodes being added
2767 		 * or not, so do resume_all to nodes not being added (by
2768 		 * skipping any nodes in the nodelist being added) and then do
2769 		 * resume_all to nodes being added if remote_sets_created is 1.
2770 		 */
2771 		nd = sd->sd_nodelist;
2772 		/* All nodes are guaranteed to be ALIVE */
2773 		while (nd) {
2774 			/* Skip nodes being added - handled later */
2775 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2776 				nd = nd->nd_next;
2777 				continue;
2778 			}
2779 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2780 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2781 				if (rval == 0)
2782 					(void) mdstealerror(ep, &xep);
2783 				rval = -1;
2784 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2785 				    "Unable to resume rpc.mdcommd.\n"));
2786 			}
2787 			nd = nd->nd_next;
2788 		}
2789 		/*
2790 		 * Send resume to added nodes that had a set created since
2791 		 * rpc.mdcommd is be running on the nodes with a set.
2792 		 */
2793 		if (remote_sets_created == 1) {
2794 		    for (i = 0; i < node_c; i++) {
2795 			/* Already verified to be alive */
2796 			if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
2797 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2798 				if (rval == 0)
2799 					(void) mdstealerror(ep, &xep);
2800 				rval = -1;
2801 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2802 				    "Unable to resume rpc.mdcommd.\n"));
2803 			}
2804 		    }
2805 		}
2806 		meta_ping_mnset(sp->setno);
2807 		/*
2808 		 * Start a resync thread on the newly added nodes
2809 		 * if set is not stale. Also start a thread to update the
2810 		 * abr state of all soft partitions
2811 		 */
2812 		if (stale_flag != MNSET_IS_STALE) {
2813 			for (i = 0; i < node_c; i++) {
2814 				if (clnt_mn_mirror_resync_all(node_v[i],
2815 				    sp->setno, &xep)) {
2816 					if (rval == 0)
2817 						(void) mdstealerror(ep, &xep);
2818 					rval = -1;
2819 					mde_perror(ep, dgettext(TEXT_DOMAIN,
2820 					    "Unable to start resync "
2821 					    "thread.\n"));
2822 				}
2823 				if (clnt_mn_sp_update_abr(node_v[i],
2824 				    sp->setno, &xep)) {
2825 					if (rval == 0)
2826 						(void) mdstealerror(ep, &xep);
2827 					rval = -1;
2828 					mde_perror(ep, dgettext(TEXT_DOMAIN,
2829 					    "Unable to start sp update "
2830 					    "thread.\n"));
2831 				}
2832 			}
2833 		}
2834 	}
2835 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2836 	/*
2837 	 * Don't know if nodelist contains the nodes being added
2838 	 * or not, so do clnt_unlock_set to nodes not being added (by
2839 	 * skipping any nodes in the nodelist being added) and then do
2840 	 * clnt_unlock_set to nodes being added.
2841 	 */
2842 	if (lock_flag) {
2843 		nd = sd->sd_nodelist;
2844 		/* All nodes are guaranteed to be ALIVE */
2845 		while (nd) {
2846 			/* Skip hosts we get in the next loop */
2847 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
2848 				nd = nd->nd_next;
2849 				continue;
2850 			}
2851 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2852 				if (rval == 0)
2853 					(void) mdstealerror(ep, &xep);
2854 				rval = -1;
2855 			}
2856 			nd = nd->nd_next;
2857 		}
2858 		for (i = 0; i < node_c; i++) {
2859 			/* Already verified to be alive */
2860 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
2861 				if (rval == 0)
2862 					(void) mdstealerror(ep, &xep);
2863 				rval = -1;
2864 			}
2865 		}
2866 	}
2867 	cl_set_setkey(NULL);
2868 
2869 	metaflushsetname(sp);
2870 
2871 	/* release signals back to what they were on entry */
2872 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2873 		mdclrerror(&xep);
2874 
2875 	return (rval);
2876 
2877 rollback:
2878 	rval = -1;
2879 
2880 	/* level 6 */
2881 	if (rb_level > 5) {
2882 		/*
2883 		 * For each node being deleted, set DEL flag and
2884 		 * reset OK flag on that node first.
2885 		 * Until a node has turned off its own
2886 		 * rpc.metad's NODE_OK flag, that node could be
2887 		 * considered for master during a reconfig.
2888 		 */
2889 		for (i = 0; i < node_c; i++) {
2890 			nd = sd->sd_nodelist;
2891 			/* All nodes are guaranteed to be ALIVE */
2892 			while (nd) {
2893 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2894 					break;
2895 				nd = nd->nd_next;
2896 			}
2897 			/* Something wrong, handle this in next loop */
2898 			if (nd == NULL)
2899 				continue;
2900 
2901 			/* Only changing my local cache of node list */
2902 			saved_nd_next = nd->nd_next;
2903 			nd->nd_next = NULL;
2904 
2905 			/* Set flags for del host to DEL on that host */
2906 			if (clnt_upd_nr_flags(node_v[i], sp,
2907 			    nd, MD_NR_DEL, NULL, &xep)) {
2908 				mdclrerror(&xep);
2909 			}
2910 			nd->nd_next = saved_nd_next;
2911 		}
2912 
2913 		for (i = 0; i < node_c; i++) {
2914 			if (dd != NULL) {
2915 				/* Reset master on newly added node */
2916 				if (clnt_mnsetmaster(node_v[i], sp, "",
2917 				    MD_MN_INVALID_NID, &xep))
2918 					mdclrerror(&xep);
2919 				/* Withdraw set on newly added node */
2920 				if (clnt_withdrawset(node_v[i], sp, &xep))
2921 					mdclrerror(&xep);
2922 			}
2923 			/*
2924 			 * Turn off owner flag in nodes to be deleted
2925 			 * if there are drives in the set.
2926 			 * Also, turn off NODE_OK and turn on NODE_DEL
2927 			 * for nodes to be deleted.
2928 			 * These flags are used to set the node
2929 			 * record flags in all nodes in the set.
2930 			 */
2931 			nd = sd->sd_nodelist;
2932 			while (nd) {
2933 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2934 					if (dd != NULL) {
2935 						nd->nd_flags &= ~MD_MN_NODE_OWN;
2936 					}
2937 					nd->nd_flags |= MD_MN_NODE_DEL;
2938 					nd->nd_flags &= ~MD_MN_NODE_OK;
2939 					break;
2940 				}
2941 				nd = nd->nd_next;
2942 			}
2943 		}
2944 
2945 		/*
2946 		 * Now, reset owner and set delete flags for the deleted
2947 		 * nodes on all nodes.
2948 		 */
2949 		nd = sd->sd_nodelist;
2950 		while (nd) {
2951 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2952 			    sd->sd_nodelist, MD_NR_SET, NULL, &xep)) {
2953 				mdclrerror(&xep);
2954 			}
2955 			nd = nd->nd_next;
2956 		}
2957 
2958 		/*
2959 		 * On each node being deleted, set the set record
2960 		 * to be in DEL state.
2961 		 */
2962 		for (i = 0; i < node_c; i++) {
2963 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
2964 				mdclrerror(&xep);
2965 			}
2966 		}
2967 	}
2968 
2969 	/* level 5 */
2970 	if (rb_level > 4) {
2971 		nd = sd->sd_nodelist;
2972 		/* All nodes are guaranteed to be ALIVE */
2973 		while (nd) {
2974 			if (clnt_delhosts(nd->nd_nodename, sp, node_c,
2975 			    node_v, &xep) == -1)
2976 				mdclrerror(&xep);
2977 			nd = nd->nd_next;
2978 		}
2979 	}
2980 
2981 	/*
2982 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2983 	 * Send reinit command to mdcommd which forces it to get
2984 	 * fresh set description.  Then send resume.
2985 	 * Nodelist contains all nodes (existing + added).
2986 	 */
2987 	if (suspendall_flag) {
2988 		/* Send reinit */
2989 		nd = sd->sd_nodelist;
2990 		/* All nodes are guaranteed to be ALIVE */
2991 		/* Send reinit to nodes in nodelist before addhosts call */
2992 		while (nd) {
2993 			/*
2994 			 * Skip nodes being added if remote sets were not
2995 			 * created since rpc.mdcommd may not be running
2996 			 * on the remote nodes.
2997 			 */
2998 			if ((remote_sets_created == 0) &&
2999 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
3000 				nd = nd->nd_next;
3001 				continue;
3002 			}
3003 			/* Class is ignored for REINIT */
3004 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3005 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3006 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3007 				    "Unable to reinit rpc.mdcommd.\n"));
3008 				mdclrerror(&xep);
3009 			}
3010 			nd = nd->nd_next;
3011 		}
3012 
3013 		/* Send resume */
3014 		nd = sd->sd_nodelist;
3015 		/* All nodes are guaranteed to be ALIVE */
3016 		while (nd) {
3017 			/*
3018 			 * Skip nodes being added if remote sets were not
3019 			 * created since rpc.mdcommd may not be running
3020 			 * on the remote nodes.
3021 			 */
3022 			if ((remote_sets_created == 0) &&
3023 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
3024 				nd = nd->nd_next;
3025 				continue;
3026 			}
3027 			/*
3028 			 * Resume all classes but class 1 so that lock is held
3029 			 * against meta* commands.
3030 			 * Send resume_all_but_1 to nodes in nodelist
3031 			 * before addhosts call.
3032 			 */
3033 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3034 			    sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
3035 			    &xep)) {
3036 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3037 				    "Unable to resume rpc.mdcommd.\n"));
3038 				mdclrerror(&xep);
3039 			}
3040 			nd = nd->nd_next;
3041 		}
3042 		meta_ping_mnset(sp->setno);
3043 	}
3044 
3045 	/* level 4 */
3046 	/* Nodelist may or may not contain nodes being added. */
3047 	if (rb_level > 3 && dd != NULL) {
3048 		nd = sd->sd_nodelist;
3049 		while (nd) {
3050 			/* Skip nodes not being added */
3051 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3052 				nd = nd->nd_next;
3053 				continue;
3054 			}
3055 
3056 			if (del_db_sidenms(sp, nd->nd_nodeid, &xep))
3057 				mdclrerror(&xep);
3058 			nd = nd->nd_next;
3059 		}
3060 	}
3061 
3062 	/* level 3 */
3063 	/* Nodelist may or may not contain nodes being added. */
3064 	if (rb_level > 2 && dd != NULL) {
3065 		nd = sd->sd_nodelist;
3066 		while (nd) {
3067 			/* Skip nodes not being added */
3068 			if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3069 				nd = nd->nd_next;
3070 				continue;
3071 			}
3072 
3073 			if (del_md_sidenms(sp, nd->nd_nodeid, &xep))
3074 				mdclrerror(&xep);
3075 			nd = nd->nd_next;
3076 		}
3077 	}
3078 
3079 	/* level 1 */
3080 	if (rb_level > 0) {
3081 		if (dd != NULL) {
3082 			/* delete the drive records */
3083 			for (i = 0; i < node_c; i++) {
3084 				if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3085 					mdclrerror(&xep);
3086 			}
3087 		}
3088 
3089 		/* delete the set record */
3090 		for (i = 0; i < node_c; i++) {
3091 			if (clnt_delset(node_v[i], sp, &xep) == -1)
3092 				mdclrerror(&xep);
3093 		}
3094 	}
3095 
3096 	/* level 0 */
3097 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3098 	/* Don't test lock flag since guaranteed to be set if in rollback */
3099 	/* Nodelist may or may not contain nodes being added. */
3100 	/*
3101 	 * Unlock diskset by resuming messages across the diskset.
3102 	 * Just resume all classes so that resume is the same whether
3103 	 * just one class was locked or all classes were locked.
3104 	 */
3105 	if ((suspend1_flag) || (suspendall_flag)) {
3106 		/* All nodes are guaranteed to be ALIVE */
3107 		nd = sd->sd_nodelist;
3108 		while (nd) {
3109 			/*
3110 			 * Skip nodes being added since remote sets
3111 			 * were either created and then deleted or
3112 			 * were never created.  Either way - rpc.mdcommd
3113 			 * may not be running on the remote node.
3114 			 */
3115 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
3116 				nd = nd->nd_next;
3117 				continue;
3118 			}
3119 			if (clnt_mdcommdctl(nd->nd_nodename,
3120 			    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
3121 			    MD_MSCF_NO_FLAGS, &xep)) {
3122 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
3123 				    "Unable to resume rpc.mdcommd.\n"));
3124 				mdclrerror(&xep);
3125 			}
3126 			nd = nd->nd_next;
3127 		}
3128 		meta_ping_mnset(sp->setno);
3129 	}
3130 	nd = sd->sd_nodelist;
3131 	/* All nodes are guaranteed to be ALIVE */
3132 	while (nd) {
3133 		/* Skip hosts we get in the next loop */
3134 		if (strinlst(nd->nd_nodename, node_c, node_v)) {
3135 			nd = nd->nd_next;
3136 			continue;
3137 		}
3138 
3139 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
3140 			mdclrerror(&xep);
3141 		nd = nd->nd_next;
3142 	}
3143 
3144 	for (i = 0; i < node_c; i++)
3145 		if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3146 			mdclrerror(&xep);
3147 	cl_set_setkey(NULL);
3148 
3149 	/* release signals back to what they were on entry */
3150 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3151 		mdclrerror(&xep);
3152 
3153 	metaflushsetname(sp);
3154 
3155 	return (rval);
3156 }
3157 
3158 /*
3159  * Add host(s) to the traditional diskset provided in sp.
3160  *	- create set if non-existent.
3161  */
3162 static int
3163 meta_traditional_set_addhosts(
3164 	mdsetname_t	*sp,
3165 	int		multi_node,
3166 	int		node_c,
3167 	char		**node_v,
3168 	int		auto_take,
3169 	md_error_t	*ep
3170 )
3171 {
3172 	md_set_desc	*sd;
3173 	md_drive_desc	*dd, *p;
3174 	med_rec_t	medr;
3175 	med_rec_t	rb_medr;
3176 	int		rval = 0;
3177 	int		bool;
3178 	int		nodeindex;
3179 	int 		i;
3180 	int		has_set;
3181 	int		numsides;
3182 	sigset_t	oldsigs;
3183 	md_setkey_t	*cl_sk;
3184 	int		rb_level = 0;
3185 	md_error_t	xep = mdnullerror;
3186 	int		max_meds;
3187 
3188 	if (nodesuniq(sp, node_c, node_v, ep))
3189 		return (-1);
3190 
3191 	if (validate_nodes(sp, node_c, node_v, ep))
3192 		return (-1);
3193 
3194 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
3195 		if (! mdiserror(ep, MDE_NO_SET))
3196 			return (-1);
3197 		mdclrerror(ep);
3198 		return (create_set(sp, multi_node, node_c, node_v, auto_take,
3199 		    ep));
3200 	}
3201 
3202 	/* The auto_take behavior is inconsistent with multiple hosts. */
3203 	if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
3204 		(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
3205 		    sp->setname);
3206 		return (-1);
3207 	}
3208 
3209 	/*
3210 	 * We already have the set.
3211 	 */
3212 
3213 	/* Make sure we own the set */
3214 	if (meta_check_ownership(sp, ep) != 0)
3215 		return (-1);
3216 
3217 	/*
3218 	 * Perform the required checks for new hosts
3219 	 */
3220 	for (i = 0; i < node_c; i++) {
3221 		if (getnodeside(node_v[i], sd) != MD_SIDEWILD)
3222 			return (mddserror(ep, MDE_DS_NODEINSET, sp->setno,
3223 			    node_v[i], NULL, sp->setname));
3224 
3225 		/* Make sure this set name is not used on the other hosts */
3226 		has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
3227 		if (has_set < 0) {
3228 			if (! mdiserror(ep, MDE_NO_SET))
3229 				return (-1);
3230 			/* Keep on truck'n */
3231 			mdclrerror(ep);
3232 		} else if (has_set)
3233 			return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
3234 			    node_v[i], NULL, sp->setname));
3235 
3236 		if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1)
3237 			return (-1);
3238 
3239 		if (bool == TRUE)
3240 			return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
3241 			    node_v[i], NULL, sp->setname));
3242 
3243 		if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1)
3244 			return (-1);
3245 
3246 		if (bool == FALSE)
3247 			return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
3248 			    node_v[i], NULL, sp->setname));
3249 
3250 		if (check_setdrvs_againstnode(sp, node_v[i], ep))
3251 			return (-1);
3252 	}
3253 
3254 	/* Count the number of occupied slots */
3255 	numsides = 0;
3256 	for (i = 0; i < MD_MAXSIDES; i++) {
3257 		/* Count occupied slots */
3258 		if (sd->sd_nodes[i][0] != '\0')
3259 			numsides++;
3260 	}
3261 
3262 	/* Make sure the we have space to add the new sides */
3263 	if ((numsides + node_c) > MD_MAXSIDES) {
3264 		(void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL,
3265 		    NULL, sp->setname);
3266 		return (-1);
3267 	}
3268 
3269 	/* Get drive descriptors for the set */
3270 	if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
3271 		if (! mdisok(ep))
3272 			return (-1);
3273 
3274 	/* Setup the mediator record roll-back structure */
3275 	(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
3276 	rb_medr.med_rec_mag = MED_REC_MAGIC;
3277 	rb_medr.med_rec_rev = MED_REC_REV;
3278 	rb_medr.med_rec_fl  = 0;
3279 	rb_medr.med_rec_sn  = sp->setno;
3280 	(void) strcpy(rb_medr.med_rec_snm, sp->setname);
3281 	for (i = 0; i < MD_MAXSIDES; i++)
3282 		(void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
3283 	rb_medr.med_rec_meds = sd->sd_med;	/* structure assigment */
3284 	(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
3285 	rb_medr.med_rec_foff = 0;
3286 	crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
3287 
3288 	if ((max_meds = get_max_meds(ep)) == 0)
3289 		return (-1);
3290 
3291 	/* END CHECK CODE */
3292 
3293 	md_rb_sig_handling_on();
3294 
3295 	/* Lock the set on current set members */
3296 	for (i = 0; i < MD_MAXSIDES; i++) {
3297 		/* Skip empty slots */
3298 		if (sd->sd_nodes[i][0] == '\0')
3299 			continue;
3300 
3301 		if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
3302 			rval = -1;
3303 			goto out;
3304 		}
3305 	}
3306 
3307 	/* Lock the set on new set members */
3308 	for (i = 0; i < node_c; i++) {
3309 		if (clnt_lock_set(node_v[i], sp, ep)) {
3310 			rval = -1;
3311 			goto out;
3312 		}
3313 	}
3314 
3315 	RB_TEST(1, "addhosts", ep)
3316 
3317 	RB_PREEMPT;
3318 	rb_level = 1;	/* level 1 */
3319 
3320 	RB_TEST(2, "addhosts", ep)
3321 
3322 	/*
3323 	 * Add the new hosts to the existing set record on the existing hosts
3324 	 */
3325 	for (i = 0; i < MD_MAXSIDES; i++) {
3326 		/* skip empty slots */
3327 		if (sd->sd_nodes[i][0] == '\0')
3328 			continue;
3329 
3330 		if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep))
3331 			goto rollback;
3332 	}
3333 
3334 	RB_PREEMPT;
3335 	rb_level = 2;	/* level 2 */
3336 
3337 	RB_TEST(3, "addhosts", ep);
3338 
3339 	/* Merge the new entries into the set with the existing sides */
3340 	nodeindex = 0;
3341 	for (i = 0; i < MD_MAXSIDES; i++) {
3342 		/* Skip full slots */
3343 		if (sd->sd_nodes[i][0] != '\0')
3344 			continue;
3345 
3346 		(void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]);
3347 		if (nodeindex == node_c)
3348 			break;
3349 	}
3350 
3351 	/* If we have drives */
3352 	if (dd != NULL) {
3353 		/*
3354 		 * For all the hosts being added, create a sidename structure
3355 		 */
3356 		for (i = 0; i < MD_MAXSIDES; i++) {
3357 			/* Skip empty slots */
3358 			if (sd->sd_nodes[i][0] == '\0')
3359 				continue;
3360 
3361 			/* Skip nodes not being added */
3362 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3363 				continue;
3364 
3365 			for (p = dd; p != NULL; p = p->dd_next) {
3366 				if (make_sideno_sidenm(sp, p->dd_dnp, i,
3367 				    ep) != 0)
3368 					goto rollback;
3369 			}
3370 		}
3371 
3372 		/*
3373 		 * Add the new sidename for each drive to the existing hosts
3374 		 */
3375 		for (i = 0; i < MD_MAXSIDES; i++) {
3376 			/* Skip empty slots */
3377 			if (sd->sd_nodes[i][0] == '\0')
3378 				continue;
3379 
3380 			/* Skip nodes being added */
3381 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
3382 				continue;
3383 
3384 			if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp,
3385 			    sd, node_c, node_v, ep)) {
3386 				goto rollback;
3387 			}
3388 		}
3389 
3390 		RB_TEST(4, "addhosts", ep)
3391 
3392 		RB_PREEMPT;
3393 		rb_level = 3;	/* level 3 */
3394 
3395 		RB_TEST(5, "addhosts", ep)
3396 
3397 		if (add_db_sidenms(sp, ep)) {
3398 			goto rollback;
3399 		}
3400 
3401 	} else {
3402 		RB_PREEMPT;
3403 		rb_level = 3;
3404 	}
3405 
3406 	RB_TEST(6, "addhosts", ep)
3407 
3408 	RB_PREEMPT;
3409 	rb_level = 4;	/* level 4 */
3410 
3411 	RB_TEST(7, "addhosts", ep)
3412 
3413 
3414 	/* create the set on the new nodes, this adds the drives as well */
3415 	if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
3416 		goto rollback;
3417 	}
3418 
3419 	RB_TEST(8, "addhosts", ep)
3420 
3421 	RB_PREEMPT;
3422 	rb_level = 5;	/* level 5 */
3423 
3424 	RB_TEST(9, "addhosts", ep)
3425 
3426 	if (dd != NULL) {
3427 
3428 		/*
3429 		 * Add the device entries for the new sides into the namespace.
3430 		 */
3431 		for (i = 0; i < MD_MAXSIDES; i++) {
3432 			/* Skip empty slots */
3433 			if (sd->sd_nodes[i][0] == '\0')
3434 				continue;
3435 
3436 			/* Skip nodes not being added */
3437 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3438 				continue;
3439 
3440 			if (add_md_sidenms(sp, i, MD_SIDEWILD, ep))
3441 				goto rollback;
3442 		}
3443 	}
3444 
3445 	RB_TEST(10, "addhosts", ep)
3446 
3447 	RB_PREEMPT;
3448 	rb_level = 6;	/* level 6 */
3449 
3450 	RB_TEST(11, "addhosts", ep);
3451 
3452 	if (dd != NULL) {
3453 		/*
3454 		 * Mark the drives MD_DR_OK.
3455 		 */
3456 		for (i = 0; i < MD_MAXSIDES; i++) {
3457 			/* Skip empty slots */
3458 			if (sd->sd_nodes[i][0] == '\0')
3459 				continue;
3460 
3461 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
3462 			    MD_DR_OK, ep) == -1) {
3463 				goto rollback;
3464 			}
3465 		}
3466 	}
3467 
3468 	RB_TEST(12, "addhosts", ep)
3469 
3470 	/* Bring the mediator record up to date with the set record */
3471 	medr = rb_medr;				/* structure assignment */
3472 	for (i = 0; i < MD_MAXSIDES; i++)
3473 		(void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]);
3474 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
3475 
3476 	/* Inform the mediator hosts of the new node list */
3477 	for (i = 0; i < max_meds; i++) {
3478 		if (sd->sd_med.n_lst[i].a_cnt == 0)
3479 			continue;
3480 
3481 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
3482 			goto rollback;
3483 	}
3484 
3485 	/* Add the mediator information to all hosts in the set */
3486 	for (i = 0; i < MD_MAXSIDES; i++) {
3487 		/* Skip empty slots */
3488 		if (sd->sd_nodes[i][0] == '\0')
3489 			continue;
3490 
3491 		if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
3492 			goto rollback;
3493 	}
3494 
3495 	RB_TEST(13, "addhosts", ep)
3496 
3497 	/*
3498 	 * Mark the set record MD_SR_OK
3499 	 */
3500 	for (i = 0; i < MD_MAXSIDES; i++) {
3501 		/* Skip empty slots */
3502 		if (sd->sd_nodes[i][0] == '\0')
3503 			continue;
3504 
3505 		if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep))
3506 			goto rollback;
3507 	}
3508 
3509 	RB_TEST(14, "addhosts", ep)
3510 
3511 out:
3512 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3513 	for (i = 0; i < MD_MAXSIDES; i++) {
3514 		/* Skip empty slots */
3515 		if (sd->sd_nodes[i][0] == '\0')
3516 			continue;
3517 
3518 		/* Skip hosts we get in the next loop */
3519 		if (strinlst(sd->sd_nodes[i], node_c, node_v))
3520 			continue;
3521 
3522 		if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
3523 			if (rval == 0)
3524 				(void) mdstealerror(ep, &xep);
3525 			rval = -1;
3526 		}
3527 	}
3528 
3529 	if (rval == 0) {
3530 		for (i = 0; i < node_c; i++)
3531 			if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
3532 				if (rval == 0)
3533 					(void) mdstealerror(ep, &xep);
3534 				rval = -1;
3535 			}
3536 	}
3537 	cl_set_setkey(NULL);
3538 
3539 	metaflushsetname(sp);
3540 
3541 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3542 
3543 	return (rval);
3544 
3545 rollback:
3546 	/* Make sure we are blocking all signals */
3547 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
3548 		mdclrerror(&xep);
3549 
3550 	rval = -1;
3551 
3552 	/* level 6 */
3553 	if (rb_level > 5) {
3554 		for (i = 0; i < max_meds; i++) {
3555 			if (sd->sd_med.n_lst[i].a_cnt == 0)
3556 				continue;
3557 
3558 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
3559 			    &rb_medr, &xep))
3560 				mdclrerror(&xep);
3561 		}
3562 		if (dd != NULL) {
3563 			for (i = 0; i < MD_MAXSIDES; i++) {
3564 				/* Skip empty slots */
3565 				if (sd->sd_nodes[i][0] == '\0')
3566 					continue;
3567 
3568 				/* Skip nodes not being added */
3569 				if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3570 					continue;
3571 
3572 				if (del_md_sidenms(sp, i, &xep))
3573 					mdclrerror(&xep);
3574 			}
3575 		}
3576 	}
3577 
3578 	/* level 5 */
3579 	if (rb_level > 4) {
3580 		if (dd != NULL) {
3581 			/* delete the drive records */
3582 			for (i = 0; i < node_c; i++) {
3583 				if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3584 					mdclrerror(&xep);
3585 			}
3586 		}
3587 		/* delete the set record on the 'new' hosts */
3588 		for (i = 0; i < node_c; i++) {
3589 			if (clnt_delset(node_v[i], sp, &xep) == -1)
3590 				mdclrerror(&xep);
3591 		}
3592 	}
3593 
3594 	/* level 4 */
3595 	if (rb_level > 3 && dd != NULL) {
3596 		for (i = 0; i < MD_MAXSIDES; i++) {
3597 			/* Skip empty slots */
3598 			if (sd->sd_nodes[i][0] == '\0')
3599 				continue;
3600 
3601 			/* Skip nodes not being added */
3602 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3603 				continue;
3604 
3605 			if (del_db_sidenms(sp, i, &xep))
3606 				mdclrerror(&xep);
3607 		}
3608 	}
3609 
3610 	/* level 3 */
3611 	if (rb_level > 2 && dd != NULL) {
3612 		for (i = 0; i < MD_MAXSIDES; i++) {
3613 			/* Skip empty slots */
3614 			if (sd->sd_nodes[i][0] == '\0')
3615 				continue;
3616 
3617 			/* Skip nodes not being added */
3618 			if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3619 				continue;
3620 
3621 			if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
3622 			    &xep) == -1)
3623 				mdclrerror(&xep);
3624 		}
3625 	}
3626 
3627 	/* level 2 */
3628 	if (rb_level > 1) {
3629 		for (i = 0; i < MD_MAXSIDES; i++) {
3630 			/* Skip empty slots */
3631 			if (sd->sd_nodes[i][0] == '\0')
3632 				continue;
3633 
3634 			if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
3635 			    &xep) == -1)
3636 				mdclrerror(&xep);
3637 		}
3638 	}
3639 
3640 	/* level 1 */
3641 	if (rb_level > 0) {
3642 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
3643 		for (i = 0; i < MD_MAXSIDES; i++) {
3644 			/* Skip empty slots */
3645 			if (sd->sd_nodes[i][0] == '\0')
3646 				continue;
3647 
3648 			/* Skip hosts we get in the next loop */
3649 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
3650 				continue;
3651 
3652 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
3653 				mdclrerror(&xep);
3654 		}
3655 
3656 		for (i = 0; i < node_c; i++)
3657 			if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3658 				mdclrerror(&xep);
3659 		cl_set_setkey(NULL);
3660 	}
3661 
3662 	/* release signals back to what they were on entry */
3663 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3664 		mdclrerror(&xep);
3665 
3666 	metaflushsetname(sp);
3667 
3668 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3669 
3670 	return (rval);
3671 }
3672 
3673 /*
3674  * Add host(s) to the diskset provided in sp.
3675  * 	- create set if non-existent.
3676  */
3677 int
3678 meta_set_addhosts(
3679 	mdsetname_t	*sp,
3680 	int		multi_node,
3681 	int		node_c,
3682 	char		**node_v,
3683 	int		auto_take,
3684 	md_error_t	*ep
3685 )
3686 {
3687 	if (multi_node)
3688 		return (meta_multinode_set_addhosts(sp, multi_node, node_c,
3689 		    node_v, auto_take, ep));
3690 	else
3691 		return (meta_traditional_set_addhosts(sp, multi_node, node_c,
3692 		    node_v, auto_take, ep));
3693 }
3694 
3695 /*
3696  * Delete host(s) from the diskset provided in sp.
3697  * 	- destroy set if last host in set is removed.
3698  */
3699 int
3700 meta_set_deletehosts(
3701 	mdsetname_t		*sp,
3702 	int			node_c,
3703 	char			**node_v,
3704 	int			forceflg,
3705 	md_error_t		*ep
3706 )
3707 {
3708 	md_set_desc		*sd;
3709 	md_drive_desc		*dd;
3710 	med_rec_t		medr;
3711 	med_rec_t		rb_medr;
3712 	int			i, j;
3713 	int			has_set;
3714 	int			numsides = 0;
3715 	int			oha = FALSE;
3716 	sigset_t		oldsigs;
3717 	mhd_mhiargs_t		mhiargs;
3718 	md_replicalist_t	*rlp = NULL;
3719 	md_setkey_t		*cl_sk;
3720 	ulong_t			max_genid = 0;
3721 	int			rval = 0;
3722 	int			rb_level = 0;
3723 	int			max_meds = 0;
3724 	md_error_t		xep = mdnullerror;
3725 	md_mnnode_desc		*nd;
3726 	md_mnnode_record	*nr;
3727 	int			delete_master = 0;
3728 	int			suspendall_flag = 0, suspendall_flag_rb = 0;
3729 	int			suspend1_flag = 0;
3730 	int			lock_flag = 0;
3731 	int			stale_flag = 0;
3732 	int			*node_id_list = NULL;
3733 	int			remote_sets_deleted = 0;
3734 
3735 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
3736 		return (-1);
3737 
3738 	/*
3739 	 * Verify that list of nodes being deleted contains no
3740 	 * duplicates.
3741 	 */
3742 	if (nodesuniq(sp, node_c, node_v, ep))
3743 		return (-1);
3744 
3745 	/* Make sure we own the set */
3746 	if (meta_check_ownership(sp, ep) != 0)
3747 		return (-1);
3748 
3749 	/*
3750 	 * The drive and node records are stored in the local mddbs of each
3751 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
3752 	 * drive and node records from that node's local mddb and caches them
3753 	 * internally. Any process needing diskset information contacts its
3754 	 * local rpc.metad to get this information.  Since each node in the
3755 	 * diskset is independently reading the set information from its local
3756 	 * mddb, the set, drive and node records in the local mddbs must stay
3757 	 * in-sync, so that all nodes have a consistent view of the diskset.
3758 	 *
3759 	 * For a multinode diskset, explicitly verify that all nodes in the
3760 	 * diskset are ALIVE (i.e. are in the API membership list) if the
3761 	 * forceflag is FALSE.  (The case of forceflag being TRUE is handled
3762 	 * in OHA check above.)
3763 	 *
3764 	 * If forceflag is FALSE and a node in the diskset is not in
3765 	 * the membership list, then fail this operation since all nodes must
3766 	 * be ALIVE in order to delete the node record from their local mddb.
3767 	 * If a panic of this node leaves the local mddbs set, node and drive
3768 	 * records out-of-sync, the reconfig cycle will fix the local mddbs
3769 	 * and force them back into synchronization.
3770 	 */
3771 	if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) {
3772 		nd = sd->sd_nodelist;
3773 		while (nd) {
3774 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3775 				return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
3776 				    sp->setno, nd->nd_nodename,
3777 				    NULL, sp->setname));
3778 			}
3779 			nd = nd->nd_next;
3780 		}
3781 	}
3782 
3783 
3784 	/*
3785 	 * Lock the set on current set members.
3786 	 * Set locking done much earlier for MN diskset than for traditional
3787 	 * diskset since lock_set and SUSPEND are used to protect against
3788 	 * other meta* commands running on the other nodes.
3789 	 */
3790 	if (MD_MNSET_DESC(sd)) {
3791 		/* Make sure we are blocking all signals */
3792 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
3793 			mdclrerror(&xep);
3794 
3795 		nd = sd->sd_nodelist;
3796 		while (nd) {
3797 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3798 				nd = nd->nd_next;
3799 				continue;
3800 			}
3801 
3802 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
3803 				rval = -1;
3804 				goto out2;
3805 			}
3806 			lock_flag = 1;
3807 			nd = nd->nd_next;
3808 		}
3809 		/*
3810 		 * Lock out other meta* commands by suspending
3811 		 * class 1 messages across the diskset.
3812 		 */
3813 		nd = sd->sd_nodelist;
3814 		while (nd) {
3815 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3816 				nd = nd->nd_next;
3817 				continue;
3818 			}
3819 			if (clnt_mdcommdctl(nd->nd_nodename,
3820 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
3821 			    MD_MSCF_NO_FLAGS, ep)) {
3822 				rval = -1;
3823 				goto out2;
3824 			}
3825 			suspend1_flag = 1;
3826 			nd = nd->nd_next;
3827 		}
3828 	}
3829 
3830 	for (i = 0; i < node_c; i++)
3831 		if (getnodeside(node_v[i], sd) == MD_SIDEWILD) {
3832 			(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3833 			    node_v[i], NULL, sp->setname);
3834 			rval = -1;
3835 			goto out2;
3836 		}
3837 
3838 	/*
3839 	 * Count the number of nodes currently in the set.
3840 	 */
3841 	if (MD_MNSET_DESC(sd)) {
3842 		nd = sd->sd_nodelist;
3843 		while (nd) {
3844 			numsides++;
3845 			nd = nd->nd_next;
3846 		}
3847 	} else {
3848 		for (i = 0; i < MD_MAXSIDES; i++)
3849 			/* Count full slots */
3850 			if (sd->sd_nodes[i][0] != '\0')
3851 				numsides++;
3852 	}
3853 
3854 	/*
3855 	 * OHA mode == -f -h <hostname>
3856 	 * OHA is One Host Administration that occurs when the forceflag (-f)
3857 	 * is set and at least one host in the diskset isn't responding
3858 	 * to RPC requests.
3859 	 *
3860 	 * When in OHA mode, a node cannot delete itself from a diskset.
3861 	 * When in OHA mode, a node can delete a list of nodes from a diskset
3862 	 * even if some of the nodes in the diskset are unresponsive.
3863 	 *
3864 	 * For multinode diskset, only allow OHA mode when the nodes that
3865 	 * aren't responding in the diskset are not in the membership list
3866 	 * (i.e. nodes that aren't responding are not marked ALIVE).
3867 	 * Nodes that aren't in the membership list will be rejoining
3868 	 * the diskset through a reconfig cycle and the local mddb set
3869 	 * and node records can be reconciled during the reconfig cycle.
3870 	 *
3871 	 * If a node isn't responding, but is still in the membership list,
3872 	 * fail the request since the node may not be responding because
3873 	 * rpc.metad died and is restarting.  In this case, no reconfig
3874 	 * cycle will be started, so there's no way to recover if
3875 	 * the host delete operation was allowed.
3876 	 *
3877 	 * NOTE: if nodes that weren't in the membership when the OHA host
3878 	 * delete occurred are now the only nodes in membership list,
3879 	 * those nodes will see the old view of the diskset.  As soon as
3880 	 * a node re-enters the cluster that was present in the cluster
3881 	 * during the host deletion, the diskset will reflect the host
3882 	 * deletion on all nodes presently in the cluster.
3883 	 */
3884 	if (forceflg == TRUE) {
3885 		if (MD_MNSET_DESC(sd)) {
3886 			nd = sd->sd_nodelist;
3887 			while (nd) {
3888 				/*
3889 				 * If a node isn't ALIVE (in member list),
3890 				 * then allow a force-able delete in OHA mode.
3891 				 */
3892 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3893 					oha = TRUE;
3894 					break;
3895 				}
3896 				/*
3897 				 * Don't test for clnt_nullproc since already
3898 				 * tested the RPC connections by clnt_lock_set.
3899 				 */
3900 				nd = nd->nd_next;
3901 			}
3902 		} else {
3903 			for (i = 0; i < MD_MAXSIDES; i++) {
3904 				/* Skip empty slots */
3905 				if (sd->sd_nodes[i][0] == '\0')
3906 					continue;
3907 
3908 				if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) {
3909 					/*
3910 					 * If we timeout to at least one
3911 					 * client, then we can allow OHA mode,
3912 					 * otherwise, we are in normal mode.
3913 					 */
3914 					if (mdanyrpcerror(ep)) {
3915 						mdclrerror(ep);
3916 						if (strinlst(sd->sd_nodes[i],
3917 						    node_c, node_v)) {
3918 							oha = TRUE;
3919 							break;
3920 						}
3921 					}
3922 				}
3923 			}
3924 		}
3925 	}
3926 
3927 	/*
3928 	 * Don't allow this for MN diskset since meta_set_destroy of 1 node
3929 	 * does NOT remove this node's node record from the other node's set
3930 	 * records in their local mddb.  This leaves a MN diskset in a very
3931 	 * messed up state.
3932 	 */
3933 	if (!(MD_MNSET_DESC(sd))) {
3934 		/* Destroy set */
3935 		if (forceflg == TRUE && node_c == 1 &&
3936 		    strcmp(mynode(), node_v[0]) == 0) {
3937 			/* Can return since !MN diskset so nothing to unlock */
3938 			return (meta_set_destroy(sp, TRUE, ep));
3939 		}
3940 	}
3941 
3942 
3943 	/*
3944 	 * In multinode diskset, can only delete self if this
3945 	 * is the last node in the set or if all nodes in
3946 	 * the set are being deleted.  The traditional diskset code
3947 	 * allows a node to delete itself (when there are other nodes
3948 	 * in the diskset) when using the force flag, but that code
3949 	 * path doesn't have the node remove itself from
3950 	 * the set node list on the other nodes.  Since this isn't
3951 	 * satisfactory for the multinode diskset, just don't
3952 	 * allow this operation.
3953 	 */
3954 	if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3955 	    strinlst(mynode(), node_c, node_v)) {
3956 		(void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno,
3957 		    mynode(), NULL, sp->setname);
3958 		rval = -1;
3959 		goto out2;
3960 	}
3961 
3962 	/*
3963 	 * In multinode diskset, don't allow deletion of master node unless
3964 	 * this is the only node left or unless all nodes are being
3965 	 * deleted since there is no way to switch
3966 	 * master ownership (unless via a cluster reconfig cycle).
3967 	 */
3968 	delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v);
3969 	if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3970 	    delete_master) {
3971 		(void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno,
3972 		    sd->sd_mn_master_nodenm, NULL, sp->setname);
3973 		rval = -1;
3974 		goto out2;
3975 	}
3976 
3977 
3978 	/* Deleting self w/o forceflg */
3979 	if (forceflg == FALSE && numsides > 1 &&
3980 	    strinlst(mynode(), node_c, node_v)) {
3981 		(void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno,
3982 		    mynode(), NULL, sp->setname);
3983 		rval = -1;
3984 		goto out2;
3985 	}
3986 
3987 	/*
3988 	 * Setup the mediator record roll-back structure for a trad diskset.
3989 	 *
3990 	 * For a MN diskset, the deletion of a host in the diskset
3991 	 * does not cause an update of the mediator record.  If the
3992 	 * host deletion will cause the diskset to be removed (this is
3993 	 * the last host being removed or all hosts are being removed)
3994 	 * then the mediator record must have already been removed by the
3995 	 * user or this delete host operation will fail (a check for
3996 	 * this is done later in this routine).
3997 	 */
3998 	if (!(MD_MNSET_DESC(sd))) {
3999 		(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
4000 		rb_medr.med_rec_mag = MED_REC_MAGIC;
4001 		rb_medr.med_rec_rev = MED_REC_REV;
4002 		rb_medr.med_rec_fl = 0;
4003 		rb_medr.med_rec_sn  = sp->setno;
4004 		(void) strcpy(rb_medr.med_rec_snm, sp->setname);
4005 		for (i = 0; i < MD_MAXSIDES; i++)
4006 		    (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
4007 		rb_medr.med_rec_meds = sd->sd_med;  /* structure assigment */
4008 		(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
4009 		rb_medr.med_rec_foff = 0;
4010 		crcgen(&rb_medr, &rb_medr.med_rec_cks,
4011 		    sizeof (med_rec_t), NULL);
4012 
4013 		/* Bring the mediator record up to date with the set record */
4014 		medr = rb_medr;			/* structure assignment */
4015 
4016 		if ((max_meds = get_max_meds(ep)) == 0) {
4017 			rval = -1;
4018 			goto out2;
4019 		}
4020 	}
4021 
4022 	/*
4023 	 * For traditional diskset:
4024 	 * Check to see if all the hosts we are trying to delete the set from
4025 	 * have a set "setname" that is the same as ours, i.e. - same name,
4026 	 * same time stamp, same genid.  We only do this if forceflg is not
4027 	 * specified or we are in OHA mode.
4028 	 */
4029 	if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) {
4030 		int	fix_node_v = FALSE;
4031 		int	j;
4032 
4033 		for (i = 0; i < node_c; i++) {
4034 			/* We skip this side */
4035 			if (strcmp(mynode(), node_v[i]) == 0)
4036 				continue;
4037 
4038 			has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4039 
4040 			if (has_set < 0) {
4041 				char	 *anode[1];
4042 
4043 				/*
4044 				 * Can't talk to the host only allowed in OHA
4045 				 * mode.
4046 				 */
4047 				if (oha == TRUE && mdanyrpcerror(ep)) {
4048 					mdclrerror(ep);
4049 					continue;
4050 				}
4051 
4052 				/*
4053 				 * We got an error we do not, or are not,
4054 				 * prepared to handle.
4055 				 */
4056 				if (! mdiserror(ep, MDE_NO_SET) &&
4057 				    ! mdismddberror(ep, MDE_DB_NODB)) {
4058 					rval = -1;
4059 					goto out2;
4060 				}
4061 				mdclrerror(ep);
4062 
4063 				/*
4064 				 * If we got here: both hosts are up; a host in
4065 				 * our set record does not have the set. So we
4066 				 * delete the host from our set and invalidate
4067 				 * the node.
4068 				 */
4069 				anode[0] = Strdup(node_v[i]);
4070 
4071 				rval = del_host_noset(sp, anode, ep);
4072 
4073 				/*
4074 				 * If we delete a host, make sure the mediator
4075 				 * hosts are made aware of this.
4076 				 */
4077 				for (j = 0; j < MD_MAXSIDES; j++) {
4078 					if (strcmp(medr.med_rec_nodes[j],
4079 					    node_v[i]) != 0)
4080 						continue;
4081 					(void) memset(&medr.med_rec_nodes[j],
4082 					    '\0', sizeof (md_node_nm_t));
4083 				}
4084 				crcgen(&medr, &medr.med_rec_cks,
4085 				    sizeof (med_rec_t), NULL);
4086 
4087 				rb_medr = medr;		/* struct assignment */
4088 
4089 				Free(anode[0]);
4090 
4091 				if (rval == -1)
4092 					goto out2;
4093 
4094 				node_v[i][0] = '\0';
4095 				fix_node_v = TRUE;
4096 				continue;
4097 			}
4098 
4099 			/*
4100 			 * If we can talk to the host, and they do not have the
4101 			 * exact set, then we disallow the operation.
4102 			 */
4103 			if (has_set == FALSE) {
4104 				(void) mddserror(ep, MDE_DS_NODENOSET,
4105 				    sp->setno, node_v[i], NULL, sp->setname);
4106 				rval = -1;
4107 				goto out2;
4108 			}
4109 		}
4110 
4111 		/*
4112 		 * Here we prune the node_v's that were invalidated above.
4113 		 */
4114 		if (fix_node_v == TRUE) {
4115 			i = 0;
4116 			while (i < node_c) {
4117 				if (node_v[i][0] == '\0') {
4118 					for (j = i; (j + 1) < node_c; j++)
4119 						node_v[j] = node_v[j + 1];
4120 					node_c--;
4121 				}
4122 				i++;
4123 			}
4124 			/*
4125 			 * If we are left with no nodes, then we have
4126 			 * compeleted the operation.
4127 			 */
4128 			if (node_c == 0) {
4129 				/*
4130 				 * Inform the mediator hosts of the new node
4131 				 * list
4132 				 */
4133 				for (i = 0; i < max_meds; i++) {
4134 					if (sd->sd_med.n_lst[i].a_cnt == 0)
4135 						continue;
4136 
4137 					if (clnt_med_upd_rec(
4138 					    &sd->sd_med.n_lst[i], sp, &medr,
4139 					    ep))
4140 						mdclrerror(ep);
4141 				}
4142 				rval = 0;
4143 				goto out2;
4144 			}
4145 		}
4146 	}
4147 
4148 	/*
4149 	 * For multinode diskset:
4150 	 * If forceflag is FALSE then check to see if all the hosts we
4151 	 * are trying to delete the set from have a set "setname" that
4152 	 * is the same as ours, i.e. - same name, same time stamp, same genid.
4153 	 * If forceflag is TRUE, then we don't care if the hosts being
4154 	 * deleted have the same set information or not since user is forcing
4155 	 * those hosts to be deleted.
4156 	 */
4157 	if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) {
4158 		for (i = 0; i < node_c; i++) {
4159 			/* We skip this node since comparing against it */
4160 			if (strcmp(mynode(), node_v[i]) == 0)
4161 				continue;
4162 
4163 			has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4164 
4165 			if (has_set < 0) {
4166 				rval = -1;
4167 				goto out2;
4168 			}
4169 
4170 			/*
4171 			 * If we can talk to the host, and they do not have the
4172 			 * exact set, then we disallow the operation.
4173 			 */
4174 			if (has_set == FALSE) {
4175 				(void) mddserror(ep, MDE_DS_NODENOSET,
4176 				    sp->setno, node_v[i], NULL, sp->setname);
4177 				rval = -1;
4178 				goto out2;
4179 			}
4180 		}
4181 	}
4182 
4183 	/*
4184 	 * For traditional diskset:
4185 	 * Can't allow user to delete their node (without deleting all nodes)
4186 	 * out of a set in OHA mode, would leave a real mess.
4187 	 * This action was already failed above for a MN diskset.
4188 	 */
4189 	if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) &&
4190 	    strinlst(mynode(), node_c, node_v)) {
4191 		/* Can directly return since !MN diskset; nothing to unlock */
4192 		return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno,
4193 		    mynode(), NULL, sp->setname));
4194 	}
4195 
4196 
4197 	/* Get the drive descriptors for this set */
4198 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4199 	    ep)) == NULL) {
4200 		if (! mdisok(ep)) {
4201 			rval = -1;
4202 			goto out2;
4203 		}
4204 	}
4205 
4206 	/*
4207 	 * We have been asked to delete all the hosts in the set, i.e. - delete
4208 	 * the whole set.
4209 	 */
4210 	if (node_c == numsides) {
4211 		/*
4212 		 * This is only a valid operation if all drives have been
4213 		 * removed first.
4214 		 */
4215 
4216 		if (dd != NULL) {
4217 			(void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno,
4218 			    NULL, NULL, sp->setname);
4219 			rval = -1;
4220 			goto out2;
4221 		}
4222 
4223 		/*
4224 		 * If a mediator is currently associated with this set,
4225 		 * fail the deletion of the last host(s).
4226 		 */
4227 		if (sd->sd_med.n_cnt != 0) {
4228 			(void) mddserror(ep, MDE_DS_HASMED, sp->setno,
4229 			    NULL, NULL, sp->setname);
4230 			rval = -1;
4231 			goto out2;
4232 		}
4233 
4234 		if (! mdisok(ep)) {
4235 			rval = -1;
4236 			goto out2;
4237 		}
4238 
4239 		rval = del_set_nodrives(sp, node_c, node_v, oha, ep);
4240 		remote_sets_deleted = 1;
4241 		goto out2;
4242 	}
4243 
4244 	/*
4245 	 * Get timeout values in case we need to roll back
4246 	 */
4247 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
4248 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) {
4249 		rval = -1;
4250 		goto out2;
4251 	}
4252 
4253 	if (dd != NULL) {
4254 		/*
4255 		 * We need this around for re-adding DB side names later.
4256 		 */
4257 		if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
4258 			rval = -1;
4259 			goto out2;
4260 		}
4261 
4262 		/*
4263 		 * Alloc nodeid list if drives are present in diskset.
4264 		 * nodeid list is used to reset mirror owners if the
4265 		 * owner is a deleted node.
4266 		 */
4267 		if (MD_MNSET_DESC(sd)) {
4268 			node_id_list = Zalloc(sizeof (int) * node_c);
4269 		}
4270 	}
4271 
4272 	/* Lock the set on current set members */
4273 	if (!(MD_MNSET_DESC(sd))) {
4274 		md_rb_sig_handling_on();
4275 		for (i = 0; i < MD_MAXSIDES; i++) {
4276 			/* Skip empty slots */
4277 			if (sd->sd_nodes[i][0] == '\0')
4278 				continue;
4279 
4280 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
4281 				if (oha == TRUE && mdanyrpcerror(ep)) {
4282 					mdclrerror(ep);
4283 					continue;
4284 				}
4285 				rval = -1;
4286 				goto out2;
4287 			}
4288 			lock_flag = 1;
4289 		}
4290 	}
4291 
4292 	RB_TEST(1, "deletehosts", ep)
4293 
4294 	RB_PREEMPT;
4295 	rb_level = 1;	/* level 1 */
4296 
4297 	RB_TEST(2, "deletehosts", ep)
4298 
4299 	if (MD_MNSET_DESC(sd)) {
4300 		md_mnnode_desc		*saved_nd_next;
4301 		mddb_config_t		c;
4302 
4303 		if (dd != NULL) {
4304 			/*
4305 			 * Notify rpc.mdcommd on all nodes of a nodelist change.
4306 			 * Start by suspending rpc.mdcommd (which drains it of
4307 			 * all messages), then change the nodelist followed
4308 			 * by a reinit and resume.
4309 			 */
4310 			nd = sd->sd_nodelist;
4311 			while (nd) {
4312 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4313 					nd = nd->nd_next;
4314 					continue;
4315 				}
4316 				if (clnt_mdcommdctl(nd->nd_nodename,
4317 				    COMMDCTL_SUSPEND, sp,
4318 				    MD_MSG_CLASS0,
4319 				    MD_MSCF_NO_FLAGS, ep)) {
4320 					rval = -1;
4321 					goto out2;
4322 				}
4323 				suspendall_flag = 1;
4324 				nd = nd->nd_next;
4325 			}
4326 			/*
4327 			 * Is current set STALE?
4328 			 * Need to know this if delete host fails and node
4329 			 * is re-joined to diskset.
4330 			 */
4331 			(void) memset(&c, 0, sizeof (c));
4332 			c.c_id = 0;
4333 			c.c_setno = sp->setno;
4334 			if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
4335 				(void) mdstealerror(ep, &c.c_mde);
4336 				rval = -1;
4337 				goto out2;
4338 			}
4339 			if (c.c_flags & MDDB_C_STALE) {
4340 				stale_flag = MNSET_IS_STALE;
4341 			}
4342 		}
4343 
4344 		/*
4345 		 * For each node being deleted, set DEL flag and
4346 		 * reset OK flag on that node first.
4347 		 * Until a node has turned off its own
4348 		 * rpc.metad's NODE_OK flag, that node could be
4349 		 * considered for master during a reconfig.
4350 		 */
4351 		for (i = 0; i < node_c; i++) {
4352 			/*
4353 			 * During OHA mode, don't issue RPCs to
4354 			 * non-alive nodes since there is no reason to
4355 			 * wait for RPC timeouts.
4356 			 */
4357 			nd = sd->sd_nodelist;
4358 			while (nd) {
4359 				if (strcmp(nd->nd_nodename, node_v[i]) == 0)
4360 					break;
4361 				nd = nd->nd_next;
4362 			}
4363 			/* Something wrong, handle this in next loop */
4364 			if (nd == NULL)
4365 				continue;
4366 
4367 			/* If node_id_list is alloc'd, fill in for later use */
4368 			if (node_id_list)
4369 				node_id_list[i] = nd->nd_nodeid;
4370 
4371 			/* All nodes are guaranteed to be ALIVE unless OHA */
4372 			if ((oha == TRUE) &&
4373 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4374 				continue;
4375 			}
4376 
4377 			/* Only changing my local cache of node list */
4378 			saved_nd_next = nd->nd_next;
4379 			nd->nd_next = NULL;
4380 
4381 			/* Set flags for del host to DEL on that host */
4382 			if (clnt_upd_nr_flags(node_v[i], sp,
4383 			    nd, MD_NR_DEL, NULL, ep)) {
4384 				nd->nd_next = saved_nd_next;
4385 				goto rollback;
4386 			}
4387 			nd->nd_next = saved_nd_next;
4388 		}
4389 		for (i = 0; i < node_c; i++) {
4390 			/*
4391 			 * Turn off owner flag in nodes to be deleted
4392 			 * if this node has been joined.
4393 			 * Also, turn off NODE_OK and turn on NODE_DEL
4394 			 * for nodes to be deleted.
4395 			 * These flags are used to set the node
4396 			 * record flags in all nodes in the set.
4397 			 * Only withdraw nodes that are joined.
4398 			 */
4399 			nd = sd->sd_nodelist;
4400 			while (nd) {
4401 				/*
4402 				 * Don't communicate with non-ALIVE node if
4403 				 * in OHA - but set flags in master list so
4404 				 * alive nodes are updated correctly.
4405 				 */
4406 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4407 				    if ((oha == TRUE) &&
4408 					(!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4409 						nd->nd_flags |= MD_MN_NODE_DEL;
4410 						nd->nd_flags &= ~MD_MN_NODE_OK;
4411 						nd = nd->nd_next;
4412 						continue;
4413 				    }
4414 				    if (nd->nd_flags & MD_MN_NODE_OWN) {
4415 					/*
4416 					 * Going to set locally cached node
4417 					 * flags to rollback join so in case
4418 					 * of error, the rollback code knows
4419 					 * which nodes to re-join.
4420 					 * rpc.metad ignores the RB_JOIN flag.
4421 					 */
4422 					nd->nd_flags |= MD_MN_NODE_RB_JOIN;
4423 					nd->nd_flags &= ~MD_MN_NODE_OWN;
4424 
4425 					/*
4426 					 * Be careful in ordering of following
4427 					 * steps so that recovery from a panic
4428 					 * between the steps is viable.
4429 					 * Only reset master info in rpc.metad
4430 					 * - don't reset local cached info
4431 					 * which will be used to set master
4432 					 * info back if failure (rollback).
4433 					 */
4434 					if (clnt_withdrawset(nd->nd_nodename,
4435 					    sp, ep))
4436 						goto rollback;
4437 
4438 					/* Reset master on deleted node */
4439 					if (clnt_mnsetmaster(node_v[i], sp, "",
4440 					    MD_MN_INVALID_NID, ep))
4441 						goto rollback;
4442 				    }
4443 
4444 				    nd->nd_flags |= MD_MN_NODE_DEL;
4445 				    nd->nd_flags &= ~MD_MN_NODE_OK;
4446 				}
4447 				nd = nd->nd_next;
4448 			}
4449 		}
4450 
4451 		/*
4452 		 * Now, reset owner and set delete flags for the
4453 		 * deleted nodes on all nodes.
4454 		 */
4455 		nd = sd->sd_nodelist;
4456 		while (nd) {
4457 			/* Skip non-ALIVE node if in OHA */
4458 			if ((oha == TRUE) &&
4459 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4460 				nd = nd->nd_next;
4461 				continue;
4462 			}
4463 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4464 			    sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
4465 				goto rollback;
4466 			}
4467 			nd = nd->nd_next;
4468 		}
4469 		/*
4470 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
4471 		 * Send reinit command to mdcommd which forces it to get
4472 		 * fresh set description.
4473 		 */
4474 		if (suspendall_flag) {
4475 			/* Send reinit */
4476 			nd = sd->sd_nodelist;
4477 			while (nd) {
4478 			    if ((oha == TRUE) &&
4479 				(!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4480 				    nd = nd->nd_next;
4481 				    continue;
4482 			    }
4483 			    /* Class is ignored for REINIT */
4484 			    if (clnt_mdcommdctl(nd->nd_nodename,
4485 				COMMDCTL_REINIT,
4486 				sp, NULL, MD_MSCF_NO_FLAGS, ep)) {
4487 				    mde_perror(ep, dgettext(TEXT_DOMAIN,
4488 					"Unable to reinit rpc.mdcommd.\n"));
4489 				    goto rollback;
4490 			    }
4491 			    nd = nd->nd_next;
4492 			}
4493 			/* Send resume */
4494 			nd = sd->sd_nodelist;
4495 			while (nd) {
4496 			    if ((oha == TRUE) &&
4497 				(!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4498 				    nd = nd->nd_next;
4499 				    continue;
4500 			    }
4501 			    if (clnt_mdcommdctl(nd->nd_nodename,
4502 				COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
4503 				MD_MSCF_DONT_RESUME_CLASS1, ep)) {
4504 				    mde_perror(ep, dgettext(TEXT_DOMAIN,
4505 					"Unable to resume rpc.mdcommd.\n"));
4506 				    goto rollback;
4507 			    }
4508 			    nd = nd->nd_next;
4509 			}
4510 			meta_ping_mnset(sp->setno);
4511 		}
4512 	}
4513 
4514 
4515 	/*
4516 	 * Mark the set record MD_SR_DEL on the hosts we are deleting
4517 	 * If a MN diskset and OHA mode, don't issue RPC to nodes that
4518 	 * are not ALIVE.
4519 	 * If a MN diskset and not in OHA mode, then all nodes must respond
4520 	 * to RPC (be alive) or this routine will return failure.
4521 	 * If a traditional diskset, all RPC failures if in OHA mode.
4522 	 */
4523 	for (i = 0; i < node_c; i++) {
4524 
4525 		RB_TEST(3, "deletehosts", ep)
4526 
4527 		if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) {
4528 			/*
4529 			 * During OHA mode, don't issue RPCs to
4530 			 * non-alive nodes since there is no reason to
4531 			 * wait for RPC timeouts.
4532 			 */
4533 			nd = sd->sd_nodelist;
4534 			while (nd) {
4535 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4536 					break;
4537 				}
4538 				nd = nd->nd_next;
4539 			}
4540 			if (nd == NULL) {
4541 				(void) mddserror(ep, MDE_DS_NODENOTINSET,
4542 				    sp->setno, node_v[i], NULL, sp->setname);
4543 				goto rollback;
4544 			} else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4545 				/* Skip non-ALIVE node if in OHA mode */
4546 				continue;
4547 			} else {
4548 				if (clnt_upd_sr_flags(node_v[i], sp,
4549 				    MD_SR_DEL, ep)) {
4550 					goto rollback;
4551 				}
4552 			}
4553 		} else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) {
4554 			/*
4555 			 * All nodes should be alive in non-oha mode.
4556 			 */
4557 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4558 				goto rollback;
4559 			}
4560 		} else {
4561 			/*
4562 			 * For traditional diskset, issue the RPC and
4563 			 * ignore RPC failure if in OHA mode.
4564 			 */
4565 			if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4566 				if (oha == TRUE && mdanyrpcerror(ep)) {
4567 					mdclrerror(ep);
4568 					continue;
4569 				}
4570 				goto rollback;
4571 			}
4572 		}
4573 
4574 		RB_TEST(4, "deletehosts", ep)
4575 	}
4576 
4577 	RB_TEST(5, "deletehosts", ep)
4578 
4579 	RB_PREEMPT;
4580 	rb_level = 2;	/* level 2 */
4581 
4582 	RB_TEST(6, "deletehosts", ep)
4583 
4584 	/* Delete the set on the hosts we are deleting */
4585 	if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) {
4586 		if (node_id_list)
4587 			Free(node_id_list);
4588 		/*
4589 		 * Failure during del_set_on_hosts would have recreated
4590 		 * the diskset on the remote hosts, but for multi-owner
4591 		 * disksets need to set node flags properly and REINIT and
4592 		 * RESUME rpc.mdcommd, so just let the rollback code
4593 		 * do this.
4594 		 */
4595 		if (MD_MNSET_DESC(sd))
4596 			goto rollback;
4597 		return (-1);
4598 	}
4599 	remote_sets_deleted = 1;
4600 
4601 	RB_TEST(19, "deletehosts", ep)
4602 
4603 	RB_PREEMPT;
4604 	rb_level = 3;	/* level 3 */
4605 
4606 	RB_TEST(20, "deletehosts", ep)
4607 
4608 	/* Delete the host from sets on hosts not being deleted */
4609 	if (MD_MNSET_DESC(sd)) {
4610 		nd = sd->sd_nodelist;
4611 		/* All nodes are guaranteed to be ALIVE unless in oha mode */
4612 		while (nd) {
4613 			/*
4614 			 * During OHA mode, don't issue RPCs to
4615 			 * non-alive nodes since there is no reason to
4616 			 * wait for RPC timeouts.
4617 			 */
4618 			if ((oha == TRUE) &&
4619 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4620 				nd = nd->nd_next;
4621 				continue;
4622 			}
4623 
4624 			/* Skip nodes being deleted */
4625 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
4626 				nd = nd->nd_next;
4627 				continue;
4628 			}
4629 			if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v,
4630 			    ep) == -1) {
4631 				goto rollback;
4632 			}
4633 
4634 			RB_TEST(21, "deletehosts", ep)
4635 			nd = nd->nd_next;
4636 		}
4637 	} else {
4638 		for (i = 0; i < MD_MAXSIDES; i++) {
4639 			/* Skip empty slots */
4640 			if (sd->sd_nodes[i][0] == '\0')
4641 				continue;
4642 
4643 			/* Skip nodes being deleted */
4644 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
4645 				continue;
4646 
4647 			if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
4648 			    ep) == -1) {
4649 				if (oha == TRUE && mdanyrpcerror(ep)) {
4650 					mdclrerror(ep);
4651 					continue;
4652 				}
4653 				goto rollback;
4654 			}
4655 
4656 			RB_TEST(21, "deletehosts", ep)
4657 		}
4658 	}
4659 
4660 	/* We have drives */
4661 	if (dd != NULL) {
4662 		RB_TEST(22, "deletehosts", ep)
4663 
4664 		RB_PREEMPT;
4665 		rb_level = 4;	/* level 4 */
4666 
4667 		RB_TEST(23, "deletehosts", ep)
4668 
4669 		/*
4670 		 * Delete the old sidename for each drive on all the hosts.
4671 		 * If a multi-node diskset, each host only stores
4672 		 * the side information for itself.  So, a multi-node
4673 		 * diskset doesn't delete the old sidename for
4674 		 * an old host.
4675 		 *
4676 		 * If a MN diskset, reset owners of mirrors that are
4677 		 * owned by the deleted nodes.
4678 		 */
4679 		if (!(MD_MNSET_DESC(sd))) {
4680 			for (i = 0; i < MD_MAXSIDES; i++) {
4681 				/* Skip empty slots */
4682 				if (sd->sd_nodes[i][0] == '\0')
4683 					continue;
4684 
4685 				/* Skip nodes being deleted */
4686 				if (strinlst(sd->sd_nodes[i], node_c, node_v))
4687 					continue;
4688 
4689 				if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
4690 				    ep)) {
4691 					if (oha == TRUE && mdanyrpcerror(ep)) {
4692 						mdclrerror(ep);
4693 						continue;
4694 					}
4695 					metaflushsetname(sp);
4696 					goto rollback;
4697 				}
4698 
4699 				RB_TEST(24, "deletehosts", ep)
4700 			}
4701 		} else {
4702 		    nd = sd->sd_nodelist;
4703 		    /* All nodes guaranteed to be ALIVE unless in oha mode */
4704 		    while (nd) {
4705 			/*
4706 			 * If mirror owner was set to a deleted node, then
4707 			 * each existing node resets mirror owner to NULL.
4708 			 *
4709 			 * During OHA mode, don't issue RPCs to
4710 			 * non-alive nodes since there is no reason to
4711 			 * wait for RPC timeouts.
4712 			 */
4713 			if ((oha == TRUE) &&
4714 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4715 				nd = nd->nd_next;
4716 				continue;
4717 			}
4718 
4719 			/* Skip nodes being deleted */
4720 			if (strinlst(nd->nd_nodename, node_c, node_v)) {
4721 				nd = nd->nd_next;
4722 				continue;
4723 			}
4724 
4725 			/*
4726 			 * If mirror owner is a deleted node, reset mirror
4727 			 * owners to NULL.  If an error occurs, print a
4728 			 * warning and continue.  Don't fail metaset
4729 			 * because of mirror owner reset problem since next
4730 			 * node to grab mirror will resolve this issue.
4731 			 * Before next node grabs mirrors, metaset will show
4732 			 * the deleted node as owner which is why an attempt
4733 			 * to reset the mirror owner is made.
4734 			 */
4735 			if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
4736 			    node_c, &node_id_list[0], &xep) == -1) {
4737 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
4738 				    "Unable to reset mirror owner on"
4739 				    " node %s\n"), nd->nd_nodename);
4740 				mdclrerror(&xep);
4741 			}
4742 
4743 			RB_TEST(21, "deletehosts", ep)
4744 			nd = nd->nd_next;
4745 		    }
4746 		}
4747 	}
4748 
4749 	RB_TEST(25, "deletehosts", ep)
4750 
4751 	RB_PREEMPT;
4752 	rb_level = 4;	/* level 4 */
4753 
4754 	RB_TEST(26, "deletehosts", ep)
4755 
4756 	/*
4757 	 * Bring the mediator record up to date with the set record for
4758 	 * traditional diskset.
4759 	 */
4760 	if (!(MD_MNSET_DESC(sd))) {
4761 		medr = rb_medr;			/* structure assignment */
4762 		for (i = 0; i < MD_MAXSIDES; i++) {
4763 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
4764 				(void) memset(&medr.med_rec_nodes[i],
4765 					'\0', sizeof (md_node_nm_t));
4766 			else
4767 				(void) strcpy(medr.med_rec_nodes[i],
4768 					sd->sd_nodes[i]);
4769 		}
4770 		crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
4771 
4772 		/* Inform the mediator hosts of the new node list */
4773 		for (i = 0; i < max_meds; i++) {
4774 			if (sd->sd_med.n_lst[i].a_cnt == 0)
4775 				continue;
4776 
4777 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
4778 			    &medr, ep)) {
4779 				if (oha == TRUE && mdanyrpcerror(ep)) {
4780 					mdclrerror(ep);
4781 					continue;
4782 				}
4783 				goto rollback;
4784 			}
4785 		}
4786 	}
4787 
4788 	RB_TEST(27, "deletehosts", ep)
4789 
4790 	/*
4791 	 * For traditional diskset:
4792 	 * We are deleting ourselves out of the set and we have drives to
4793 	 * consider; so we need to halt the set, release the drives and
4794 	 * reset the timeout.  **** THIS IS A ONE WAY TICKET, NO ROLL BACK
4795 	 * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE
4796 	 * WITH ALL SIGNALS BLOCKED AND LAST ****
4797 	 *
4798 	 * This situation cannot occur in a MN diskset since a node can't
4799 	 * delete itself unless all nodes are being deleted and a diskset
4800 	 * cannot contain any drives if all nodes are being deleted.
4801 	 * So, don't even test for this if a MN diskset.
4802 	 */
4803 	if (!(MD_MNSET_DESC(sd)) && (dd != NULL) &&
4804 	    strinlst(mynode(), node_c, node_v)) {
4805 		/* Make sure we are blocking all signals */
4806 		if (procsigs(TRUE, &oldsigs, ep) < 0) {
4807 			rval = -1;
4808 			goto out1;
4809 		}
4810 
4811 		if (halt_set(sp, ep)) {
4812 			rval = -1;
4813 			goto out1;
4814 		}
4815 
4816 		if (rel_own_bydd(sp, dd, FALSE, ep))
4817 			rval = -1;
4818 
4819 out1:
4820 		/* release signals back to what they were on entry */
4821 		if (procsigs(FALSE, &oldsigs, &xep) < 0) {
4822 			if (rval == 0)
4823 				(void) mdstealerror(ep, &xep);
4824 			rval = -1;
4825 		}
4826 	}
4827 
4828 out2:
4829 	/*
4830 	 * Unlock diskset by resuming messages across the diskset.
4831 	 * Just resume all classes so that resume is the same whether
4832 	 * just one class was locked or all classes were locked.
4833 	 */
4834 	if ((suspend1_flag) || (suspendall_flag)) {
4835 		/* Send resume */
4836 		nd = sd->sd_nodelist;
4837 		while (nd) {
4838 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4839 				nd = nd->nd_next;
4840 				continue;
4841 			}
4842 			/*
4843 			 * Skip nodes being deleted if remote set
4844 			 * was deleted since rpc.mdcommd may no longer
4845 			 * be running on remote node.
4846 			 */
4847 			if ((remote_sets_deleted == 1) &&
4848 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4849 				nd = nd->nd_next;
4850 				continue;
4851 			}
4852 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
4853 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
4854 				if (rval == 0)
4855 					(void) mdstealerror(ep, &xep);
4856 				rval = -1;
4857 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4858 				    "Unable to resume rpc.mdcommd.\n"));
4859 			}
4860 			nd = nd->nd_next;
4861 		}
4862 		meta_ping_mnset(sp->setno);
4863 	}
4864 
4865 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4866 	if (lock_flag) {
4867 		if (MD_MNSET_DESC(sd)) {
4868 			nd = sd->sd_nodelist;
4869 			while (nd) {
4870 				/*
4871 				 * During OHA mode, don't issue RPCs to
4872 				 * non-alive nodes since there is no reason to
4873 				 * wait for RPC timeouts.
4874 				 */
4875 				if ((oha == TRUE) &&
4876 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4877 					nd = nd->nd_next;
4878 					continue;
4879 				}
4880 				if (clnt_unlock_set(nd->nd_nodename,
4881 				    cl_sk, &xep)) {
4882 					if (rval == 0)
4883 						(void) mdstealerror(ep, &xep);
4884 					rval = -1;
4885 				}
4886 				nd = nd->nd_next;
4887 			}
4888 		} else {
4889 			for (i = 0; i < MD_MAXSIDES; i++) {
4890 				/* Skip empty slots */
4891 				if (sd->sd_nodes[i][0] == '\0')
4892 					continue;
4893 
4894 				if (clnt_unlock_set(sd->sd_nodes[i],
4895 				    cl_sk, &xep)) {
4896 					if (oha == TRUE &&
4897 					    mdanyrpcerror(&xep)) {
4898 						mdclrerror(&xep);
4899 						continue;
4900 					}
4901 					if (rval == 0)
4902 						(void) mdstealerror(ep, &xep);
4903 					rval = -1;
4904 				}
4905 			}
4906 		}
4907 	}
4908 	cl_set_setkey(NULL);
4909 
4910 out3:
4911 	metafreereplicalist(rlp);
4912 	if (node_id_list)
4913 		Free(node_id_list);
4914 
4915 	metaflushsetname(sp);
4916 
4917 	if (MD_MNSET_DESC(sd)) {
4918 		/* release signals back to what they were on entry */
4919 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
4920 			mdclrerror(&xep);
4921 	} else {
4922 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
4923 	}
4924 
4925 
4926 	return (rval);
4927 
4928 rollback:
4929 	/* all signals already blocked for MN disket */
4930 	if (!(MD_MNSET_DESC(sd))) {
4931 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
4932 			mdclrerror(&xep);
4933 	}
4934 
4935 	rval = -1;
4936 
4937 	max_genid = sd->sd_genid;
4938 
4939 
4940 	/*
4941 	 * Send reinit command to rpc.mdcommd which forces it to get
4942 	 * fresh set description and resume all classes but class 0.
4943 	 * Don't send any commands to rpc.mdcommd if set on that node
4944 	 * has been removed.
4945 	 */
4946 	if (suspendall_flag) {
4947 		/* Send reinit */
4948 		nd = sd->sd_nodelist;
4949 		while (nd) {
4950 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4951 				nd = nd->nd_next;
4952 				continue;
4953 			}
4954 			/*
4955 			 * If the remote set was deleted, rpc.mdcommd
4956 			 * may no longer be running so send nothing to it.
4957 			 */
4958 			if ((remote_sets_deleted == 1) &&
4959 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4960 				nd = nd->nd_next;
4961 				continue;
4962 			}
4963 			/* Class is ignored for REINIT */
4964 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
4965 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
4966 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
4967 				    "Unable to reinit rpc.mdcommd.\n"));
4968 				mdclrerror(&xep);
4969 			}
4970 			nd = nd->nd_next;
4971 		}
4972 		/* Send resume */
4973 		nd = sd->sd_nodelist;
4974 		while (nd) {
4975 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4976 				nd = nd->nd_next;
4977 				continue;
4978 			}
4979 			/*
4980 			 * If the remote set was deleted, rpc.mdcommd
4981 			 * may no longer be running so send nothing to it.
4982 			 */
4983 			if ((remote_sets_deleted == 1) &&
4984 			    (strinlst(nd->nd_nodename, node_c, node_v))) {
4985 				nd = nd->nd_next;
4986 				continue;
4987 			}
4988 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
4989 			    sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
4990 			    &xep)) {
4991 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
4992 				    "Unable to resume rpc.mdcommd.\n"));
4993 				mdclrerror(&xep);
4994 			}
4995 			nd = nd->nd_next;
4996 		}
4997 		meta_ping_mnset(sp->setno);
4998 	}
4999 
5000 	/* level 2 */
5001 	if (rb_level > 1) {
5002 		md_set_record		*sr;
5003 		md_replicalist_t	*rl;
5004 
5005 		recreate_set(sp, sd);
5006 
5007 		/*
5008 		 * Lock out other meta* commands on nodes with the newly
5009 		 * re-created sets by suspending class 1 messages
5010 		 * across the diskset.
5011 		 */
5012 		nd = sd->sd_nodelist;
5013 		while (nd) {
5014 			/* Skip nodes not being deleted */
5015 			if (!(strinlst(nd->nd_nodename, node_c, node_v))) {
5016 				nd = nd->nd_next;
5017 				continue;
5018 			}
5019 			/* Suspend commd on nodes with re-created sets */
5020 			if (clnt_mdcommdctl(nd->nd_nodename,
5021 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
5022 			    MD_MSCF_NO_FLAGS, &xep)) {
5023 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5024 				    "Unable to suspend rpc.mdcommd.\n"));
5025 				mdclrerror(&xep);
5026 			}
5027 			nd = nd->nd_next;
5028 		}
5029 
5030 		max_genid++;
5031 
5032 		/*
5033 		 * See if we have to re-add the drives specified.
5034 		 */
5035 		for (i = 0; i < node_c; i++) {
5036 			if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
5037 				/*
5038 				 * During OHA mode, don't issue RPCs to
5039 				 * non-alive nodes since there is no reason to
5040 				 * wait for RPC timeouts.
5041 				 */
5042 				nd = sd->sd_nodelist;
5043 				while (nd) {
5044 					if (strcmp(nd->nd_nodename, node_v[i])
5045 					    == 0) {
5046 						break;
5047 					}
5048 					nd = nd->nd_next;
5049 				}
5050 				if (nd == 0)
5051 					continue;
5052 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
5053 					continue;
5054 			}
5055 
5056 			/* Don't care if set record is MN or not */
5057 			if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr,
5058 			    &xep) == -1) {
5059 				mdclrerror(&xep);
5060 				continue;
5061 			}
5062 
5063 			/* Drive already added, skip to next node */
5064 			if (sr->sr_drivechain != NULL) {
5065 				/*
5066 				 * Set record structure was allocated from RPC
5067 				 * routine getset so this structure is only of
5068 				 * size md_set_record even if the MN flag is
5069 				 * set.  So, clear the flag so that the free
5070 				 * code doesn't attempt to free a structure
5071 				 * the size of md_mnset_record.
5072 				 */
5073 				sr->sr_flags &= ~MD_SR_MN;
5074 				free_sr(sr);
5075 				continue;
5076 			}
5077 
5078 			if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime,
5079 			    sr->sr_genid, &xep) == -1)
5080 				mdclrerror(&xep);
5081 
5082 			if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK,
5083 			    &xep) == -1)
5084 				mdclrerror(&xep);
5085 
5086 			/*
5087 			 * Set record structure was allocated from RPC routine
5088 			 * getset so this structure is only of size
5089 			 * md_set_record even if the MN flag is set.  So,
5090 			 * clear the flag so that the free code doesn't
5091 			 * attempt to free a structure the size of
5092 			 * md_mnset_record.
5093 			 */
5094 			sr->sr_flags &= ~MD_SR_MN;
5095 			free_sr(sr);
5096 		}
5097 		max_genid += 3;
5098 
5099 		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
5100 			md_replica_t	*r = rl->rl_repp;
5101 			/*
5102 			 * This is not the first replica being added to the
5103 			 * diskset so call with ADDSIDENMS_BCAST.  If this
5104 			 * is a traditional diskset, the bcast flag is ignored
5105 			 * since traditional disksets don't use the rpc.mdcommd.
5106 			 */
5107 			if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
5108 			    DB_ADDSIDENMS_BCAST, &xep))
5109 				mdclrerror(&xep);
5110 		}
5111 
5112 		/*
5113 		 * Add the device names for the new sides into the namespace,
5114 		 * on all hosts not being deleted.
5115 		 */
5116 		if (MD_MNSET_DESC(sd)) {
5117 			nd = sd->sd_nodelist;
5118 			while (nd) {
5119 				/* Find a node that is not being deleted */
5120 				if (!strinlst(nd->nd_nodename, node_c,
5121 				    node_v)) {
5122 					j = nd->nd_nodeid;
5123 					break;
5124 				}
5125 				nd = nd->nd_next;
5126 			}
5127 		} else {
5128 			for (j = 0; j < MD_MAXSIDES; j++) {
5129 				/* Skip empty slots */
5130 				if (sd->sd_nodes[j][0] == '\0')
5131 					continue;
5132 
5133 				/* Find a node that is not being deleted */
5134 				if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5135 					break;
5136 			}
5137 		}
5138 
5139 		if (MD_MNSET_DESC(sd)) {
5140 			nd = sd->sd_nodelist;
5141 			while (nd) {
5142 				/* Skip nodes not being deleted */
5143 				if (!strinlst(nd->nd_nodename, node_c,
5144 				    node_v)) {
5145 					nd = nd->nd_next;
5146 					continue;
5147 				}
5148 
5149 				/* this side was just created, add the names */
5150 				if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep))
5151 					mdclrerror(&xep);
5152 				nd = nd->nd_next;
5153 			}
5154 		} else {
5155 			for (i = 0; i < MD_MAXSIDES; i++) {
5156 				/* Skip empty slots */
5157 				if (sd->sd_nodes[i][0] == '\0')
5158 					continue;
5159 
5160 				/* Skip nodes not being deleted */
5161 				if (!strinlst(sd->sd_nodes[i], node_c, node_v))
5162 					continue;
5163 
5164 				/* this side was just created, add the names */
5165 				if (add_md_sidenms(sp, i, j, &xep))
5166 					mdclrerror(&xep);
5167 			}
5168 		}
5169 	}
5170 
5171 	/* level 4 */
5172 	if (rb_level > 3 && dd != NULL) {
5173 		/*
5174 		 * Add the new sidename for each drive to all the hosts
5175 		 * Multi-node disksets only store the sidename for
5176 		 * that host, so there is nothing to re-add.
5177 		 */
5178 		if (!(MD_MNSET_DESC(sd))) {
5179 			for (j = 0; j < MD_MAXSIDES; j++) {
5180 				/* Skip empty slots */
5181 				if (sd->sd_nodes[j][0] == '\0')
5182 					continue;
5183 
5184 				/* Skip nodes not being deleted */
5185 				if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5186 					break;
5187 			}
5188 			for (i = 0; i < MD_MAXSIDES; i++) {
5189 				/* Skip empty slots */
5190 				if (sd->sd_nodes[i][0] == '\0')
5191 					continue;
5192 
5193 				if (clnt_add_drv_sidenms(sd->sd_nodes[i],
5194 				    sd->sd_nodes[j], sp, sd, node_c, node_v,
5195 				    &xep))
5196 					mdclrerror(&xep);
5197 			}
5198 		}
5199 
5200 	}
5201 
5202 	/* level 5 */
5203 	if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) {
5204 		/* rollback the mediator record */
5205 		for (i = 0; i < max_meds; i++) {
5206 			if (sd->sd_med.n_lst[i].a_cnt == 0)
5207 				continue;
5208 
5209 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
5210 			    &rb_medr, &xep))
5211 				mdclrerror(&xep);
5212 		}
5213 	}
5214 
5215 	/* level 3 */
5216 	if (rb_level > 2) {
5217 		md_set_record		*sr;
5218 		md_mnset_record		*mnsr;
5219 
5220 		if (MD_MNSET_DESC(sd)) {
5221 			nd = sd->sd_nodelist;
5222 			/*
5223 			 * During OHA mode, don't issue RPCs to
5224 			 * non-alive nodes since there is no reason to
5225 			 * wait for RPC timeouts.
5226 			 */
5227 			while (nd) {
5228 				if ((oha == TRUE) &&
5229 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5230 					nd = nd->nd_next;
5231 					continue;
5232 				}
5233 				/* Record should be for a multi-node diskset */
5234 				if (clnt_mngetset(nd->nd_nodename, sp->setname,
5235 				    MD_SET_BAD, &mnsr, &xep) == -1) {
5236 					mdclrerror(&xep);
5237 					nd = nd->nd_next;
5238 					continue;
5239 				}
5240 
5241 				has_set = 1;
5242 
5243 				nr = mnsr->sr_nodechain;
5244 				while (nr) {
5245 					if (nd->nd_nodeid == nr->nr_nodeid) {
5246 						break;
5247 					}
5248 					nr = nr->nr_next;
5249 				}
5250 				if (nr == NULL)
5251 					has_set = 0;
5252 
5253 				free_sr((struct md_set_record *)mnsr);
5254 				if (has_set) {
5255 					nd = nd->nd_next;
5256 					continue;
5257 				}
5258 
5259 				if (clnt_addhosts(nd->nd_nodename, sp, node_c,
5260 				    node_v, &xep) == -1)
5261 					mdclrerror(&xep);
5262 
5263 				nd = nd->nd_next;
5264 			}
5265 		} else {
5266 			for (i = 0; i < MD_MAXSIDES; i++) {
5267 				/* Skip empty slots */
5268 				if (sd->sd_nodes[i][0] == '\0')
5269 					continue;
5270 
5271 				/* Record should be for a non-multi-node set */
5272 				if (clnt_getset(sd->sd_nodes[i], sp->setname,
5273 				    MD_SET_BAD, &sr, &xep) == -1) {
5274 					mdclrerror(&xep);
5275 					continue;
5276 				}
5277 
5278 				/*
5279 				 * Set record structure was allocated from RPC
5280 				 * routine getset so this structure is only of
5281 				 * size md_set_record even if the MN flag is
5282 				 * set.  So, clear the flag so that the free
5283 				 * code doesn't attempt to free a structure
5284 				 * the size of md_mnset_record.
5285 				 */
5286 				if (MD_MNSET_REC(sr)) {
5287 					sr->sr_flags &= ~MD_SR_MN;
5288 					free_sr(sr);
5289 					continue;
5290 				}
5291 
5292 				has_set = 1;
5293 				for (j = 0; j < MD_MAXSIDES; j++) {
5294 					/* Skip empty slots */
5295 					if (sd->sd_nodes[j][0] == '\0')
5296 						continue;
5297 
5298 					if (sr->sr_nodes[j][0] == '\0') {
5299 						has_set = 0;
5300 						break;
5301 					}
5302 				}
5303 
5304 				free_sr(sr);
5305 				if (has_set)
5306 					continue;
5307 
5308 				if (clnt_addhosts(sd->sd_nodes[i], sp, node_c,
5309 				    node_v, &xep) == -1)
5310 					mdclrerror(&xep);
5311 			}
5312 		}
5313 		max_genid++;
5314 	}
5315 
5316 	/* level 1 */
5317 	if (rb_level > 0) {
5318 		max_genid++;
5319 		/* Sets MD_SR_OK on given nodes. */
5320 		resync_genid(sp, sd, max_genid, node_c, node_v);
5321 
5322 		/*
5323 		 * For MN diskset:
5324 		 * On each newly re-added node, set the node record for that
5325 		 * node to OK.  Then set all node records for the newly added
5326 		 * nodes on all nodes to ok.
5327 		 *
5328 		 * By setting a node's own node record to ok first, even if
5329 		 * the node re-adding the hosts panics, the rest of the nodes
5330 		 * can determine the same node list during the choosing of the
5331 		 * master during reconfig.  So, only nodes considered for
5332 		 * mastership are nodes that have both MD_MN_NODE_OK and
5333 		 * MD_SR_OK set on that node's rpc.metad.  If all nodes have
5334 		 * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set,
5335 		 * then the set will be removed during reconfig since a panic
5336 		 * occurred during the re-creation of the deletion of
5337 		 * the initial diskset.
5338 		 */
5339 		if (MD_MNSET_DESC(sd)) {
5340 			md_mnnode_desc	*saved_nd_next;
5341 			if (dd != NULL) {
5342 				/*
5343 				 * Notify rpc.mdcommd on all nodes of a
5344 				 * nodelist change.  Start by suspending
5345 				 * rpc.mdcommd (which drains it of all
5346 				 * messages), then change the nodelist
5347 				 * followed by a reinit and resume.
5348 				 */
5349 				nd = sd->sd_nodelist;
5350 				while (nd) {
5351 					if (!(nd->nd_flags &
5352 					    MD_MN_NODE_ALIVE)) {
5353 						nd = nd->nd_next;
5354 						continue;
5355 					}
5356 					if (clnt_mdcommdctl(nd->nd_nodename,
5357 					    COMMDCTL_SUSPEND, sp,
5358 					    MD_MSG_CLASS0,
5359 					    MD_MSCF_NO_FLAGS, &xep)) {
5360 						mde_perror(&xep,
5361 						    dgettext(TEXT_DOMAIN,
5362 						    "Unable to suspend "
5363 						    "rpc.mdcommd.\n"));
5364 						mdclrerror(&xep);
5365 					}
5366 					suspendall_flag_rb = 1;
5367 					nd = nd->nd_next;
5368 				}
5369 			}
5370 			for (i = 0; i < node_c; i++) {
5371 				/*
5372 				 * During OHA mode, don't issue RPCs to
5373 				 * non-alive nodes since there is no reason to
5374 				 * wait for RPC timeouts.
5375 				 */
5376 				nd = sd->sd_nodelist;
5377 				while (nd) {
5378 					if (strcmp(nd->nd_nodename, node_v[i])
5379 					    == 0)
5380 						break;
5381 					nd = nd->nd_next;
5382 				}
5383 				/* Something wrong, finish this in next loop */
5384 				if (nd == NULL)
5385 					continue;
5386 
5387 				if ((oha == TRUE) &&
5388 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5389 					continue;
5390 				}
5391 
5392 				if (dd != NULL) {
5393 					/* Set master on re-joining node. */
5394 					if (clnt_mnsetmaster(node_v[i], sp,
5395 					    sd->sd_mn_master_nodenm,
5396 					    sd->sd_mn_master_nodeid, &xep)) {
5397 						mdclrerror(&xep);
5398 					}
5399 
5400 					/*
5401 					 * Re-join set to same state as
5402 					 * before - stale or non-stale.
5403 					 */
5404 					if (clnt_joinset(node_v[i], sp,
5405 					    stale_flag, &xep)) {
5406 						mdclrerror(&xep);
5407 					}
5408 				}
5409 
5410 				/* Only changing my local cache of node list */
5411 				saved_nd_next = nd->nd_next;
5412 				nd->nd_next = NULL;
5413 
5414 				/* Set record for host to ok on that host */
5415 				if (clnt_upd_nr_flags(node_v[i], sp,
5416 				    nd, MD_NR_OK, NULL, &xep)) {
5417 					mdclrerror(&xep);
5418 				}
5419 				nd->nd_next = saved_nd_next;
5420 			}
5421 
5422 			/* Now set all node records on all nodes to be ok */
5423 			nd = sd->sd_nodelist;
5424 			while (nd) {
5425 				/*
5426 				 * During OHA mode, don't issue RPCs to
5427 				 * non-alive nodes since there is no reason to
5428 				 * wait for RPC timeouts.
5429 				 */
5430 				if ((oha == TRUE) &&
5431 				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5432 					nd = nd->nd_next;
5433 					continue;
5434 				}
5435 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5436 				    sd->sd_nodelist, MD_NR_OK, NULL, &xep)) {
5437 					mdclrerror(&xep);
5438 				}
5439 				nd = nd->nd_next;
5440 			}
5441 		}
5442 	}
5443 
5444 	/*
5445 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
5446 	 * Send reinit command to mdcommd which forces it to get
5447 	 * fresh set description.
5448 	 */
5449 	if (suspendall_flag_rb) {
5450 		/* Send reinit */
5451 		nd = sd->sd_nodelist;
5452 		while (nd) {
5453 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5454 				nd = nd->nd_next;
5455 				continue;
5456 			}
5457 
5458 			/* Class is ignored for REINIT */
5459 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5460 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5461 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5462 				    "Unable to reinit rpc.mdcommd.\n"));
5463 				mdclrerror(&xep);
5464 			}
5465 			nd = nd->nd_next;
5466 		}
5467 	}
5468 
5469 	/*
5470 	 * Unlock diskset by resuming messages across the diskset.
5471 	 * Just resume all classes so that resume is the same whether
5472 	 * just one class was locked or all classes were locked.
5473 	 */
5474 	if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) {
5475 		/* Send resume */
5476 		nd = sd->sd_nodelist;
5477 		while (nd) {
5478 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5479 				nd = nd->nd_next;
5480 				continue;
5481 			}
5482 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5483 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
5484 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
5485 				    "Unable to resume rpc.mdcommd.\n"));
5486 			}
5487 			nd = nd->nd_next;
5488 		}
5489 		meta_ping_mnset(sp->setno);
5490 	}
5491 
5492 	/*
5493 	 * Start a resync thread on the re-added nodes
5494 	 * if set is not stale. Also start a thread to update the
5495 	 * abr state of all soft partitions
5496 	 */
5497 	if (stale_flag != MNSET_IS_STALE) {
5498 		for (i = 0; i < node_c; i++) {
5499 			/*
5500 			 * During OHA mode, don't issue RPCs to
5501 			 * non-alive nodes since there is no reason to
5502 			 * wait for RPC timeouts.
5503 			 */
5504 			nd = sd->sd_nodelist;
5505 			while (nd) {
5506 				if (strcmp(nd->nd_nodename, node_v[i])
5507 				    == 0)
5508 					break;
5509 				nd = nd->nd_next;
5510 			}
5511 			if (nd == NULL)
5512 				continue;
5513 
5514 			if ((oha == TRUE) &&
5515 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5516 				continue;
5517 			}
5518 
5519 			if (dd != 0) {
5520 				if (clnt_mn_mirror_resync_all(node_v[i],
5521 				    sp->setno, &xep)) {
5522 					mde_perror(ep, dgettext(TEXT_DOMAIN,
5523 					    "Unable to start resync "
5524 					    "thread.\n"));
5525 				}
5526 				if (clnt_mn_sp_update_abr(node_v[i],
5527 				    sp->setno, &xep)) {
5528 					mde_perror(ep, dgettext(TEXT_DOMAIN,
5529 					    "Unable to start sp update "
5530 					    "thread.\n"));
5531 				}
5532 			}
5533 		}
5534 	}
5535 
5536 	/* level 0 */
5537 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
5538 	/* Don't test lock flag since guaranteed to be set if in rollback */
5539 	if (MD_MNSET_DESC(sd)) {
5540 		nd = sd->sd_nodelist;
5541 		while (nd) {
5542 			/*
5543 			 * During OHA mode, don't issue RPCs to
5544 			 * non-alive nodes since there is no reason to
5545 			 * wait for RPC timeouts.
5546 			 */
5547 			if ((oha == TRUE) &&
5548 			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5549 				nd = nd->nd_next;
5550 				continue;
5551 			}
5552 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
5553 				mdclrerror(&xep);
5554 			nd = nd->nd_next;
5555 		}
5556 	} else {
5557 		for (i = 0; i < MD_MAXSIDES; i++) {
5558 			/* Skip empty slots */
5559 			if (sd->sd_nodes[i][0] == '\0')
5560 				continue;
5561 
5562 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
5563 				mdclrerror(&xep);
5564 		}
5565 	}
5566 	cl_set_setkey(NULL);
5567 
5568 	/* release signals back to what they were on entry */
5569 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
5570 		mdclrerror(&xep);
5571 
5572 	metafreereplicalist(rlp);
5573 	if (node_id_list)
5574 		Free(node_id_list);
5575 
5576 	metaflushsetname(sp);
5577 
5578 	if (!(MD_MNSET_DESC(sd))) {
5579 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
5580 	}
5581 
5582 	return (rval);
5583 }
5584 
5585 int
5586 meta_set_auto_take(
5587 	mdsetname_t	*sp,
5588 	int		take_val,
5589 	md_error_t	*ep
5590 )
5591 {
5592 	int		i;
5593 	md_set_desc	*sd;
5594 	int		rval = 0;
5595 	md_setkey_t	*cl_sk;
5596 	md_error_t	xep = mdnullerror;
5597 	char		*hostname;
5598 	md_drive_desc	*dd;
5599 
5600 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
5601 		return (-1);
5602 
5603 	/* Make sure we own the set */
5604 	if (meta_check_ownership(sp, ep) != 0)
5605 		return (-1);
5606 
5607 	hostname = mynode();
5608 
5609 	/* Lock the set on our side */
5610 	if (clnt_lock_set(hostname, sp, ep)) {
5611 	    rval = -1;
5612 	    goto out;
5613 	}
5614 
5615 	if (take_val) {
5616 	    /* enable auto_take but only if it is not already set */
5617 	    if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
5618 		/* verify that we're the only host in the set */
5619 		for (i = 0; i < MD_MAXSIDES; i++) {
5620 		    if (sd->sd_nodes[i] == NULL || sd->sd_nodes[i][0] == '\0')
5621 			continue;
5622 
5623 		    if (strcmp(sd->sd_nodes[i], hostname) != 0) {
5624 			(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL,
5625 			    NULL, sp->setname);
5626 			rval = -1;
5627 			goto out;
5628 		    }
5629 		}
5630 
5631 		if (clnt_enable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep))
5632 		    rval = -1;
5633 
5634 		/* Disable SCSI reservations */
5635 		if (sd->sd_flags & MD_SR_MB_DEVID)
5636 		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST,
5637 			&xep);
5638 		else
5639 		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep);
5640 		if (! mdisok(&xep))
5641 		    mdclrerror(&xep);
5642 
5643 		if (dd != NULL) {
5644 		    if (rel_own_bydd(sp, dd, TRUE, &xep))
5645 			mdclrerror(&xep);
5646 		}
5647 	    }
5648 
5649 	} else {
5650 	    /* disable auto_take, if set, or error */
5651 	    if (sd->sd_flags & MD_SR_AUTO_TAKE) {
5652 		if (clnt_disable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep))
5653 		    rval = -1;
5654 
5655 		/* Enable SCSI reservations */
5656 		if (sd->sd_flags & MD_SR_MB_DEVID)
5657 		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST,
5658 			&xep);
5659 		else
5660 		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep);
5661 		if (! mdisok(&xep))
5662 		    mdclrerror(&xep);
5663 
5664 		if (dd != NULL) {
5665 		    mhd_mhiargs_t	mhiargs = defmhiargs;
5666 
5667 		    if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
5668 			mdclrerror(&xep);
5669 		}
5670 
5671 	    } else {
5672 		(void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, NULL, NULL,
5673 		    sp->setname);
5674 		rval = -1;
5675 	    }
5676 	}
5677 
5678 out:
5679 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
5680 	if (clnt_unlock_set(hostname, cl_sk, &xep)) {
5681 	    if (rval == 0)
5682 		(void) mdstealerror(ep, &xep);
5683 	    rval = -1;
5684 	}
5685 	cl_set_setkey(NULL);
5686 
5687 	return (rval);
5688 }
5689