xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_set_drv.c (revision 2150:e99313126b1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Metadevice diskset interfaces
30  */
31 
32 #include <meta.h>
33 #include <mdmn_changelog.h>
34 #include "meta_set_prv.h"
35 #include "meta_repartition.h"
36 
37 static int
38 check_setnodes_againstdrivelist(
39 	mdsetname_t		*sp,
40 	mddrivenamelist_t	*dnlp,
41 	md_error_t		*ep
42 )
43 {
44 	md_set_desc		*sd;
45 	mddrivenamelist_t	*p;
46 	int 			i;
47 	md_mnnode_desc		*nd;
48 
49 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
50 		return (-1);
51 
52 	if (MD_MNSET_DESC(sd)) {
53 		nd = sd->sd_nodelist;
54 		while (nd) {
55 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
56 				nd = nd->nd_next;
57 				continue;
58 			}
59 			for (p = dnlp; p != NULL; p = p->next)
60 				if (checkdrive_onnode(sp, p->drivenamep,
61 				    nd->nd_nodename, ep))
62 					return (-1);
63 			nd = nd->nd_next;
64 		}
65 	} else {
66 		for (i = 0; i < MD_MAXSIDES; i++) {
67 			/* Skip empty slots */
68 			if (sd->sd_nodes[i][0] == '\0')
69 				continue;
70 
71 			for (p = dnlp; p != NULL; p = p->next)
72 				if (checkdrive_onnode(sp, p->drivenamep,
73 				    sd->sd_nodes[i], ep))
74 					return (-1);
75 		}
76 	}
77 	return (0);
78 }
79 
80 static int
81 drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
82 {
83 	mddrivenamelist_t *dl1, *dl2;
84 	mddrivename_t *dn1, *dn2;
85 
86 	for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
87 		dn1 = dl1->drivenamep;
88 
89 		for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
90 			dn2 = dl2->drivenamep;
91 			if (strcmp(dn1->cname, dn2->cname) != 0)
92 				continue;
93 
94 			return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
95 			    NULL, dn1->cname, sp->setname));
96 		}
97 	}
98 	return (0);
99 }
100 
101 static md_drive_desc *
102 metaget_drivedesc_fromdrivelist(
103 	mdsetname_t		*sp,
104 	mddrivenamelist_t	*dnlp,
105 	uint_t			flags,
106 	md_error_t		*ep
107 )
108 {
109 	mddrivenamelist_t	*p;
110 	md_drive_desc		*dd = NULL;
111 	md_set_desc		*sd;
112 
113 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
114 		return (NULL);
115 
116 	for (p = dnlp; p != NULL; p = p->next) {
117 		(void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
118 		    sd->sd_ctime, sd->sd_genid, flags);
119 	}
120 
121 	return (dd);
122 }
123 
124 /*
125  * Exported Entry Points
126  */
127 
128 int
129 meta_make_sidenmlist(
130 	mdsetname_t		*sp,
131 	mddrivename_t		*dnp,
132 	int			import_flag, /* flags partial import */
133 	md_im_drive_info_t	*midp,	/* import drive information */
134 	md_error_t		*ep
135 )
136 {
137 	mdsidenames_t		*sn, **sn_next;
138 	mdname_t		*np;
139 	int			done;
140 	side_t			sideno = MD_SIDEWILD;
141 	uint_t			rep_slice;
142 	char			*bname;
143 
144 	if (!import_flag) {
145 		/*
146 		 * Normal (aka NOT partial import) code path.
147 		 */
148 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
149 			return (-1);
150 		}
151 
152 		dnp->side_names_key = MD_KEYWILD;
153 
154 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
155 			return (-1);
156 		bname = Strdup(np->bname);
157 	} else {
158 		/*
159 		 * When doing a partial import, we'll get the needed
160 		 * information from somewhere other than the system.
161 		 */
162 		dnp->side_names_key = MD_KEYWILD;
163 		bname = Strdup(midp->mid_devname);
164 	}
165 	metaflushsidenames(dnp);
166 	sn_next = &dnp->side_names;
167 	/*CONSTCOND*/
168 	while (1) {
169 		sn = Zalloc(sizeof (*sn));
170 
171 		if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
172 		    &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
173 			if (import_flag) {
174 				mdclrerror(ep);
175 				sn->dname = Strdup(midp->mid_driver_name);
176 				sn->mnum = midp->mid_mnum;
177 			} else {
178 				Free(sn);
179 				Free(bname);
180 				return (-1);
181 			}
182 		}
183 
184 		if (done == 0) {
185 			Free(sn);
186 			Free(bname);
187 			return (0);
188 		}
189 
190 		sn->sideno = sideno;
191 
192 		/* Add to the end of the linked list */
193 		assert(*sn_next == NULL);
194 		*sn_next = sn;
195 		sn_next = &sn->next;
196 	}
197 	/*NOTREACHED*/
198 }
199 
200 int
201 meta_set_adddrives(
202 	mdsetname_t		*sp,
203 	mddrivenamelist_t	*dnlp,
204 	daddr_t			dbsize,
205 	int			force_label,
206 	md_error_t		*ep
207 )
208 {
209 	md_set_desc		*sd;
210 	md_drive_desc		*dd = NULL, *curdd = NULL, *ddp;
211 	int			i;
212 	mddrivenamelist_t	*p;
213 	mhd_mhiargs_t		mhiargs;
214 	int			rval = 0;
215 	md_timeval32_t		now;
216 	sigset_t		oldsigs;
217 	ulong_t			genid;
218 	ulong_t			max_genid = 0;
219 	md_setkey_t		*cl_sk;
220 	int			rb_level = 0;
221 	md_error_t		xep = mdnullerror;
222 	md_mnnode_desc		*nd;
223 	int			suspendall_flag = 0;
224 	int			suspend1_flag = 0;
225 	int			lock_flag = 0;
226 	int			flush_set_onerr = 0;
227 	md_replicalist_t	*rlp = NULL, *rl;
228 
229 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
230 		return (-1);
231 
232 	/* Make sure we own the set */
233 	if (meta_check_ownership(sp, ep) != 0)
234 		return (-1);
235 
236 	/*
237 	 * The drive and node records are stored in the local mddbs of each
238 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
239 	 * drive and node records from that node's local mddb and caches them
240 	 * internally. Any process needing diskset information contacts its
241 	 * local rpc.metad to get this information.  Since each node in the
242 	 * diskset is independently reading the set information from its local
243 	 * mddb, the set, drive and node records in the local mddbs must stay
244 	 * in-sync, so that all nodes have a consistent view of the diskset.
245 	 *
246 	 * For a multinode diskset, explicitly verify that all nodes in the
247 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
248 	 * fail this operation since all nodes must be ALIVE in order to add
249 	 * the new drive record to their local mddb.  If a panic of this node
250 	 * leaves the local mddbs set, node and drive records out-of-sync, the
251 	 * reconfig cycle will fix the local mddbs and force them back into
252 	 * synchronization.
253 	 */
254 	if (MD_MNSET_DESC(sd)) {
255 		nd = sd->sd_nodelist;
256 		while (nd) {
257 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
258 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
259 					sp->setno,
260 					nd->nd_nodename, NULL, sp->setname);
261 				return (-1);
262 			}
263 			nd = nd->nd_next;
264 		}
265 	}
266 
267 	if (drvsuniq(sp, dnlp, ep) == -1)
268 		return (-1);
269 
270 	/*
271 	 * Lock the set on current set members.
272 	 * Set locking done much earlier for MN diskset than for traditional
273 	 * diskset since lock_set and SUSPEND are used to protect against
274 	 * other meta* commands running on the other nodes.
275 	 */
276 	if (MD_MNSET_DESC(sd)) {
277 		/* Make sure we are blocking all signals */
278 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
279 			mdclrerror(&xep);
280 
281 		nd = sd->sd_nodelist;
282 		/* All nodes are guaranteed to be ALIVE */
283 		while (nd) {
284 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
285 				rval = -1;
286 				goto out;
287 			}
288 			lock_flag = 1;
289 			nd = nd->nd_next;
290 		}
291 		/*
292 		 * Lock out other meta* commands by suspending
293 		 * class 1 messages across the diskset.
294 		 */
295 		nd = sd->sd_nodelist;
296 		/* All nodes are guaranteed to be ALIVE */
297 		while (nd) {
298 			if (clnt_mdcommdctl(nd->nd_nodename,
299 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
300 			    MD_MSCF_NO_FLAGS, ep)) {
301 				rval = -1;
302 				goto out;
303 			}
304 			suspend1_flag = 1;
305 			nd = nd->nd_next;
306 		}
307 	}
308 
309 	if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
310 		rval = -1;
311 		goto out;
312 	}
313 
314 	for (p = dnlp; p != NULL; p = p->next) {
315 		mdsetname_t	*tmp;
316 
317 		if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
318 		    ep) == -1) {
319 			rval = -1;
320 			goto out;
321 		}
322 
323 		if (tmp != NULL) {
324 			(void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
325 			    tmp->setname, p->drivenamep->cname, sp->setname);
326 			rval = -1;
327 			goto out;
328 		}
329 	}
330 
331 	/* END CHECK CODE */
332 
333 	/*
334 	 * This is a separate loop (from above) so that we validate all the
335 	 * drives handed to us before we repartition any one drive.
336 	 */
337 	for (p = dnlp; p != NULL; p = p->next) {
338 		if (meta_repartition_drive(sp,
339 		    p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
340 		    NULL, /* Don't return the VTOC. */
341 		    ep) != 0) {
342 			rval = -1;
343 			goto out;
344 		}
345 		/*
346 		 * Create the names for the drives we are adding per side.
347 		 */
348 		if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
349 		    ep) == -1) {
350 			rval = -1;
351 			goto out;
352 		}
353 	}
354 
355 	/*
356 	 * Get the list of drives descriptors that we are adding.
357 	 */
358 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
359 
360 	if (! mdisok(ep)) {
361 		rval = -1;
362 		goto out;
363 	}
364 
365 	/*
366 	 * Get the set timeout information.
367 	 */
368 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
369 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
370 		rval = -1;
371 		goto out;
372 	}
373 
374 	/*
375 	 * Get timestamp and generation id for new records
376 	 */
377 	now = sd->sd_ctime;
378 	genid = sd->sd_genid;
379 
380 
381 	/* At this point, in case of error, set should be flushed. */
382 	flush_set_onerr = 1;
383 
384 	/* Lock the set on current set members */
385 	if (!(MD_MNSET_DESC(sd))) {
386 		md_rb_sig_handling_on();
387 		for (i = 0; i < MD_MAXSIDES; i++) {
388 			/* Skip empty slots */
389 			if (sd->sd_nodes[i][0] == '\0')
390 				continue;
391 
392 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
393 				rval = -1;
394 				goto out;
395 			}
396 			lock_flag = 1;
397 		}
398 	}
399 
400 	/*
401 	 * Get drive descriptors for the drives that are currently in the set.
402 	 */
403 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
404 	if (! mdisok(ep))
405 		goto rollback;
406 
407 	/*
408 	 * If first drive being added to set, set the mastership
409 	 * of the multinode diskset to be this node.
410 	 * Only set it on this node.  If all goes well
411 	 * and there are no errors, the mastership of this node will be set
412 	 * on all nodes in user space and in the kernel.
413 	 */
414 	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
415 		if (clnt_mnsetmaster(mynode(), sp,
416 		    sd->sd_mn_mynode->nd_nodename,
417 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
418 			goto rollback;
419 		}
420 		/*
421 		 * Set this up in my local cache of the set desc so that
422 		 * the set descriptor won't have to be gotten again from
423 		 * rpc.metad.  If it is flushed and gotten again, these
424 		 * values will be set in sr2setdesc.
425 		 */
426 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
427 		(void) strcpy(sd->sd_mn_master_nodenm,
428 		    sd->sd_mn_mynode->nd_nodename);
429 		sd->sd_mn_am_i_master = 1;
430 	}
431 
432 	RB_TEST(1, "adddrives", ep)
433 
434 	RB_PREEMPT;
435 	rb_level = 1;	/* level 1 */
436 
437 	RB_TEST(2, "adddrives", ep)
438 
439 	/*
440 	 * Add the drive records for the drives that we are adding to
441 	 * each host in the set.  Marks the drive as MD_DR_ADD.
442 	 */
443 	if (MD_MNSET_DESC(sd)) {
444 		nd = sd->sd_nodelist;
445 		/* All nodes are guaranteed to be ALIVE */
446 		while (nd) {
447 			if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
448 			    ep) == -1)
449 				goto rollback;
450 
451 			RB_TEST(3, "adddrives", ep)
452 			nd = nd->nd_next;
453 		}
454 	} else {
455 		for (i = 0; i < MD_MAXSIDES; i++) {
456 			/* Skip empty slots */
457 			if (sd->sd_nodes[i][0] == '\0')
458 				continue;
459 
460 			if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
461 			    ep) == -1)
462 				goto rollback;
463 
464 			RB_TEST(3, "adddrives", ep)
465 		}
466 	}
467 
468 	RB_TEST(4, "adddrives", ep)
469 
470 	RB_PREEMPT;
471 	rb_level = 2;	/* level 2 */
472 
473 	RB_TEST(5, "adddrives", ep)
474 
475 	/*
476 	 * Take ownership of the added drives.
477 	 */
478 	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
479 		if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
480 			goto rollback;
481 	}
482 
483 	RB_TEST(6, "adddrives", ep)
484 
485 	RB_PREEMPT;
486 	rb_level = 3;	/* level 3 */
487 
488 	RB_TEST(7, "adddrives", ep)
489 
490 	/*
491 	 * Balance the DB's according to the list of existing drives and the
492 	 * list of added drives.
493 	 */
494 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
495 		goto rollback;
496 
497 	/*
498 	 * Slam a dummy master block on all the disks that we are adding
499 	 * that don't have replicas on them.
500 	 * Used by diskset import if the disksets are remotely replicated
501 	 */
502 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
503 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
504 			uint_t		rep_slice;
505 			int		fd = -1;
506 			mdname_t	*np = NULL;
507 			char		*drive_name;
508 
509 			drive_name = ddp->dd_dnp->cname;
510 
511 			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
512 				char	*rep_name;
513 
514 				rep_name =
515 				    rl->rl_repp->r_namep->drivenamep->cname;
516 
517 				if (strcmp(drive_name, rep_name) == 0) {
518 					/*
519 					 * Disk has a replica on it so don't
520 					 * add dummy master block.
521 					 */
522 					break;
523 				}
524 			}
525 			if (rl == NULL) {
526 				/*
527 				 * Drive doesn't have a replica on it so
528 				 * we need a dummy master block. Add it.
529 				 */
530 				if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
531 				    &xep) != 0) {
532 					mdclrerror(&xep);
533 					continue;
534 				}
535 
536 				if ((np = metaslicename(ddp->dd_dnp, rep_slice,
537 				    &xep)) == NULL) {
538 					mdclrerror(&xep);
539 					continue;
540 				}
541 
542 				if ((fd = open(np->rname, O_RDWR)) >= 0) {
543 					meta_mkdummymaster(sp, fd, 16);
544 					(void) close(fd);
545 				}
546 			}
547 		}
548 	}
549 
550 	if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
551 		/*
552 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
553 		 * Start by suspending rpc.mdcommd (which drains it of all
554 		 * messages), then change the nodelist followed by a reinit
555 		 * and resume.
556 		 */
557 		nd = sd->sd_nodelist;
558 		/* All nodes are guaranteed to be ALIVE */
559 		while (nd) {
560 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
561 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
562 				rval = -1;
563 				goto out;
564 			}
565 			suspendall_flag = 1;
566 			nd = nd->nd_next;
567 		}
568 	}
569 
570 	/*
571 	 * If a MN diskset and this is the first disk(s) being added
572 	 * to set, then pre-allocate change log records here.
573 	 * When the other nodes are joined into the MN diskset, the
574 	 * USER records will just be snarfed in.
575 	 */
576 	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
577 		if (mdmn_allocate_changelog(sp, ep) != 0)
578 			goto rollback;
579 	}
580 
581 	/*
582 	 * Mark the drives MD_DR_OK.
583 	 * If first drive being added to MN diskset, then set
584 	 * master on all nodes to be this node and then join
585 	 * all alive nodes (nodes in membership list) to set.
586 	 */
587 	if (MD_MNSET_DESC(sd)) {
588 		nd = sd->sd_nodelist;
589 		/* All nodes are guaranteed to be ALIVE */
590 		while (nd) {
591 			/* don't set master on this node - done earlier */
592 			if ((curdd == NULL) && (nd->nd_nodeid !=
593 			    sd->sd_mn_mynode->nd_nodeid)) {
594 				/*
595 				 * Set master on all alive nodes since
596 				 * all alive nodes will become joined nodes.
597 				 */
598 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
599 				    sd->sd_mn_mynode->nd_nodename,
600 				    sd->sd_mn_mynode->nd_nodeid, ep)) {
601 					goto rollback;
602 				}
603 			}
604 
605 			if (curdd == NULL) {
606 				/*
607 				 * No special flags for join set.  Since
608 				 * all nodes are joining if 1st drive is being
609 				 * added to set then all nodes will be either
610 				 * STALE or non-STALE and each node can
611 				 * determine this on its own.
612 				 */
613 				if (clnt_joinset(nd->nd_nodename, sp,
614 				    NULL, ep)) {
615 					goto rollback;
616 				}
617 				/* Sets join node flag on all nodes in list */
618 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
619 				    sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
620 					goto rollback;
621 				}
622 			}
623 
624 			/*
625 			 * Set MD_DR_OK as last thing before unlock.
626 			 * In case of panic on this node, recovery
627 			 * code can check for MD_DR_OK to determine
628 			 * status of diskset.
629 			 */
630 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
631 			    MD_DR_OK, ep) == -1)
632 				goto rollback;
633 
634 
635 			RB_TEST(8, "adddrives", ep)
636 			nd = nd->nd_next;
637 		}
638 	} else {
639 		for (i = 0; i < MD_MAXSIDES; i++) {
640 			/* Skip empty slots */
641 			if (sd->sd_nodes[i][0] == '\0')
642 				continue;
643 
644 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
645 			    ep) == -1)
646 				goto rollback;
647 
648 			RB_TEST(8, "adddrives", ep)
649 		}
650 	}
651 
652 	RB_TEST(9, "adddrives", ep)
653 
654 out:
655 	/*
656 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
657 	 * Send reinit command to mdcommd which forces it to get
658 	 * fresh set description.
659 	 */
660 	if (suspendall_flag) {
661 		/* Send reinit */
662 		nd = sd->sd_nodelist;
663 		/* All nodes are guaranteed to be ALIVE */
664 		while (nd) {
665 			/* Class is ignored for REINIT */
666 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
667 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
668 				if (rval == 0)
669 					(void) mdstealerror(ep, &xep);
670 				rval = -1;
671 				mde_perror(ep, dgettext(TEXT_DOMAIN,
672 				    "Unable to reinit rpc.mdcommd.\n"));
673 			}
674 			nd = nd->nd_next;
675 		}
676 	}
677 	/*
678 	 * Unlock diskset by resuming messages across the diskset.
679 	 * Just resume all classes so that resume is the same whether
680 	 * just one class was locked or all classes were locked.
681 	 */
682 	if ((suspend1_flag) || (suspendall_flag)) {
683 		nd = sd->sd_nodelist;
684 		/* All nodes are guaranteed to be ALIVE */
685 		while (nd) {
686 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
687 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
688 				if (rval == 0)
689 					(void) mdstealerror(ep, &xep);
690 				rval = -1;
691 				mde_perror(ep, dgettext(TEXT_DOMAIN,
692 				    "Unable to resume rpc.mdcommd.\n"));
693 			}
694 			nd = nd->nd_next;
695 		}
696 		meta_ping_mnset(sp->setno);
697 	}
698 
699 	if (lock_flag) {
700 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
701 		if (MD_MNSET_DESC(sd)) {
702 			nd = sd->sd_nodelist;
703 			/* All nodes are guaranteed to be ALIVE */
704 			while (nd) {
705 				if (clnt_unlock_set(nd->nd_nodename,
706 				    cl_sk, &xep)) {
707 					if (rval == 0)
708 						(void) mdstealerror(ep, &xep);
709 					rval = -1;
710 				}
711 				nd = nd->nd_next;
712 			}
713 		} else {
714 			for (i = 0; i < MD_MAXSIDES; i++) {
715 				/* Skip empty slots */
716 				if (sd->sd_nodes[i][0] == '\0')
717 					continue;
718 
719 				if (clnt_unlock_set(sd->sd_nodes[i],
720 				    cl_sk, &xep)) {
721 					if (rval == 0)
722 						(void) mdstealerror(ep, &xep);
723 					rval = -1;
724 				}
725 			}
726 		}
727 		cl_set_setkey(NULL);
728 	}
729 
730 	metafreedrivedesc(&dd);
731 
732 	if (flush_set_onerr) {
733 		metaflushsetname(sp);
734 		if (!(MD_MNSET_DESC(sd))) {
735 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
736 		}
737 	}
738 
739 	if (MD_MNSET_DESC(sd)) {
740 		/* release signals back to what they were on entry */
741 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
742 			mdclrerror(&xep);
743 	}
744 
745 	return (rval);
746 
747 rollback:
748 	/* all signals already blocked for MN disket */
749 	if (!(MD_MNSET_DESC(sd))) {
750 		/* Make sure we are blocking all signals */
751 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
752 			mdclrerror(&xep);
753 	}
754 
755 	rval = -1;
756 
757 	max_genid = sd->sd_genid;
758 
759 	/* level 3 */
760 	if (rb_level > 2) {
761 		/*
762 		 * Since the add drive operation is failing, need
763 		 * to reset config back to the way it was
764 		 * before the add drive opration.
765 		 * If a MN diskset and this is the first drive being added,
766 		 * then reset master on all ALIVE nodes (which is all nodes)
767 		 * since the master would have not been set previously.
768 		 * Don't reset master on this node, since this
769 		 * is done later.
770 		 * This is ok to fail since next node to add first
771 		 * disk to diskset will also set the master on all nodes.
772 		 *
773 		 * Also, if this is the first drive being added,
774 		 * need to have each node withdraw itself from the set.
775 		 */
776 		if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
777 			nd = sd->sd_nodelist;
778 			/* All nodes are guaranteed to be ALIVE */
779 			while (nd) {
780 				/*
781 				 * Be careful with ordering in case of
782 				 * panic between the steps and the
783 				 * effect on recovery during reconfig.
784 				 */
785 				if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
786 					mdclrerror(&xep);
787 
788 				/* Sets withdraw flag on all nodes in list */
789 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
790 				    sd->sd_nodelist, MD_NR_WITHDRAW,
791 				    NULL, &xep)) {
792 					mdclrerror(&xep);
793 				}
794 
795 				/* Skip this node */
796 				if (nd->nd_nodeid ==
797 				    sd->sd_mn_mynode->nd_nodeid) {
798 					nd = nd->nd_next;
799 					continue;
800 				}
801 				/* Reset master on all of the other nodes. */
802 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
803 				    "", MD_MN_INVALID_NID, &xep))
804 					mdclrerror(&xep);
805 				nd = nd->nd_next;
806 			}
807 		}
808 	}
809 
810 	/*
811 	 * Send resume command to mdcommd.  Don't send reinit command
812 	 * since nodelist should not have changed.
813 	 * If suspendall_flag is set, then user would have been adding
814 	 * first drives to set.  Since this failed, there is certainly
815 	 * no reinit message to send to rpc.commd since no nodes will
816 	 * be joined to set at the end of this metaset command.
817 	 */
818 	if (suspendall_flag) {
819 		/* Send resume */
820 		nd = sd->sd_nodelist;
821 		/* All nodes are guaranteed to be ALIVE */
822 		while (nd) {
823 			/*
824 			 * Resume all classes but class 1 so that lock is held
825 			 * against meta* commands.
826 			 * To later resume class1, must issue a class0 resume.
827 			 */
828 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
829 			    sp, MD_MSG_CLASS0,
830 			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
831 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
832 				    "Unable to resume rpc.mdcommd.\n"));
833 				mdclrerror(&xep);
834 			}
835 			nd = nd->nd_next;
836 		}
837 		meta_ping_mnset(sp->setno);
838 	}
839 
840 	/* level 3 */
841 	if (rb_level > 2) {
842 		mdnamelist_t	*nlp;
843 		mdname_t	*np;
844 
845 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
846 			uint_t	rep_slice;
847 
848 			if ((meta_replicaslice(ddp->dd_dnp,
849 			    &rep_slice, &xep) != 0) ||
850 			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
851 				&xep)) == NULL)) {
852 				mdclrerror(&xep);
853 				continue;
854 			}
855 			nlp = NULL;
856 			(void) metanamelist_append(&nlp, np);
857 
858 			if (meta_db_detach(sp, nlp,
859 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
860 				mdclrerror(&xep);
861 
862 			metafreenamelist(nlp);
863 		}
864 
865 		/* Re-balance */
866 		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
867 			mdclrerror(&xep);
868 
869 		/* Only if we are adding the first drive */
870 		/* Handled MN diskset above. */
871 		if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
872 			if (clnt_stimeout(mynode(), sp, &defmhiargs,
873 			    &xep) == -1)
874 				mdclrerror(&xep);
875 
876 			/* This is needed because of a corner case */
877 			if (halt_set(sp, &xep))
878 				mdclrerror(&xep);
879 		}
880 		max_genid++;
881 	}
882 
883 	/* level 2 */
884 	if (rb_level > 1) {
885 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
886 			if (rel_own_bydd(sp, dd, TRUE, &xep))
887 				mdclrerror(&xep);
888 		}
889 	}
890 
891 	/* level 1 */
892 	if (rb_level > 0) {
893 		if (MD_MNSET_DESC(sd)) {
894 			nd = sd->sd_nodelist;
895 			/* All nodes are guaranteed to be ALIVE */
896 			while (nd) {
897 				if (clnt_deldrvs(nd->nd_nodename, sp, dd,
898 				    &xep) == -1)
899 					mdclrerror(&xep);
900 				nd = nd->nd_next;
901 			}
902 		} else {
903 			for (i = 0; i < MD_MAXSIDES; i++) {
904 				/* Skip empty slots */
905 				if (sd->sd_nodes[i][0] == '\0')
906 					continue;
907 
908 				if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
909 				    &xep) == -1)
910 					mdclrerror(&xep);
911 			}
912 		}
913 		max_genid += 2;
914 		resync_genid(sp, sd, max_genid, 0, NULL);
915 	}
916 
917 	if ((suspend1_flag) || (suspendall_flag)) {
918 		/* Send resume */
919 		nd = sd->sd_nodelist;
920 		/* All nodes are guaranteed to be ALIVE */
921 		while (nd) {
922 			/*
923 			 * Just resume all classes so that resume is the
924 			 * same whether just one class was locked or all
925 			 * classes were locked.
926 			 */
927 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
928 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
929 				mdclrerror(&xep);
930 			}
931 			nd = nd->nd_next;
932 		}
933 		meta_ping_mnset(sp->setno);
934 	}
935 
936 	/* level 0 */
937 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
938 	/* Don't test lock flag since guaranteed to be set if in rollback */
939 	if (MD_MNSET_DESC(sd)) {
940 		/*
941 		 * Since the add drive operation is failing, need
942 		 * to reset config back to the way it was
943 		 * before the add drive opration.
944 		 * If a MN diskset and this is the first drive being
945 		 * added, then reset master on this node since
946 		 * the master would have not been set previously.
947 		 * This is ok to fail since next node to add first
948 		 * disk to diskset will also set the master on all nodes.
949 		 */
950 		if (curdd == NULL) {
951 			/* Reset master on mynode */
952 			if (clnt_mnsetmaster(mynode(), sp, "",
953 			    MD_MN_INVALID_NID, &xep))
954 				mdclrerror(&xep);
955 		}
956 		nd = sd->sd_nodelist;
957 		/* All nodes are guaranteed to be ALIVE */
958 		while (nd) {
959 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
960 				mdclrerror(&xep);
961 			nd = nd->nd_next;
962 		}
963 	} else {
964 		for (i = 0; i < MD_MAXSIDES; i++) {
965 			/* Skip empty slots */
966 			if (sd->sd_nodes[i][0] == '\0')
967 				continue;
968 
969 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
970 				mdclrerror(&xep);
971 		}
972 	}
973 	cl_set_setkey(NULL);
974 
975 	/* release signals back to what they were on entry */
976 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
977 		mdclrerror(&xep);
978 
979 	metafreedrivedesc(&dd);
980 
981 	if (flush_set_onerr) {
982 		metaflushsetname(sp);
983 		if (!(MD_MNSET_DESC(sd))) {
984 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
985 		}
986 	}
987 
988 	return (rval);
989 }
990 
991 /*
992  * Add drives routine used during import of a diskset.
993  */
994 int
995 meta_imp_set_adddrives(
996 	mdsetname_t		*sp,
997 	mddrivenamelist_t	*dnlp,
998 	md_im_set_desc_t	*misp,
999 	md_error_t		*ep
1000 )
1001 {
1002 	md_set_desc		*sd;
1003 	mddrivenamelist_t	*p;
1004 	md_drive_desc		*dd = NULL, *ddp;
1005 	int			flush_set_onerr = 0;
1006 	md_timeval32_t		now;
1007 	ulong_t			genid;
1008 	mhd_mhiargs_t		mhiargs;
1009 	md_im_replica_info_t	*mirp;
1010 	md_im_drive_info_t	*midp;
1011 	int			rval = 0;
1012 	sigset_t		oldsigs;
1013 	ulong_t			max_genid = 0;
1014 	int			rb_level = 0;
1015 	md_error_t		xep = mdnullerror;
1016 
1017 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1018 		return (-1);
1019 
1020 	for (p = dnlp; p != NULL; p = p->next) {
1021 		int		imp_flag = 0;
1022 
1023 		/*
1024 		 * If we have a partial diskset, meta_make_sidenmlist will
1025 		 * need information from midp to complete making the
1026 		 * side name structure.
1027 		 */
1028 		if (misp->mis_partial) {
1029 			imp_flag = MDDB_C_IMPORT;
1030 			for (midp = misp->mis_drives; midp != NULL;
1031 			    midp = midp->mid_next) {
1032 				if (midp->mid_dnp == p->drivenamep)
1033 					break;
1034 			}
1035 			if (midp == NULL) {
1036 				(void) mddserror(ep, MDE_DS_SETNOTIMP,
1037 				    MD_SET_BAD, mynode(), NULL, sp->setname);
1038 				rval = -1;
1039 				goto out;
1040 			}
1041 		}
1042 		/*
1043 		 * Create the names for the drives we are adding per side.
1044 		 */
1045 		if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
1046 		    midp, ep) == -1) {
1047 			rval = -1;
1048 			goto out;
1049 		}
1050 	}
1051 
1052 	/*
1053 	 * Get the list of drives descriptors that we are adding.
1054 	 */
1055 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
1056 
1057 	if (! mdisok(ep)) {
1058 		rval = -1;
1059 		goto out;
1060 	}
1061 
1062 	/*
1063 	 * Get the set timeout information.
1064 	 */
1065 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1066 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1067 		rval = -1;
1068 		goto out;
1069 	}
1070 
1071 	/*
1072 	 * Get timestamp and generation id for new records
1073 	 */
1074 	now = sd->sd_ctime;
1075 	genid = sd->sd_genid;
1076 
1077 	/* At this point, in case of error, set should be flushed. */
1078 	flush_set_onerr = 1;
1079 
1080 	rb_level = 1;   /* level 1 */
1081 
1082 	for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
1083 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1084 			if (ddp->dd_dnp == midp->mid_dnp) {
1085 				/* same disk */
1086 				ddp->dd_dnp->devid =
1087 				    devid_str_encode(midp->mid_devid,
1088 				    midp->mid_minor_name);
1089 
1090 				ddp->dd_dbcnt = 0;
1091 				mirp = midp->mid_replicas;
1092 				if (mirp) {
1093 					ddp->dd_dbsize = mirp->mir_length;
1094 					for (; mirp != NULL;
1095 					    mirp = mirp->mir_next) {
1096 						ddp->dd_dbcnt++;
1097 					}
1098 				}
1099 				if ((midp->mid_available &
1100 				    MD_IM_DISK_NOT_AVAILABLE) &&
1101 				    (misp->mis_flags & MD_IM_SET_REPLICATED)) {
1102 					ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
1103 				}
1104 			}
1105 		}
1106 	}
1107 
1108 	/*
1109 	 * Add the drive records for the drives that we are adding to
1110 	 * each host in the set.  Marks the drive records as MD_DR_ADD.
1111 	 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
1112 	 * this flag was set in the dd_flags for that drive.
1113 	 */
1114 	if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
1115 		goto rollback;
1116 
1117 	rb_level = 2;   /* level 2 */
1118 
1119 	/*
1120 	 * Take ownership of the added drives.
1121 	 */
1122 	if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
1123 		goto rollback;
1124 
1125 out:
1126 	metafreedrivedesc(&dd);
1127 
1128 	if (flush_set_onerr) {
1129 		metaflushsetname(sp);
1130 	}
1131 
1132 	return (rval);
1133 
1134 rollback:
1135 	/* Make sure we are blocking all signals */
1136 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1137 		mdclrerror(&xep);
1138 
1139 	rval = -1;
1140 
1141 	max_genid = sd->sd_genid;
1142 
1143 	/* level 2 */
1144 	if (rb_level > 1) {
1145 		if (!MD_ATSET_DESC(sd)) {
1146 			if (rel_own_bydd(sp, dd, TRUE, &xep)) {
1147 				mdclrerror(&xep);
1148 			}
1149 		}
1150 	}
1151 
1152 	/* level 1 */
1153 	if (rb_level > 0) {
1154 		if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
1155 			mdclrerror(&xep);
1156 		}
1157 		max_genid += 2;
1158 		resync_genid(sp, sd, max_genid, 0, NULL);
1159 	}
1160 
1161 	/* level 0 */
1162 
1163 	/* release signals back to what they were on entry */
1164 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1165 		mdclrerror(&xep);
1166 
1167 	metafreedrivedesc(&dd);
1168 
1169 	if (flush_set_onerr) {
1170 		metaflushsetname(sp);
1171 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1172 	}
1173 
1174 	return (rval);
1175 }
1176 
1177 int
1178 meta_set_deletedrives(
1179 	mdsetname_t		*sp,
1180 	mddrivenamelist_t	*dnlp,
1181 	int			forceflg,
1182 	md_error_t		*ep
1183 )
1184 {
1185 	md_set_desc		*sd;
1186 	md_drive_desc		*ddp, *dd = NULL, *curdd = NULL;
1187 	md_replicalist_t	*rlp = NULL, *rl;
1188 	mddrivenamelist_t	*p;
1189 	int			deldrvcnt = 0;
1190 	int			rval = 0;
1191 	mhd_mhiargs_t		mhiargs;
1192 	int			i;
1193 	sigset_t		oldsigs;
1194 	md_setkey_t		*cl_sk;
1195 	ulong_t			max_genid = 0;
1196 	int			rb_level = 0;
1197 	md_error_t		xep = mdnullerror;
1198 	md_mnnode_desc		*nd;
1199 	int			has_set;
1200 	int			current_drv_cnt = 0;
1201 	int			suspendall_flag = 0, suspendall_flag_rb = 0;
1202 	int			suspend1_flag = 0;
1203 	int			lock_flag = 0;
1204 	bool_t			stale_bool = FALSE;
1205 	int			flush_set_onerr = 0;
1206 	mdnamelist_t		*nlp;
1207 	mdname_t		*np;
1208 
1209 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1210 		return (-1);
1211 
1212 	/* Make sure we own the set */
1213 	if (meta_check_ownership(sp, ep) != 0)
1214 		return (-1);
1215 
1216 	if (drvsuniq(sp, dnlp, ep) == -1)
1217 		return (-1);
1218 
1219 	/*
1220 	 * Check and see if all the nodes have the set.
1221 	 *
1222 	 * The drive and node records are stored in the local mddbs of each
1223 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
1224 	 * drive and node records from that node's local mddb and caches them
1225 	 * internally. Any process needing diskset information contacts its
1226 	 * local rpc.metad to get this information.  Since each node in the
1227 	 * diskset is independently reading the set information from its local
1228 	 * mddb, the set, drive and node records in the local mddbs must stay
1229 	 * in-sync, so that all nodes have a consistent view of the diskset.
1230 	 *
1231 	 * For a multinode diskset, explicitly verify that all nodes in the
1232 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
1233 	 * fail this operation since all nodes must be ALIVE in order to delete
1234 	 * a drive record from their local mddb.  If a panic of this node
1235 	 * leaves the local mddbs set, node and drive records out-of-sync, the
1236 	 * reconfig cycle will fix the local mddbs and force them back into
1237 	 * synchronization.
1238 	 */
1239 	if (MD_MNSET_DESC(sd)) {
1240 		nd = sd->sd_nodelist;
1241 		while (nd) {
1242 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1243 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1244 					sp->setno,
1245 					nd->nd_nodename, NULL, sp->setname);
1246 				return (-1);
1247 			}
1248 			nd = nd->nd_next;
1249 		}
1250 
1251 		/* Make sure we are blocking all signals */
1252 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1253 			mdclrerror(&xep);
1254 
1255 		/*
1256 		 * Lock the set on current set members.
1257 		 * Set locking done much earlier for MN diskset than for
1258 		 * traditional diskset since lock_set and SUSPEND are used
1259 		 * to protect against other meta* commands running on the
1260 		 * other nodes.
1261 		 */
1262 		nd = sd->sd_nodelist;
1263 		/* All nodes are guaranteed to be ALIVE */
1264 		while (nd) {
1265 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1266 				rval = -1;
1267 				goto out;
1268 			}
1269 			lock_flag = 1;
1270 			nd = nd->nd_next;
1271 		}
1272 		/*
1273 		 * Lock out other meta* commands by suspending
1274 		 * class 1 messages across the diskset.
1275 		 */
1276 		nd = sd->sd_nodelist;
1277 		/* All nodes are guaranteed to be ALIVE */
1278 		while (nd) {
1279 			if (clnt_mdcommdctl(nd->nd_nodename,
1280 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1281 			    MD_MSCF_NO_FLAGS, ep)) {
1282 				rval = -1;
1283 				goto out;
1284 			}
1285 			suspend1_flag = 1;
1286 			nd = nd->nd_next;
1287 		}
1288 
1289 		nd = sd->sd_nodelist;
1290 		/* All nodes are guaranteed to be ALIVE */
1291 		while (nd) {
1292 			if (strcmp(nd->nd_nodename, mynode()) == 0) {
1293 				nd = nd->nd_next;
1294 				continue;
1295 			}
1296 
1297 			has_set = nodehasset(sp, nd->nd_nodename,
1298 				    NHS_NSTG_EQ, ep);
1299 			if (has_set < 0) {
1300 				rval = -1;
1301 				goto out;
1302 			}
1303 
1304 			if (! has_set) {
1305 				(void) mddserror(ep, MDE_DS_NODENOSET,
1306 					sp->setno, nd->nd_nodename,
1307 					NULL, sp->setname);
1308 				rval = -1;
1309 				goto out;
1310 			}
1311 			nd = nd->nd_next;
1312 		}
1313 	} else {
1314 		for (i = 0; i < MD_MAXSIDES; i++) {
1315 			/* Skip empty slots */
1316 			if (sd->sd_nodes[i][0] == '\0')
1317 				continue;
1318 
1319 			if (strcmp(sd->sd_nodes[i], mynode()) == 0)
1320 				continue;
1321 
1322 			has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
1323 				ep);
1324 			if (has_set < 0) {
1325 				/*
1326 				 * Can directly return since !MN diskset;
1327 				 * nothing to unlock.
1328 				 */
1329 				return (-1);
1330 			}
1331 
1332 			if (! has_set) {
1333 				/*
1334 				 * Can directly return since !MN diskset;
1335 				 * nothing to unlock.
1336 				 */
1337 				return (mddserror(ep, MDE_DS_NODENOSET,
1338 				    sp->setno, sd->sd_nodes[i], NULL,
1339 				    sp->setname));
1340 			}
1341 		}
1342 	}
1343 
1344 	for (p = dnlp; p != NULL; p = p->next) {
1345 		int		is_it;
1346 		mddrivename_t	*dnp;
1347 
1348 		dnp = p->drivenamep;
1349 
1350 		if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
1351 		    == -1) {
1352 			rval = -1;
1353 			goto out;
1354 		}
1355 
1356 		if (! is_it) {
1357 			(void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
1358 			    NULL, dnp->cname, sp->setname);
1359 			rval = -1;
1360 			goto out;
1361 		}
1362 
1363 		if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
1364 			rval = -1;
1365 			goto out;
1366 		}
1367 
1368 		deldrvcnt++;
1369 	}
1370 	current_drv_cnt = deldrvcnt;
1371 
1372 	/*
1373 	 * Get drive descriptors for the drives that are currently in the set.
1374 	 */
1375 	curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
1376 	if (! mdisok(ep)) {
1377 		rval = -1;
1378 		goto out;
1379 	}
1380 
1381 	/*
1382 	 * Decrement the the delete drive count for each drive currently in the
1383 	 * set.
1384 	 */
1385 	for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
1386 		deldrvcnt--;
1387 
1388 	/*
1389 	 * If the count of drives we are deleting is equal to the drives in the
1390 	 * set, and we haven't specified forceflg, return an error
1391 	 */
1392 	if (deldrvcnt == 0 && forceflg == FALSE) {
1393 		(void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
1394 		rval = -1;
1395 		goto out;
1396 	}
1397 
1398 	/*
1399 	 * Get the list of drive descriptors that we are deleting.
1400 	 */
1401 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
1402 	if (! mdisok(ep)) {
1403 		rval = -1;
1404 		goto out;
1405 	}
1406 
1407 	/*
1408 	 * Get the set timeout information in case we have to roll back.
1409 	 */
1410 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1411 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1412 		rval = -1;
1413 		goto out;
1414 	}
1415 
1416 	/* At this point, in case of error, set should be flushed. */
1417 	flush_set_onerr = 1;
1418 
1419 	/* END CHECK CODE */
1420 
1421 	/* Lock the set on current set members */
1422 	if (!(MD_MNSET_DESC(sd))) {
1423 		md_rb_sig_handling_on();
1424 		for (i = 0; i < MD_MAXSIDES; i++) {
1425 			/* Skip empty slots */
1426 			if (sd->sd_nodes[i][0] == '\0')
1427 				continue;
1428 
1429 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1430 				rval = -1;
1431 				goto out;
1432 			}
1433 			lock_flag = 1;
1434 		}
1435 	}
1436 
1437 	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1438 		mddb_config_t		c;
1439 		/*
1440 		 * Is current set STALE?
1441 		 */
1442 		(void) memset(&c, 0, sizeof (c));
1443 		c.c_id = 0;
1444 		c.c_setno = sp->setno;
1445 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1446 			(void) mdstealerror(ep, &c.c_mde);
1447 			rval = -1;
1448 			goto out;
1449 		}
1450 		if (c.c_flags & MDDB_C_STALE) {
1451 			stale_bool = TRUE;
1452 		}
1453 	}
1454 
1455 	RB_TEST(1, "deletedrives", ep)
1456 
1457 	RB_PREEMPT;
1458 	rb_level = 1;	/* level 1 */
1459 
1460 	RB_TEST(2, "deletedrives", ep)
1461 
1462 	/*
1463 	 * Mark the drives MD_DR_DEL
1464 	 */
1465 	if (MD_MNSET_DESC(sd)) {
1466 		nd = sd->sd_nodelist;
1467 		/* All nodes are guaranteed to be ALIVE */
1468 		while (nd) {
1469 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
1470 			    MD_DR_DEL, ep) == -1)
1471 				goto rollback;
1472 
1473 			RB_TEST(3, "deletedrives", ep)
1474 			nd = nd->nd_next;
1475 		}
1476 	} else {
1477 		for (i = 0; i < MD_MAXSIDES; i++) {
1478 			/* Skip empty slots */
1479 			if (sd->sd_nodes[i][0] == '\0')
1480 				continue;
1481 
1482 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
1483 			    MD_DR_DEL, ep) == -1)
1484 				goto rollback;
1485 
1486 			RB_TEST(3, "deletedrives", ep)
1487 		}
1488 	}
1489 
1490 	RB_TEST(4, "deletedrives", ep)
1491 
1492 	RB_PREEMPT;
1493 	rb_level = 2;	/* level 2 */
1494 
1495 	RB_TEST(5, "deletedrives", ep)
1496 
1497 	/*
1498 	 * Balance the DB's according to the list of existing drives and the
1499 	 * list of deleted drives.
1500 	 */
1501 	if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
1502 		goto rollback;
1503 
1504 	/*
1505 	 * If the drive(s) to be deleted cannot be accessed,
1506 	 * they haven't really been deleted yet. Check and delete now
1507 	 * if need be.
1508 	 */
1509 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
1510 		nlp = NULL;
1511 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1512 			char	*delete_name;
1513 
1514 			delete_name = ddp->dd_dnp->cname;
1515 
1516 			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1517 				char	*cur_name;
1518 
1519 				cur_name =
1520 				    rl->rl_repp->r_namep->drivenamep->cname;
1521 
1522 				if (strcmp(delete_name, cur_name) == 0) {
1523 					/* put it on the delete list */
1524 					np = rl->rl_repp->r_namep;
1525 					(void) metanamelist_append(&nlp, np);
1526 
1527 				}
1528 			}
1529 		}
1530 
1531 		if (nlp != NULL) {
1532 			if (meta_db_detach(sp, nlp,
1533 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
1534 			    ep) == -1) {
1535 				metafreenamelist(nlp);
1536 				goto rollback;
1537 			}
1538 			metafreenamelist(nlp);
1539 		}
1540 	}
1541 
1542 	RB_TEST(6, "deletedrives", ep)
1543 
1544 	RB_PREEMPT;
1545 	rb_level = 3;	/* level 3 */
1546 
1547 	RB_TEST(7, "deletedrives", ep)
1548 
1549 	/*
1550 	 * Cannot suspend set until after meta_db_balance since
1551 	 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
1552 	 */
1553 	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1554 		/*
1555 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
1556 		 * Start by suspending rpc.mdcommd (which drains it of all
1557 		 * messages), then change the nodelist followed by a reinit
1558 		 * and resume.
1559 		 */
1560 		nd = sd->sd_nodelist;
1561 		/* All nodes are guaranteed to be ALIVE */
1562 		while (nd) {
1563 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
1564 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
1565 				rval = -1;
1566 				goto out;
1567 			}
1568 			suspendall_flag = 1;
1569 			nd = nd->nd_next;
1570 		}
1571 	}
1572 
1573 	/*
1574 	 * Remove the drive records for the drives that were deleted from
1575 	 * each host in the set.  This removes the record and dr_flags.
1576 	 */
1577 	if (MD_MNSET_DESC(sd)) {
1578 		nd = sd->sd_nodelist;
1579 		/* All nodes are guaranteed to be ALIVE */
1580 		while (nd) {
1581 			if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
1582 				goto rollback;
1583 
1584 			RB_TEST(8, "deletedrives", ep)
1585 			nd = nd->nd_next;
1586 		}
1587 	} else {
1588 		for (i = 0; i < MD_MAXSIDES; i++) {
1589 			/* Skip empty slots */
1590 			if (sd->sd_nodes[i][0] == '\0')
1591 				continue;
1592 
1593 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
1594 				goto rollback;
1595 
1596 			RB_TEST(8, "deletedrives", ep)
1597 		}
1598 	}
1599 
1600 	RB_TEST(9, "deletedrives", ep)
1601 
1602 	RB_PREEMPT;
1603 	rb_level = 4;	/* level 4 */
1604 
1605 	RB_TEST(10, "deletedrives", ep)
1606 
1607 	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
1608 		if (rel_own_bydd(sp, dd, TRUE, ep))
1609 			goto rollback;
1610 	}
1611 
1612 	/* If we deleted all the drives, then we need to halt the set. */
1613 	if (deldrvcnt == 0) {
1614 		RB_TEST(11, "deletedrives", ep)
1615 
1616 		RB_PREEMPT;
1617 		rb_level = 5;	/* level 5 */
1618 
1619 		RB_TEST(12, "deletedrives", ep)
1620 
1621 		if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1622 			goto rollback;
1623 
1624 		RB_TEST(13, "deletedrives", ep)
1625 
1626 		RB_PREEMPT;
1627 		rb_level = 6;	/* level 6 */
1628 
1629 		RB_TEST(14, "deletedrives", ep)
1630 
1631 		/* Halt MN diskset on all nodes by having node withdraw */
1632 		if (MD_MNSET_DESC(sd)) {
1633 			nd = sd->sd_nodelist;
1634 			/* All nodes are guaranteed to be ALIVE */
1635 			while (nd) {
1636 				/* Only withdraw nodes that are joined */
1637 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
1638 					nd = nd->nd_next;
1639 					continue;
1640 				}
1641 				/*
1642 				 * Going to set locally cached node flags to
1643 				 * rollback join so in case of error, the
1644 				 * rollback code knows which nodes to re-join.
1645 				 */
1646 				nd->nd_flags |= MD_MN_NODE_RB_JOIN;
1647 
1648 				/*
1649 				 * Be careful in ordering of following steps
1650 				 * so that recovery from a panic between
1651 				 * the steps is viable.
1652 				 * Only reset master info in rpc.metad -
1653 				 * don't reset local cached information
1654 				 * which will be used to set master information
1655 				 * back in case of failure (rollback).
1656 				 */
1657 				if (clnt_withdrawset(nd->nd_nodename, sp, ep))
1658 					goto rollback;
1659 				/* Sets withdraw flag on all nodes in list */
1660 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
1661 				    sd->sd_nodelist, MD_NR_WITHDRAW,
1662 				    NULL, ep)) {
1663 					goto rollback;
1664 				}
1665 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
1666 				    "", MD_MN_INVALID_NID, ep)) {
1667 					goto rollback;
1668 				}
1669 				nd = nd->nd_next;
1670 			}
1671 		} else {
1672 			if (halt_set(sp, ep))
1673 				goto rollback;
1674 		}
1675 
1676 		RB_TEST(15, "deletedrives", ep)
1677 	}
1678 
1679 	RB_TEST(16, "deletedrives", ep)
1680 
1681 out:
1682 	/*
1683 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
1684 	 * Send reinit command to mdcommd which forces it to get
1685 	 * fresh set description.
1686 	 */
1687 	if (suspendall_flag) {
1688 		/* Send reinit */
1689 		nd = sd->sd_nodelist;
1690 		/* All nodes are guaranteed to be ALIVE */
1691 		while (nd) {
1692 			/* Class is ignored for REINIT */
1693 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1694 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1695 				if (rval == 0)
1696 					(void) mdstealerror(ep, &xep);
1697 				rval = -1;
1698 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1699 				    "Unable to reinit rpc.mdcommd.\n"));
1700 			}
1701 			nd = nd->nd_next;
1702 		}
1703 	}
1704 
1705 	/*
1706 	 * Just resume all classes so that resume is the same whether
1707 	 * just one class was locked or all classes were locked.
1708 	 */
1709 	if ((suspend1_flag) || (suspendall_flag)) {
1710 		/* Send resume */
1711 		nd = sd->sd_nodelist;
1712 		/* All nodes are guaranteed to be ALIVE */
1713 		while (nd) {
1714 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1715 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1716 				if (rval == 0)
1717 					(void) mdstealerror(ep, &xep);
1718 				rval = -1;
1719 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1720 				    "Unable to resume rpc.mdcommd.\n"));
1721 			}
1722 			nd = nd->nd_next;
1723 		}
1724 		meta_ping_mnset(sp->setno);
1725 	}
1726 	if (lock_flag) {
1727 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1728 		if (MD_MNSET_DESC(sd)) {
1729 			nd = sd->sd_nodelist;
1730 			/* All nodes are guaranteed to be ALIVE */
1731 			while (nd) {
1732 				if (clnt_unlock_set(nd->nd_nodename,
1733 				    cl_sk, &xep)) {
1734 					if (rval == 0)
1735 						(void) mdstealerror(ep, &xep);
1736 					rval = -1;
1737 				}
1738 				nd = nd->nd_next;
1739 			}
1740 		} else {
1741 			for (i = 0; i < MD_MAXSIDES; i++) {
1742 				/* Skip empty slots */
1743 				if (sd->sd_nodes[i][0] == '\0')
1744 					continue;
1745 
1746 				if (clnt_unlock_set(sd->sd_nodes[i],
1747 				    cl_sk, &xep)) {
1748 					if (rval == 0)
1749 						(void) mdstealerror(ep, &xep);
1750 					rval = -1;
1751 				}
1752 			}
1753 		}
1754 		cl_set_setkey(NULL);
1755 	}
1756 
1757 	metafreedrivedesc(&dd);
1758 
1759 	if (flush_set_onerr) {
1760 		metaflushsetname(sp);
1761 		if (!(MD_MNSET_DESC(sd))) {
1762 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1763 		}
1764 	}
1765 
1766 	if (MD_MNSET_DESC(sd)) {
1767 		/* release signals back to what they were on entry */
1768 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1769 			mdclrerror(&xep);
1770 	}
1771 
1772 	return (rval);
1773 
1774 rollback:
1775 	/* all signals already blocked for MN disket */
1776 	if (!(MD_MNSET_DESC(sd))) {
1777 		/* Make sure we are blocking all signals */
1778 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1779 			mdclrerror(&xep);
1780 	}
1781 
1782 	rval = -1;
1783 
1784 	max_genid = sd->sd_genid;
1785 
1786 	/* Set the master on all nodes first thing */
1787 	if (rb_level > 5) {
1788 		if (MD_MNSET_DESC(sd)) {
1789 			nd = sd->sd_nodelist;
1790 			/* All nodes are guaranteed to be ALIVE */
1791 			while (nd) {
1792 				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
1793 					continue;
1794 				}
1795 				/*
1796 				 * Set master on all re-joining nodes to be
1797 				 * my cached view of master.
1798 				 */
1799 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
1800 				    sd->sd_mn_master_nodenm,
1801 				    sd->sd_mn_master_nodeid, &xep)) {
1802 					mdclrerror(&xep);
1803 				}
1804 			}
1805 		}
1806 	}
1807 
1808 	/* level 3 */
1809 	if (rb_level > 2) {
1810 		md_set_record		*sr;
1811 		md_mnset_record		*mnsr;
1812 		md_drive_record		*dr;
1813 		int			sr_drive_cnt;
1814 
1815 		/*
1816 		 * See if we have to re-add the drives specified.
1817 		 */
1818 		if (MD_MNSET_DESC(sd)) {
1819 			nd = sd->sd_nodelist;
1820 			/* All nodes are guaranteed to be ALIVE */
1821 			while (nd) {
1822 				/*
1823 				 * Must get current set record from each
1824 				 * node to see what else must be done
1825 				 * to recover.
1826 				 * Record should be for a multi-node diskset.
1827 				 */
1828 				if (clnt_mngetset(nd->nd_nodename, sp->setname,
1829 				    MD_SET_BAD, &mnsr, &xep) == -1) {
1830 					mdclrerror(&xep);
1831 					nd = nd->nd_next;
1832 					continue;
1833 				}
1834 
1835 				/*
1836 				 * If all drives are already there, skip
1837 				 * to next node.
1838 				 */
1839 				sr_drive_cnt = 0;
1840 				dr = mnsr->sr_drivechain;
1841 				while (dr) {
1842 					sr_drive_cnt++;
1843 					dr = dr->dr_next;
1844 				}
1845 				if (sr_drive_cnt == current_drv_cnt) {
1846 					free_sr((md_set_record *)mnsr);
1847 					nd = nd->nd_next;
1848 					continue;
1849 				}
1850 
1851 				/* Readd all drives */
1852 				if (clnt_adddrvs(nd->nd_nodename, sp, dd,
1853 				    mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
1854 					mdclrerror(&xep);
1855 
1856 				free_sr((struct md_set_record *)mnsr);
1857 				nd = nd->nd_next;
1858 			}
1859 		} else {
1860 			for (i = 0; i < MD_MAXSIDES; i++) {
1861 				/* Skip empty slots */
1862 				if (sd->sd_nodes[i][0] == '\0')
1863 					continue;
1864 
1865 				/* Record should be for a non-multi-node set */
1866 				if (clnt_getset(sd->sd_nodes[i], sp->setname,
1867 				    MD_SET_BAD, &sr, &xep) == -1) {
1868 					mdclrerror(&xep);
1869 					continue;
1870 				}
1871 
1872 				/*
1873 				 * Set record structure was allocated from RPC
1874 				 * routine getset so this structure is only of
1875 				 * size md_set_record even if the MN flag is
1876 				 * set.  So, clear the flag so that the free
1877 				 * code doesn't attempt to free a structure
1878 				 * the size of md_mnset_record.
1879 				 */
1880 				if (MD_MNSET_REC(sr)) {
1881 					sr->sr_flags &= ~MD_SR_MN;
1882 					free_sr(sr);
1883 					continue;
1884 				}
1885 
1886 				/* Drive already added, skip to next node */
1887 				if (sr->sr_drivechain != NULL) {
1888 					free_sr(sr);
1889 					continue;
1890 				}
1891 
1892 				if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
1893 				    sr->sr_ctime, sr->sr_genid, &xep) == -1)
1894 					mdclrerror(&xep);
1895 
1896 				free_sr(sr);
1897 			}
1898 		}
1899 		max_genid += 2;
1900 	}
1901 
1902 	/*
1903 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
1904 	 * At this point in time, don't know which nodes are joined
1905 	 * to the set.  So, send a reinit command to mdcommd
1906 	 * which forces it to get fresh set description.  Then send resume.
1907 	 *
1908 	 * Later, this code will use rpc.mdcommd messages to reattach disks
1909 	 * and then rpc.mdcommd may be suspended again, rest of the nodes
1910 	 * joined, rpc.mdcommd reinited and then resumed.
1911 	 */
1912 	if (suspendall_flag) {
1913 		/* Send reinit */
1914 		nd = sd->sd_nodelist;
1915 		/* All nodes are guaranteed to be ALIVE */
1916 		while (nd) {
1917 			/* Class is ignored for REINIT */
1918 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1919 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1920 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
1921 				    "Unable to reinit rpc.mdcommd.\n"));
1922 				mdclrerror(&xep);
1923 			}
1924 			nd = nd->nd_next;
1925 		}
1926 
1927 		/* Send resume */
1928 		nd = sd->sd_nodelist;
1929 		/* All nodes are guaranteed to be ALIVE */
1930 		while (nd) {
1931 			/*
1932 			 * Resume all classes but class 1 so that lock is held
1933 			 * against meta* commands.
1934 			 * To later resume class1, must issue a class0 resume.
1935 			 */
1936 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1937 			    sp, MD_MSG_CLASS0,
1938 			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
1939 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
1940 				    "Unable to resume rpc.mdcommd.\n"));
1941 				mdclrerror(&xep);
1942 			}
1943 			nd = nd->nd_next;
1944 		}
1945 		meta_ping_mnset(sp->setno);
1946 	}
1947 
1948 	/* level 2 */
1949 	if (rb_level > 1) {
1950 		mdnamelist_t	*nlp;
1951 		mdname_t	*np;
1952 
1953 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1954 			uint_t	rep_slice;
1955 
1956 			if ((meta_replicaslice(ddp->dd_dnp,
1957 			    &rep_slice, &xep) != 0) ||
1958 			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
1959 				&xep)) == NULL)) {
1960 				mdclrerror(&xep);
1961 				continue;
1962 			}
1963 			nlp = NULL;
1964 			(void) metanamelist_append(&nlp, np);
1965 
1966 			if (meta_db_attach(sp, nlp,
1967 			    (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
1968 			    &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
1969 			    NULL, &xep) == -1)
1970 				mdclrerror(&xep);
1971 
1972 			metafreenamelist(nlp);
1973 		}
1974 		/* Re-balance */
1975 		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
1976 			mdclrerror(&xep);
1977 	}
1978 
1979 	/* level 4 */
1980 	if (rb_level > 3) {
1981 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
1982 			if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
1983 				mdclrerror(&xep);
1984 		}
1985 	}
1986 
1987 	/* level 5 */
1988 	if (rb_level > 4) {
1989 		if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
1990 			mdclrerror(&xep);
1991 	}
1992 
1993 	/*
1994 	 * If at least one node needs to be rejoined to MN diskset,
1995 	 * then suspend commd again.
1996 	 */
1997 	if (MD_MNSET_DESC(sd)) {
1998 		nd = sd->sd_nodelist;
1999 		/* All nodes are guaranteed to be ALIVE */
2000 		while (nd) {
2001 			if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2002 				nd = nd->nd_next;
2003 				continue;
2004 			}
2005 			break;
2006 		}
2007 		if (nd) {
2008 			/*
2009 			 * Found node that will be rejoined so
2010 			 * notify rpc.mdcommd on all nodes of a nodelist change.
2011 			 * Start by suspending rpc.mdcommd (which drains it of
2012 			 * all messages), then change the nodelist followed by
2013 			 * a reinit and resume.
2014 			 */
2015 			nd = sd->sd_nodelist;
2016 			/* All nodes are guaranteed to be ALIVE */
2017 			while (nd) {
2018 				if (clnt_mdcommdctl(nd->nd_nodename,
2019 				    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
2020 				    MD_MSCF_NO_FLAGS, &xep)) {
2021 					mdclrerror(&xep);
2022 				}
2023 				suspendall_flag_rb = 1;
2024 				nd = nd->nd_next;
2025 			}
2026 		}
2027 	}
2028 
2029 
2030 
2031 	/* level 6 */
2032 	if (rb_level > 5) {
2033 		if (MD_MNSET_DESC(sd)) {
2034 			int	join_flags = 0;
2035 
2036 			nd = sd->sd_nodelist;
2037 			/* All nodes are guaranteed to be ALIVE */
2038 			while (nd) {
2039 				/* Only rejoin nodes that were joined before */
2040 				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2041 					nd = nd->nd_next;
2042 					continue;
2043 				}
2044 				/*
2045 				 * Rejoin nodes to same state as before -
2046 				 * either STALE or non-STALE.
2047 				 */
2048 				if (stale_bool == TRUE)
2049 					join_flags = MNSET_IS_STALE;
2050 				if (clnt_joinset(nd->nd_nodename, sp,
2051 				    join_flags, &xep))
2052 					mdclrerror(&xep);
2053 				/* Sets OWN flag on all nodes in list */
2054 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2055 				    sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
2056 					mdclrerror(&xep);
2057 				}
2058 				nd = nd->nd_next;
2059 			}
2060 		} else {
2061 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
2062 				mdclrerror(&xep);
2063 
2064 			/* No special flag for traditional diskset */
2065 			if (snarf_set(sp, NULL, &xep))
2066 				mdclrerror(&xep);
2067 		}
2068 	}
2069 
2070 	/* level 1 */
2071 	if (rb_level > 0) {
2072 		/*
2073 		 * Mark the drives as OK.
2074 		 */
2075 		if (MD_MNSET_DESC(sd)) {
2076 			nd = sd->sd_nodelist;
2077 			/* All nodes are guaranteed to be ALIVE */
2078 			while (nd) {
2079 				/*
2080 				 * Must be last action before unlock.
2081 				 * In case of panic, recovery code checks
2082 				 * for MD_DR_OK to know that drive
2083 				 * and possible master are fully added back.
2084 				 */
2085 				if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2086 				    MD_DR_OK, &xep) == -1)
2087 					mdclrerror(&xep);
2088 				nd = nd->nd_next;
2089 			}
2090 		} else {
2091 			for (i = 0; i < MD_MAXSIDES; i++) {
2092 				/* Skip empty slots */
2093 				if (sd->sd_nodes[i][0] == '\0')
2094 					continue;
2095 
2096 				if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
2097 				    MD_DR_OK, &xep) == -1)
2098 					mdclrerror(&xep);
2099 
2100 			}
2101 		}
2102 		max_genid += 2;
2103 		resync_genid(sp, sd, max_genid, 0, NULL);
2104 	}
2105 	/*
2106 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2107 	 * Send a reinit command to mdcommd which forces it to get
2108 	 * fresh set description.
2109 	 */
2110 	if (suspendall_flag_rb) {
2111 		/* Send reinit */
2112 		nd = sd->sd_nodelist;
2113 		/* All nodes are guaranteed to be ALIVE */
2114 		while (nd) {
2115 			/* Class is ignored for REINIT */
2116 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2117 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2118 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
2119 				    "Unable to reinit rpc.mdcommd.\n"));
2120 				mdclrerror(&xep);
2121 			}
2122 			nd = nd->nd_next;
2123 		}
2124 	}
2125 
2126 	/*
2127 	 * Just resume all classes so that resume is the same whether
2128 	 * just one class was locked or all classes were locked.
2129 	 */
2130 	if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
2131 		/* Send resume */
2132 		nd = sd->sd_nodelist;
2133 		/* All nodes are guaranteed to be ALIVE */
2134 		while (nd) {
2135 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2136 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2137 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
2138 				    "Unable to resume rpc.mdcommd.\n"));
2139 				mdclrerror(&xep);
2140 			}
2141 			nd = nd->nd_next;
2142 		}
2143 		meta_ping_mnset(sp->setno);
2144 	}
2145 
2146 
2147 	/* level 0 */
2148 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2149 	/* Don't test lock flag since guaranteed to be set if in rollback */
2150 	if (MD_MNSET_DESC(sd)) {
2151 		nd = sd->sd_nodelist;
2152 		/* All nodes are guaranteed to be ALIVE */
2153 		while (nd) {
2154 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
2155 				mdclrerror(&xep);
2156 			nd = nd->nd_next;
2157 		}
2158 	} else {
2159 		for (i = 0; i < MD_MAXSIDES; i++) {
2160 			/* Skip empty slots */
2161 			if (sd->sd_nodes[i][0] == '\0')
2162 				continue;
2163 
2164 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
2165 				mdclrerror(&xep);
2166 		}
2167 	}
2168 	cl_set_setkey(NULL);
2169 
2170 	/* release signals back to what they were on entry */
2171 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2172 		mdclrerror(&xep);
2173 
2174 	metafreedrivedesc(&dd);
2175 
2176 	if (flush_set_onerr) {
2177 		metaflushsetname(sp);
2178 		if (!(MD_MNSET_DESC(sd))) {
2179 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
2180 		}
2181 	}
2182 
2183 	return (rval);
2184 }
2185