xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c (revision 1945:74cee1cd404b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Metadevice diskset interfaces
30  */
31 
32 #include "meta_set_prv.h"
33 #include <sys/lvm/md_crc.h>
34 
35 extern	char	*blkname(char *);
36 
37 static int
38 upd_dr_dbinfo(
39 	mdsetname_t		*sp,
40 	md_set_desc		*sd,
41 	md_drive_desc		*dd,
42 	md_replicalist_t	*rlp,
43 	int			forceflg,
44 	md_error_t		*ep
45 )
46 {
47 	md_drive_desc		*p;
48 	md_replica_t		*r;
49 	md_replicalist_t	*rl;
50 	int			i;
51 	int			dbcnt;
52 	int			rval = 0;
53 	daddr_t			nblks = 0;
54 	md_setkey_t		*cl_sk;
55 	md_error_t		xep = mdnullerror;
56 	md_mnnode_desc		*nd;
57 	ddi_devid_t		devid;
58 
59 	/* find the smallest existing replica */
60 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
61 		r = rl->rl_repp;
62 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
63 	}
64 
65 	if (nblks <= 0)
66 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
67 
68 	for (p = dd; p != NULL; p = p->dd_next) {
69 		dbcnt = 0;
70 		for (rl = rlp; rl != NULL; rl = rl->rl_next) {
71 			r = rl->rl_repp;
72 
73 			/*
74 			 * Before we bump up the dbcnt, if we're
75 			 * running with device ids in disksets, let's
76 			 * compare the device ids otherwise we compare
77 			 * the ctd names.
78 			 *
79 			 * There is a possibility the device ids might
80 			 * have changed. To account for that case, we
81 			 * fallback to comparing the ctd names if the
82 			 * device id comparison fails. If we aren't running
83 			 * in device id mode and a disk has moved, the ctd's
84 			 * won't match.
85 			 */
86 			if ((p->dd_dnp->devid != NULL) &&
87 			    (r->r_devid != NULL) && (!MD_MNSET_DESC(sd))) {
88 				(void) devid_str_decode(p->dd_dnp->devid,
89 				    &devid, NULL);
90 				if ((devid_compare(devid, r->r_devid) == 0) ||
91 				    (strcmp(r->r_namep->drivenamep->cname,
92 				    p->dd_dnp->cname) == 0))
93 					dbcnt++;
94 				devid_free(devid);
95 			} else {
96 				if (strcmp(r->r_namep->drivenamep->cname,
97 				    p->dd_dnp->cname) == 0)
98 					dbcnt++;
99 			}
100 		}
101 		p->dd_dbcnt = dbcnt;
102 		p->dd_dbsize = dbcnt > 0 ? nblks : 0;
103 	}
104 
105 	/* Lock the set on current set members */
106 	if (MD_MNSET_DESC(sd)) {
107 		nd = sd->sd_nodelist;
108 		while (nd) {
109 			/* If this is forced, don't lock other sides */
110 			if (forceflg && strcmp(mynode(), nd->nd_nodename)
111 			    != 0) {
112 				nd = nd->nd_next;
113 				continue;
114 			}
115 
116 			/* We already locked this side in the caller */
117 			if (strcmp(mynode(), nd->nd_nodename) == 0) {
118 				nd = nd->nd_next;
119 				continue;
120 			}
121 
122 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
123 				nd = nd->nd_next;
124 				continue;
125 			}
126 
127 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
128 				rval = -1;
129 				goto out;
130 			}
131 			nd = nd->nd_next;
132 		}
133 	} else {
134 		for (i = 0; i < MD_MAXSIDES; i++) {
135 			/* Skip empty slots */
136 			if (sd->sd_nodes[i][0] == '\0')
137 				continue;
138 
139 			/* If this is forced, don't lock other sides */
140 			if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0)
141 				continue;
142 
143 			/* We already locked this side in the caller */
144 			if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
145 				continue;
146 
147 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
148 				rval = -1;
149 				goto out;
150 			}
151 		}
152 	}
153 
154 	if (MD_MNSET_DESC(sd)) {
155 		nd = sd->sd_nodelist;
156 		while (nd) {
157 			/* If this is forced, then only care about this node */
158 			if (forceflg && strcmp(mynode(), nd->nd_nodename)
159 			    != 0) {
160 				nd = nd->nd_next;
161 				continue;
162 			}
163 
164 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
165 				nd = nd->nd_next;
166 				continue;
167 			}
168 
169 			if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd,
170 			    ep) == -1) {
171 				if (! mdiserror(ep, MDE_NO_SET) &&
172 				    ! mdismddberror(ep, MDE_DB_NODB)) {
173 					rval = -1;
174 					break;
175 				}
176 				mdclrerror(ep);
177 			}
178 			nd = nd->nd_next;
179 		}
180 	} else {
181 		for (i = 0; i < MD_MAXSIDES; i++) {
182 			/* Skip empty slots */
183 			if (sd->sd_nodes[i][0] == '\0')
184 				continue;
185 
186 			/* If this is forced, then only care about this node */
187 			if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0)
188 				continue;
189 
190 			if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd,
191 			    ep) == -1) {
192 				if (! mdiserror(ep, MDE_NO_SET) &&
193 				    ! mdismddberror(ep, MDE_DB_NODB)) {
194 					rval = -1;
195 					break;
196 				}
197 				mdclrerror(ep);
198 			}
199 		}
200 	}
201 
202 out:
203 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
204 	if (MD_MNSET_DESC(sd)) {
205 		nd = sd->sd_nodelist;
206 		while (nd) {
207 			/* If this is forced, don't unlock other sides */
208 			if (forceflg && strcmp(mynode(), nd->nd_nodename)
209 			    != 0) {
210 				nd = nd->nd_next;
211 				continue;
212 			}
213 
214 			/* We will unlocked this side in the caller */
215 			if (strcmp(mynode(), nd->nd_nodename) == 0) {
216 				nd = nd->nd_next;
217 				continue;
218 			}
219 
220 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
221 				nd = nd->nd_next;
222 				continue;
223 			}
224 
225 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
226 				if (rval == 0)
227 					(void) mdstealerror(ep, &xep);
228 				rval = -1;
229 			}
230 			nd = nd->nd_next;
231 		}
232 	} else {
233 		for (i = 0; i < MD_MAXSIDES; i++) {
234 			/* Skip empty slots */
235 			if (sd->sd_nodes[i][0] == '\0')
236 				continue;
237 
238 			/* If this is forced, don't unlock other sides */
239 			if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0)
240 				continue;
241 
242 			/* We will unlocked this side in the caller */
243 			if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
244 				continue;
245 
246 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
247 				if (rval == 0)
248 					(void) mdstealerror(ep, &xep);
249 				rval = -1;
250 			}
251 		}
252 	}
253 	/* Do not clear the key, via cl_set_setkey(NULL) this is nested */
254 
255 	return (rval);
256 }
257 
258 static int
259 usetag_take(set_t setno, int usetag, md_error_t *ep)
260 {
261 	mddb_dtag_use_parm_t	dtup;
262 
263 	(void) memset(&dtup, '\0', sizeof (mddb_dtag_use_parm_t));
264 	dtup.dtup_id = usetag;
265 	dtup.dtup_setno = setno;
266 
267 	if (metaioctl(MD_MED_USE_TAG, &dtup, &dtup.dtup_mde, NULL) != 0)
268 		return (mdstealerror(ep, &dtup.dtup_mde));
269 
270 	return (0);
271 }
272 
273 static int
274 useit_take(set_t setno, md_error_t *ep)
275 {
276 	mddb_accept_parm_t	accp;
277 
278 	(void) memset(&accp, '\0', sizeof (mddb_accept_parm_t));
279 	accp.accp_setno = setno;
280 
281 	if (metaioctl(MD_MED_ACCEPT, &accp, &accp.accp_mde, NULL) != 0)
282 		return (mdstealerror(ep, &accp.accp_mde));
283 
284 	return (0);
285 }
286 
287 /*
288  * Update the master block with the device id information for the disks
289  * in the diskset. The device id information will be consumed by the
290  * diskset import code in case of remotely replicated disksets.
291  *
292  * For the drives that have a valid diskset mddb on them, we add the
293  * device id for the drive to the unused portion of the mddb.
294  *
295  * For the drives that don't have a diskset mddb on them, we add a dummy
296  * master block that contains the device id for the drive. A dummy master
297  * block is signified by changing the master block magic number, mb_magic,
298  * to MDDB_MAGIC_DU.
299  *
300  * This code is responsible primarily for adding the appropriate device id
301  * information to diskset disks that didn't have the information. This would
302  * typically occur when the OS has been upgraded from an OS release prior to
303  * Solaris 10
304  *
305  * The error path in this routine is defined as - if an error occurs while
306  * updating the mddb for one disk in the diskset, don't bother updating *any*
307  * of the mddbs because it's game over anyways as far as disaster recovery for
308  * that diskset is concerned.
309  *
310  * This code will need to be revisited if and when support for importing
311  * partial disksets is added.
312  *
313  * NOTE: This code relies heavily on the meta_repartition() working correctly
314  * and reformatting a drive, so that there's enough room for a dummy master
315  * block, every time a drive is added to a diskset. Should
316  * the meta_repartition() code change in future, this code will have to be
317  * revisited.
318  *
319  * Returns 0 on success and -1 on failure
320  */
321 int
322 meta_update_mb(mdsetname_t *sp, md_drive_desc *drivedesc, md_error_t *ep)
323 {
324 	uint_t			sliceno, offset;
325 	void			*mb;
326 	mddb_mb_t		*mbp;
327 	int			fd = -1;
328 	ddi_devid_t		devid = NULL;
329 	md_drive_desc		*dd;
330 	mddrivename_t		*dnp;
331 	mdname_t		*rsp;
332 	int			dbcnt;
333 	int			dbsize;
334 	size_t 			len;
335 	md_set_desc		*sd;
336 
337 	/*
338 	 * Don't do anything for MN diskset for now.
339 	 */
340 	if (! metaislocalset(sp)) {
341 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
342 			return (-1);
343 
344 		if (MD_MNSET_DESC(sd))
345 			return (0);
346 	}
347 
348 	mb = Malloc(DEV_BSIZE);
349 	mbp = (mddb_mb_t *)mb;
350 
351 	/*
352 	 * For every drive in the drive descriptor, iterate through all
353 	 * the mddbs present on it and check to see if mb_devid_magic is
354 	 * set. If it isn't, then update the master block with the correct
355 	 * device id information
356 	 */
357 	for (dd = drivedesc; dd != NULL; dd = dd->dd_next) {
358 		int i = 0;
359 
360 		dnp = dd->dd_dnp;
361 		dbcnt = dd->dd_dbcnt;
362 		dbsize = dd->dd_dbsize;
363 
364 		/*
365 		 * When the import support for remotely replicated
366 		 * disksets gets implemented, we probably want to
367 		 * inform the user that the disks won't be self
368 		 * identifying if any of these calls fails
369 		 */
370 		if (meta_replicaslice(dnp, &sliceno, ep) != 0)
371 			return (-1);
372 
373 		if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL)
374 			return (-1);
375 
376 		if ((fd = open(rsp->rname, O_RDWR)) < 0)
377 			goto cleanup;
378 
379 		/* if devid_str_decode fails, make sure devid is null */
380 		if (devid_str_decode(dnp->devid, &devid, NULL) != 0) {
381 			devid = NULL;
382 		}
383 
384 		do {
385 			int push = 0;
386 
387 			offset = (i * dbsize + 16);
388 			++i;
389 
390 			if (lseek(fd, (off_t)dbtob(offset), SEEK_SET) < 0)
391 				goto cleanup;
392 
393 			if (read(fd, mbp, DEV_BSIZE) != DEV_BSIZE)
394 				goto cleanup;
395 
396 			if (crcchk((uchar_t *)mbp, (uint_t *)&mbp->mb_checksum,
397 			    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
398 				goto cleanup;
399 
400 			/*
401 			 * If the disk is one of the ones that doesn't
402 			 * have a shared mddb on it, we put a dummy
403 			 * master block on it.
404 			 */
405 			if (mbp->mb_devid_magic != MDDB_MAGIC_DE) {
406 				if (dbcnt == 0) {
407 					meta_mkdummymaster(sp, fd, 16);
408 					break;
409 				}
410 			}
411 
412 			/*
413 			 * if mb_setcreatetime is 0, this field was never
414 			 * filled in so do it now.
415 			 */
416 			if ((mbp->mb_setcreatetime.tv_sec == 0) &&
417 			    (mbp->mb_setcreatetime.tv_usec == 0)) {
418 				mbp->mb_setcreatetime =
419 				    meta_get_lb_inittime(sp, ep);
420 				push = 1;
421 			}
422 
423 			/*
424 			 * If MDDB_MAGIC_DE is set in the
425 			 * mb_devid_magic field then we know we
426 			 * have a valid device id and we don't
427 			 * need to add it to the master block.
428 			 *
429 			 * This would have to be revisited if device
430 			 * ids change as a result of device id
431 			 * algorithms changing or somesuch.
432 			 */
433 			if (mbp->mb_devid_magic != MDDB_MAGIC_DE) {
434 				if (devid != NULL) {
435 					len = devid_sizeof(devid);
436 					if (len <= (DEV_BSIZE -
437 					    sizeof (mddb_mb_t))) {
438 						/*
439 						 * there's enough space to
440 						 * store the devid
441 						 */
442 						mbp->mb_devid_magic =
443 						    MDDB_MAGIC_DE;
444 						mbp->mb_devid_len = len;
445 						(void) memcpy(mbp->mb_devid,
446 						    (char *)devid, len);
447 						push = 1;
448 					}
449 				}
450 			}
451 
452 			/*
453 			 * write out (push) any changes we have to the mb
454 			 */
455 			if (push) {
456 				crcgen((uchar_t *)mbp,
457 				    (uint_t *)&mbp->mb_checksum,
458 				    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL);
459 
460 				if (lseek(fd, (off_t)dbtob(offset), SEEK_SET)
461 				    < 0)
462 					goto cleanup;
463 
464 				if (write(fd, mbp, DEV_BSIZE) != DEV_BSIZE)
465 					goto cleanup;
466 			}
467 			if (devid)
468 				devid_free(devid);
469 		} while (i < dbcnt);
470 		(void) close(fd);
471 	}
472 	/* success */
473 	return (0);
474 
475 cleanup:
476 	if (fd != -1)
477 		(void) close(fd);
478 	if (devid)
479 		devid_free(devid);
480 	return (-1);
481 }
482 
483 extern int *replicated_disk_list_built;
484 extern int replicated_disk_list_built_pass1;
485 /*
486  * Exported Entry Points
487  */
488 int
489 meta_set_take(
490 	mdsetname_t		*sp,
491 	mhd_mhiargs_t		*mhiargsp,
492 	int			flags,
493 	int			usetag,
494 	md_error_t		*ep
495 )
496 {
497 	md_set_desc		*sd;
498 	md_drive_desc		*dd;
499 	md_drive_desc		*d = NULL;
500 	char			*owner = NULL;
501 	int			rval = 0;
502 	int			pathname_return = 0;
503 	int			i;
504 	int			has_set;
505 	int			matches = 0;
506 	int			numsides = 0;
507 	md_replicalist_t	*rlp = NULL;
508 	sigset_t		oldsigs;
509 	md_setkey_t		*cl_sk;
510 	int			rb_level = 0;
511 	md_error_t		xep = mdnullerror;
512 	mdsetname_t		*local_sp = NULL;
513 	side_t			side;
514 	int			ret = 0;
515 	char			*newname = NULL;
516 	mdkey_t			side_names_key;
517 	int			unrslv_replicated = 0;
518 	mddrivenamelist_t	*dnlp = NULL;
519 	int			retake_flag = 0;
520 
521 	if ((flags & TAKE_USETAG) || (flags & TAKE_USEIT)) {
522 		if (flags & TAKE_USETAG) {
523 			if (usetag_take(sp->setno, usetag, ep))
524 				return (-1);
525 		} else {
526 			if (useit_take(sp->setno, ep))
527 				return (-1);
528 		}
529 
530 		if (meta_resync_all(sp, MD_DEF_RESYNC_BUF_SIZE, ep) != 0)
531 			mdclrerror(ep);
532 	}
533 
534 	/* Do we own the set? */
535 	i = own_set(sp, &owner, (flags & TAKE_FORCE), ep);
536 	if (! mdisok(ep)) {
537 		if (owner != NULL)
538 			Free(owner);
539 		return (-1);
540 	}
541 
542 	if (i == MD_SETOWNER_NO) {
543 		(void) mddserror(ep, MDE_DS_NOTOWNER, sp->setno, owner, NULL,
544 		    sp->setname);
545 		if (owner != NULL)
546 			Free(owner);
547 		return (-1);
548 	}
549 
550 	if (owner != NULL) {
551 		Free(owner);
552 		owner = NULL;
553 	}
554 
555 	/* We already own it, we are done. */
556 	if (i == MD_SETOWNER_YES)
557 		return (0);
558 
559 	if ((sd = metaget_setdesc(sp, &xep)) == NULL)
560 		return (-1);
561 
562 	/* You can not take ownership of a set that has no drives */
563 	if (sd->sd_flags & MD_SR_MB_DEVID)
564 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, ep);
565 	else
566 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
567 
568 	if (dd == NULL) {
569 		if (! mdisok(ep))
570 			return (-1);
571 		return (0);
572 	}
573 
574 	/* END CHECK CODE */
575 
576 	md_rb_sig_handling_on();
577 
578 	/* Lock the set on our side */
579 	if (clnt_lock_set(mynode(), sp, ep)) {
580 		rval = -1;
581 		goto out;
582 	}
583 
584 	/*
585 	 * Find the "side" value so that it can be used to deal with
586 	 * the devids.
587 	 */
588 	side = getnodeside(mynode(), sd);
589 
590 	if (side == MD_SIDEWILD) {
591 	    (void) mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, mynode(),
592 		NULL, mynode());
593 	    rval = -1;
594 	    goto out;
595 	}
596 
597 	/*
598 	 * A local sets' side 0 references records associated with
599 	 * that node's local set. As this is a non-local set, "side"
600 	 * must be modified (by adding a SKEW) before we reference
601 	 * records in the local set [setno = 0] for the non-local set
602 	 * [setno = 1..n].
603 	 */
604 	side += SKEW;
605 
606 	/*
607 	 * If this set had been previously imported as a partial replicated
608 	 * diskset, then must attempt to updated any unresolved drive
609 	 * records in diskset with new devid information.  Must set
610 	 * flags in drivedesc list before loading up set so that the
611 	 * md driver will fix up names and devids correctly in the
612 	 * locator block.
613 	 */
614 	if (sd->sd_flags & MD_SR_UNRSLV_REPLICATED) {
615 		md_im_names_t		cnames = { 0, NULL};
616 		ddi_devid_t		old_devid, new_devid;
617 		char			*search_path = "/dev";
618 		devid_nmlist_t		*nmlist;
619 		int			indx;
620 		mddrivenamelist_t	**dnlpp = &dnlp;
621 
622 		if (meta_list_disks(ep, &cnames) != 0) {
623 			rval = -1;
624 			goto out;
625 		}
626 
627 		for (indx = 0; indx < cnames.min_count; ++indx) {
628 			mddrivename_t   *dnp;
629 			mdsetname_t	*sp =  metasetname(MD_LOCAL_NAME, ep);
630 			int		fd = -1;
631 			ddi_devid_t	devid1;
632 			char		*cdevidp;
633 			int		len;
634 			char		*fp;
635 
636 			/*
637 			 * We may have name collision here so we need to get
638 			 * the dnp using the devid and not the name.
639 			 */
640 			len = strlen(cnames.min_names[indx]) + strlen("s0");
641 			if ((fp = (char *)Malloc(len+1)) == NULL) {
642 				(void) mdsyserror(ep, ENOMEM, NULL);
643 				rval = -1;
644 				goto out;
645 			}
646 			(void) snprintf(fp, len + 1, "%ss0",
647 			    cnames.min_names[indx]);
648 			if ((fd = open(fp, O_RDONLY|O_NDELAY)) < 0) {
649 				(void) mdsyserror(ep, EIO, fp);
650 				rval = -1;
651 				goto out;
652 			}
653 			Free(fp);
654 			/* if no device id, what error?) */
655 			if (devid_get(fd, &devid1) != 0) {
656 				(void) mdsyserror(ep, EIO, fp);
657 				rval = -1;
658 				goto out;
659 			}
660 			if (close(fd) < 0) {
661 				(void) mdsyserror(ep, EIO, fp);
662 				rval = -1;
663 				goto out;
664 			}
665 			cdevidp = devid_str_encode(devid1, NULL);
666 			if (cdevidp == NULL) {
667 				(void) mdsyserror(ep, EIO, fp);
668 				rval = -1;
669 				goto out;
670 			}
671 			devid_free(devid1);
672 			dnp = metadrivenamebydevid(&sp, cdevidp,
673 			    cnames.min_names[indx], ep);
674 			devid_str_free(cdevidp);
675 			if (dnp == NULL) {
676 				/*
677 				 * Assuming we're interested in knowing about
678 				 * whatever error occurred, but not in stopping.
679 				 */
680 				mde_perror(ep, cnames.min_names[indx]);
681 				mdclrerror(ep);
682 				continue;
683 			}
684 
685 			dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp);
686 		}
687 		/* Reget sd and dd since freed by meta_prune_cnames. */
688 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
689 			rval = -1;
690 			goto out;
691 		}
692 
693 		if (sd->sd_flags & MD_SR_MB_DEVID)
694 			dd = metaget_drivedesc(sp,
695 				MD_BASICNAME_OK | PRINT_FAST, ep);
696 		else
697 			dd = metaget_drivedesc(sp,
698 				MD_BASICNAME_OK, ep);
699 		/* If ep has error, then there was a failure, set rval */
700 		if (!mdisok(ep)) {
701 			rval = -1;
702 			goto out;
703 		}
704 
705 		/* Builds global replicated disk list */
706 		replicated_disk_list_built = &replicated_disk_list_built_pass1;
707 
708 		/* If success, then clear error structure */
709 		if (build_replicated_disks_list(ep, dnlp) == 1)
710 			mdclrerror(ep);
711 		/* If ep has error, then there was a failure, set rval */
712 		if (! mdisok(ep)) {
713 			rval = -1;
714 			goto out;
715 		}
716 
717 		for (d = dd; d != NULL; d = d->dd_next) {
718 			if (d->dd_flags & MD_DR_UNRSLV_REPLICATED) {
719 				/* Get old devid from drive record */
720 				(void) devid_str_decode(d->dd_dnp->devid,
721 				    &old_devid, NULL);
722 
723 				/*
724 				 * If the devid stored in the drive record
725 				 * (old_devid) matches a devid known by
726 				 * the system, then this disk has already
727 				 * been partially resolved.  This situation
728 				 * could occur if a panic happened during a
729 				 * previous take of this diskset.
730 				 * Set flag to later handle fixing the master
731 				 * block on disk and turning off the unresolved
732 				 * replicated flag.
733 				 */
734 				if (meta_deviceid_to_nmlist(search_path,
735 				    (ddi_devid_t)old_devid,
736 				    DEVID_MINOR_NAME_ALL,
737 				    &nmlist) == 0) {
738 					d->dd_flags |= MD_DR_FIX_MB_DID;
739 					retake_flag = 1;
740 					continue;
741 				}
742 
743 				/*
744 				 * If the devid stored in the drive record
745 				 * is on the list of replicated disks found
746 				 * during a system scan then set both flags
747 				 * so that the locator block, namespaces
748 				 * (diskset and local set), master block
749 				 * and unresolved replicated flag are updated.
750 				 */
751 				new_devid = replicated_list_lookup(
752 				    devid_sizeof((ddi_devid_t)old_devid),
753 				    old_devid);
754 				devid_free(old_devid);
755 
756 				/*
757 				 * If devid stored in the drive record is
758 				 * not found then set flag to mark
759 				 * that set is still unresolved and
760 				 * continue to next drive record.
761 				 */
762 				if (new_devid == NULL) {
763 					unrslv_replicated = 1;
764 					continue;
765 				}
766 
767 				/*
768 				 * Set flags to fix up the master block,
769 				 * locator block of the diskset, diskset
770 				 * namespace and the local set namespace.
771 				 */
772 				d->dd_flags |= (MD_DR_FIX_MB_DID |
773 						MD_DR_FIX_LB_NM_DID);
774 				retake_flag = 1;
775 			}
776 		}
777 
778 	}
779 
780 	/*
781 	 * Check the local devid namespace to see if the disks
782 	 * have been moved. Use the local set first of all as this contains
783 	 * entries for the disks in the set.
784 	 *
785 	 * This is being done before the tk_own_bydd because the disks
786 	 * in the dd list could be wrong! But it should be done with the lock
787 	 * held for the set.
788 	 */
789 	local_sp = metasetname(MD_LOCAL_NAME, ep);
790 	for (d = dd; d != NULL; d = d->dd_next) {
791 		/*
792 		 * Actually do the check of the disks.
793 		 */
794 		ret = meta_upd_ctdnames(&local_sp, 0, side, d->dd_dnp, &newname,
795 		    ep);
796 
797 		if ((ret == METADEVADM_ERR) ||
798 		    (ret == METADEVADM_DSKNAME_ERR)) {
799 			/* check failed in some unknown manner */
800 			rval = -1;
801 			goto out;
802 		} else if (ret == METADEVADM_DISKMOVE) {
803 
804 			/*
805 			 * Update the dd namelist so that the rpc.metamhd
806 			 * gets the correct disks to reserve - it is the rname
807 			 * we are interested in.
808 			 */
809 			if (newname != NULL) {
810 				char	*save_devid;
811 				/*
812 				 * Need to save the side names key as this
813 				 * points to the namespace entry that will
814 				 * need to be updated. In addition the call
815 				 * to meta_make_sidenmlist does not actually
816 				 * set the namespace key.
817 				 */
818 				side_names_key = d->dd_dnp->side_names_key;
819 
820 				/*
821 				 * There is the possibility that there
822 				 * will be multiple disks with the same
823 				 * name but different devids in the
824 				 * drivelist. Because of this, we need
825 				 * to look for a new dnp based on devid
826 				 * and not name.
827 				 */
828 				save_devid = Strdup(d->dd_dnp->devid);
829 				metafreedrivename(d->dd_dnp);
830 				d->dd_dnp = metadrivenamebydevid(&sp,
831 				    save_devid, newname, ep);
832 				Free(save_devid);
833 				Free(newname);
834 				/*
835 				 * null newname so we are reset for next time
836 				 * through
837 				 */
838 				newname = NULL;
839 				ret = meta_make_sidenmlist(sp,
840 					    d->dd_dnp, 0, NULL, ep);
841 				d->dd_dnp->side_names_key = side_names_key;
842 				if (ret == -1) {
843 					rval = -1;
844 					goto out;
845 				}
846 			}
847 		}
848 	}
849 
850 
851 	RB_TEST(1, "take", ep)
852 
853 	RB_PREEMPT;
854 	rb_level = 1;	/* level 1 */
855 
856 	RB_TEST(2, "take", ep)
857 
858 	if (!MD_ATSET_DESC(sd)) {
859 		if (tk_own_bydd(sp, dd, mhiargsp,
860 		    flags & MD_IM_PARTIAL_DISKSET, ep))
861 			goto rollback;
862 	}
863 
864 	RB_TEST(3, "take", ep)
865 
866 	RB_PREEMPT;
867 	rb_level = 2;	/* level 2 */
868 
869 	RB_TEST(4, "take", ep)
870 
871 	if (clnt_stimeout(mynode(), sp, mhiargsp, ep) == -1)
872 		goto rollback;
873 
874 	if (setup_db_bydd(sp, dd, (flags & TAKE_FORCE), ep) == -1) {
875 		if (! mdismddberror(ep, MDE_DB_ACCOK) &&
876 		    ! mdismddberror(ep, MDE_DB_TAGDATA))
877 			goto rollback;
878 		mdclrerror(ep);
879 	}
880 
881 	RB_TEST(5, "take", ep)
882 
883 	RB_PREEMPT;
884 	rb_level = 3;	/* level 3 */
885 
886 	RB_TEST(6, "take", ep)
887 
888 	/* Snarf set of traditional diskset doesn't use stale information */
889 	if (snarf_set(sp, FALSE, ep)) {
890 		if (mdismddberror(ep, MDE_DB_STALE) ||
891 		    mdismddberror(ep, MDE_DB_ACCOK) ||
892 		    mdismddberror(ep, MDE_DB_TAGDATA)) {
893 			rval = -1;
894 			goto out;
895 		}
896 
897 		if (! mdismddberror(ep, MDE_DB_NODB) &&
898 		    ! mdismddberror(ep, MDE_DB_NOTOWNER))
899 			goto rollback;
900 
901 		/*
902 		 * Look at the set on all other hosts, if every other host
903 		 * has the same set with a larger genid, then we destroy this
904 		 * copy.
905 		 */
906 		for (i = 0; i < MD_MAXSIDES; i++) {
907 			/* Skip empty slots */
908 			if (sd->sd_nodes[i][0] == '\0')
909 				continue;
910 
911 			/* Skip this node */
912 			if (strcmp(sd->sd_nodes[i], mynode()) == 0)
913 				continue;
914 
915 			numsides++;
916 
917 			has_set = nodehasset(sp, sd->sd_nodes[i],
918 			    NHS_NST_EQ_G_GT, &xep);
919 
920 			if (has_set < 0) {
921 				if (! mdiserror(&xep, MDE_NO_SET) &&
922 				    ! mdismddberror(&xep, MDE_DB_NODB))
923 					goto rollback;
924 				matches++;
925 				mdclrerror(&xep);
926 				continue;
927 			}
928 
929 			if (has_set)
930 				matches++;
931 		}
932 
933 		/* Destroy the set */
934 		if (numsides > 0 && (numsides - matches) == 0) {
935 			if (meta_set_destroy(sp, FALSE, &xep))
936 				mdclrerror(&xep);
937 			(void) mddserror(ep, MDE_DS_SETCLEANUP, sp->setno,
938 			    sp->setname, NULL, mynode());
939 			rval = -1;
940 		}
941 		goto rollback;
942 	}
943 
944 	/*
945 	 * If an unresolved replicated diskset, fix up diskset
946 	 * and local namespaces, master block and drive record
947 	 * with the new devid.  If all drives in diskset are
948 	 * now resolved, then clear set unresolved replicated flag.
949 	 * If an error is encountered, don't fail the take, but
950 	 * don't proceed any further in resolving the replicated disks.
951 	 */
952 	if (sd->sd_flags & MD_SR_UNRSLV_REPLICATED) {
953 		/* Fix up diskset and local namespaces with new devids */
954 		meta_unrslv_replicated_nm(sp, dd, dnlp, ep);
955 		if (mdisok(ep)) {
956 			/* Fix up master block with new devids  */
957 			meta_unrslv_replicated_mb(sp, dd, dnlp, ep);
958 		}
959 
960 		/* If all drives are resolved, set OK flag in set record. */
961 		if (mdisok(ep) && (unrslv_replicated == 0)) {
962 			/* Ignore failure since no bad effect. */
963 			(void) clnt_upd_sr_flags(mynode(), sp, MD_SR_OK, ep);
964 		}
965 		mdclrerror(ep);
966 
967 	}
968 
969 	pathname_return = pathname_reload(&sp, sp->setno, ep);
970 	if ((pathname_return == METADEVADM_ERR) ||
971 	    (pathname_return == METADEVADM_DSKNAME_ERR)) {
972 		goto rollback;
973 	}
974 
975 
976 	if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
977 		goto rollback;
978 
979 	if (upd_dr_dbinfo(sp, sd, dd, rlp, (flags & TAKE_FORCE), ep) < 0) {
980 		metafreereplicalist(rlp);
981 		goto rollback;
982 	}
983 
984 	metafreereplicalist(rlp);
985 
986 	/*
987 	 * If the set doesn't have the MD_SR_MB_DEVID bit set, i.e
988 	 * the drives in the set don't have the device id information,
989 	 * then stick it in if possible.
990 	 *
991 	 * If updating the master block fails for whatever reason, it's
992 	 * okay. It just means the disk(s) in the diskset won't be self
993 	 * identifying.
994 	 */
995 	if (!(sd->sd_flags & MD_SR_MB_DEVID)) {
996 		/* Lock the set on current set members */
997 		for (i = 0; i < MD_MAXSIDES; i++) {
998 			/* Skip empty slots */
999 			if (sd->sd_nodes[i][0] == '\0')
1000 				continue;
1001 
1002 			/* We already locked this side */
1003 			if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1004 				continue;
1005 
1006 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1007 				rval = -1;
1008 				goto out;
1009 			}
1010 		}
1011 		rb_level = 4;	/* level 4 */
1012 
1013 		if (meta_update_mb(sp, dd, ep) == 0)
1014 			/* update the sr_flags on all hosts */
1015 			for (i = 0; i < MD_MAXSIDES; i++) {
1016 				/* Skip empty slots */
1017 				if (sd->sd_nodes[i][0] == '\0')
1018 					continue;
1019 
1020 				if (clnt_upd_sr_flags(sd->sd_nodes[i],
1021 				    sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
1022 					goto rollback;
1023 			}
1024 
1025 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1026 		for (i = 0; i < MD_MAXSIDES; i++) {
1027 			/* Skip empty slots */
1028 			if (sd->sd_nodes[i][0] == '\0')
1029 				continue;
1030 
1031 			/* Unlocked of this side is done later */
1032 			if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1033 				continue;
1034 
1035 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1036 				if (rval == 0)
1037 					(void) mdstealerror(ep, &xep);
1038 				rval = -1;
1039 			}
1040 		}
1041 	}
1042 
1043 	/*
1044 	 * If we get here, we need to unlock the set before the resync
1045 	 * gets called, otherwise the "daemon" will hold the set lock
1046 	 * until the resync is done!
1047 	 */
1048 
1049 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1050 	if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1051 		if (rval == 0)
1052 			(void) mdstealerror(ep, &xep);
1053 		rval = -1;
1054 	}
1055 	cl_set_setkey(NULL);
1056 
1057 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1058 
1059 	/* We try to get things resync'ed, but this can fail */
1060 	mdclrerror(&xep);
1061 	if (meta_resync_all(sp, MD_DEF_RESYNC_BUF_SIZE, &xep) != 0) {
1062 		if (rval == 0)
1063 			(void) mdstealerror(ep, &xep);
1064 		rval = -1;
1065 	}
1066 
1067 	RB_TEST(7, "take", ep)
1068 
1069 	/*
1070 	 * In order to resolve the namespace major driver names and
1071 	 * to have the subdrivers attempt to re-associate devts from
1072 	 * the newly resolved replicated device ids, return a '2'.
1073 	 * This instructs metaset to release the diskset and re-take.
1074 	 *
1075 	 * Return a 2 if
1076 	 * 	- no error was detected on the take
1077 	 *	- a replicated unresolved devid was resolved during take
1078 	 *	- take isn't being called during an import
1079 	 *	- this isn't already a re-take situation
1080 	 */
1081 	if ((rval == 0) && (retake_flag == 1) &&
1082 	    ((flags & (TAKE_RETAKE | TAKE_IMP)) == 0)) {
1083 		rval = 2;
1084 	}
1085 
1086 	return (rval);
1087 
1088 out:
1089 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1090 	if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1091 		if (rval == 0)
1092 			(void) mdstealerror(ep, &xep);
1093 		rval = -1;
1094 	}
1095 	if (!(sd->sd_flags & MD_SR_MB_DEVID) && (rb_level > 2)) {
1096 		for (i = 0; i < MD_MAXSIDES; i++) {
1097 			/* Skip empty slots */
1098 			if (sd->sd_nodes[i][0] == '\0')
1099 				continue;
1100 
1101 			/* We already unlocked this side */
1102 			if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1103 				continue;
1104 
1105 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1106 				if (rval == 0)
1107 					(void) mdstealerror(ep, &xep);
1108 				rval = -1;
1109 			}
1110 		}
1111 	}
1112 	cl_set_setkey(NULL);
1113 
1114 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1115 
1116 	return (rval);
1117 
1118 rollback:
1119 	/* Make sure we are blocking all signals */
1120 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1121 		mdclrerror(&xep);
1122 
1123 	rval = -1;
1124 
1125 	/* level 4 */
1126 	if (rb_level > 3) {
1127 		if (sd->sd_flags & MD_SR_MB_DEVID) {
1128 			/* update the sr_flags on all hosts */
1129 			for (i = 0; i < MD_MAXSIDES; i++) {
1130 				/* Skip empty slots */
1131 				if (sd->sd_nodes[i][0] == '\0')
1132 					continue;
1133 
1134 				if (clnt_upd_sr_flags(sd->sd_nodes[i], sp,
1135 				    (sd->sd_flags & ~MD_SR_MB_DEVID), &xep))
1136 					mdclrerror(&xep);
1137 			}
1138 		}
1139 
1140 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1141 		for (i = 0; i < MD_MAXSIDES; i++) {
1142 			/* Skip empty slots */
1143 			if (sd->sd_nodes[i][0] == '\0')
1144 				continue;
1145 
1146 			/* We will unlocked this side below */
1147 			if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1148 				continue;
1149 
1150 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1151 				mdclrerror(&xep);
1152 		}
1153 	}
1154 
1155 	/* level 3 */
1156 	if (rb_level > 2) {
1157 		if (halt_set(sp, &xep))
1158 			mdclrerror(&xep);
1159 	}
1160 
1161 	/* level 2 */
1162 	if (rb_level > 1) {
1163 		if (clnt_stimeout(mynode(), sp, &defmhiargs, &xep) == -1)
1164 			mdclrerror(&xep);
1165 	}
1166 
1167 	/* level 1 */
1168 	if (rb_level > 0) {
1169 		if (!MD_ATSET_DESC(sd)) {
1170 			if (rel_own_bydd(sp, dd, FALSE, &xep))
1171 				mdclrerror(&xep);
1172 		}
1173 	}
1174 
1175 	/* level 0 */
1176 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1177 	if (clnt_unlock_set(mynode(), cl_sk, &xep))
1178 		mdclrerror(&xep);
1179 	cl_set_setkey(NULL);
1180 
1181 	/* release signals back to what they were on entry */
1182 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1183 		mdclrerror(&xep);
1184 
1185 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1186 
1187 	return (rval);
1188 }
1189 
1190 int
1191 meta_set_release(
1192 	mdsetname_t		*sp,
1193 	md_error_t		*ep
1194 )
1195 {
1196 	int			rval = 0;
1197 	md_drive_desc		*dd;
1198 	mhd_mhiargs_t		mhiargs;
1199 	sigset_t		oldsigs;
1200 	md_setkey_t		*cl_sk;
1201 	int			rb_level = 0;
1202 	md_error_t		xep = mdnullerror;
1203 
1204 	/* Make sure we own the set */
1205 	if (meta_check_ownership(sp, ep) != 0)
1206 		return (-1);
1207 
1208 	/* Get the drive descriptors */
1209 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1210 	    ep)) == NULL)
1211 		if (! mdisok(ep))
1212 			return (-1);
1213 
1214 	/* Get timeout values in case we need to roll back this release */
1215 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
1216 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0)
1217 		return (-1);
1218 
1219 	/* END CHECK CODE */
1220 
1221 	md_rb_sig_handling_on();
1222 
1223 	/* Lock the set on our side */
1224 	if (clnt_lock_set(mynode(), sp, ep)) {
1225 		rval = -1;
1226 		goto out;
1227 	}
1228 
1229 	RB_TEST(1, "release", ep)
1230 
1231 	RB_PREEMPT;
1232 	rb_level = 1;	/* level 1 */
1233 
1234 	RB_TEST(2, "release", ep)
1235 
1236 	if (halt_set(sp, ep))
1237 		goto rollback;
1238 
1239 	RB_TEST(3, "release", ep)
1240 
1241 	RB_PREEMPT;
1242 	rb_level = 2;	/* level 2 */
1243 
1244 	RB_TEST(4, "release", ep)
1245 
1246 	if (rel_own_bydd(sp, dd, FALSE, ep))
1247 		goto rollback;
1248 
1249 	RB_TEST(5, "release", ep)
1250 
1251 	RB_PREEMPT;
1252 	rb_level = 3;	/* level 3 */
1253 
1254 	RB_TEST(6, "release", ep)
1255 
1256 	if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1257 		goto rollback;
1258 
1259 	RB_TEST(7, "release", ep)
1260 
1261 out:
1262 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1263 	if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1264 		if (rval == 0)
1265 			(void) mdstealerror(ep, &xep);
1266 		rval = -1;
1267 	}
1268 	cl_set_setkey(NULL);
1269 
1270 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1271 
1272 	return (rval);
1273 
1274 rollback:
1275 	/* Make sure we are blocking all signals */
1276 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
1277 		mdclrerror(&xep);
1278 
1279 	rval = -1;
1280 
1281 	/* level 3 */
1282 	if (rb_level > 2) {
1283 		if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
1284 			mdclrerror(&xep);
1285 	}
1286 
1287 	/* level 2 */
1288 	if (rb_level > 1) {
1289 		if (tk_own_bydd(sp, dd, &mhiargs, FALSE, &xep))
1290 			mdclrerror(&xep);
1291 	}
1292 
1293 	/* level 1 */
1294 	if (rb_level > 0) {
1295 		if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
1296 			mdclrerror(&xep);
1297 
1298 		/* Snarf set of trad diskset doesn't use stale information */
1299 		if (snarf_set(sp, FALSE, &xep))
1300 			mdclrerror(&xep);
1301 	}
1302 
1303 	/* level 0 */
1304 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1305 	if (clnt_unlock_set(mynode(), cl_sk, &xep))
1306 		mdclrerror(&xep);
1307 	cl_set_setkey(NULL);
1308 
1309 	/* release signals back to what they were on entry */
1310 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1311 		mdclrerror(&xep);
1312 
1313 	md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1314 
1315 	return (rval);
1316 }
1317