xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_import.c (revision 127:b5442f86e50a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <assert.h>
30 #include <ctype.h>
31 #include <libdevinfo.h>
32 #include <mdiox.h>
33 #include <meta.h>
34 #include "meta_repartition.h"
35 #include "meta_set_prv.h"
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/lvm/md_mddb.h>
40 #include <sys/lvm/md_names.h>
41 #include <sys/lvm/md_crc.h>
42 
43 typedef struct did_list {
44 	void		*rdid;	/* real did if replicated set */
45 	void		*did;	/* did stored in lb */
46 	char		*devname;
47 	dev_t		dev;
48 	uint_t		did_index;
49 	char		*minor_name;
50 	struct did_list	*next;
51 } did_list_t;
52 
53 typedef struct replicated_disk {
54 	void			*old_devid;
55 	void 			*new_devid;
56 	struct replicated_disk	*next;
57 } replicated_disk_t;
58 
59 /*
60  * The current implementation limits the max device id length to 256 bytes.
61  * Should the max device id length be increased, this define would have to
62  * be bumped up accordingly
63  */
64 #define	MAX_DEVID_LEN		256
65 
66 /*
67  * We store a global list of all the replicated disks in the system. In
68  * order to prevent us from performing a linear search on this list, we
69  * store the disks in a two dimensional sparse array. The disks are bucketed
70  * based on the length of their device ids.
71  */
72 static replicated_disk_t *replicated_disk_list[MAX_DEVID_LEN + 1] = {NULL};
73 
74 /*
75  * The list of replicated disks is built just once and this flag is set
76  * once it's done
77  */
78 static int replicated_disk_list_built = 0;
79 
80 /*
81  * Map logical blk to physical
82  *
83  * This is based on the routine of the same name in the md kernel module (see
84  * file md_mddb.c), with the following caveats:
85  *
86  * - The kernel routine works on in core master blocks, or mddb_mb_ic_t; this
87  * routine works instead on the mddb_mb_t read directly from the disk
88  */
89 static daddr_t
90 getphysblk(
91 	mddb_block_t	blk,
92 	mddb_mb_t	*mbp
93 )
94 {
95 	/*
96 	 * Sanity check: is the block within range?  If so, we then assume
97 	 * that the block range map in the master block is valid and
98 	 * consistent with the block count.  Unfortunately, there is no
99 	 * reliable way to validate this assumption.
100 	 */
101 	if (blk >= mbp->mb_blkcnt || blk >= mbp->mb_blkmap.m_consecutive)
102 		return ((daddr_t)-1);
103 
104 	return (mbp->mb_blkmap.m_firstblk + blk);
105 }
106 
107 
108 
109 /*
110  * drive_append()
111  *
112  * Append to tail of linked list of md_im_drive_info_t.
113  *
114  * Will allocate space for new node and copy args into new space.
115  *
116  * Returns pointer to new node.
117  */
118 static md_im_drive_info_t *
119 drive_append(
120 	md_im_drive_info_t	**midpp,
121 	mddrivename_t		*dnp,
122 	void			*devid,
123 	void			*rdevid,
124 	int			devid_sz,
125 	char			*minor_name,
126 	md_timeval32_t		timestamp,
127 	md_im_replica_info_t	*mirp
128 )
129 {
130 	md_im_drive_info_t	*midp;
131 	int			o_devid_sz;
132 
133 	for (; (*midpp != NULL); midpp = &((*midpp)->mid_next))
134 		;
135 
136 	midp = *midpp = Zalloc(sizeof (md_im_drive_info_t));
137 
138 	midp->mid_dnp = dnp;
139 
140 	/*
141 	 * If rdevid is not NULL then we know we are dealing with
142 	 * replicated diskset case. 'devid_sz' will always be the
143 	 * size of a valid devid which can be 'devid' or 'rdevid'
144 	 */
145 	midp->mid_devid = (void *)Malloc(devid_sz);
146 
147 	if (rdevid) {
148 		(void) memcpy(midp->mid_devid, rdevid, devid_sz);
149 		/*
150 		 * Also need to store the 'other' devid
151 		 */
152 		o_devid_sz = devid_sizeof((ddi_devid_t)devid);
153 		midp->mid_o_devid = (void *)Malloc(o_devid_sz);
154 		(void) memcpy(midp->mid_o_devid, devid, o_devid_sz);
155 		midp->mid_o_devid_sz = o_devid_sz;
156 	} else {
157 		/*
158 		 * In the case of regular diskset, midp->mid_o_devid
159 		 * will be a NULL pointer
160 		 */
161 		(void) memcpy(midp->mid_devid, devid, devid_sz);
162 	}
163 
164 	midp->mid_devid_sz = devid_sz;
165 	midp->mid_setcreatetimestamp = timestamp;
166 	(void) strlcpy(midp->mid_minor_name, minor_name, MDDB_MINOR_NAME_MAX);
167 	midp->mid_replicas = mirp;
168 
169 	return (midp);
170 }
171 
172 
173 
174 /*
175  * drive_append_wrapper()
176  *
177  * Constant time append wrapper; the append function will always walk the list,
178  * this will take a tail argument and use the append function on just the tail
179  * node, doing the appropriate old-tail-next-pointer bookkeeping.
180  */
181 static md_im_drive_info_t **
182 drive_append_wrapper(
183 	md_im_drive_info_t	**tailpp,
184 	mddrivename_t		*dnp,
185 	void 			*devid,
186 	void			*rdevid,
187 	int			devid_sz,
188 	char			*minor_name,
189 	md_timeval32_t		timestamp,
190 	md_im_replica_info_t	*mirp
191 )
192 {
193 	(void) drive_append(tailpp, dnp, devid, rdevid, devid_sz, minor_name,
194 		timestamp, mirp);
195 
196 	if ((*tailpp)->mid_next == NULL)
197 		return (tailpp);
198 
199 	return (&((*tailpp)->mid_next));
200 }
201 
202 
203 
204 /*
205  * replica_append()
206  *
207  * Append to tail of linked list of md_im_replica_info_t.
208  *
209  * Will allocate space for new node and copy args into new space.
210  *
211  * Returns pointer to new node.
212  */
213 static md_im_replica_info_t *
214 replica_append(
215 	md_im_replica_info_t	**mirpp,
216 	int			flags,
217 	daddr32_t		offset,
218 	daddr32_t		length,
219 	md_timeval32_t		timestamp
220 )
221 {
222 	md_im_replica_info_t	*mirp;
223 
224 	for (; (*mirpp != NULL); mirpp = &((*mirpp)->mir_next))
225 		;
226 
227 	mirp = *mirpp = Zalloc(sizeof (md_im_replica_info_t));
228 
229 	mirp->mir_flags = flags;
230 	mirp->mir_offset = offset;
231 	mirp->mir_length = length;
232 	mirp->mir_timestamp = timestamp;
233 
234 	return (mirp);
235 
236 }
237 
238 
239 
240 /*
241  * replica_append_wrapper()
242  *
243  * Constant time append wrapper; the append function will always walk the list,
244  * this will take a tail argument and use the append function on just the tail
245  * node, doing the appropriate old-tail-next-pointer bookkeeping.
246  */
247 static md_im_replica_info_t **
248 replica_append_wrapper(
249 	md_im_replica_info_t	**tailpp,
250 	int			flags,
251 	daddr32_t		offset,
252 	daddr32_t		length,
253 	md_timeval32_t		timestamp
254 )
255 {
256 	(void) replica_append(tailpp, flags, offset, length, timestamp);
257 
258 	if ((*tailpp)->mir_next == NULL)
259 		return (tailpp);
260 
261 	return (&(*tailpp)->mir_next);
262 }
263 
264 /*
265  * map_replica_disk()
266  *
267  * Searches the device id list for a specific
268  * disk based on the locator block device id array index.
269  *
270  * Returns a pointer to the did_list node if a match was
271  * found or NULL otherwise.
272  */
273 static did_list_t *
274 map_replica_disk(
275 	did_list_t	*did_listp,
276 	int		did_index
277 )
278 {
279 	did_list_t	*tailp = did_listp;
280 
281 	while (tailp != NULL) {
282 		if (tailp->did_index == did_index)
283 			return (tailp);
284 		tailp = tailp->next;
285 	}
286 
287 	/* not found, return failure */
288 	return (NULL);
289 }
290 
291 /*
292  * replicated_list_lookup()
293  *
294  * looks up a replicated disk entry in the global replicated disk list
295  * based upon the length of that disk's device id. returns the new device id
296  * for the disk.
297  * If you store the returned devid you must create a local copy.
298  */
299 static void *
300 replicated_list_lookup(
301 	uint_t	devid_len,
302 	void	*old_devid
303 )
304 {
305 	replicated_disk_t *head = NULL;
306 
307 	assert(devid_len <= MAX_DEVID_LEN);
308 	head = replicated_disk_list[devid_len];
309 
310 	if (head == NULL)
311 		return (NULL);
312 
313 	do {
314 		if (devid_compare((ddi_devid_t)old_devid,
315 			(ddi_devid_t)head->old_devid) == 0)
316 			return (head->new_devid);
317 		head = head->next;
318 	} while (head != NULL);
319 
320 	return (NULL);
321 }
322 
323 /*
324  * replicated_list_insert()
325  *
326  * inserts a replicated disk entry into the global replicated disk list
327  */
328 static void
329 replicated_list_insert(
330 	size_t	old_devid_len,
331 	void	*old_devid,
332 	void	*new_devid
333 )
334 {
335 	replicated_disk_t	*repl_disk, **first_entry;
336 	void			*repl_old_devid = NULL;
337 
338 	assert(old_devid_len <= MAX_DEVID_LEN);
339 
340 	repl_disk = Zalloc(sizeof (replicated_disk_t));
341 	repl_old_devid = Zalloc(old_devid_len);
342 	(void) memcpy(repl_old_devid, (void *)old_devid, old_devid_len);
343 
344 	repl_disk->old_devid = repl_old_devid;
345 	repl_disk->new_devid = new_devid;
346 
347 	first_entry = &replicated_disk_list[old_devid_len];
348 
349 	if (*first_entry == NULL) {
350 		*first_entry = repl_disk;
351 		return;
352 	}
353 
354 	repl_disk->next = *first_entry;
355 	replicated_disk_list[old_devid_len] = repl_disk;
356 }
357 
358 /*
359  * get_replica_disks()
360  *
361  * Will step through the locator records in the supplied locator block, and add
362  * each one with an active replica to a supplied list of md_im_drive_info_t, and
363  * add the appropriate replicas to the md_im_replica_info_t contained therein.
364  */
365 static void
366 get_replica_disks(
367 	md_im_set_desc_t	*misp,
368 	did_list_t		*did_listp,
369 	mddb_mb_t		*mb,
370 	mddb_lb_t		*lbp,
371 	md_error_t		*ep,
372 	int			replicated
373 )
374 {
375 	mddrivename_t		*dnp;
376 	int			indx, on_list;
377 	mdsetname_t		*sp = metasetname(MD_LOCAL_NAME, ep);
378 	int			flags;
379 	int			devid_sz;
380 	char			*minor_name;
381 	did_list_t		*replica_disk;
382 	daddr32_t		offset;
383 	daddr32_t		length;
384 	md_timeval32_t		timestamp;
385 	md_im_replica_info_t	**mirpp = NULL;
386 	md_im_drive_info_t	**midpp = &misp->mis_drives;
387 	md_im_drive_info_t	*midp;
388 	void			*did;
389 
390 	for (indx = 0; indx < lbp->lb_loccnt; indx++) {
391 
392 		on_list = 0;
393 		if (lbp->lb_locators[indx].l_flags & MDDB_F_ACTIVE) {
394 
395 			/*
396 			 * search the device id list for a
397 			 * specific ctds based on the locator
398 			 * block device id array index.
399 			 */
400 			replica_disk = map_replica_disk(did_listp, indx);
401 
402 			assert(replica_disk != NULL);
403 
404 
405 			/*
406 			 * metadrivename() can fail for a slice name
407 			 * if there is not an existing mddrivename_t.
408 			 * So we use metadiskname() to strip the slice
409 			 * number.
410 			 */
411 			dnp = metadrivename(&sp,
412 			    metadiskname(replica_disk->devname), ep);
413 
414 			for (midp = misp->mis_drives; midp != NULL;
415 				midp = midp->mid_next) {
416 				if (dnp == midp->mid_dnp) {
417 					on_list = 1;
418 					mirpp = &midp->mid_replicas;
419 					break;
420 				}
421 			}
422 
423 			/*
424 			 * Get the correct devid_sz
425 			 */
426 			if (replicated)
427 				did = replica_disk->rdid;
428 			else
429 				did = replica_disk->did;
430 
431 			devid_sz = devid_sizeof((ddi_devid_t)did);
432 			minor_name = replica_disk->minor_name;
433 
434 			/*
435 			 * New on the list so add it
436 			 */
437 			if (!on_list) {
438 				mddb_mb_t	*mbp;
439 				uint_t		sliceno;
440 				mdname_t	*rsp;
441 				int		fd = -1;
442 
443 				mbp = Malloc(DEV_BSIZE);
444 
445 				/* determine the replica slice */
446 				if (meta_replicaslice(dnp, &sliceno,
447 				    ep) != 0) {
448 					Free(mbp);
449 					continue;
450 				}
451 
452 				/*
453 				 * if the replica slice size is zero,
454 				 * don't bother opening
455 				 */
456 				if (dnp->vtoc.parts[sliceno].size == 0) {
457 					Free(mbp);
458 					continue;
459 				}
460 
461 				if ((rsp = metaslicename(dnp, sliceno,
462 				    ep)) == NULL) {
463 					Free(mbp);
464 					continue;
465 				}
466 
467 				if ((fd = open(rsp->rname,
468 				    O_RDONLY| O_NDELAY)) < 0) {
469 					Free(mbp);
470 					continue;
471 				}
472 
473 				/*
474 				 * a drive may not have a master block
475 				 */
476 				if (read_master_block(ep, fd, mbp,
477 				    DEV_BSIZE) <= 0) {
478 					mdclrerror(ep);
479 					Free(mbp);
480 					(void) close(fd);
481 					continue;
482 				}
483 
484 				(void) close(fd);
485 				midpp = drive_append_wrapper(midpp, dnp,
486 				    replica_disk->did, replica_disk->rdid,
487 				    devid_sz, minor_name, mbp->mb_setcreatetime,
488 				    NULL);
489 				mirpp = &((*midpp)->mid_replicas);
490 				Free(mbp);
491 			}
492 
493 			/*
494 			 * For either of these assertions to fail, it implies
495 			 * a NULL return from metadrivename() above.  Since
496 			 * the args came from a presumed valid locator block,
497 			 * that's Bad.
498 			 */
499 			assert(midpp != NULL);
500 			assert(mirpp != NULL);
501 
502 			/*
503 			 * Extract the parameters describing this replica.
504 			 *
505 			 * The magic "1" in the length calculation accounts
506 			 * for the length of the master block, in addition to
507 			 * the block count it describes.  (The master block
508 			 * will always take up one block on the disk, and
509 			 * there will always only be one master block per
510 			 * replica, even though much of the code is structured
511 			 * to handle noncontiguous replicas.)
512 			 */
513 			flags = lbp->lb_locators[indx].l_flags;
514 			offset = lbp->lb_locators[indx].l_blkno;
515 			length = mb->mb_blkcnt + 1;
516 			timestamp = mb->mb_setcreatetime;
517 
518 			mirpp = replica_append_wrapper(mirpp, flags,
519 				offset, length, timestamp);
520 
521 			/*
522 			 * If we're here it means -
523 			 *
524 			 * a) we had an active copy of the replica, and
525 			 * b) we've added the disk to the list of
526 			 *    disks as well.
527 			 *
528 			 * We need to bump up the number of active
529 			 * replica count for each such replica so that it
530 			 * can be used later for replica quorum check.
531 			 */
532 			misp->mis_active_replicas++;
533 		}
534 	}
535 }
536 
537 
538 
539 /*
540  * get_nonreplica_disks()
541  *
542  * Extracts the disks without replicas from the locator name space and adds them
543  * to the supplied list of md_im_drive_info_t.
544  */
545 static void
546 get_nonreplica_disks(
547 	md_im_set_desc_t	*misp,
548 	mddb_rb_t		*did_nm,
549 	mddb_rb_t		*did_shrnm,
550 	md_error_t		*ep,
551 	int			replicated
552 )
553 {
554 	char			*search_path = "/dev";
555 	devid_nmlist_t		*nmlist;
556 	md_im_drive_info_t	*midp, **midpp = &misp->mis_drives;
557 	mddrivename_t		*dnp;
558 	mdsetname_t		*sp = metasetname(MD_LOCAL_NAME, ep);
559 	mddb_rb_t		*rbp_did = did_nm;
560 	mddb_rb_t		*rbp_did_shr = did_shrnm;
561 	int			on_list = 0;
562 	int			devid_sz;
563 	struct devid_min_rec	*did_rec;
564 	struct devid_shr_rec	*did_shr_rec;
565 	struct did_shr_name	*did;
566 	struct did_min_name	*min;
567 	void			*r_did;	/* NULL if not a replicated diskset */
568 	void			*valid_did;
569 
570 	/*
571 	 * We got a pointer to an mddb record, which we expect to contain a
572 	 * name record; extract the pointer thereto.
573 	 */
574 	/* LINTED */
575 	did_rec = (struct devid_min_rec *)((caddr_t)(&rbp_did->rb_data));
576 	/* LINTED */
577 	did_shr_rec = (struct devid_shr_rec *)
578 	    ((caddr_t)(&rbp_did_shr->rb_data));
579 
580 	/*
581 	 * Skip the nm_rec_hdr and iterate on the array of struct minor_name
582 	 * at the end of the devid_min_rec
583 	 */
584 	for (min = &did_rec->minor_name[0]; min->min_devid_key != 0;
585 	    /* LINTED */
586 	    min = (struct did_min_name *)((char *)min + DID_NAMSIZ(min))) {
587 
588 		on_list = 0;
589 		r_did = NULL;
590 
591 		/*
592 		 * For a give DID_NM key, locate the corresponding device
593 		 * id from DID_NM_SHR
594 		 */
595 		for (did = &did_shr_rec->device_id[0]; did->did_key != 0;
596 		    /* LINTED */
597 		    did = (struct did_shr_name *)
598 		    ((char *)did + DID_SHR_NAMSIZ(did))) {
599 			/*
600 			 * We got a match, this is the device id we're
601 			 * looking for
602 			 */
603 			if (min->min_devid_key == did->did_key)
604 				break;
605 		}
606 
607 		if (did->did_key == 0) {
608 			/* we didn't find a match */
609 			assert(did->did_key != 0);
610 			md_exit(NULL, 1);
611 		}
612 
613 		/*
614 		 * If replicated diskset
615 		 */
616 		if (replicated) {
617 			size_t		new_devid_len;
618 			char		*temp;
619 			/*
620 			 * In this case, did->did_devid will
621 			 * be invalid so lookup the real one
622 			 */
623 			temp = replicated_list_lookup(did->did_size,
624 			    did->did_devid);
625 			new_devid_len = devid_sizeof((ddi_devid_t)temp);
626 			r_did = Zalloc(new_devid_len);
627 			(void) memcpy(r_did, temp, new_devid_len);
628 			valid_did = r_did;
629 		} else {
630 			valid_did = did->did_devid;
631 		}
632 
633 		/* Get the ctds mapping for that device id */
634 		if (meta_deviceid_to_nmlist(search_path,
635 		    (ddi_devid_t)valid_did,
636 		    &min->min_name[0], &nmlist) == 0) {
637 
638 			assert(nmlist->devname != NULL);
639 			dnp = metadrivename(&sp,
640 			    metadiskname(nmlist->devname), ep);
641 
642 			assert(dnp != NULL);
643 			/* Is it already on the list? */
644 			for (midp = misp->mis_drives; midp != NULL;
645 			    midp = midp->mid_next) {
646 				if (midp->mid_dnp == dnp) {
647 					on_list = 1;
648 					break;
649 				}
650 			}
651 
652 			devid_sz = devid_sizeof(
653 			    (ddi_devid_t)valid_did);
654 
655 			if (!on_list) {
656 				mddb_mb_t	*mbp;
657 				uint_t		sliceno;
658 				mdname_t	*rsp;
659 				int		fd = -1;
660 
661 				mbp = Malloc(DEV_BSIZE);
662 
663 				/* determine the replica slice */
664 				if (meta_replicaslice(dnp, &sliceno,
665 				    ep) != 0) {
666 					Free(mbp);
667 					continue;
668 				}
669 
670 				/*
671 				 * if the replica slice size is zero,
672 				 * don't bother opening
673 				 */
674 				if (dnp->vtoc.parts[sliceno].size
675 				    == 0) {
676 					Free(mbp);
677 					continue;
678 				}
679 
680 				if ((rsp = metaslicename(dnp, sliceno,
681 				    ep)) == NULL) {
682 					Free(mbp);
683 					continue;
684 				}
685 
686 				if ((fd = open(rsp->rname,
687 				    O_RDONLY| O_NDELAY)) < 0) {
688 					Free(mbp);
689 					continue;
690 				}
691 
692 				/*
693 				 * a drive may not have a master block
694 				 */
695 				if (read_master_block(ep, fd, mbp,
696 				    DEV_BSIZE) <= 0) {
697 					mdclrerror(ep);
698 					Free(mbp);
699 						(void) close(fd);
700 						continue;
701 				}
702 
703 				(void) close(fd);
704 				/*
705 				 * If it is replicated diskset,
706 				 * r_did will be non-NULL and
707 				 * devid_sz will be its size
708 				 */
709 				midpp = drive_append_wrapper(midpp,
710 				    dnp, &did->did_devid, r_did,
711 				    devid_sz, &min->min_name[0],
712 				    mbp->mb_setcreatetime, NULL);
713 				Free(mbp);
714 			}
715 		devid_free_nmlist(nmlist);
716 		}
717 	}
718 }
719 
720 /*
721  * set_append()
722  *
723  * Append to tail of linked list of md_im_set_desc_t.
724  *
725  * Will allocate space for new node AND populate it by extracting disks with
726  * and without replicas from the locator blocks and locator namespace.
727  *
728  * Returns pointer to new node.
729  */
730 static md_im_set_desc_t *
731 set_append(
732 	md_im_set_desc_t	**mispp,
733 	did_list_t		*did_listp,
734 	mddb_mb_t		*mb,
735 	mddb_lb_t		*lbp,
736 	mddb_rb_t		*nm,
737 	mddb_rb_t		*did_nm,
738 	mddb_rb_t		*did_shrnm,
739 	md_error_t		*ep,
740 	int			replicated
741 )
742 {
743 	md_im_set_desc_t	*misp;
744 	set_t			setno = mb->mb_setno;
745 
746 	/* run to end of list */
747 	for (; (*mispp != NULL); mispp = &((*mispp)->mis_next))
748 		;
749 
750 	/* allocate new list element */
751 	misp = *mispp = Zalloc(sizeof (md_im_set_desc_t));
752 
753 	if (replicated)
754 		misp->mis_flags = MD_IM_SET_REPLICATED;
755 
756 	misp->mis_oldsetno = setno;
757 
758 	/* Get the disks with and without replicas */
759 	get_replica_disks(misp, did_listp, mb, lbp, ep, replicated);
760 
761 	if (nm != NULL && did_nm != NULL && did_shrnm != NULL) {
762 		get_nonreplica_disks(misp, did_nm, did_shrnm, ep, replicated);
763 	}
764 
765 	/*
766 	 * An error in this struct could come from either of the above routines;
767 	 * in both cases, we want to pass it back on up.
768 	 */
769 	return (misp);
770 }
771 
772 
773 
774 /*
775  * set_append_wrapper()
776  *
777  * Constant time append wrapper; the append function will always walk the list,
778  * this will take a tail argument and use the append function on just the tail
779  * node, doing the appropriate old-tail-next-pointer bookkeeping.
780  */
781 static md_im_set_desc_t **
782 set_append_wrapper(
783 	md_im_set_desc_t	**tailpp,
784 	did_list_t		*did_listp,
785 	mddb_mb_t		*mb,
786 	mddb_lb_t		*lbp,
787 	mddb_rb_t		*nm,
788 	mddb_rb_t		*did_nm,
789 	mddb_rb_t		*did_shrnm,
790 	md_error_t		*ep,
791 	int			replicated
792 )
793 {
794 	(void) set_append(tailpp, did_listp, mb, lbp, nm, did_nm,
795 	    did_shrnm, ep, replicated);
796 
797 	/* it's the first item in the list, return it instead of the next */
798 	return (((*tailpp)->mis_next == NULL) ? tailpp : &(*tailpp)->mis_next);
799 }
800 
801 
802 
803 /*
804  * add_disk_names()
805  *
806  * Iterator to walk the minor node tree of the device snapshot, adding only the
807  * first non-block instance of each non-cdrom minor node to a list of disks.
808  */
809 static int
810 add_disk_names(di_node_t node, di_minor_t minor, void *args)
811 {
812 	char			*search_path = "/dev";
813 	ddi_devid_t		devid = di_devid(node);
814 	devid_nmlist_t		*nm;
815 	char			*min = di_minor_name(minor);
816 	md_im_names_t		*cnames = (md_im_names_t *)args;
817 	static di_node_t	save_node = NULL;
818 
819 	/*
820 	 * skip CD devices
821 	 * If a device does not have a device id, we can't
822 	 * do anything with it so just exclude it from our
823 	 * list.
824 	 *
825 	 * This would also encompass CD devices and floppy
826 	 * devices that don't have a device id.
827 	 */
828 	if (devid == NULL) {
829 		return (DI_WALK_CONTINUE);
830 	}
831 
832 	/* char disk devices (as opposed to block) */
833 	if (di_minor_spectype(minor) == S_IFCHR) {
834 
835 		/* only first occurrence (slice 0) of each instance */
836 		if (save_node == NULL || node != save_node) {
837 			save_node = node;
838 			if (meta_deviceid_to_nmlist(search_path, devid,
839 			    min, &nm) == 0) {
840 				int	index = cnames->min_count++;
841 
842 				assert(nm->devname != NULL);
843 				cnames->min_names =
844 					Realloc(cnames->min_names,
845 						cnames->min_count *
846 						sizeof (char *));
847 
848 				assert(cnames->min_names != NULL);
849 				cnames->min_names[index] =
850 					metadiskname(nm->devname);
851 				devid_free_nmlist(nm);
852 			}
853 		}
854 	}
855 	return (DI_WALK_CONTINUE);
856 }
857 
858 
859 
860 /*
861  * meta_list_disks()
862  *
863  * Snapshots the device tree and extracts disk devices from the snapshot.
864  */
865 int
866 meta_list_disks(md_error_t *ep, md_im_names_t *cnames)
867 {
868 	di_node_t root_node;
869 
870 	assert(cnames != NULL);
871 	cnames->min_count = 0;
872 	cnames->min_names = NULL;
873 
874 	if ((root_node = di_init("/", DINFOCPYALL|DINFOFORCE))
875 	    == DI_NODE_NIL) {
876 		return (mdsyserror(ep, errno, NULL));
877 	}
878 
879 	(void) di_walk_minor(root_node, DDI_NT_BLOCK, 0, cnames,
880 	    add_disk_names);
881 
882 	di_fini(root_node);
883 	return (0);
884 }
885 
886 /*
887  * meta_imp_drvused
888  *
889  * Checks if given drive is mounted, swapped, part of disk configuration
890  * or in use by SVM.  ep also has error code set up if drive is in use.
891  *
892  * Returns 1 if drive is in use.
893  * Returns 0 if drive is not in use.
894  */
895 int
896 meta_imp_drvused(
897 	mdsetname_t		*sp,
898 	mddrivename_t		*dnp,
899 	md_error_t		*ep
900 )
901 {
902 	md_error_t		status = mdnullerror;
903 	md_error_t		*db_ep = &status;
904 
905 	/*
906 	 * We pass in db_ep to meta_setup_db_locations
907 	 * and never ever use the error contained therein
908 	 * because all we're interested in is a check to
909 	 * see whether any local metadbs are present.
910 	 */
911 	if ((meta_check_drivemounted(sp, dnp, ep) != 0) ||
912 	    (meta_check_driveswapped(sp, dnp, ep) != 0) ||
913 	    (((meta_setup_db_locations(db_ep) == 0) &&
914 	    ((meta_check_drive_inuse(sp, dnp, 1, ep) != 0) ||
915 	    (meta_check_driveinset(sp, dnp, ep) != 0))))) {
916 		return (1);
917 	} else {
918 		return (0);
919 	}
920 }
921 
922 /*
923  * meta_prune_cnames()
924  *
925  * Removes in-use disks from the list prior to further processing.
926  *
927  * Return value depends on err_on_prune flag: if set, and one or more disks
928  * are pruned, the return list will be the pruned disks.  If not set, or if no
929  * disks are pruned, the return list will be the unpruned disks.
930  */
931 mddrivenamelist_t *
932 meta_prune_cnames(
933 	md_error_t *ep,
934 	md_im_names_t *cnames,
935 	int err_on_prune
936 )
937 {
938 	int			d;
939 	int			fcount = 0;
940 	mddrivenamelist_t	*dnlp = NULL;
941 	mddrivenamelist_t	**dnlpp = &dnlp;
942 	mddrivenamelist_t	*fdnlp = NULL;
943 	mddrivenamelist_t	**fdnlpp = &fdnlp;
944 	mdsetname_t		*sp = metasetname(MD_LOCAL_NAME, ep);
945 
946 	for (d = 0; d < cnames->min_count; ++d) {
947 		mddrivename_t	*dnp;
948 
949 		dnp = metadrivename(&sp, cnames->min_names[d], ep);
950 		if (dnp == NULL) {
951 			/*
952 			 * Assuming we're interested in knowing about
953 			 * whatever error occurred, but not in stopping.
954 			 */
955 			mde_perror(ep, cnames->min_names[d]);
956 			mdclrerror(ep);
957 
958 			continue;
959 		}
960 
961 		/*
962 		 * Check if the drive is inuse.
963 		 */
964 		if (meta_imp_drvused(sp, dnp, ep)) {
965 			fdnlpp = meta_drivenamelist_append_wrapper(fdnlpp, dnp);
966 			fcount++;
967 			mdclrerror(ep);
968 		} else {
969 			dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp);
970 		}
971 	}
972 
973 	if (fcount) {
974 		if (err_on_prune) {
975 			(void) mddserror(ep, MDE_DS_DRIVEINUSE, 0,
976 			    NULL, fdnlp->drivenamep->cname, NULL);
977 			metafreedrivenamelist(dnlp);
978 			return (fdnlp);
979 		}
980 		metafreedrivenamelist(fdnlp);
981 	}
982 
983 	return (dnlp);
984 }
985 
986 /*
987  * read_master_block()
988  *
989  * Returns:
990  *	< 0 for failure
991  *	  0 for no valid master block
992  *	  1 for valid master block
993  *
994  * The supplied buffer will be filled in for EITHER 0 or 1.
995  */
996 int
997 read_master_block(
998 	md_error_t	*ep,
999 	int		fd,
1000 	void		*bp,
1001 	int		bsize
1002 )
1003 {
1004 	mddb_mb_t	*mbp = bp;
1005 	int		rval = 1;
1006 
1007 	assert(bp != NULL);
1008 
1009 	if (lseek(fd, (off_t)dbtob(16), SEEK_SET) < 0)
1010 		return (mdsyserror(ep, errno, NULL));
1011 
1012 	if (read(fd, bp, bsize) != bsize)
1013 		return (mdsyserror(ep, errno, NULL));
1014 
1015 	/*
1016 	 * The master block magic number can either be MDDB_MAGIC_MB in
1017 	 * the case of a real master block, or, it can be MDDB_MAGIC_DU
1018 	 * in the case of a dummy master block
1019 	 */
1020 	if ((mbp->mb_magic != MDDB_MAGIC_MB) &&
1021 	    (mbp->mb_magic != MDDB_MAGIC_DU)) {
1022 		rval = 0;
1023 		(void) mdmddberror(ep, MDE_DB_MASTER, 0, 0, 0, NULL);
1024 	}
1025 
1026 	if (mbp->mb_revision != MDDB_REV_MB) {
1027 		rval = 0;
1028 	}
1029 
1030 	return (rval);
1031 }
1032 
1033 /*
1034  * read_locator_block()
1035  *
1036  * Returns:
1037  *	< 0 for failure
1038  *	  0 for no valid locator block
1039  *	  1 for valid locator block
1040  */
1041 int
1042 read_locator_block(
1043 	md_error_t	*ep,
1044 	int		fd,
1045 	mddb_mb_t	*mbp,
1046 	void		*bp,
1047 	int		bsize
1048 )
1049 {
1050 	mddb_lb_t	*lbp = bp;
1051 
1052 	assert(bp != NULL);
1053 
1054 	if (lseek(fd, (off_t)dbtob(mbp->mb_blkmap.m_firstblk), SEEK_SET) < 0)
1055 		return (mdsyserror(ep, errno, NULL));
1056 
1057 	if (read(fd, bp, bsize) != bsize)
1058 		return (mdsyserror(ep, errno, NULL));
1059 
1060 	return ((lbp->lb_magic == MDDB_MAGIC_LB) ? 1 : 0);
1061 }
1062 
1063 int
1064 phys_read(
1065 	md_error_t	*ep,
1066 	int		fd,
1067 	mddb_mb_t	*mbp,
1068 	daddr_t		blk,
1069 	void		*bp,
1070 	int		bcount
1071 )
1072 {
1073 	daddr_t		pblk;
1074 
1075 	if ((pblk = getphysblk(blk, mbp)) < 0)
1076 		return (mdmddberror(ep, MDE_DB_BLKRANGE, NODEV32,
1077 			MD_LOCAL_SET, blk, NULL));
1078 
1079 	if (lseek(fd, (off_t)dbtob(pblk), SEEK_SET) < 0)
1080 		return (mdsyserror(ep, errno, NULL));
1081 
1082 	if (read(fd, bp, bcount) != bcount)
1083 		return (mdsyserror(ep, errno, NULL));
1084 
1085 	return (bcount);
1086 }
1087 
1088 /*
1089  * read_locator_block_did()
1090  *
1091  * Returns:
1092  * 	< 0 for failure
1093  *	  0 for no valid locator name struct
1094  *	  1 for valid locator name struct
1095  */
1096 int
1097 read_locator_block_did(
1098 	md_error_t	*ep,
1099 	int		fd,
1100 	mddb_mb_t	*mbp,
1101 	mddb_lb_t	*lbp,
1102 	void		*bp,
1103 	int		bsize
1104 )
1105 {
1106 	int		lb_didfirstblk = lbp->lb_didfirstblk;
1107 	mddb_did_blk_t	*lbdidp = bp;
1108 	int		rval;
1109 
1110 	assert(bp != NULL);
1111 
1112 	if ((rval = phys_read(ep, fd, mbp, lb_didfirstblk, bp, bsize)) < 0)
1113 		return (rval);
1114 
1115 	return ((lbdidp->blk_magic == MDDB_MAGIC_DI) ? 1 : 0);
1116 }
1117 
1118 /*
1119  * read_locator_names()
1120  *
1121  * Returns:
1122  *	< 0 for failure
1123  *	  0 for no valid locator name struct
1124  *	  1 for valid locator name struct
1125  */
1126 int
1127 read_locator_names(
1128 	md_error_t	*ep,
1129 	int		fd,
1130 	mddb_mb_t	*mbp,
1131 	mddb_lb_t	*lbp,
1132 	void		*bp,
1133 	int		bsize
1134 )
1135 {
1136 	int		lnfirstblk = lbp->lb_lnfirstblk;
1137 	mddb_ln_t	*lnp = bp;
1138 	int		rval;
1139 
1140 	assert(bp != NULL);
1141 
1142 	if ((rval = phys_read(ep, fd, mbp, lnfirstblk, bp, bsize)) < 0)
1143 		return (rval);
1144 
1145 	return ((lnp->ln_magic == MDDB_MAGIC_LN) ? 1 : 0);
1146 }
1147 
1148 
1149 int
1150 read_database_block(
1151 	md_error_t	*ep,
1152 	int		fd,
1153 	mddb_mb_t	*mbp,
1154 	int		dbblk,
1155 	void		*bp,
1156 	int		bsize
1157 )
1158 {
1159 	mddb_db_t	*dbp = bp;
1160 	int		rval;
1161 
1162 	assert(bp != NULL);
1163 
1164 	if ((rval = phys_read(ep, fd, mbp, dbblk, bp, bsize)) < 0)
1165 		return (rval);
1166 
1167 	return ((dbp->db_magic == MDDB_MAGIC_DB) ? 1 : 0);
1168 }
1169 
1170 int
1171 read_loc_didblks(
1172 	md_error_t	*ep,
1173 	int		fd,
1174 	mddb_mb_t	*mbp,
1175 	int		didblk,
1176 	void		*bp,
1177 	int		bsize
1178 )
1179 {
1180 	mddb_did_blk_t	*didbp = bp;
1181 	int		rval;
1182 
1183 	assert(bp != NULL);
1184 
1185 	if ((rval = phys_read(ep, fd, mbp, didblk, bp, bsize)) < 0)
1186 		return (rval);
1187 
1188 	return ((didbp->blk_magic == MDDB_MAGIC_DI) ? 1 : 0);
1189 }
1190 
1191 
1192 int
1193 read_loc_didinfo(
1194 	md_error_t	*ep,
1195 	int		fd,
1196 	mddb_mb_t	*mbp,
1197 	int		infoblk,
1198 	void		*bp,
1199 	int		bsize
1200 )
1201 {
1202 	int		rval = 1;
1203 	mddb_did_info_t	*infop = bp;
1204 
1205 	assert(bp != NULL);
1206 
1207 	if ((rval = phys_read(ep, fd, mbp, infoblk, bp, bsize)) < 0)
1208 		return (rval);
1209 
1210 	return ((infop->info_flags & MDDB_DID_EXISTS) ? 1 : 0);
1211 }
1212 
1213 /*
1214  * meta_nm_rec()
1215  *
1216  * Return the DE corresponding to the requested namespace record type.
1217  * Modifies dbp to have a firstentry if one isn't there.
1218  */
1219 static mddb_de_t *
1220 meta_nm_rec(mddb_db_t *dbp, mddb_type_t rectype)
1221 {
1222 	mddb_de_t *dep;
1223 	int	desize;
1224 
1225 	if (dbp->db_firstentry != NULL) {
1226 		/* LINTED */
1227 		dep = (mddb_de_t *)((caddr_t)(&dbp->db_firstentry)
1228 				    + sizeof (dbp->db_firstentry));
1229 		dbp->db_firstentry = dep;
1230 		while (dep && dep->de_next) {
1231 			desize = sizeof (*dep) - sizeof (dep->de_blks) +
1232 				sizeof (daddr_t) * dep->de_blkcount;
1233 			/* LINTED */
1234 			dep->de_next = (mddb_de_t *)
1235 				((caddr_t)dep + desize);
1236 			dep = dep->de_next;
1237 		}
1238 	}
1239 
1240 	for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) {
1241 		if (dep->de_type1 == rectype)
1242 			break;
1243 	}
1244 	return (dep);
1245 }
1246 
1247 /*
1248  * read_nm_rec()
1249  *
1250  * Reads the NM, NM_DID or NM_DID_SHR record in the mddb and stores the
1251  * configuration data in the buffer 'nm'
1252  *
1253  * Returns:
1254  *	< 0 for failure
1255  *	  0 for no valid NM/DID_NM/DID_NM_SHR record
1256  *	  1 for valid NM/DID_NM/DID_NM_SHR record
1257  *
1258  */
1259 static int
1260 read_nm_rec(
1261 	md_error_t 	*ep,
1262 	int 		fd,
1263 	mddb_mb_t	*mbp,
1264 	mddb_lb_t	*lbp,
1265 	char		**nm,
1266 	mddb_type_t	rectype,
1267 	char		*diskname
1268 )
1269 {
1270 	int		cnt, dbblk, rval = 0;
1271 	char		db[DEV_BSIZE];
1272 	mddb_de_t	*dep;
1273 	/*LINTED*/
1274 	mddb_db_t	*dbp = (mddb_db_t *)&db;
1275 	char 		*tmpnm = NULL;
1276 	daddr_t		pblk;
1277 
1278 	for (dbblk = lbp->lb_dbfirstblk;
1279 	    dbblk != 0;
1280 	    dbblk = dbp->db_nextblk) {
1281 
1282 		if ((rval = read_database_block(ep, fd, mbp, dbblk, dbp,
1283 		    sizeof (db))) <= 0)
1284 			return (rval);
1285 
1286 		/*
1287 		 * Locate NM/DID_NM/DID_NM_SHR record. Normally there is
1288 		 * only one record per mddb. There is a rare case when we
1289 		 * can't expand the record. If this is the case then we
1290 		 * will have multiple NM/DID_NM/DID_NM_SHR records linked
1291 		 * with r_next_recid.
1292 		 *
1293 		 * For now assume the normal case and handle the extended
1294 		 * namespace in Phase 2.
1295 		 */
1296 		if ((dep = meta_nm_rec(dbp, rectype)) != NULL)
1297 			break;
1298 	}
1299 
1300 	/* If meta_nm_rec() never succeeded, bail out */
1301 	if (dep == NULL)
1302 		return (0);
1303 
1304 	/* Read in the appropriate record and return configurations */
1305 	tmpnm = (char *)Zalloc(dbtob(dep->de_blkcount));
1306 	*nm = tmpnm;
1307 
1308 	for (cnt = 0; cnt < dep->de_blkcount; cnt++) {
1309 		if ((pblk = getphysblk(dep->de_blks[cnt], mbp)) < 0) {
1310 			rval = mdmddberror(ep, MDE_DB_BLKRANGE,
1311 			    NODEV32, MD_LOCAL_SET,
1312 			    dep->de_blks[cnt], diskname);
1313 			return (rval);
1314 		}
1315 
1316 		if (lseek(fd, (off_t)dbtob(pblk), SEEK_SET) < 0) {
1317 			rval = mdsyserror(ep, errno, diskname);
1318 			return (rval);
1319 		}
1320 
1321 		if (read(fd, tmpnm, DEV_BSIZE) != DEV_BSIZE) {
1322 			rval = mdsyserror(ep, errno, diskname);
1323 			return (rval);
1324 		}
1325 
1326 		tmpnm += DEV_BSIZE;
1327 	}
1328 	return (1);
1329 }
1330 
1331 /*
1332  * is_replicated
1333  *
1334  * Determines whether a disk has been replicated or not. It checks to see
1335  * if the device id stored in the master block is the same as the device id
1336  * registered for that disk on the current system. If the two device ids are
1337  * different, then we know that the disk has been replicated.
1338  *
1339  * If need_devid is set and the disk is replicated, fill in the new_devid.
1340  * Also, if need_devid is set, this routine allocates memory for the device
1341  * ids; the caller of this routine is responsible for free'ing up the memory.
1342  *
1343  * Returns:
1344  * 	1	if it's a replicated disk
1345  * 	0 	if it's not a replicated disk
1346  */
1347 static int
1348 is_replicated(
1349 	int fd,
1350 	mddb_mb_t *mbp,
1351 	int need_devid,
1352 	void **new_devid
1353 )
1354 {
1355 	ddi_devid_t	current_devid;
1356 	int		retval = 0;
1357 	size_t		new_devid_len;
1358 
1359 	if (mbp->mb_devid_magic != MDDB_MAGIC_DE)
1360 		return (retval);
1361 
1362 	if (devid_get(fd, &current_devid) != 0)
1363 		return (retval);
1364 
1365 	if (devid_compare((ddi_devid_t)mbp->mb_devid, current_devid) != 0)
1366 		retval = 1;
1367 
1368 	if (retval && need_devid) {
1369 		new_devid_len = devid_sizeof(current_devid);
1370 		*new_devid = Zalloc(new_devid_len);
1371 		(void) memcpy(*new_devid, (void *)current_devid, new_devid_len);
1372 	}
1373 
1374 	devid_free(current_devid);
1375 	return (retval);
1376 }
1377 
1378 /*
1379  * free_replicated_disks_list()
1380  *
1381  * this frees up all the memory allocated by build_replicated_disks_list
1382  */
1383 static void
1384 free_replicated_disks_list()
1385 {
1386 	replicated_disk_t 	**repl_disk, *temp;
1387 	int 			index;
1388 
1389 	for (index = 0; index <= MAX_DEVID_LEN; index++) {
1390 		repl_disk = &replicated_disk_list[index];
1391 
1392 		while (*repl_disk != NULL) {
1393 			temp = *repl_disk;
1394 			*repl_disk = (*repl_disk)->next;
1395 
1396 			Free(temp->old_devid);
1397 			Free(temp->new_devid);
1398 			Free(temp);
1399 		}
1400 	}
1401 }
1402 
1403 /*
1404  * build_replicated_disks_list()
1405  *
1406  * Builds a list of disks that have been replicated using either a
1407  * remote replication or a point-in-time replication software. The
1408  * list is stored as a two dimensional sparse array.
1409  *
1410  * Returns
1411  * 	1	on success
1412  * 	0 	on failure
1413  */
1414 static int
1415 build_replicated_disks_list(
1416 	md_error_t *ep,
1417 	mddrivenamelist_t *dnlp
1418 )
1419 {
1420 	uint_t			sliceno;
1421 	int			fd = -1;
1422 	mddrivenamelist_t	*dp;
1423 	mdname_t		*rsp;
1424 	mddb_mb_t		*mbp;
1425 
1426 	mbp = Malloc(DEV_BSIZE);
1427 
1428 	for (dp = dnlp; dp != NULL; dp = dp->next) {
1429 		mddrivename_t *dnp;
1430 		void *new_devid;
1431 
1432 		dnp = dp->drivenamep;
1433 		/* determine the replica slice */
1434 		if (meta_replicaslice(dnp, &sliceno, ep) != 0)
1435 			continue;
1436 
1437 		/*
1438 		 * if the replica slice size is zero, don't bother opening
1439 		 */
1440 		if (dnp->vtoc.parts[sliceno].size == 0)
1441 			continue;
1442 
1443 		if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL)
1444 			continue;
1445 
1446 		if ((fd = open(rsp->rname, O_RDONLY| O_NDELAY)) < 0)
1447 			return (mdsyserror(ep, errno, rsp->rname));
1448 
1449 		/* a drive may not have a master block so we just continue */
1450 		if (read_master_block(ep, fd, mbp, DEV_BSIZE) <= 0) {
1451 			(void) close(fd);
1452 			mdclrerror(ep);
1453 			continue;
1454 		}
1455 
1456 		if (is_replicated(fd, mbp, 1, &new_devid)) {
1457 			replicated_list_insert(mbp->mb_devid_len,
1458 			    mbp->mb_devid, new_devid);
1459 		}
1460 		(void) close(fd);
1461 	}
1462 	replicated_disk_list_built = 1;
1463 
1464 	Free(mbp);
1465 	return (1);
1466 }
1467 
1468 /*
1469  * free_did_list()
1470  *
1471  * Frees the did_list allocated as part of build_did_list
1472  */
1473 static void
1474 free_did_list(
1475 	did_list_t	*did_listp
1476 )
1477 {
1478 	did_list_t	*temp, *head;
1479 
1480 	head = did_listp;
1481 
1482 	while (head != NULL) {
1483 		temp = head;
1484 		head = head->next;
1485 		if (temp->rdid)
1486 			Free(temp->rdid);
1487 		if (temp->did)
1488 			Free(temp->did);
1489 		if (temp->devname)
1490 			Free(temp->devname);
1491 		if (temp->minor_name)
1492 			Free(temp->minor_name);
1493 		Free(temp);
1494 	}
1495 }
1496 
1497 /*
1498  * build_did_list()
1499  *
1500  * Build a list of device ids corresponding to disks in the locator block.
1501  * Memory is allocated here for the nodes in the did_list. The callers of
1502  * this routine must also call free_did_list to free up the memory after
1503  * they're done.
1504  *
1505  * Returns:
1506  *	< 0 		for failure
1507  *	  0 		for no valid locator block device id array
1508  *	  1 		for valid locator block device id array
1509  *	  ENOTSUP	partial diskset, not all disks in a diskset on the
1510  *			system where import is being executed
1511  */
1512 static int
1513 build_did_list(
1514 	md_error_t	*ep,
1515 	int		fd,
1516 	mddb_mb_t	*mb,
1517 	mddb_did_blk_t	*lbdidp,
1518 	did_list_t	**did_listp,
1519 	int		replicated
1520 )
1521 {
1522 	char 		*search_path = "/dev";
1523 	char		*minor_name;
1524 	int		rval, cnt;
1525 	devid_nmlist_t	*nm;
1526 	uint_t		did_info_length = 0;
1527 	uint_t		did_info_firstblk = 0;
1528 	did_list_t	*new, *head = NULL;
1529 	char		*bp = NULL, *temp;
1530 	mddb_did_info_t	*did_info = NULL;
1531 	void		*did = NULL;
1532 	size_t		new_devid_len;
1533 
1534 	for (cnt = 0; cnt < MDDB_NLB; cnt++) {
1535 		did_info = &lbdidp->blk_info[cnt];
1536 
1537 		if (!(did_info->info_flags & MDDB_DID_EXISTS))
1538 			continue;
1539 
1540 		new = Zalloc(sizeof (did_list_t));
1541 		new->did = Zalloc(did_info->info_length);
1542 
1543 		/*
1544 		 * If we can re-use the buffer already has been
1545 		 * read in then just use it.  Otherwise free
1546 		 * the previous one and alloc a new one
1547 		 */
1548 		if (dbtob(did_info->info_blkcnt) != did_info_length &&
1549 		    did_info->info_firstblk != did_info_firstblk) {
1550 
1551 			did_info_length = dbtob(did_info->info_blkcnt);
1552 			did_info_firstblk = did_info->info_firstblk;
1553 
1554 			if (bp)
1555 				Free(bp);
1556 			bp = temp = Zalloc(did_info_length);
1557 
1558 			if ((rval = phys_read(ep, fd, mb, did_info_firstblk,
1559 			    (void *)bp, did_info_length)) < 0)
1560 				return (rval);
1561 		} else {
1562 			temp = bp;
1563 		}
1564 
1565 		temp += did_info->info_offset;
1566 		(void) memcpy(new->did, temp, did_info->info_length);
1567 		new->did_index = cnt;
1568 		minor_name = did_info->info_minor_name;
1569 
1570 		/*
1571 		 * If we are not able to find the ctd mapping corresponding
1572 		 * to a given device id, it probably means the device id in
1573 		 * question is not registered with the system.
1574 		 *
1575 		 * Highly likely that the only time this happens, we've hit
1576 		 * a case where not all the disks that are a part of the
1577 		 * diskset were moved before importing the diskset.
1578 		 *
1579 		 * If set is a replicated diskset, then the device id we get
1580 		 * from 'lb' will be the 'other' did and we need to lookup
1581 		 * the real one before we call this routine.
1582 		 */
1583 		if (replicated) {
1584 		    temp = replicated_list_lookup(did_info->info_length,
1585 			new->did);
1586 		    new_devid_len = devid_sizeof((ddi_devid_t)temp);
1587 		    new->rdid = Zalloc(new_devid_len);
1588 		    (void) memcpy(new->rdid, temp, new_devid_len);
1589 		    did = new->rdid;
1590 		} else {
1591 		    did = new->did;
1592 		}
1593 
1594 		if (devid_valid((ddi_devid_t)(did)) == 0) {
1595 			return (-1);
1596 		}
1597 
1598 		if ((rval = meta_deviceid_to_nmlist(search_path,
1599 		    (ddi_devid_t)did, minor_name, &nm)) != 0) {
1600 			*did_listp = head;
1601 			free_did_list(*did_listp);
1602 			*did_listp = NULL;
1603 			(void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD,
1604 			    mynode(), NULL, NULL);
1605 			return (ENOTSUP);
1606 		}
1607 
1608 		assert(nm->devname != NULL);
1609 		new->devname = Strdup(nm->devname);
1610 		new->dev = nm->dev;
1611 		new->minor_name = Strdup(minor_name);
1612 
1613 		devid_free_nmlist(nm);
1614 
1615 		new->next = head;
1616 		head = new;
1617 	}
1618 
1619 	/* Free the last bp */
1620 	if (bp)
1621 		Free(bp);
1622 	*did_listp = head;
1623 	return (1);
1624 }
1625 /*
1626  * check_nm_disks
1627  *	Checks the disks listed in the shared did namespace to see if they
1628  *	are accessable on the system. If not, return ENOTSUP error to
1629  *	indicate we have a partial diskset.
1630  * Returns:
1631  *	< 0 		for failure
1632  *	  0		success
1633  *	  ENOTSUP	partial diskset, not all disks in a diskset on the
1634  *			system where import is being executed
1635  */
1636 static int
1637 check_nm_disks(
1638 	md_error_t		*ep,
1639 	struct devid_min_rec	*did_nmp,
1640 	struct devid_shr_rec	*did_shrnmp
1641 )
1642 {
1643 	char 		*search_path = "/dev";
1644 	char		*minor_name = NULL;
1645 	uint_t		used_size, min_used_size;
1646 	ddi_devid_t	did;
1647 	devid_nmlist_t	*nm;
1648 	void		*did_min_namep;
1649 	void		*did_shr_namep;
1650 	size_t		did_nsize, did_shr_nsize;
1651 
1652 	used_size = did_shrnmp->did_rec_hdr.r_used_size -
1653 	    sizeof (struct nm_rec_hdr);
1654 	min_used_size = did_nmp->min_rec_hdr.r_used_size -
1655 	    sizeof (struct nm_rec_hdr);
1656 	did_shr_namep = (void *)(&did_shrnmp->device_id[0]);
1657 	while (used_size > (int)sizeof (struct did_shr_name)) {
1658 		did_min_namep = (void *)(&did_nmp->minor_name[0]);
1659 		/* grab device id and minor name from the shared spaces */
1660 		did = (ddi_devid_t)(((struct did_shr_name *)
1661 		    did_shr_namep)->did_devid);
1662 		if (devid_valid(did) == 0) {
1663 			return (-1);
1664 		}
1665 
1666 		/*
1667 		 * We need to check that the DID_NM and DID_SHR_NM are in
1668 		 * sync. It is possible that we took a panic between writing
1669 		 * the two areas to disk. This would be cleaned up on the
1670 		 * next snarf but we don't know for sure that snarf has even
1671 		 * happened since we're reading from disk.
1672 		 */
1673 		while (((struct did_shr_name *)did_shr_namep)->did_key !=
1674 		    ((struct did_min_name *)did_min_namep)->min_devid_key) {
1675 			did_nsize = DID_NAMSIZ((struct did_min_name *)
1676 			    did_min_namep);
1677 			did_min_namep = ((void *)((char *)did_min_namep +
1678 			    did_nsize));
1679 			min_used_size -= did_nsize;
1680 			if (min_used_size < (int)sizeof (struct did_min_name))
1681 				continue;
1682 		}
1683 		minor_name = ((struct did_min_name *)did_min_namep)->min_name;
1684 
1685 		/*
1686 		 * Try to find disk in the system. If we can't find the
1687 		 * disk, we have a partial diskset.
1688 		 */
1689 		if ((meta_deviceid_to_nmlist(search_path,
1690 		    did, minor_name, &nm)) != 0) {
1691 			(void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD,
1692 			    mynode(), NULL, NULL);
1693 			return (ENOTSUP);
1694 		}
1695 		devid_free_nmlist(nm);
1696 		used_size -= DID_SHR_NAMSIZ((struct did_shr_name *)
1697 		    did_shr_namep);
1698 		/* increment to next item in the shared spaces */
1699 		did_shr_nsize = DID_SHR_NAMSIZ((struct did_shr_name *)
1700 		    did_shr_namep);
1701 		did_shr_namep = ((void *)((char *)did_shr_namep +
1702 		    did_shr_nsize));
1703 	}
1704 	return (0);
1705 }
1706 
1707 /*
1708  * meta_get_set_info
1709  *
1710  * Scans a given drive for set specific information. If the given drive
1711  * has a shared metadb, scans the shared metadb for information pertaining
1712  * to the set.
1713  *
1714  * Returns:
1715  * 	<0 	for failure
1716  *	0	success but no replicas were found
1717  *	1	success and a replica was found
1718  *	ENOTSUP for partial disksets detected
1719  */
1720 int
1721 meta_get_set_info(
1722 	mddrivenamelist_t *dp,
1723 	md_im_set_desc_t **mispp,
1724 	int local_mb_ok,
1725 	md_error_t *ep
1726 )
1727 {
1728 	uint_t			s;
1729 	mdname_t		*rsp;
1730 	int			fd;
1731 	char			mb[DEV_BSIZE];
1732 				/*LINTED*/
1733 	mddb_mb_t		*mbp = (mddb_mb_t *)mb;
1734 	char			lb[dbtob(MDDB_LBCNT)];
1735 				/*LINTED*/
1736 	mddb_lb_t		*lbp = (mddb_lb_t *)lb;
1737 	mddb_did_blk_t		*lbdidp = NULL;
1738 	mddb_ln_t		*lnp = NULL;
1739 	int			lnsize, lbdid_size;
1740 	int			rval = 0;
1741 	char			db[DEV_BSIZE];
1742 				/*LINTED*/
1743 	mddb_db_t		*dbp = (mddb_db_t *)db;
1744 	did_list_t		*did_listp = NULL;
1745 	mddrivenamelist_t	*dnlp;
1746 	mddrivename_t 		*dnp;
1747 	md_im_names_t		cnames = { 0, NULL};
1748 	char			*nm = NULL;
1749 	char			*did_nm = NULL, *did_shrnm = NULL;
1750 	struct nm_rec		*nmp;
1751 	struct devid_shr_rec	*did_shrnmp;
1752 	struct devid_min_rec	*did_nmp;
1753 	int			extended_namespace = 0;
1754 	int			replicated = 0;
1755 
1756 	dnp = dp->drivenamep;
1757 
1758 	/*
1759 	 * Determine and open the replica slice
1760 	 */
1761 	if (meta_replicaslice(dnp, &s, ep) != 0) {
1762 		return (-1);
1763 	}
1764 
1765 	/*
1766 	 * Test for the size of replica slice in question. If
1767 	 * the size is zero, we know that this is not a disk that was
1768 	 * part of a set and it should be silently ignored for import.
1769 	 */
1770 	if (dnp->vtoc.parts[s].size == 0)
1771 		return (0);
1772 
1773 	if ((rsp = metaslicename(dnp, s, ep)) == NULL) {
1774 		return (-1);
1775 	}
1776 
1777 	if ((fd = open(rsp->rname, O_RDONLY|O_NDELAY)) < 0)
1778 		return (mdsyserror(ep, errno, rsp->cname));
1779 
1780 	/*
1781 	 * After the open() succeeds, we should return via the "out"
1782 	 * label to clean up after ourselves.  (Up 'til now, we can
1783 	 * just return directly, because there are no resources to
1784 	 * give back.)
1785 	 */
1786 
1787 	if ((rval = read_master_block(ep, fd, mbp, sizeof (mb))) <= 0)
1788 		goto out;
1789 
1790 	replicated = is_replicated(fd, mbp, 0, NULL);
1791 
1792 	if (!local_mb_ok && mbp->mb_setno == 0) {
1793 		rval = 0;
1794 		goto out;
1795 	}
1796 
1797 	if ((rval = read_locator_block(ep, fd, mbp, lbp, sizeof (lb))) <= 0)
1798 		goto out;
1799 
1800 	/*
1801 	 * Once the locator block has been read, we need to
1802 	 * check if the locator block commit count is zero.
1803 	 * If it is zero, we know that the replica we're dealing
1804 	 * with is on a disk that was deleted from the disk set;
1805 	 * and, it potentially has stale data. We need to quit
1806 	 * in that case
1807 	 */
1808 	if (lbp->lb_commitcnt == 0) {
1809 		rval = 0;
1810 		goto out;
1811 	}
1812 
1813 	/*
1814 	 * Make sure that the disk being imported has device id
1815 	 * namespace present for disksets. If a disk doesn't have
1816 	 * device id namespace, we skip reading the replica on that disk
1817 	 */
1818 	if (!(lbp->lb_flags & MDDB_DEVID_STYLE)) {
1819 		rval = 0;
1820 		goto out;
1821 	}
1822 
1823 	/*
1824 	 * Grab the locator block device id array. Allocate memory for the
1825 	 * array first.
1826 	 */
1827 	lbdid_size = dbtob(lbp->lb_didblkcnt);
1828 	lbdidp = Zalloc(lbdid_size);
1829 
1830 	if ((rval = read_locator_block_did(ep, fd, mbp, lbp, lbdidp,
1831 	    lbdid_size)) <= 0)
1832 		goto out;
1833 
1834 	/*
1835 	 * For a disk that has not been replicated, extract the device ids
1836 	 * stored in the locator block device id array and store them in
1837 	 * a list.
1838 	 *
1839 	 * If the disk has been replicated using replication software such
1840 	 * as HDS Truecopy/ShadowImage or EMC SRDF/BCV, the device ids in
1841 	 * the locator block are invalid and we need to build a list of
1842 	 * replicated disks.
1843 	 */
1844 	if (replicated && !replicated_disk_list_built) {
1845 		/*
1846 		 * if there's a replicated diskset involved, we need to
1847 		 * scan the system one more time and build a list of all
1848 		 * candidate disks that might be part of that replicated set
1849 		 */
1850 		if (meta_list_disks(ep, &cnames) != 0) {
1851 			rval = 0;
1852 			goto out;
1853 		}
1854 		dnlp = meta_prune_cnames(ep, &cnames, 0);
1855 		rval = build_replicated_disks_list(ep, dnlp);
1856 		if (rval == 0)
1857 			goto out;
1858 	}
1859 
1860 	rval = build_did_list(ep, fd, mbp, lbdidp, &did_listp, replicated);
1861 
1862 	if ((rval <= 0) || (rval == ENOTSUP))
1863 		goto out;
1864 
1865 	/*
1866 	 * Until here, we've gotten away with fixed sizes for the
1867 	 * master block and locator block.  The locator names,
1868 	 * however, are sized (and therefore allocated) dynamically
1869 	 * according to information in the locator block.
1870 	 */
1871 	lnsize = dbtob(lbp->lb_lnblkcnt);
1872 	lnp = Zalloc(lnsize);
1873 
1874 	if ((rval = read_locator_names(ep, fd, mbp, lbp, lnp, lnsize)) <= 0)
1875 		goto out;
1876 
1877 	/*
1878 	 * Read in the NM record
1879 	 * If no NM record was found, it still is a valid configuration
1880 	 * but it also means that we won't find any corresponding DID_NM
1881 	 * or DID_SHR_NM.
1882 	 */
1883 	if ((rval = read_nm_rec(ep, fd, mbp, lbp, &nm, MDDB_NM, rsp->cname))
1884 	    < 0)
1885 		goto out;
1886 	else if (rval == 0)
1887 		goto append;
1888 
1889 	/*
1890 	 * At this point, we have read in all of the blocks that form
1891 	 * the nm_rec.  We should at least detect the corner case
1892 	 * mentioned above, in which r_next_recid links to another
1893 	 * nm_rec. Extended namespace handling is left for Phase 2.
1894 	 *
1895 	 * What this should really be is a loop, each iteration of
1896 	 * which reads in a nm_rec and calls the set_append_wrapper().
1897 	 */
1898 	/*LINTED*/
1899 	nmp = (struct nm_rec *)(nm + sizeof (mddb_rb_t));
1900 	if (nmp->r_rec_hdr.r_next_recid != (mddb_recid_t)0) {
1901 		extended_namespace = 1;
1902 		rval = 0;
1903 		goto out;
1904 	}
1905 
1906 	if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_nm,
1907 	    MDDB_DID_NM, rsp->cname)) < 0)
1908 		goto out;
1909 	else if (rval == 0)
1910 		goto append;
1911 
1912 	/*LINTED*/
1913 	did_nmp = (struct devid_min_rec *)(did_nm + sizeof (mddb_rb_t) -
1914 	    sizeof (int));
1915 	if (did_nmp->min_rec_hdr.r_next_recid != (mddb_recid_t)0) {
1916 		extended_namespace = 1;
1917 		rval = 0;
1918 		goto out;
1919 	}
1920 
1921 	if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_shrnm,
1922 	    MDDB_DID_SHR_NM, rsp->cname)) < 0)
1923 		goto out;
1924 	else if (rval == 0)
1925 		goto append;
1926 
1927 	/*LINTED*/
1928 	did_shrnmp = (struct devid_shr_rec *)(did_shrnm + sizeof (mddb_rb_t) -
1929 	    sizeof (int));
1930 	if (did_shrnmp->did_rec_hdr.r_next_recid != (mddb_recid_t)0) {
1931 		extended_namespace = 1;
1932 		rval = 0;
1933 		goto out;
1934 	}
1935 
1936 	/*
1937 	 * We need to check if all of the disks listed in the namespace
1938 	 * are actually available. If they aren't we'll return with
1939 	 * an ENOTSUP error which indicates a partial diskset.
1940 	 */
1941 	rval = check_nm_disks(ep, did_nmp, did_shrnmp);
1942 	if ((rval < 0) || (rval == ENOTSUP))
1943 		goto out;
1944 
1945 append:
1946 	/* Finally, we've got what we need to process this replica. */
1947 	mispp = set_append_wrapper(mispp, did_listp, mbp, lbp,
1948 	    /*LINTED*/
1949 	    (mddb_rb_t *)nm, (mddb_rb_t *)did_nm, (mddb_rb_t *)did_shrnm,
1950 	    ep, replicated);
1951 
1952 	/* Return the fact that we found at least one set */
1953 	rval = 1;
1954 
1955 out:
1956 	if (fd >= 0)
1957 		(void) close(fd);
1958 	if (did_listp != NULL)
1959 		free_did_list(did_listp);
1960 	if (lnp != NULL)
1961 		Free(lnp);
1962 	if (nm != NULL)
1963 		Free(nm);
1964 	if (did_nm != NULL)
1965 		Free(did_nm);
1966 	if (did_shrnm != NULL)
1967 		Free(did_shrnm);
1968 
1969 	/*
1970 	 * If we are at the end of the list, we must free up
1971 	 * the replicated list too
1972 	 */
1973 	if (dp->next == NULL)
1974 		free_replicated_disks_list();
1975 
1976 	if (extended_namespace)
1977 		return (mddserror(ep, MDE_DS_EXTENDEDNM, MD_SET_BAD,
1978 		    mynode(), NULL, NULL));
1979 
1980 	return (rval);
1981 }
1982 
1983 /*
1984  * Return the minor name associated with a given disk slice
1985  */
1986 static char *
1987 meta_getminor_name(
1988 	char *devname,
1989 	md_error_t *ep
1990 )
1991 {
1992 	int 	fd = -1;
1993 	char 	*minor_name = NULL;
1994 	char	*ret_minor_name = NULL;
1995 
1996 	if (devname == NULL)
1997 		return (NULL);
1998 
1999 	if ((fd = open(devname, O_RDONLY|O_NDELAY, 0)) < 0) {
2000 		(void) mdsyserror(ep, errno, devname);
2001 		return (NULL);
2002 	}
2003 
2004 	if (devid_get_minor_name(fd, &minor_name) == 0) {
2005 		ret_minor_name = Strdup(minor_name);
2006 		devid_str_free(minor_name);
2007 	}
2008 
2009 	(void) close(fd);
2010 	return (ret_minor_name);
2011 }
2012 
2013 static int
2014 meta_replica_quorum(
2015 	md_im_set_desc_t *misp,
2016 	md_error_t *ep
2017 )
2018 {
2019 	md_im_drive_info_t	*midp;
2020 	mddrivename_t		*dnp;
2021 	md_im_replica_info_t    *midr;
2022 	mdname_t		*np;
2023 	struct stat		st_buf;
2024 	uint_t			rep_slice;
2025 	int			replica_count = 0;
2026 
2027 	for (midp = misp->mis_drives; midp != NULL;
2028 		midp = midp->mid_next) {
2029 
2030 		dnp = midp->mid_dnp;
2031 
2032 		if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
2033 			((np = metaslicename(dnp, rep_slice, ep))
2034 			== NULL)) {
2035 			mdclrerror(ep);
2036 			continue;
2037 		}
2038 
2039 		if (stat(np->bname, &st_buf) != 0)
2040 			continue;
2041 
2042 		/*
2043 		 * The drive is okay now count its replicas
2044 		 */
2045 		for (midr = midp->mid_replicas; midr != NULL;
2046 			midr = midr->mir_next) {
2047 			replica_count++;
2048 		}
2049 	}
2050 
2051 	if (replica_count < (misp->mis_active_replicas + 1)/2)
2052 		return (-1);
2053 
2054 	return (0);
2055 }
2056 
2057 static set_t
2058 meta_imp_setno(
2059 	md_error_t *ep
2060 )
2061 {
2062 	set_t	max_sets, setno;
2063 	int	bool;
2064 
2065 	if ((max_sets = get_max_sets(ep)) == 0) {
2066 		return (MD_SET_BAD);
2067 	}
2068 
2069 	/*
2070 	 * This code needs to be expanded when we run in SunCluster
2071 	 * environment SunCluster obtains setno internally
2072 	 */
2073 	for (setno = 1; setno < max_sets; setno++) {
2074 		if (clnt_setnumbusy(mynode(), setno,
2075 			&bool, ep) == -1) {
2076 			setno = MD_SET_BAD;
2077 			break;
2078 		}
2079 		/*
2080 		 * found one available
2081 		 */
2082 		if (bool == FALSE)
2083 			break;
2084 	}
2085 
2086 	if (setno == max_sets) {
2087 		setno = MD_SET_BAD;
2088 	}
2089 
2090 	return (setno);
2091 }
2092 
2093 int
2094 meta_imp_set(
2095 	md_im_set_desc_t *misp,
2096 	char		*setname,
2097 	int		force,
2098 	bool_t		dry_run,
2099 	md_error_t	*ep
2100 )
2101 {
2102 	md_timeval32_t		tp;
2103 	md_im_drive_info_t	*midp;
2104 	uint_t			rep_slice;
2105 	mddrivename_t		*dnp;
2106 	struct mddb_config	c;
2107 	mdname_t		*np;
2108 	md_im_replica_info_t	*mirp;
2109 	char			setnum_link[MAXPATHLEN];
2110 	char			setname_link[MAXPATHLEN];
2111 	char			*minor_name = NULL;
2112 
2113 	(void) memset(&c, 0, sizeof (c));
2114 	(void) strlcpy(c.c_setname, setname, sizeof (c.c_setname));
2115 	c.c_sideno = 0;
2116 	c.c_flags = MDDB_C_IMPORT;
2117 
2118 	/*
2119 	 * Check to see if the setname that the set is being imported into,
2120 	 * already exists.
2121 	 */
2122 	if (getsetbyname(c.c_setname, ep) != NULL) {
2123 		return (mddserror(ep, MDE_DS_SETNAMEBUSY, MD_SET_BAD,
2124 		    mynode(), NULL, c.c_setname));
2125 	}
2126 
2127 	/*
2128 	 * Find the next available set number
2129 	 */
2130 	if ((c.c_setno = meta_imp_setno(ep)) == MD_SET_BAD) {
2131 		return (mddserror(ep, MDE_DS_SETNOTIMP, MD_SET_BAD,
2132 		    mynode(), NULL, c.c_setname));
2133 	}
2134 
2135 	if (meta_gettimeofday(&tp) == -1) {
2136 		return (mdsyserror(ep, errno, NULL));
2137 	}
2138 	c.c_timestamp = tp;
2139 
2140 	/* Check to see if replica quorum requirement is fulfilled */
2141 	if (!force && meta_replica_quorum(misp, ep) == -1)
2142 		return (mddserror(ep, MDE_DS_INSUFQUORUM, MD_SET_BAD,
2143 		    mynode(), NULL, c.c_setname));
2144 
2145 	for (midp = misp->mis_drives; midp != NULL;
2146 		midp = midp->mid_next) {
2147 		mdcinfo_t	*cinfo;
2148 
2149 		/*
2150 		 * We pass down the list of the drives in the
2151 		 * set down to the kernel irrespective of
2152 		 * whether the drives have a replica or not.
2153 		 *
2154 		 * The kernel detects which of the drives don't
2155 		 * have a replica and accordingly does the
2156 		 * right thing.
2157 		 */
2158 		dnp = midp->mid_dnp;
2159 		if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
2160 		    ((np = metaslicename(dnp, rep_slice, ep))
2161 		    == NULL)) {
2162 			mdclrerror(ep);
2163 			continue;
2164 		}
2165 
2166 		(void) strcpy(c.c_locator.l_devname, np->bname);
2167 		c.c_locator.l_dev = meta_cmpldev(np->dev);
2168 		c.c_locator.l_mnum = meta_getminor(np->dev);
2169 		c.c_locator.l_devid = (uintptr_t)Malloc(midp->mid_devid_sz);
2170 		(void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2171 		    midp->mid_devid, midp->mid_devid_sz);
2172 		c.c_locator.l_devid_sz = midp->mid_devid_sz;
2173 		c.c_locator.l_devid_flags =
2174 		    MDDB_DEVID_VALID | MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2175 		if (midp->mid_o_devid) {
2176 			c.c_locator.l_old_devid =
2177 			    (uint64_t)(uintptr_t)Malloc(midp->mid_o_devid_sz);
2178 			(void) memcpy((void *)(uintptr_t)
2179 			    c.c_locator.l_old_devid,
2180 			    midp->mid_o_devid, midp->mid_o_devid_sz);
2181 			c.c_locator.l_old_devid_sz = midp->mid_o_devid_sz;
2182 		}
2183 		minor_name = meta_getminor_name(np->bname, ep);
2184 		(void) strncpy(c.c_locator.l_minor_name, minor_name,
2185 		    sizeof (c.c_locator.l_minor_name));
2186 
2187 		if ((cinfo = metagetcinfo(np, ep)) == NULL) {
2188 			mdclrerror(ep);
2189 			continue;
2190 		}
2191 		(void) strncpy(c.c_locator.l_driver, cinfo->dname,
2192 		    sizeof (c.c_locator.l_driver));
2193 
2194 		mirp = midp->mid_replicas;
2195 
2196 		do {
2197 			if (mirp) {
2198 				c.c_locator.l_flags = 0;
2199 				c.c_locator.l_blkno = mirp->mir_offset;
2200 				mirp = mirp->mir_next;
2201 			} else {
2202 				/*
2203 				 * Default offset for dummy is 16
2204 				 */
2205 				c.c_locator.l_blkno = 16;
2206 			}
2207 
2208 			if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2209 				Free((void *)(uintptr_t)c.c_locator.l_devid);
2210 				if (c.c_locator.l_old_devid)
2211 					Free((void *)(uintptr_t)
2212 					    c.c_locator.l_old_devid);
2213 				return (mdstealerror(ep, &c.c_mde));
2214 			}
2215 		} while (mirp != NULL);
2216 	}
2217 
2218 	/*
2219 	 * If the dry run option was specified, flag success
2220 	 * and exit out
2221 	 */
2222 	if (dry_run == 1) {
2223 		md_eprintf("%s\n", dgettext(TEXT_DOMAIN,
2224 		    "import should be successful"));
2225 		Free((void *)(uintptr_t)c.c_locator.l_devid);
2226 		if (c.c_locator.l_old_devid)
2227 			Free((void *)(uintptr_t)c.c_locator.l_old_devid);
2228 		return (0);
2229 	}
2230 
2231 	/*
2232 	 * Now kernel should have all the information
2233 	 * regarding the import diskset replica.
2234 	 * Tell kernel to load them up and import the set
2235 	 */
2236 	if (metaioctl(MD_IOCIMP_LOAD, &c.c_setno, &c.c_mde, NULL) != 0) {
2237 		Free((void *)(uintptr_t)c.c_locator.l_devid);
2238 		if (c.c_locator.l_old_devid)
2239 			Free((void *)(uintptr_t)c.c_locator.l_old_devid);
2240 		return (mdstealerror(ep, &c.c_mde));
2241 	}
2242 
2243 	(void) meta_smf_enable(META_SMF_DISKSET, NULL);
2244 
2245 	/* The set has now been imported, create the appropriate symlink */
2246 	(void) snprintf(setname_link, MAXPATHLEN, "/dev/md/%s", setname);
2247 	(void) snprintf(setnum_link, MAXPATHLEN, "shared/%d", c.c_setno);
2248 
2249 	/*
2250 	 * Since we already verified that the setname was OK, make sure to
2251 	 * cleanup before proceeding.
2252 	 */
2253 	if (unlink(setname_link) == -1) {
2254 		if (errno != ENOENT)
2255 			(void) mdsyserror(ep, errno, setname_link);
2256 	}
2257 
2258 	if (symlink(setnum_link, setname_link) == -1)
2259 		(void) mdsyserror(ep, errno, setname_link);
2260 
2261 	/* resnarf the set that has just been imported */
2262 	if (clnt_resnarf_set(mynode(), c.c_setno, ep) != 0)
2263 		md_eprintf("%s\n", dgettext(TEXT_DOMAIN, "Please stop and "
2264 		    "restart rpc.metad"));
2265 
2266 	Free((void *)(uintptr_t)c.c_locator.l_devid);
2267 	if (c.c_locator.l_old_devid)
2268 		Free((void *)(uintptr_t)c.c_locator.l_old_devid);
2269 	return (0);
2270 }
2271