xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_metad_subr.c (revision 2063:a6ebd483c3cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * interface between user land and the set records
38  */
39 
40 #include <meta.h>
41 #include <metad.h>
42 #include <sdssc.h>
43 #include <syslog.h>
44 #include <sys/cladm.h>
45 #include "meta_set_prv.h"
46 
47 #include <sys/sysevent/eventdefs.h>
48 #include <sys/sysevent/svm.h>
49 
50 static	md_set_record	*setrecords = NULL; /* head of cache linked list */
51 static	int		setsnarfdone = 0;
52 
53 typedef struct key_lst_t {
54 	side_t			kl_side;
55 	mdkey_t			kl_key;
56 	struct key_lst_t	*kl_next;
57 } key_lst_t;
58 
59 typedef struct ur_recid_lst {
60 	mddb_recid_t		url_recid;
61 	struct	ur_recid_lst	*url_nx;
62 } ur_recid_lst_t;
63 
64 static ur_recid_lst_t		*url_used = NULL;
65 static ur_recid_lst_t		*url_tode = NULL;
66 
67 static void
68 url_addl(ur_recid_lst_t **urlpp, mddb_recid_t recid)
69 {
70 	/* Run to the end of the list */
71 	for (/* void */; (*urlpp != NULL); urlpp = &(*urlpp)->url_nx)
72 		if ((*urlpp)->url_recid == recid)
73 			return;
74 
75 	/* Add the new member */
76 	*urlpp = Zalloc(sizeof (**urlpp));
77 	if (*urlpp == NULL)
78 		return;
79 
80 	(*urlpp)->url_recid = recid;
81 }
82 
83 static int
84 url_findl(ur_recid_lst_t *urlp, mddb_recid_t recid)
85 {
86 	while (urlp != NULL) {
87 		if (urlp->url_recid == recid)
88 			return (1);
89 		urlp = urlp->url_nx;
90 	}
91 	return (0);
92 }
93 
94 static void
95 url_freel(ur_recid_lst_t **urlpp)
96 {
97 	ur_recid_lst_t	*urlp;
98 	ur_recid_lst_t	*turlp;
99 
100 	for (turlp = *urlpp; turlp != NULL; turlp = urlp) {
101 		urlp = turlp->url_nx;
102 		Free(turlp);
103 	}
104 	*urlpp = (ur_recid_lst_t *)NULL;
105 }
106 
107 static int
108 ckncvt_set_record(mddb_userreq_t *reqp, md_error_t *ep)
109 {
110 	mddb_userreq_t	req;
111 	md_set_record	*sr;
112 	int		recs[3];
113 
114 	if (reqp->ur_size == sizeof (*sr))
115 		return (0);
116 
117 	if (! md_in_daemon) {
118 		if (reqp->ur_size >= sizeof (*sr))
119 			return (0);
120 
121 		reqp->ur_data = (uintptr_t)Realloc((void *)(uintptr_t)
122 		    reqp->ur_data, sizeof (*sr));
123 		(void) memset(
124 		    ((char *)(uintptr_t)reqp->ur_data) + reqp->ur_size,
125 		    '\0', sizeof (*sr) - reqp->ur_size);
126 		reqp->ur_size = sizeof (*sr);
127 		return (0);
128 	}
129 
130 	/*
131 	 * If here, then the daemon is calling, and so the automatic
132 	 * conversion will be performed.
133 	 */
134 
135 	/* shorthand */
136 	req = *reqp;			/* structure assignment */
137 	sr = (md_set_record *)(uintptr_t)req.ur_data;
138 
139 	if (sr->sr_flags & MD_SR_CVT)
140 		return (0);
141 
142 	/* Leave multi-node set records alone */
143 	if (MD_MNSET_REC(sr)) {
144 		return (0);
145 	}
146 
147 	/* Mark the old record as converted */
148 	sr->sr_flags |= MD_SR_CVT;
149 
150 	METAD_SETUP_SR(MD_DB_SETDATA, sr->sr_selfid)
151 
152 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0)
153 		return (mdstealerror(ep, &req.ur_mde));
154 
155 	/* Create space for the new record */
156 	METAD_SETUP_SR(MD_DB_CREATE, 0);
157 	req.ur_size = sizeof (*sr);
158 
159 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0)
160 		return (mdstealerror(ep, &req.ur_mde));
161 
162 	/* Allocate the new record */
163 	sr = Zalloc(sizeof (*sr));
164 
165 	/* copy all the data from the record being converted */
166 	(void) memmove(sr, (void *)(uintptr_t)reqp->ur_data, reqp->ur_size);
167 	sr->sr_flags &= ~MD_SR_CVT;
168 
169 	/* adjust the selfid to point to the new record */
170 	sr->sr_selfid = req.ur_recid;
171 
172 	METAD_SETUP_SR(MD_DB_SETDATA, sr->sr_selfid)
173 	req.ur_size = sizeof (*sr);
174 	req.ur_data = (uintptr_t)sr;
175 
176 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
177 		Free(sr);
178 		return (mdstealerror(ep, &req.ur_mde));
179 	}
180 
181 	/* Commit the old and the new */
182 	recs[0] = ((md_set_record *)(uintptr_t)reqp->ur_data)->sr_selfid;
183 	recs[1] = sr->sr_selfid;
184 	recs[2] = 0;
185 
186 	METAD_SETUP_UR(MD_DB_COMMIT_MANY, 0, 0);
187 	req.ur_size = sizeof (recs);
188 	req.ur_data = (uintptr_t)recs;
189 
190 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
191 		Free(sr);
192 		return (mdstealerror(ep, &req.ur_mde));
193 	}
194 
195 	/* Add the the old record to the list of records to delete */
196 	url_addl(&url_tode,
197 	    ((md_set_record *)(uintptr_t)reqp->ur_data)->sr_selfid);
198 
199 	/* Free the old records space */
200 	Free((void *)(uintptr_t)reqp->ur_data);
201 
202 	/* Adjust the reqp structure to point to the new record and size */
203 	reqp->ur_recid = sr->sr_selfid;
204 	reqp->ur_size = sizeof (*sr);
205 	reqp->ur_data = (uintptr_t)sr;
206 
207 	return (0);
208 }
209 
210 mddb_userreq_t *
211 get_db_rec(
212 	md_ur_get_cmd_t	cmd,
213 	set_t		setno,
214 	mddb_type_t	type,
215 	uint_t		type2,
216 	mddb_recid_t	*idp,
217 	md_error_t	*ep
218 )
219 {
220 	mddb_userreq_t	*reqp = Zalloc(sizeof (*reqp));
221 	mdsetname_t	*sp;
222 	md_set_desc	*sd;
223 	int		ureq;
224 
225 	if ((sp = metasetnosetname(setno, ep)) == NULL) {
226 		Free(reqp);
227 		return (NULL);
228 	}
229 
230 	if (metaislocalset(sp)) {
231 		ureq = MD_DB_USERREQ;
232 	} else {
233 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
234 			Free(reqp);
235 			return (NULL);
236 		}
237 		ureq = MD_MNSET_DESC(sd) ? MD_MN_DB_USERREQ : MD_DB_USERREQ;
238 	}
239 
240 	reqp->ur_setno = setno;
241 	reqp->ur_type = type;
242 	reqp->ur_type2 = type2;
243 
244 	switch (cmd) {
245 	    case MD_UR_GET_NEXT:
246 		    reqp->ur_cmd = MD_DB_GETNEXTREC;
247 		    reqp->ur_recid = *idp;
248 		    if (metaioctl(ureq, reqp, &reqp->ur_mde, NULL)
249 			!= 0) {
250 			    (void) mdstealerror(ep, &reqp->ur_mde);
251 			    Free(reqp);
252 			    return (NULL);
253 		    }
254 		    *idp = reqp->ur_recid;
255 		    break;
256 	    case MD_UR_GET_WKEY:
257 		    reqp->ur_recid = *idp;
258 		    break;
259 	}
260 
261 	if (*idp <= 0) {
262 		Free(reqp);
263 		return (NULL);
264 	}
265 
266 	reqp->ur_cmd = MD_DB_GETSIZE;
267 	if (metaioctl(ureq, reqp, &reqp->ur_mde, NULL) != 0) {
268 		(void) mdstealerror(ep, &reqp->ur_mde);
269 		Free(reqp);
270 
271 		*idp = 0;
272 		return (NULL);
273 	}
274 
275 	reqp->ur_cmd = MD_DB_GETDATA;
276 	reqp->ur_data = (uintptr_t)Zalloc(reqp->ur_size);
277 	if (metaioctl(ureq, reqp, &reqp->ur_mde, NULL) != 0) {
278 		(void) mdstealerror(ep, &reqp->ur_mde);
279 		Free((void *)(uintptr_t)reqp->ur_data);
280 		Free(reqp);
281 		*idp = 0;
282 		return (NULL);
283 	}
284 
285 	switch (reqp->ur_type) {
286 	    case MDDB_USER:
287 		    switch (reqp->ur_type2) {
288 			case MDDB_UR_SR:
289 				if (ckncvt_set_record(reqp, ep)) {
290 					Free((void *)(uintptr_t)reqp->ur_data);
291 					Free(reqp);
292 					return (NULL);
293 				}
294 				break;
295 		    }
296 		    break;
297 	}
298 
299 	return (reqp);
300 }
301 
302 void *
303 get_ur_rec(
304 	set_t		setno,
305 	md_ur_get_cmd_t	cmd,
306 	uint_t		type2,
307 	mddb_recid_t	*idp,
308 	md_error_t	*ep
309 )
310 {
311 	mddb_userreq_t	*reqp = NULL;
312 	void		*ret_val;
313 
314 	assert(idp != NULL);
315 
316 	reqp = get_db_rec(cmd, setno, MDDB_USER, type2, idp, ep);
317 	if (reqp == NULL)
318 		return (NULL);
319 
320 	ret_val = (void *)(uintptr_t)reqp->ur_data;
321 	Free(reqp);
322 	return (ret_val);
323 }
324 
325 /*
326  * Called by rpc.metad on startup of disksets to cleanup
327  * the host entries associated with a diskset.  This is needed if
328  * a node failed or the metaset command was killed during the addition
329  * of a node to a diskset.
330  *
331  * This is called for all traditional disksets.
332  * This is only called for MNdisksets when in there is only one node
333  * in all of the MN disksets and this node is not running SunCluster.
334  * (Otherwise, the cleanup of the host entries is handled by a
335  * reconfig cycle that the SunCluster software calls).
336  */
337 static int
338 sr_hosts(md_set_record *sr)
339 {
340 	int		i,
341 			nid,
342 			self_in_set = FALSE;
343 	md_error_t	xep = mdnullerror;
344 	md_mnnode_record	*nr;
345 	md_mnset_record		*mnsr;
346 
347 	if (MD_MNSET_REC(sr)) {
348 		mnsr = (struct md_mnset_record *)sr;
349 		nr = mnsr->sr_nodechain;
350 		/*
351 		 * Already guaranteed to be only 1 node in set which
352 		 * is mynode (done in sr_validate).
353 		 * Now, check if node is in the OK state.  If not in
354 		 * the OK state, leave self_in_set FALSE so that
355 		 * set will be removed.
356 		 */
357 		if (nr->nr_flags & MD_MN_NODE_OK)
358 			self_in_set = TRUE;
359 	} else {
360 		for (i = 0; i < MD_MAXSIDES; i++) {
361 			/* Skip empty slots */
362 			if (sr->sr_nodes[i][0] == '\0')
363 				continue;
364 
365 			/* Make sure we are in the set and skip this node */
366 			if (strcmp(sr->sr_nodes[i], mynode()) == 0) {
367 				self_in_set = TRUE;
368 				break;
369 			}
370 		}
371 	}
372 
373 	if ((self_in_set == FALSE) && (!(MD_MNSET_REC(sr)))) {
374 	    if (_cladm(CL_CONFIG, CL_NODEID, &nid) == 0) {
375 
376 		/*
377 		 * See if we've got a node which has been booted in
378 		 * non-cluster mode. If true the nodeid will match
379 		 * one of the sr_nodes values because the conversion
380 		 * from nodeid to hostname failed to occur.
381 		 */
382 		for (i = 0; i < MD_MAXSIDES; i++) {
383 			if (sr->sr_nodes[i][0] == 0)
384 				continue;
385 			if (atoi(sr->sr_nodes[i]) == nid)
386 				self_in_set = TRUE;
387 		}
388 
389 		/* If we aren't in the set, delete the set */
390 		if (self_in_set == FALSE) {
391 			syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
392 			    "Removing set %s from database\n"), sr->sr_setname);
393 			s_delset(sr->sr_setname, &xep);
394 			if (! mdisok(&xep))
395 				mdclrerror(&xep);
396 			return (1);
397 		}
398 	    } else {
399 		/*
400 		 * Send a message to syslog and return without
401 		 * deleting any sets
402 		 */
403 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
404 			"Call to _cladm failed for set %s\n"),
405 			sr->sr_setname);
406 		return (1);
407 	    }
408 	}
409 	return (0);
410 }
411 
412 void
413 sr_del_drv(md_set_record *sr, mddb_recid_t recid)
414 {
415 	mddb_userreq_t		req;
416 	md_error_t		xep = mdnullerror;
417 
418 	if (!s_ownset(sr->sr_setno, &xep)) {
419 		if (! mdisok(&xep))
420 			mdclrerror(&xep);
421 		goto skip;
422 	}
423 
424 	/* delete the replicas? */
425 	/* release ownership of the drive? */
426 	/* NOTE: We may not have a name, so both of the above are ugly! */
427 
428 skip:
429 	(void) memset(&req, 0, sizeof (req));
430 	METAD_SETUP_DR(MD_DB_DELETE, recid)
431 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0)
432 		mdclrerror(&req.ur_mde);
433 
434 	dr_cache_del(sr, recid);
435 }
436 
437 static void
438 sr_drvs(md_set_record *sr)
439 {
440 	md_drive_record		*dr;
441 	int			i;
442 	int			modified = 0;
443 	int			sidesok;
444 	mdnm_params_t		nm;
445 	static	char		device_name[MAXPATHLEN];
446 	md_error_t		xep = mdnullerror;
447 	md_mnnode_record	*nr;
448 	md_mnset_record		*mnsr;
449 
450 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
451 		/* If we were mid-add, cleanup */
452 		if ((dr->dr_flags & MD_DR_ADD)) {
453 			sr_del_drv(sr, dr->dr_selfid);
454 			modified++;
455 			continue;
456 		}
457 
458 		sidesok = TRUE;
459 		if (MD_MNSET_REC(sr)) {
460 			mnsr = (md_mnset_record *)sr;
461 			nr = mnsr->sr_nodechain;
462 			/*
463 			 * MultiNode disksets only have entries for
464 			 * their side in the local set.  Verify
465 			 * that drive has a name associated with
466 			 * this node's side.
467 			 */
468 			while (nr) {
469 				/* Find my node */
470 				if (strcmp(mynode(), nr->nr_nodename) != 0) {
471 					nr = nr->nr_next;
472 					continue;
473 				}
474 
475 				(void) memset(&nm, '\0', sizeof (nm));
476 				nm.setno = MD_LOCAL_SET;
477 				nm.side = nr->nr_nodeid;
478 				nm.key = dr->dr_key;
479 				nm.devname = (uint64_t)device_name;
480 
481 				if (metaioctl(MD_IOCGET_NM, &nm, &nm.mde,
482 				    NULL) != 0) {
483 					if (! mdissyserror(&nm.mde, ENOENT)) {
484 						mdclrerror(&nm.mde);
485 						return;
486 					}
487 				}
488 
489 				/*
490 				 * If entry is found for this node, then
491 				 * break out of loop walking through
492 				 * node list.  For a multi-node diskset,
493 				 * there should only be an entry for
494 				 * this node.
495 				 */
496 				if (nm.key != MD_KEYWILD &&
497 				    ! mdissyserror(&nm.mde, ENOENT)) {
498 					break;
499 				}
500 
501 				/*
502 				 * If entry is not found for this node,
503 				 * then delete the drive.  No need to
504 				 * continue through the node loop since
505 				 * our node has already been found.
506 				 */
507 				sidesok = FALSE;
508 				mdclrerror(&nm.mde);
509 
510 				/* If we are missing a sidename, cleanup */
511 				sr_del_drv(sr, dr->dr_selfid);
512 				modified++;
513 
514 				break;
515 			}
516 		} else  {
517 			for (i = 0; i < MD_MAXSIDES; i++) {
518 				/* Skip empty slots */
519 				if (sr->sr_nodes[i][0] == '\0')
520 					continue;
521 
522 				(void) memset(&nm, '\0', sizeof (nm));
523 				nm.setno = MD_LOCAL_SET;
524 				nm.side = i + SKEW;
525 				nm.key = dr->dr_key;
526 				nm.devname = (uint64_t)device_name;
527 
528 				if (metaioctl(MD_IOCGET_NM, &nm, &nm.mde,
529 				    NULL) != 0) {
530 					if (! mdissyserror(&nm.mde, ENOENT)) {
531 						mdclrerror(&nm.mde);
532 						return;
533 					}
534 				}
535 
536 				if (nm.key != MD_KEYWILD &&
537 				    ! mdissyserror(&nm.mde, ENOENT))
538 					continue;
539 
540 				sidesok = FALSE;
541 				mdclrerror(&nm.mde);
542 
543 				/* If we are missing a sidename, cleanup */
544 				sr_del_drv(sr, dr->dr_selfid);
545 				modified++;
546 
547 				break;
548 			}
549 		}
550 
551 		if (sidesok == FALSE)
552 			continue;
553 
554 		/*
555 		 * If we got this far, the drive record is either in the OK
556 		 * or DEL state, if it is in the DEL state and the sidenames
557 		 * all checked out, then we will make it OK.
558 		 */
559 		if ((dr->dr_flags & MD_DR_OK))
560 			continue;
561 
562 		dr->dr_flags = MD_DR_OK;
563 
564 		modified++;
565 	}
566 
567 	if (modified) {
568 		commitset(sr, FALSE, &xep);
569 		if (! mdisok(&xep))
570 			mdclrerror(&xep);
571 	}
572 }
573 
574 static void
575 add_key_to_lst(key_lst_t **klpp, side_t side, mdkey_t key)
576 {
577 	key_lst_t	*klp;
578 
579 	assert(klpp != NULL);
580 
581 	for (/* void */; *klpp != NULL; klpp = &(*klpp)->kl_next)
582 		/* void */;
583 
584 	/* allocate new list element */
585 	klp = *klpp = Zalloc(sizeof (*klp));
586 
587 	klp->kl_side = side;
588 	klp->kl_key  = key;
589 }
590 
591 #ifdef DUMPKEYLST
592 static void
593 pr_key_lst(char *tag, key_lst_t *klp)
594 {
595 	key_lst_t	*tklp;
596 
597 	md_eprintf("Tag=%s\n", tag);
598 	for (tklp = klp; tklp != NULL; tklp = tklp->kl_next)
599 		md_eprintf("side=%d, key=%lu\n", tklp->kl_side, tklp->kl_key);
600 }
601 #endif	/* DUMPKEYLST */
602 
603 static int
604 key_in_key_lst(key_lst_t *klp, side_t side, mdkey_t key)
605 {
606 	key_lst_t	*tklp;
607 
608 	for (tklp = klp; tklp != NULL; tklp = tklp->kl_next)
609 		if (tklp->kl_side == side && tklp->kl_key == key)
610 			return (1);
611 
612 	return (0);
613 }
614 
615 static void
616 destroy_key_lst(key_lst_t **klpp)
617 {
618 	key_lst_t	*tklp, *klp;
619 
620 	assert(klpp != NULL);
621 
622 	tklp = klp = *klpp;
623 	while (klp != NULL) {
624 		tklp = klp;
625 		klp = klp->kl_next;
626 		Free(tklp);
627 	}
628 	*klpp = NULL;
629 }
630 
631 static void
632 sr_sidenms(void)
633 {
634 	md_drive_record		*dr;
635 	md_set_record		*sr;
636 	key_lst_t		*use = NULL;
637 	mdnm_params_t		nm;
638 	int			i;
639 	md_mnset_record		*mnsr;
640 	md_mnnode_record	*nr;
641 	side_t			myside = 0;
642 
643 	/*
644 	 * We now go through the list of set and drive records collecting
645 	 * the key/side pairs that are being used.
646 	 */
647 	for (sr = setrecords; sr != NULL; sr = sr->sr_next) {
648 		/*
649 		 * To handle the multi-node diskset case, get the sideno
650 		 * associated with this node.  This sideno will be the
651 		 * same across all multi-node disksets.
652 		 */
653 		if ((myside == 0) && (MD_MNSET_REC(sr))) {
654 			mnsr = (struct md_mnset_record *)sr;
655 			nr = mnsr->sr_nodechain;
656 			while (nr) {
657 				if (strcmp(mynode(), nr->nr_nodename) == 0) {
658 					myside = nr->nr_nodeid;
659 					break;
660 				}
661 				nr = nr->nr_next;
662 			}
663 			/*
664 			 * If this node is not in this MNset -
665 			 * then skip this set.
666 			 */
667 			if (!nr) {
668 				continue;
669 			}
670 		}
671 
672 		for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
673 			if (MD_MNSET_REC(sr)) {
674 				/*
675 				 * There are no non-local sidenames in the
676 				 * local set for a multi-node diskset.
677 				 */
678 				add_key_to_lst(&use, myside, dr->dr_key);
679 			} else {
680 				for (i = 0; i < MD_MAXSIDES; i++) {
681 					/* Skip empty slots */
682 					if (sr->sr_nodes[i][0] == '\0')
683 						continue;
684 
685 					add_key_to_lst(&use, i + SKEW,
686 						dr->dr_key);
687 				}
688 			}
689 		}
690 	}
691 
692 #ifdef DUMPKEYLST
693 	pr_key_lst("use", use);
694 #endif	/* DUMPKEYLST */
695 
696 	/*
697 	 * We take the list above and get all non-local sidenames, checking
698 	 * each to see if they are in use, if they are not used, we delete them.
699 	 * Do the check for myside to cover multinode disksets.
700 	 * Then do the check for MD_MAXSIDES to cover non-multinode disksets.
701 	 * If any multi-node disksets were present, myside would be non-zero.
702 	 * myside is the same for all multi-node disksets for this node.
703 	 */
704 	if (myside) {
705 		(void) memset(&nm, '\0', sizeof (nm));
706 		nm.setno = MD_LOCAL_SET;
707 		nm.side = myside;
708 		nm.key = MD_KEYWILD;
709 
710 		/*CONSTCOND*/
711 		while (1) {
712 			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde,
713 			    NULL) != 0) {
714 				mdclrerror(&nm.mde);
715 				break;
716 			}
717 
718 			if (nm.key == MD_KEYWILD)
719 				break;
720 
721 			if (! key_in_key_lst(use, nm.side, nm.key)) {
722 				if (metaioctl(MD_IOCREM_NM, &nm, &nm.mde,
723 				    NULL) != 0) {
724 					mdclrerror(&nm.mde);
725 					continue;
726 				}
727 			}
728 		}
729 	}
730 	/* Now handle the non-multinode disksets */
731 	for (i = 0; i < MD_MAXSIDES; i++) {
732 		(void) memset(&nm, '\0', sizeof (nm));
733 		nm.setno = MD_LOCAL_SET;
734 		nm.side = i + SKEW;
735 		nm.key = MD_KEYWILD;
736 
737 		/*CONSTCOND*/
738 		while (1) {
739 			if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde,
740 			    NULL) != 0) {
741 				mdclrerror(&nm.mde);
742 				break;
743 			}
744 
745 			if (nm.key == MD_KEYWILD)
746 				break;
747 
748 			if (! key_in_key_lst(use, nm.side, nm.key)) {
749 				if (metaioctl(MD_IOCREM_NM, &nm, &nm.mde,
750 				    NULL) != 0) {
751 					mdclrerror(&nm.mde);
752 					continue;
753 				}
754 			}
755 		}
756 	}
757 
758 	/* Cleanup */
759 	destroy_key_lst(&use);
760 }
761 
762 void
763 sr_validate(void)
764 {
765 	md_set_record			*sr;
766 	md_error_t			xep = mdnullerror;
767 	int				mnset_single_node;
768 	md_mnnode_record		*nr;
769 	md_mnset_record			*mnsr;
770 
771 	assert(setsnarfdone != 0);
772 
773 	/* We have validated the records already */
774 	if (setsnarfdone == 3)
775 		return;
776 
777 	/*
778 	 * Check if we are in a single node non-SC3.x environmemnt
779 	 */
780 	mnset_single_node = meta_mn_singlenode();
781 	/*
782 	 * If a possible single_node situation, verify that all
783 	 * MN disksets have only one node (which is mynode()).
784 	 */
785 	if (mnset_single_node) {
786 		for (sr = setrecords; sr != NULL; sr = sr->sr_next) {
787 			if (MD_MNSET_REC(sr)) {
788 				mnsr = (struct md_mnset_record *)sr;
789 				nr = mnsr->sr_nodechain;
790 				/*
791 				 * If next pointer is non-null (more than
792 				 * one node in list) or if the single node
793 				 * isn't my node - reset single node flag.
794 				 */
795 				if ((nr->nr_next) ||
796 				    (strcmp(nr->nr_nodename, mynode()) != 0)) {
797 					mnset_single_node = 0;
798 					break;
799 				}
800 			}
801 		}
802 	}
803 
804 	for (sr = setrecords; sr != NULL; sr = sr->sr_next) {
805 		/*
806 		 * If a MN diskset and not in the single node
807 		 * situation, then don't validate the MN set.
808 		 * This is done during a reconfig cycle since all
809 		 * nodes must take the same action.
810 		 */
811 		if (MD_MNSET_REC(sr) && (mnset_single_node == 0))
812 			continue;
813 
814 		/* Since we do "partial" snarf's, we only check new entries */
815 		if (! (sr->sr_flags & MD_SR_CHECK))
816 			continue;
817 
818 		/* If we were mid-add, cleanup */
819 		if ((sr->sr_flags & MD_SR_ADD)) {
820 			s_delset(sr->sr_setname, &xep);
821 			if (! mdisok(&xep))
822 				mdclrerror(&xep);
823 			continue;
824 		}
825 
826 		/* Make sure we are in the set. */
827 		if (sr_hosts(sr))
828 			continue;
829 
830 		/* Check has been done, clear the flag */
831 		if ((sr->sr_flags & MD_SR_CHECK))
832 			sr->sr_flags &= ~MD_SR_CHECK;
833 
834 		/*
835 		 * If we got here, we are in the set, make sure the flags make
836 		 * sense.
837 		 */
838 		if (! (sr->sr_flags & MD_SR_OK)) {
839 			sr->sr_flags &= ~MD_SR_STATE_FLAGS;
840 			sr->sr_flags |= MD_SR_OK;
841 			commitset(sr, FALSE, &xep);
842 			if (! mdisok(&xep))
843 				mdclrerror(&xep);
844 		}
845 
846 		/* Make sure all the drives are in a stable state. */
847 		sr_drvs(sr);
848 	}
849 
850 	/* Cleanup any stray sidenames */
851 	sr_sidenms();
852 
853 	setsnarfdone = 3;
854 }
855 
856 static md_set_record *
857 sr_in_cache(mddb_recid_t recid)
858 {
859 	md_set_record *tsr;
860 
861 	for (tsr = setrecords; tsr != NULL; tsr = tsr->sr_next)
862 		if (tsr->sr_selfid == recid)
863 			return (tsr);
864 	return ((md_set_record *)NULL);
865 }
866 
867 int
868 set_snarf(md_error_t *ep)
869 {
870 	md_set_record			*sr;
871 	md_mnset_record			*mnsr;
872 	md_set_record			*tsr;
873 	md_drive_record			*dr;
874 	mddb_userreq_t			*reqp;
875 	ur_recid_lst_t			*urlp;
876 	mddb_recid_t			id;
877 	mddb_recid_t			*p;
878 	md_error_t			xep = mdnullerror;
879 	md_mnnode_record		*nr;
880 	mddb_set_node_params_t		snp;
881 	int				nodecnt;
882 	mndiskset_membershiplist_t	 *nl, *nl2;
883 
884 	/* We have done the snarf call */
885 	if (setsnarfdone != 0)
886 		return (0);
887 
888 	if (meta_setup_db_locations(ep) != 0) {
889 		if (! mdismddberror(ep, MDE_DB_STALE))
890 			return (-1);
891 		mdclrerror(ep);
892 	}
893 
894 	/*
895 	 * Get membershiplist from API routine.
896 	 * If there's an error, just use a NULL
897 	 * nodelist.
898 	 */
899 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
900 		nodecnt = 0;  /* no nodes are alive */
901 		nl = NULL;
902 		mdclrerror(ep);
903 	}
904 
905 	/* Let sr_cache_add and dr_cache_add know we are doing the snarf */
906 	setsnarfdone = 1;
907 
908 	/* Go get the set records */
909 	id = 0;
910 	while ((sr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_NEXT, MDDB_UR_SR,
911 							&id, ep)) != NULL) {
912 		sr->sr_next = NULL;
913 		sr->sr_drivechain = NULL;
914 
915 		/*
916 		 * Cluster nodename support
917 		 * Convert nodeid -> nodename
918 		 * Don't do this for MN disksets since we've already stored
919 		 * both the nodeid and name.
920 		 */
921 		if (!(MD_MNSET_REC(sr)))
922 			sdssc_cm_sr_nid2nm(sr);
923 
924 		/* If we were mid-cvt, cleanup */
925 		if (sr->sr_flags & MD_SR_CVT) {
926 			/* If the daemon is calling, cleanup */
927 			if (md_in_daemon)
928 				url_addl(&url_tode, sr->sr_selfid);
929 			continue;
930 		}
931 
932 		if (md_in_daemon)
933 			url_addl(&url_used, sr->sr_selfid);
934 
935 		/* Skip cached records */
936 		tsr = sr_in_cache(sr->sr_selfid);
937 		if (tsr != (md_set_record *)NULL) {
938 			if (MD_MNSET_REC(sr)) {
939 				mnsr = (struct md_mnset_record *)sr;
940 				Free(mnsr);
941 			} else {
942 				Free(sr);
943 			}
944 			if (md_in_daemon)
945 				for (dr = tsr->sr_drivechain;
946 				    dr != (md_drive_record *)NULL;
947 				    dr = dr->dr_next)
948 					url_addl(&url_used, dr->dr_selfid);
949 			continue;
950 		}
951 
952 		/* Mark the record as one to be checked */
953 		sr->sr_flags |= MD_SR_CHECK;
954 
955 		sr_cache_add(sr);
956 
957 		/* If MNdiskset, go get the node records */
958 		if (MD_MNSET_REC(sr)) {
959 			mnsr = (struct md_mnset_record *)sr;
960 			mnsr->sr_nodechain = NULL;
961 			p = &mnsr->sr_noderec;
962 			while ((nr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_WKEY,
963 					MDDB_UR_NR, p, ep)) != NULL) {
964 				nr->nr_next = NULL;
965 
966 				if (md_in_daemon)
967 					url_addl(&url_used, nr->nr_selfid);
968 
969 				/*
970 				 * Turn off ALIVE node flag based on member
971 				 * list.
972 				 * If ALIVE flag is not set, reset OWN flag.
973 				 * If this node is mynode, set the OWN flag
974 				 * to match the ownership of the diskset.
975 				 */
976 				if (md_in_daemon) {
977 					nr->nr_flags &= ~MD_MN_NODE_ALIVE;
978 					nl2 = nl;
979 					while (nl2) {
980 						/*
981 						 * If in member list,
982 						 * set alive.
983 						 */
984 						if (nl2->msl_node_id ==
985 						    nr->nr_nodeid) {
986 							nr->nr_flags |=
987 							    MD_MN_NODE_ALIVE;
988 							break;
989 						}
990 						nl2 = nl2->next;
991 					}
992 					/*
993 					 * If mynode is in member list, then
994 					 * check to see if set is snarfed.
995 					 * If set snarfed, set own flag;
996 					 * otherwise reset it.
997 					 * Don't change master even if
998 					 * node isn't an owner node, since
999 					 * node may be master, but hasn't
1000 					 * joined the set yet.
1001 					 */
1002 					if (nr->nr_flags & MD_MN_NODE_ALIVE) {
1003 					    if (strcmp(nr->nr_nodename,
1004 						mynode()) == 0) {
1005 						    if (s_ownset(
1006 							mnsr->sr_setno, ep)) {
1007 							nr->nr_flags |=
1008 							    MD_MN_NODE_OWN;
1009 						    } else {
1010 							nr->nr_flags &=
1011 							    ~MD_MN_NODE_OWN;
1012 						    }
1013 					    }
1014 					} else {
1015 					    if (strcmp(nr->nr_nodename,
1016 						mynode()) == 0) {
1017 						/*
1018 						 * If my node isn't in member
1019 						 * list then reset master.
1020 						 */
1021 						mnsr = (struct
1022 						    md_mnset_record *)sr;
1023 						mnsr->sr_master_nodeid =
1024 							MD_MN_INVALID_NID;
1025 						mnsr->sr_master_nodenm[0] =
1026 							'\0';
1027 					    }
1028 					    nr->nr_flags &= ~MD_MN_NODE_OWN;
1029 					}
1030 				}
1031 
1032 				/*
1033 				 * Must grab nr_nextrec now since
1034 				 * mnnr_cache_add may change it
1035 				 * (mnnr_cache_add is storing the nodes in
1036 				 * an ascending nodeid order list in order
1037 				 * to support reconfig).
1038 				 */
1039 				if (nr->nr_nextrec != 0)
1040 					p = &nr->nr_nextrec;
1041 				else
1042 					p = NULL;
1043 
1044 				mnnr_cache_add((struct md_mnset_record *)sr,
1045 					nr);
1046 
1047 				if ((md_in_daemon) &&
1048 				    (strcmp(nr->nr_nodename, mynode()) == 0)) {
1049 					(void) memset(&snp, 0, sizeof (snp));
1050 					snp.sn_nodeid = nr->nr_nodeid;
1051 					snp.sn_setno = mnsr->sr_setno;
1052 					if (metaioctl(MD_MN_SET_NODEID, &snp,
1053 					    &snp.sn_mde, NULL) != 0) {
1054 						(void) mdstealerror(ep,
1055 							&snp.sn_mde);
1056 					}
1057 				}
1058 
1059 				if (p == NULL)
1060 					break;
1061 			}
1062 			if (! mdisok(ep)) {
1063 				if (! mdissyserror(ep, ENOENT))
1064 					goto out;
1065 				mdclrerror(ep);
1066 			}
1067 		}
1068 
1069 		if (sr->sr_driverec == 0)
1070 			continue;
1071 
1072 		/* Go get the drive records */
1073 		p = &sr->sr_driverec;
1074 		while ((dr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_WKEY,
1075 				MDDB_UR_DR, p, ep)) != NULL) {
1076 			dr->dr_next = NULL;
1077 
1078 			if (md_in_daemon)
1079 				url_addl(&url_used, dr->dr_selfid);
1080 
1081 			dr_cache_add(sr, dr);
1082 
1083 			if (dr->dr_nextrec == 0)
1084 				break;
1085 
1086 			p = &dr->dr_nextrec;
1087 		}
1088 		if (! mdisok(ep)) {
1089 			if (! mdissyserror(ep, ENOENT))
1090 				goto out;
1091 			mdclrerror(ep);
1092 			/*
1093 			 * If dr_nextrec was not valid, or we had some
1094 			 * problem getting the record, we end up here.
1095 			 * get_ur_rec() zeroes the recid we passed in,
1096 			 * if we had a failure getting a record using a key,
1097 			 * so we simply commit the set record and valid
1098 			 * drive records, if this fails, we hand an error
1099 			 * back to the caller.
1100 			 */
1101 			commitset(sr, FALSE, ep);
1102 			if (! mdisok(ep))
1103 				goto out;
1104 		}
1105 	}
1106 	if (! mdisok(ep)) {
1107 		if (! mdissyserror(ep, ENOENT))
1108 			goto out;
1109 		mdclrerror(ep);
1110 	}
1111 
1112 	/*
1113 	 * If the daemon called, go through the USER records and cleanup
1114 	 * any that are not used by valid sets.
1115 	 */
1116 	if (md_in_daemon) {
1117 		id = 0;
1118 		/* Make a list of records to delete */
1119 		while ((reqp = get_db_rec(MD_UR_GET_NEXT, MD_LOCAL_SET,
1120 		    MDDB_USER, 0, &id, ep)) != NULL) {
1121 			if (reqp->ur_type2 != MDDB_UR_SR &&
1122 			    reqp->ur_type2 != MDDB_UR_DR) {
1123 				Free((void *)(uintptr_t)reqp->ur_data);
1124 				Free(reqp);
1125 				continue;
1126 			}
1127 			if (! url_findl(url_used, reqp->ur_recid))
1128 				url_addl(&url_tode, reqp->ur_recid);
1129 			Free((void *)(uintptr_t)reqp->ur_data);
1130 			Free(reqp);
1131 		}
1132 		if (! mdisok(ep)) {
1133 			if (! mdissyserror(ep, ENOENT))
1134 				goto out;
1135 			mdclrerror(ep);
1136 		}
1137 
1138 		/* Delete all the delete listed records */
1139 		for (urlp = url_tode; urlp != NULL; urlp = urlp->url_nx) {
1140 			s_delrec(urlp->url_recid, &xep);
1141 			if (! mdisok(&xep))
1142 				mdclrerror(&xep);
1143 		}
1144 	}
1145 
1146 	url_freel(&url_used);
1147 	url_freel(&url_tode);
1148 
1149 	if (nodecnt)
1150 		meta_free_nodelist(nl);
1151 
1152 	/* Mark the snarf complete */
1153 	setsnarfdone = 2;
1154 	return (0);
1155 
1156 out:
1157 	url_freel(&url_used);
1158 	url_freel(&url_tode);
1159 
1160 	sr_cache_flush(1);
1161 
1162 	if (nodecnt)
1163 		meta_free_nodelist(nl);
1164 
1165 	/* Snarf failed, reset state */
1166 	setsnarfdone = 0;
1167 
1168 	return (-1);
1169 }
1170 
1171 void
1172 sr_cache_add(md_set_record *sr)
1173 {
1174 	md_set_record *tsr;
1175 
1176 	assert(setsnarfdone != 0);
1177 
1178 	if (setrecords == NULL) {
1179 		setrecords = sr;
1180 		return;
1181 	}
1182 
1183 	for (tsr = setrecords; tsr->sr_next != NULL; tsr = tsr->sr_next)
1184 		/* void */;
1185 	tsr->sr_next = sr;
1186 }
1187 
1188 void
1189 sr_cache_del(mddb_recid_t recid)
1190 {
1191 	md_set_record	*sr, *tsr;
1192 	md_mnset_record	*mnsr;
1193 
1194 	assert(setsnarfdone != 0);
1195 
1196 	for (sr = tsr = setrecords; sr != NULL; tsr = sr, sr = sr->sr_next) {
1197 		if (sr->sr_selfid != recid)
1198 			continue;
1199 		if (sr == setrecords)
1200 			setrecords = sr->sr_next;
1201 		else
1202 			tsr->sr_next = sr->sr_next;
1203 		if (MD_MNSET_REC(sr)) {
1204 			mnsr = (struct md_mnset_record *)sr;
1205 			Free(mnsr);
1206 		} else {
1207 			Free(sr);
1208 		}
1209 		break;
1210 	}
1211 	if (setrecords == NULL)
1212 		setsnarfdone = 0;
1213 }
1214 
1215 void
1216 dr_cache_add(md_set_record *sr, md_drive_record *dr)
1217 {
1218 	md_drive_record	*tdr;
1219 
1220 	assert(setsnarfdone != 0);
1221 
1222 	assert(sr != NULL);
1223 
1224 	if (sr->sr_drivechain == NULL) {
1225 		sr->sr_drivechain = dr;
1226 		sr->sr_driverec = dr->dr_selfid;
1227 		return;
1228 	}
1229 
1230 	for (tdr = sr->sr_drivechain; tdr->dr_next != NULL; tdr = tdr->dr_next)
1231 		/* void */;
1232 
1233 	tdr->dr_next = dr;
1234 	tdr->dr_nextrec = dr->dr_selfid;
1235 }
1236 
1237 void
1238 dr_cache_del(md_set_record *sr, mddb_recid_t recid)
1239 {
1240 	md_drive_record *dr;
1241 	md_drive_record *tdr;
1242 
1243 	assert(setsnarfdone != 0);
1244 
1245 	assert(sr != NULL);
1246 
1247 	for (dr = tdr = sr->sr_drivechain; dr != NULL;
1248 	    tdr = dr, dr = dr->dr_next) {
1249 		if (dr->dr_selfid != recid)
1250 			continue;
1251 
1252 		if (dr == sr->sr_drivechain) {
1253 			sr->sr_drivechain = dr->dr_next;
1254 			sr->sr_driverec = dr->dr_nextrec;
1255 		} else {
1256 			tdr->dr_next = dr->dr_next;
1257 			tdr->dr_nextrec = dr->dr_nextrec;
1258 		}
1259 		Free(dr);
1260 		break;
1261 	}
1262 }
1263 
1264 /*
1265  * Nodes must be kept in ascending node id order in order to
1266  * support reconfig.
1267  *
1268  * This routine may change nr->nr_next and nr->nr_nextrec.
1269  */
1270 void
1271 mnnr_cache_add(md_mnset_record *mnsr, md_mnnode_record *nr)
1272 {
1273 	md_mnnode_record	*tnr, *tnr_prev;
1274 
1275 	assert(mnsr != NULL);
1276 
1277 	if (mnsr->sr_nodechain == NULL) {
1278 		mnsr->sr_nodechain = nr;
1279 		mnsr->sr_noderec = nr->nr_selfid;
1280 		return;
1281 	}
1282 
1283 	/*
1284 	 * If new_record->nodeid < first_record->nodeid,
1285 	 * put new_record at beginning of list.
1286 	 */
1287 	if (nr->nr_nodeid < mnsr->sr_nodechain->nr_nodeid) {
1288 		nr->nr_next = mnsr->sr_nodechain;
1289 		nr->nr_nextrec = mnsr->sr_noderec;
1290 		mnsr->sr_nodechain = nr;
1291 		mnsr->sr_noderec = nr->nr_selfid;
1292 		return;
1293 	}
1294 
1295 	/*
1296 	 * Walk list looking for place to insert record.
1297 	 */
1298 
1299 	tnr_prev = mnsr->sr_nodechain;
1300 	tnr = tnr_prev->nr_next;
1301 	while (tnr) {
1302 		/* Insert new record between tnr_prev and tnr */
1303 		if (nr->nr_nodeid < tnr->nr_nodeid) {
1304 			nr->nr_next = tnr;
1305 			nr->nr_nextrec = tnr->nr_selfid; /* tnr's recid */
1306 			tnr_prev->nr_next = nr;
1307 			tnr_prev->nr_nextrec = nr->nr_selfid;
1308 			return;
1309 		}
1310 		tnr_prev = tnr;
1311 		tnr = tnr->nr_next;
1312 	}
1313 
1314 	/*
1315 	 * Add record to end of list.
1316 	 */
1317 	tnr_prev->nr_next = nr;
1318 	tnr_prev->nr_nextrec = nr->nr_selfid;
1319 }
1320 
1321 void
1322 mnnr_cache_del(md_mnset_record *mnsr, mddb_recid_t recid)
1323 {
1324 	md_mnnode_record *nr;
1325 	md_mnnode_record *tnr;
1326 
1327 	assert(mnsr != NULL);
1328 
1329 	tnr = 0;
1330 	nr = mnsr->sr_nodechain;
1331 	while (nr) {
1332 		if (nr->nr_selfid != recid) {
1333 			tnr = nr;
1334 			nr = nr->nr_next;
1335 			continue;
1336 		}
1337 
1338 		if (nr == mnsr->sr_nodechain) {
1339 			mnsr->sr_nodechain = nr->nr_next;
1340 			mnsr->sr_noderec = nr->nr_nextrec;
1341 		} else {
1342 			tnr->nr_next = nr->nr_next;
1343 			tnr->nr_nextrec = nr->nr_nextrec;
1344 		}
1345 		Free(nr);
1346 		break;
1347 	}
1348 }
1349 
1350 int
1351 metad_isautotakebyname(char *setname)
1352 {
1353 	md_error_t	error = mdnullerror;
1354 	md_set_record	*sr;
1355 
1356 	if (md_in_daemon)
1357 	    assert(setsnarfdone != 0);
1358 	else if (set_snarf(&error)) {
1359 	    mdclrerror(&error);
1360 	    return (0);
1361 	}
1362 
1363 	for (sr = setrecords; sr != NULL; sr = sr->sr_next) {
1364 	    if (strcmp(setname, sr->sr_setname) == 0) {
1365 		if (sr->sr_flags & MD_SR_AUTO_TAKE)
1366 		    return (1);
1367 		return (0);
1368 	    }
1369 	}
1370 
1371 	return (0);
1372 }
1373 
1374 int
1375 metad_isautotakebynum(set_t setno)
1376 {
1377 	md_error_t	error = mdnullerror;
1378 	md_set_record	*sr;
1379 
1380 	if (md_in_daemon)
1381 	    assert(setsnarfdone != 0);
1382 	else if (set_snarf(&error)) {
1383 	    mdclrerror(&error);
1384 	    return (0);
1385 	}
1386 
1387 	for (sr = setrecords; sr != NULL; sr = sr->sr_next) {
1388 	    if (setno == sr->sr_setno) {
1389 		if (sr->sr_flags & MD_SR_AUTO_TAKE)
1390 		    return (1);
1391 		return (0);
1392 	    }
1393 	}
1394 
1395 	return (0);
1396 }
1397 
1398 md_set_record *
1399 metad_getsetbyname(char *setname, md_error_t *ep)
1400 {
1401 	md_set_record	*sr;
1402 	char		buf[100];
1403 
1404 	assert(setsnarfdone != 0);
1405 
1406 	for (sr = setrecords; sr != NULL; sr = sr->sr_next)
1407 		if (strcmp(setname, sr->sr_setname) == 0)
1408 			return (sr);
1409 
1410 	(void) snprintf(buf, sizeof (buf), "setname \"%s\"", setname);
1411 	(void) mderror(ep, MDE_NO_SET, buf);
1412 	return (NULL);
1413 }
1414 
1415 md_set_record *
1416 metad_getsetbynum(set_t setno, md_error_t *ep)
1417 {
1418 	md_set_record	*sr;
1419 	char		buf[100];
1420 
1421 	if (md_in_daemon)
1422 		assert(setsnarfdone != 0);
1423 	else if (set_snarf(ep))		/* BYPASS DAEMON mode */
1424 		return (NULL);
1425 
1426 	for (sr = setrecords; sr != NULL; sr = sr->sr_next)
1427 		if (setno == sr->sr_setno)
1428 			return (sr);
1429 
1430 	(void) sprintf(buf, "setno %u", setno);
1431 	(void) mderror(ep, MDE_NO_SET, buf);
1432 	return (NULL);
1433 }
1434 
1435 
1436 /*
1437  * Commit the set record and all of its associated records
1438  * (drive records, node records for a MNset) to the local mddb.
1439  */
1440 void
1441 commitset(md_set_record *sr, int inc_genid, md_error_t *ep)
1442 {
1443 	int		drc, nrc, rc;
1444 	int		*recs;
1445 	uint_t		size;
1446 	md_drive_record	*dr;
1447 	mddb_userreq_t	req;
1448 	md_mnset_record	*mnsr;
1449 	md_mnnode_record	*nr;
1450 
1451 	assert(setsnarfdone != 0);
1452 
1453 	/*
1454 	 * Cluster nodename support
1455 	 * Convert nodename -> nodeid
1456 	 * Don't do this for MN disksets since we've already stored
1457 	 * both the nodeid and name.
1458 	 */
1459 	if (!(MD_MNSET_REC(sr)))
1460 		sdssc_cm_sr_nm2nid(sr);
1461 
1462 	/* Send down to kernel the data in mddb USER set record */
1463 	if (inc_genid)
1464 		sr->sr_genid++;
1465 	(void) memset(&req, 0, sizeof (req));
1466 	METAD_SETUP_SR(MD_DB_SETDATA, sr->sr_selfid)
1467 	if (MD_MNSET_REC(sr)) {
1468 		req.ur_size = sizeof (*mnsr);
1469 	} else {
1470 		req.ur_size = sizeof (*sr);
1471 	}
1472 	req.ur_data = (uintptr_t)sr;
1473 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
1474 		(void) mdstealerror(ep, &req.ur_mde);
1475 		return;
1476 	}
1477 
1478 	/*
1479 	 * Walk through the drive records associated with this set record
1480 	 * and send down to kernel the data in mddb USER drive record.
1481 	 */
1482 	drc = 0;
1483 	dr = sr->sr_drivechain;
1484 	while (dr) {
1485 		if (inc_genid)
1486 			dr->dr_genid++;
1487 		METAD_SETUP_DR(MD_DB_SETDATA, dr->dr_selfid)
1488 		req.ur_size = sizeof (*dr);
1489 		req.ur_data = (uintptr_t)dr;
1490 		if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
1491 			(void) mdstealerror(ep, &req.ur_mde);
1492 			return;
1493 		}
1494 		drc++;
1495 		dr = dr->dr_next;
1496 	}
1497 
1498 
1499 	/*
1500 	 * If this set is a multi-node set -
1501 	 * walk through the node records associated with this set record
1502 	 * and send down to kernel the data in mddb USER node record.
1503 	 */
1504 	nrc = 0;
1505 	if (MD_MNSET_REC(sr)) {
1506 		mnsr = (struct md_mnset_record *)sr;
1507 		nr = mnsr->sr_nodechain;
1508 		while (nr) {
1509 			if (inc_genid)
1510 				nr->nr_genid++;
1511 			METAD_SETUP_NR(MD_DB_SETDATA, nr->nr_selfid)
1512 			req.ur_size = sizeof (*nr);
1513 			req.ur_data = (uint64_t)(uintptr_t)nr;
1514 			if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL)
1515 			    != 0) {
1516 				(void) mdstealerror(ep, &req.ur_mde);
1517 				return;
1518 			}
1519 			nrc++;
1520 			nr = nr->nr_next;
1521 		}
1522 	}
1523 
1524 	/*
1525 	 * Set up list of mddb USER recids containing set and drive records
1526 	 * and node records if a MNset.
1527 	 */
1528 	rc = 0;
1529 	size = (nrc + drc + 2) * sizeof (int);
1530 	recs = Zalloc(size);
1531 	/* First recid in list is the set record's id */
1532 	recs[rc] = sr->sr_selfid;
1533 	rc++;
1534 	dr = sr->sr_drivechain;
1535 	while (dr) {
1536 		/* Now, fill in the drive record ids */
1537 		recs[rc] = dr->dr_selfid;
1538 		dr = dr->dr_next;
1539 		rc++;
1540 	}
1541 	if (MD_MNSET_REC(sr)) {
1542 		nr = mnsr->sr_nodechain;
1543 		while (nr) {
1544 			/* If a MNset, fill in the node record ids */
1545 			recs[rc] = nr->nr_selfid;
1546 			nr = nr->nr_next;
1547 			rc++;
1548 		}
1549 	}
1550 	/* Set last record to null recid */
1551 	recs[rc] = 0;
1552 
1553 	/* Write out the set and drive and node records to the local mddb */
1554 	METAD_SETUP_UR(MD_DB_COMMIT_MANY, 0, 0);
1555 	req.ur_size = size;
1556 	req.ur_data = (uintptr_t)recs;
1557 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
1558 		(void) mdstealerror(ep, &req.ur_mde);
1559 		return;
1560 	}
1561 
1562 	/*
1563 	 * Cluster nodename support
1564 	 * Convert nodeid -> nodename
1565 	 * Don't do this for MN disksets since we've already stored
1566 	 * both the nodeid and name.
1567 	 */
1568 	if (!(MD_MNSET_REC(sr)))
1569 		sdssc_cm_sr_nid2nm(sr);
1570 
1571 	Free(recs);
1572 }
1573 
1574 /*
1575  * This routine only handles returns a md_set_record structure even
1576  * if the set record describes a MN set.  This will allow pre-MN
1577  * SVM RPC code to access a MN set record and to display it.
1578  *
1579  * The MN SVM RPC code detects if the set record returned describes
1580  * a MN set and then will copy it using mnsetdup.
1581  */
1582 md_set_record *
1583 setdup(md_set_record *sr)
1584 {
1585 	md_set_record		*tsr = NULL;
1586 	md_drive_record		**tdrpp = NULL;
1587 
1588 	if (sr && (tsr = Malloc(sizeof (*sr))) != NULL) {
1589 		(void) memmove(tsr, sr, sizeof (*sr));
1590 		tsr->sr_next = NULL;
1591 		tdrpp = &tsr->sr_drivechain;
1592 		while (*tdrpp) {
1593 			*tdrpp = drdup(*tdrpp);
1594 			tdrpp = &(*tdrpp)->dr_next;
1595 		}
1596 	}
1597 	return (tsr);
1598 }
1599 
1600 /*
1601  * This routine only copies MN set records.   If a non-MN set
1602  * record was passed in NULL pointer will be returned.
1603  */
1604 md_mnset_record *
1605 mnsetdup(md_mnset_record *mnsr)
1606 {
1607 	md_mnset_record		*tmnsr = NULL;
1608 	md_drive_record		**tdrpp = NULL;
1609 	md_mnnode_record	**tnrpp = NULL;
1610 
1611 	if (!MD_MNSET_REC(mnsr)) {
1612 		return (NULL);
1613 	}
1614 
1615 	if (mnsr && (tmnsr = Malloc(sizeof (*mnsr))) != NULL) {
1616 		(void) memmove(tmnsr, mnsr, sizeof (*mnsr));
1617 		tmnsr->sr_next = NULL;
1618 		tdrpp = &tmnsr->sr_drivechain;
1619 		while (*tdrpp) {
1620 			*tdrpp = drdup(*tdrpp);
1621 			tdrpp = &(*tdrpp)->dr_next;
1622 		}
1623 		tnrpp = &tmnsr->sr_nodechain;
1624 		while (*tnrpp) {
1625 			*tnrpp = nrdup(*tnrpp);
1626 			tnrpp = &(*tnrpp)->nr_next;
1627 		}
1628 	}
1629 	return (tmnsr);
1630 }
1631 
1632 md_drive_record *
1633 drdup(md_drive_record *dr)
1634 {
1635 	md_drive_record		*tdr = NULL;
1636 
1637 	if (dr && (tdr = Malloc(sizeof (*dr))) != NULL)
1638 		(void) memmove(tdr, dr, sizeof (*dr));
1639 	return (tdr);
1640 }
1641 
1642 md_mnnode_record *
1643 nrdup(md_mnnode_record *nr)
1644 {
1645 	md_mnnode_record	*tnr = NULL;
1646 
1647 	if (nr && (tnr = Malloc(sizeof (*nr))) != NULL)
1648 		(void) memmove(tnr, nr, sizeof (*nr));
1649 	return (tnr);
1650 }
1651 
1652 /*
1653  * Duplicate parts of the drive decriptor list for this node.
1654  * Only duplicate the drive name string in the mddrivename structure, don't
1655  * need to copy any other pointers since only interested in the flags and
1656  * the drive name (i.e. other pointers will be set to NULL).
1657  *	Returns NULL if failure due to Malloc failure.
1658  *	Returns pointer (non-NULL) to dup'd list if successful.
1659  */
1660 md_drive_desc *
1661 dd_list_dup(md_drive_desc *dd)
1662 {
1663 	md_drive_desc	*orig_dd;
1664 	md_drive_desc	*copy_dd = NULL, *copy_dd_prev = NULL;
1665 	md_drive_desc	*copy_dd_head = NULL;
1666 	mddrivename_t	*copy_dnp;
1667 	char		*copy_cname;
1668 	char		*copy_devid;
1669 
1670 	if (dd == NULL)
1671 		return (NULL);
1672 
1673 	orig_dd = dd;
1674 
1675 	while (orig_dd) {
1676 		copy_dd = Zalloc(sizeof (*copy_dd));
1677 		copy_dnp = Zalloc(sizeof (mddrivename_t));
1678 		copy_cname = Zalloc(sizeof (orig_dd->dd_dnp->cname));
1679 		if (orig_dd->dd_dnp->devid) {
1680 			copy_devid = Zalloc(sizeof (orig_dd->dd_dnp->devid));
1681 		} else {
1682 			copy_devid = NULL;
1683 		}
1684 		copy_dd->dd_next = NULL;
1685 		if ((copy_dd == NULL) || (copy_dnp == NULL) ||
1686 		    (copy_cname == NULL)) {
1687 			while (copy_dd_head) {
1688 				copy_dd = copy_dd_head->dd_next;
1689 				Free(copy_dd_head);
1690 				copy_dd_head = copy_dd;
1691 			}
1692 			if (copy_dnp)
1693 				Free(copy_dnp);
1694 			if (copy_dd)
1695 				Free(copy_dd);
1696 			if (copy_cname)
1697 				Free(copy_cname);
1698 			if (copy_devid)
1699 				Free(copy_devid);
1700 			return (NULL);
1701 		}
1702 		(void) memmove(copy_dd, orig_dd, sizeof (*orig_dd));
1703 		(void) strlcpy(copy_cname, orig_dd->dd_dnp->cname,
1704 		    sizeof (orig_dd->dd_dnp->cname));
1705 		copy_dd->dd_next = NULL;
1706 		copy_dd->dd_dnp = copy_dnp;
1707 		copy_dd->dd_dnp->cname = copy_cname;
1708 		if (copy_devid) {
1709 			(void) strlcpy(copy_devid, orig_dd->dd_dnp->devid,
1710 			    sizeof (orig_dd->dd_dnp->devid));
1711 		}
1712 
1713 		if (copy_dd_prev == NULL) {
1714 			copy_dd_head = copy_dd;
1715 			copy_dd_prev = copy_dd;
1716 		} else {
1717 			copy_dd_prev->dd_next = copy_dd;
1718 			copy_dd_prev = copy_dd;
1719 		}
1720 		orig_dd = orig_dd->dd_next;
1721 	}
1722 	copy_dd->dd_next = NULL;
1723 	return (copy_dd_head);
1724 }
1725 
1726 void
1727 sr_cache_flush(int flushnames)
1728 {
1729 	md_set_record	*sr, *tsr;
1730 	md_mnset_record	*mnsr;
1731 	md_drive_record *dr, *tdr;
1732 	md_mnnode_record *nr, *tnr;
1733 
1734 	sr = tsr = setrecords;
1735 	while (sr != NULL) {
1736 		dr = tdr = sr->sr_drivechain;
1737 		while (dr != NULL) {
1738 			tdr = dr;
1739 			dr = dr->dr_next;
1740 			Free(tdr);
1741 		}
1742 		tsr = sr;
1743 		sr = sr->sr_next;
1744 		if (MD_MNSET_REC(tsr)) {
1745 			mnsr = (struct md_mnset_record *)tsr;
1746 			nr = tnr = mnsr->sr_nodechain;
1747 			while (nr != NULL) {
1748 				tnr = nr;
1749 				nr = nr->nr_next;
1750 				Free(tnr);
1751 			}
1752 			Free(mnsr);
1753 		} else {
1754 			Free(tsr);
1755 		}
1756 	}
1757 
1758 	setrecords = NULL;
1759 
1760 	setsnarfdone = 0;
1761 
1762 	/* This will cause the other caches to be cleared */
1763 	if (flushnames)
1764 		metaflushnames(0);
1765 }
1766 
1767 void
1768 sr_cache_flush_setno(set_t setno)
1769 {
1770 	md_set_record	*sr, *tsr;
1771 	md_mnset_record	*mnsr;
1772 	md_drive_record *dr, *tdr;
1773 
1774 	assert(setsnarfdone != 0);
1775 
1776 	for (sr = tsr = setrecords; sr; tsr = sr, sr = sr->sr_next) {
1777 		if (sr->sr_setno != setno)
1778 			continue;
1779 
1780 		dr = tdr = sr->sr_drivechain;
1781 		while (dr != NULL) {
1782 			tdr = dr;
1783 			dr = dr->dr_next;
1784 			Free(tdr);
1785 		}
1786 		if (sr == setrecords)
1787 			setrecords = sr->sr_next;
1788 		else
1789 			tsr->sr_next = sr->sr_next;
1790 		if (MD_MNSET_REC(sr)) {
1791 			mnsr = (struct md_mnset_record *)sr;
1792 			Free(mnsr);
1793 		} else {
1794 			Free(sr);
1795 		}
1796 		break;
1797 	}
1798 
1799 	setsnarfdone = 0;
1800 
1801 	/* This will cause the other caches to be cleared */
1802 	metaflushnames(0);
1803 }
1804 
1805 int
1806 s_ownset(set_t setno, md_error_t *ep)
1807 {
1808 	mddb_ownset_t		ownset_arg;
1809 
1810 	ownset_arg.setno = setno;
1811 	ownset_arg.owns_set = MD_SETOWNER_NONE;
1812 
1813 	if (metaioctl(MD_DB_OWNSET, &ownset_arg, ep, NULL) != 0)
1814 		return (0);
1815 
1816 	return (ownset_arg.owns_set);
1817 }
1818 
1819 void
1820 s_delset(char *setname, md_error_t *ep)
1821 {
1822 	md_set_record		*sr;
1823 	md_set_record		*tsr;
1824 	md_drive_record		*dr;
1825 	md_drive_record		*tdr;
1826 	md_mnnode_record	*nr, *tnr;
1827 	mddb_userreq_t		req;
1828 	char			stringbuf[100];
1829 	int			i;
1830 	mdsetname_t		*sp = NULL;
1831 	mddrivename_t		*dn = NULL;
1832 	mdname_t		*np = NULL;
1833 	md_dev64_t		dev;
1834 	side_t			myside = MD_SIDEWILD;
1835 	md_error_t		xep = mdnullerror;
1836 	md_mnset_record		*mnsr;
1837 	int			num_sets = 0;
1838 	int			num_mn_sets = 0;
1839 
1840 	(void) memset(&req, 0, sizeof (mddb_userreq_t));
1841 
1842 	if ((sr = getsetbyname(setname, ep)) == NULL)
1843 		return;
1844 
1845 	sp = metasetnosetname(sr->sr_setno, &xep);
1846 	mdclrerror(&xep);
1847 
1848 	if (MD_MNSET_REC(sr)) {
1849 		/*
1850 		 * If this node is a set owner, halt the set before
1851 		 * deleting the set records.  Ignore any errors since
1852 		 * s_ownset and halt_set could fail if panic had occurred
1853 		 * during the add/delete of a node.
1854 		 */
1855 		if (s_ownset(sr->sr_setno, &xep)) {
1856 			mdclrerror(&xep);
1857 			if (halt_set(sp, &xep))
1858 				mdclrerror(&xep);
1859 		}
1860 	}
1861 
1862 	(void) snprintf(stringbuf, sizeof (stringbuf), "/dev/md/%s", setname);
1863 	(void) unlink(stringbuf);
1864 	(void) unlink(meta_lock_name(sr->sr_setno));
1865 
1866 	if (MD_MNSET_REC(sr)) {
1867 		mnsr = (struct md_mnset_record *)sr;
1868 		nr = mnsr->sr_nodechain;
1869 		while (nr) {
1870 			/* Setting myside for later use */
1871 			if (strcmp(mynode(), nr->nr_nodename) == 0)
1872 				myside = nr->nr_nodeid;
1873 
1874 			(void) memset(&req, 0, sizeof (req));
1875 			METAD_SETUP_NR(MD_DB_DELETE, nr->nr_selfid)
1876 			if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde,
1877 			    NULL) != 0) {
1878 				(void) mdstealerror(ep, &req.ur_mde);
1879 				free_sr(sr);
1880 				return;
1881 			}
1882 			tnr = nr;
1883 			nr = nr->nr_next;
1884 
1885 			SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_HOST,
1886 			    sr->sr_setno, tnr->nr_nodeid);
1887 
1888 			mnnr_cache_del((struct md_mnset_record *)sr,
1889 			    tnr->nr_selfid);
1890 		}
1891 	} else {
1892 		for (i = 0; i < MD_MAXSIDES; i++) {
1893 			/* Skip empty slots */
1894 			if (sr->sr_nodes[i][0] == '\0')
1895 				continue;
1896 
1897 			if (strcmp(mynode(), sr->sr_nodes[i]) == 0)
1898 				myside = i;
1899 
1900 			SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_HOST,
1901 			    sr->sr_setno, i);
1902 		}
1903 	}
1904 
1905 	dr = sr->sr_drivechain;
1906 	while (dr) {
1907 		(void) memset(&req, 0, sizeof (req));
1908 		METAD_SETUP_DR(MD_DB_DELETE, dr->dr_selfid)
1909 		if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
1910 			(void) mdstealerror(ep, &req.ur_mde);
1911 			free_sr(sr);
1912 			return;
1913 		}
1914 		tdr = dr;
1915 		dr = dr->dr_next;
1916 
1917 		dev = NODEV64;
1918 		if (myside != MD_SIDEWILD && sp != NULL) {
1919 			dn = metadrivename_withdrkey(sp, myside,
1920 			    tdr->dr_key, MD_BASICNAME_OK, &xep);
1921 			if (dn != NULL) {
1922 				uint_t	rep_slice;
1923 
1924 				np = NULL;
1925 				if (meta_replicaslice(dn, &rep_slice,
1926 				    &xep) == 0) {
1927 					np = metaslicename(dn, rep_slice, &xep);
1928 				}
1929 
1930 				if (np != NULL)
1931 					dev = np->dev;
1932 				else
1933 					mdclrerror(&xep);
1934 			} else
1935 				mdclrerror(&xep);
1936 		} else
1937 			mdclrerror(&xep);
1938 
1939 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_DRIVE,
1940 		    sr->sr_setno, dev);
1941 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_DRIVE,
1942 		    MD_LOCAL_SET, dev);
1943 
1944 		dr_cache_del(sr, tdr->dr_selfid);
1945 
1946 	}
1947 
1948 	(void) memset(&req, 0, sizeof (req));
1949 	METAD_SETUP_SR(MD_DB_DELETE, sr->sr_selfid)
1950 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
1951 		(void) mdstealerror(ep, &req.ur_mde);
1952 		free_sr(sr);
1953 		return;
1954 	}
1955 
1956 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_SET, sr->sr_setno,
1957 	    NODEV64);
1958 
1959 	for (tsr = setrecords; tsr; tsr = tsr->sr_next) {
1960 		if (tsr == sr)
1961 			continue;
1962 
1963 		num_sets++;
1964 		if (MD_MNSET_REC(tsr))
1965 			num_mn_sets++;
1966 	}
1967 
1968 	if (num_mn_sets == 0)
1969 		(void) meta_smf_disable(META_SMF_MN_DISKSET, NULL);
1970 
1971 	/* The set we just deleted is the only one left */
1972 	if (num_sets == 0)
1973 		(void) meta_smf_disable(META_SMF_DISKSET, NULL);
1974 
1975 	sr_cache_del(sr->sr_selfid);
1976 	free_sr(sr);
1977 
1978 }
1979 
1980 void
1981 s_delrec(mddb_recid_t recid, md_error_t *ep)
1982 {
1983 	mddb_userreq_t		req;
1984 
1985 	(void) memset(&req, 0, sizeof (req));
1986 
1987 	METAD_SETUP_SR(MD_DB_DELETE, recid)
1988 
1989 	if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0)
1990 		(void) mdstealerror(ep, &req.ur_mde);
1991 }
1992 
1993 /*
1994  * resnarf the imported set
1995  */
1996 int
1997 resnarf_set(
1998 	set_t			setno,
1999 	md_error_t		*ep
2000 )
2001 {
2002 	md_set_record	*sr;
2003 	md_drive_record	*dr;
2004 	mddb_recid_t	id, *p;
2005 
2006 	if (meta_setup_db_locations(ep) != 0) {
2007 		if (! mdismddberror(ep, MDE_DB_STALE))
2008 			return (-1);
2009 		mdclrerror(ep);
2010 	}
2011 
2012 	setsnarfdone = 1;
2013 
2014 	id = 0;
2015 	while ((sr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_NEXT, MDDB_UR_SR, &id,
2016 	    ep)) != NULL) {
2017 
2018 		if (sr->sr_setno != setno)
2019 			continue;
2020 
2021 		/* Don't allow resnarf of a multi-node diskset */
2022 		if (MD_MNSET_REC(sr))
2023 			goto out;
2024 
2025 		sr->sr_next = NULL;
2026 		sr->sr_drivechain = NULL;
2027 
2028 		if (md_in_daemon)
2029 			url_addl(&url_used, sr->sr_selfid);
2030 
2031 		sr->sr_flags |= MD_SR_CHECK;
2032 
2033 		sr_cache_add(sr);
2034 
2035 		if (sr->sr_driverec == 0)
2036 			break;
2037 
2038 		p = &sr->sr_driverec;
2039 		while ((dr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_WKEY,
2040 		    MDDB_UR_DR, p, ep)) != NULL) {
2041 			dr->dr_next = NULL;
2042 
2043 			if (md_in_daemon)
2044 				url_addl(&url_used, dr->dr_selfid);
2045 
2046 			dr_cache_add(sr, dr);
2047 
2048 			if (dr->dr_nextrec == 0)
2049 				break;
2050 
2051 			p = &dr->dr_nextrec;
2052 		}
2053 		if (! mdisok(ep)) {
2054 			if (! mdissyserror(ep, ENOENT))
2055 				goto out;
2056 			mdclrerror(ep);
2057 			commitset(sr, FALSE, ep);
2058 			if (! mdisok(ep))
2059 				goto out;
2060 		}
2061 	}
2062 	if (! mdisok(ep)) {
2063 		if (! mdissyserror(ep, ENOENT))
2064 			goto out;
2065 		mdclrerror(ep);
2066 	}
2067 
2068 	setsnarfdone = 2;
2069 
2070 	url_freel(&url_used);
2071 	url_freel(&url_tode);
2072 	return (0);
2073 
2074 out:
2075 	url_freel(&url_used);
2076 	url_freel(&url_tode);
2077 
2078 	sr_cache_flush(1);
2079 
2080 	setsnarfdone = 0;
2081 
2082 	return (-1);
2083 }
2084