xref: /onnv-gate/usr/src/uts/common/io/lvm/md/md_mddb.c (revision 86:fa9ea5e7dbe5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/time.h>
32 #include <sys/uio.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/systeminfo.h>
36 #include <sys/sysmacros.h>
37 #include <sys/buf.h>
38 #include <sys/kmem.h>
39 #include <sys/file.h>
40 #include <sys/open.h>
41 #include <sys/debug.h>
42 #include <sys/stat.h>
43 #include <sys/lvm/mdvar.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/lvm/md_convert.h>
46 #include <sys/types.h>
47 #include <sys/kmem.h>
48 #include <sys/lvm/mdmn_commd.h>
49 #include <sys/cladm.h>
50 
51 mhd_mhiargs_t	defmhiargs = {
52 	1000,
53 	{ 6000, 6000, 30000 }
54 };
55 
56 #define	MDDB
57 
58 #include <sys/lvm/mdvar.h>
59 #include <sys/lvm/mdmed.h>
60 #include <sys/lvm/md_names.h>
61 #include <sys/cred.h>
62 #include <sys/ddi.h>
63 #include <sys/sunddi.h>
64 #include <sys/esunddi.h>
65 
66 #include <sys/sysevent/eventdefs.h>
67 #include <sys/sysevent/svm.h>
68 
69 extern char svm_bootpath[];
70 
71 int			md_maxbootlist = MAXBOOTLIST;
72 static ulong_t		mddb_maxblocks = 0;	/* tune for small records */
73 static int		mddb_maxbufheaders = 50;
74 static uint_t		mddb_maxcopies = MDDB_NLB;
75 
76 /*
77  * If this is set, more detailed messages about DB init will be given, instead
78  * of just the MDE_DB_NODB.
79  */
80 static int		mddb_db_err_detail = 0;
81 
82 /*
83  * This lock is used to single-thread load/unload of all sets
84  */
85 static kmutex_t		mddb_lock;
86 
87 /*
88  * You really do NOT want to change this boolean.
89  * It can be VERY dangerous to do so.  Loss of
90  * data may occur. USE AT YOUR OWN RISK!!!!
91  */
92 static int		mddb_allow_half = 0;
93 /*
94  * For mirrored root allow reboot with only half the replicas available
95  * Flag inserted for Santa Fe project.
96  */
97 int mirrored_root_flag;
98 
99 #define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
100 			    ((c) == '\r') || ((c) == '\n'))
101 #define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))
102 
103 #define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)
104 
105 extern md_krwlock_t	md_unit_array_rw;	/* md.c */
106 extern set_t		md_nsets;		/* md.c */
107 extern int		md_nmedh;		/* md.c */
108 extern md_set_t		md_set[];		/* md.c */
109 extern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
110 extern dev_info_t	*md_devinfo;
111 extern int		md_init_debug;
112 extern int		md_status;
113 extern md_ops_t		*md_opslist;
114 extern md_krwlock_t	nm_lock;
115 
116 static int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
117 				ddi_devid_t didptr);
118 
119 /*
120  * Defines for crc calculation for records
121  * rec_crcgen generates a crc checksum for a record block
122  * rec_crcchk checks the crc checksum for a record block
123  */
124 #define	REC_CRCGEN	0
125 #define	REC_CRCCHK	1
126 #define	rec_crcgen(s, dep, rbp) \
127 	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
128 #define	rec_crcchk(s, dep, rbp) \
129 	rec_crcfunc(s, dep, rbp, REC_CRCCHK)
130 
131 /*
132  * During upgrade, SVM basically runs with the devt from the target
133  * being upgraded.  Translations are made from the target devt to the
134  * miniroot devt when writing data out to the disk.  This is done by
135  * the following routines:
136  *	wrtblklst
137  *	writeblks
138  *	readblklst
139  *	readblks
140  *	dt_read
141  *
142  * The following routines are used by the routines listed above and
143  * expect a translated (aka miniroot) devt:
144  *	getblks
145  * 	getmasters
146  *
147  * Also, when calling any system routines, such as ddi_lyr_get_devid,
148  * the translated (aka miniroot) devt must be used.
149  *
150  * By the same token, the major number and major name conversion operations
151  * need to use the name_to_major file from the target system instead
152  * of the name_to_major file on the miniroot.  So, calls to
153  * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
154  * when running on an upgrade.  Same is true with calls to
155  * ddi_major_to_name.
156  */
157 
158 
159 #ifndef MDDB_FAKE
160 
161 static int
162 mddb_rwdata(
163 	mddb_set_t	*s,	/* incore db set structure */
164 	int		flag,	/* B_ASYNC or 0 passed in here */
165 	buf_t		*bp
166 )
167 {
168 	int		err = 0;
169 
170 	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
171 
172 	mutex_exit(SETMUTEX(s->s_setno));
173 	if (mdv_strategy_tstpnt == NULL ||
174 	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
175 		(void) bdev_strategy(bp);
176 
177 	if (flag & B_ASYNC) {
178 		mutex_enter(SETMUTEX(s->s_setno));
179 		return (0);
180 	}
181 
182 	err = biowait(bp);
183 	mutex_enter(SETMUTEX(s->s_setno));
184 	return (err);
185 }
186 
187 static void
188 setidentifier(
189 	mddb_set_t	*s,
190 	identifier_t	*ident
191 )
192 {
193 	if (s->s_setno == MD_LOCAL_SET)
194 		(void) strcpy(&ident->serial[0], s->s_ident.serial);
195 	else
196 		ident->createtime = s->s_ident.createtime;
197 }
198 
199 static int
200 cmpidentifier(
201 	mddb_set_t	*s,
202 	identifier_t	*ident
203 )
204 {
205 	if (s->s_setno == MD_LOCAL_SET)
206 		return (strcmp(ident->serial, s->s_ident.serial));
207 	else
208 		return (timercmp(&ident->createtime,
209 		    /*CSTYLED*/
210 		    &s->s_ident.createtime, !=));
211 }
212 
213 static int
214 mddb_devopen(
215 	md_dev64_t	dev
216 )
217 {
218 	dev_t		ddi_dev = md_dev64_to_dev(dev);
219 
220 	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
221 		return (0);
222 	return (1);
223 }
224 
225 static void
226 mddb_devclose(
227 	md_dev64_t	dev
228 )
229 {
230 	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
231 }
232 
233 /*
234  * stripe_skip_ts
235  *
236  * Returns a list of fields to be skipped in the stripe record structure.
237  * These fields are ms_timestamp in the component structure.
238  * Used to skip these fields when calculating the checksum.
239  */
240 static crc_skip_t *
241 stripe_skip_ts(void *un, uint_t revision)
242 {
243 	struct ms_row32_od	*small_mdr;
244 	struct ms_row		*big_mdr;
245 	uint_t			row, comp, ncomps, compoff;
246 	crc_skip_t		*skip;
247 	crc_skip_t		*skip_prev;
248 	crc_skip_t		skip_start = {0, 0, 0};
249 	ms_unit_t		*big_un;
250 	ms_unit32_od_t		*small_un;
251 	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);
252 
253 	if (revision == MDDB_REV_RB) {
254 		small_un = (ms_unit32_od_t *)un;
255 		skip_prev = &skip_start;
256 
257 		if (small_un->un_nrows == 0)
258 			return (NULL);
259 		/*
260 		 * walk through all rows to find the total number
261 		 * of components
262 		 */
263 		small_mdr   = &small_un->un_row[0];
264 		ncomps = 0;
265 		for (row = 0; (row < small_un->un_nrows); row++) {
266 			ncomps += small_mdr[row].un_ncomp;
267 		}
268 
269 		/* Now walk through the components */
270 		compoff = small_un->un_ocomp + rb_off;
271 		for (comp = 0; (comp < ncomps); ++comp) {
272 			uint_t	mdcp = compoff +
273 			    (comp * sizeof (ms_comp32_od_t));
274 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
275 			    KM_SLEEP);
276 			skip->skip_offset = mdcp +
277 			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
278 			skip->skip_size = sizeof (md_timeval32_t);
279 			skip_prev->skip_next = skip;
280 			skip_prev = skip;
281 		}
282 	} else {
283 		big_un = (ms_unit_t *)un;
284 		skip_prev = &skip_start;
285 
286 		if (big_un->un_nrows == 0)
287 			return (NULL);
288 		/*
289 		 * walk through all rows to find the total number
290 		 * of components
291 		 */
292 		big_mdr   = &big_un->un_row[0];
293 		ncomps = 0;
294 		for (row = 0; (row < big_un->un_nrows); row++) {
295 			ncomps += big_mdr[row].un_ncomp;
296 		}
297 
298 		/* Now walk through the components */
299 		compoff = big_un->un_ocomp + rb_off;
300 		for (comp = 0; (comp < ncomps); ++comp) {
301 			uint_t	mdcp = compoff +
302 			    (comp * sizeof (ms_comp_t));
303 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
304 			    KM_SLEEP);
305 			skip->skip_offset = mdcp +
306 			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
307 			skip->skip_size = sizeof (md_timeval32_t);
308 			skip_prev->skip_next = skip;
309 			skip_prev = skip;
310 		}
311 	}
312 	/* Return the start of the list of fields to skip */
313 	return (skip_start.skip_next);
314 }
315 
316 /*
317  * mirror_skip_ts
318  *
319  * Returns a list of fields to be skipped in the mirror record structure.
320  * This includes un_last_read and sm_timestamp for each submirror
321  * Used to skip these fields when calculating the checksum.
322  */
323 static crc_skip_t *
324 mirror_skip_ts(uint_t revision)
325 {
326 	int		i;
327 	crc_skip_t	*skip;
328 	crc_skip_t	*skip_prev;
329 	crc_skip_t	skip_start = {0, 0, 0};
330 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
331 
332 	skip_prev = &skip_start;
333 
334 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
335 	if (revision == MDDB_REV_RB) {
336 		skip->skip_offset = offsetof(mm_unit32_od_t,
337 		    un_last_read) + rb_off;
338 	} else {
339 		skip->skip_offset = offsetof(mm_unit_t,
340 		    un_last_read) + rb_off;
341 	}
342 	skip->skip_size = sizeof (int);
343 	skip_prev->skip_next = skip;
344 	skip_prev = skip;
345 
346 	for (i = 0; i < NMIRROR; i++) {
347 		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
348 		if (revision == MDDB_REV_RB) {
349 			skip->skip_offset = offsetof(mm_unit32_od_t,
350 			    un_sm[i].sm_timestamp) + rb_off;
351 		} else {
352 			skip->skip_offset = offsetof(mm_unit_t,
353 			    un_sm[i].sm_timestamp) + rb_off;
354 		}
355 		skip->skip_size = sizeof (md_timeval32_t);
356 		skip_prev->skip_next = skip;
357 		skip_prev = skip;
358 	}
359 	/* Return the start of the list of fields to skip */
360 	return (skip_start.skip_next);
361 }
362 
363 /*
364  * hotspare_skip_ts
365  *
366  * Returns a list of the timestamp fields in the hotspare record structure.
367  * Used to skip these fields when calculating the checksum.
368  */
369 static crc_skip_t *
370 hotspare_skip_ts(uint_t revision)
371 {
372 	crc_skip_t	*skip;
373 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
374 
375 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
376 	if (revision == MDDB_REV_RB) {
377 		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
378 		    rb_off;
379 	} else {
380 		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
381 		    rb_off;
382 	}
383 	skip->skip_size = sizeof (md_timeval32_t);
384 	return (skip);
385 }
386 
387 /*
388  * rec_crcfunc
389  *
390  * Calculate or check the checksum for a record
391  * Calculate the crc if check == 0, Check the crc if check == 1
392  *
393  * Record block may be written by different nodes in a multi-owner diskset
394  * (in case of master change), the function rec_crcchk excludes timestamp
395  * fields in crc computation of record data.
396  * Otherwise, timestamp fields will cause each node to have a different
397  * checksum for same record block causing the exclusive-or of all record block
398  * checksums and data block record sums to be non-zero after new master writes
399  * at least one record block.
400  */
401 static uint_t
402 rec_crcfunc(
403 	mddb_set_t	*s,
404 	mddb_de_ic_t	*dep,
405 	mddb_rb32_t	*rbp,
406 	int		check
407 )
408 {
409 	crc_skip_t	*skip;
410 	crc_skip_t	*skip_tail;
411 	mddb_type_t	type = dep->de_type1;
412 	uint_t		ret;
413 
414 	/*
415 	 * Generate a list of the areas to be skipped when calculating
416 	 * the checksum.
417 	 * First skip rb_checksum, rb_private and rb_userdata.
418 	 */
419 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
420 	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
421 	skip->skip_size = 3 * sizeof (uint_t);
422 	skip_tail = skip;
423 	if (MD_MNSET_SETNO(s->s_setno)) {
424 		/* For a MN set, skip rb_timestamp */
425 		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
426 		    KM_SLEEP);
427 		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
428 		skip_tail->skip_size = sizeof (md_timeval32_t);
429 		skip->skip_next = skip_tail;
430 
431 		/* Now add a list of timestamps to be skipped */
432 		if (type >= MDDB_FIRST_MODID) {
433 			switch (dep->de_flags) {
434 				case MDDB_F_STRIPE:
435 					skip_tail->skip_next =
436 					    stripe_skip_ts((void *)rbp->rb_data,
437 					    rbp->rb_revision);
438 					break;
439 				case MDDB_F_MIRROR:
440 					skip_tail->skip_next =
441 					    mirror_skip_ts(rbp->rb_revision);
442 					break;
443 				case MDDB_F_HOTSPARE:
444 					skip_tail->skip_next =
445 					    hotspare_skip_ts(rbp->rb_revision);
446 					break;
447 				default:
448 					break;
449 			}
450 		}
451 	}
452 
453 	if (check) {
454 		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
455 	} else {
456 		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
457 		ret = rbp->rb_checksum;
458 	}
459 	while (skip) {
460 		crc_skip_t	*skip_save = skip;
461 
462 		skip = skip->skip_next;
463 		kmem_free(skip_save, sizeof (crc_skip_t));
464 	}
465 	return (ret);
466 }
467 
468 static mddb_bf_t *
469 allocbuffer(
470 	mddb_set_t	*s,
471 	int		sleepflag
472 )
473 {
474 	mddb_bf_t	*bfp;
475 
476 	while ((bfp = s->s_freebufhead) == NULL) {
477 		if (sleepflag == MDDB_NOSLEEP)
478 			return ((mddb_bf_t *)NULL);
479 		++s->s_bufmisses;
480 #ifdef	DEBUG
481 		if (s->s_bufmisses == 1)
482 			cmn_err(CE_NOTE,
483 			    "md: mddb: set %u sleeping for buffer", s->s_setno);
484 #endif
485 		s->s_bufwakeup = 1;
486 		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
487 	}
488 	s->s_freebufhead = bfp->bf_next;
489 	bzero((caddr_t)bfp, sizeof (*bfp));
490 	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
491 	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
492 	return (bfp);
493 }
494 
495 static void
496 freebuffer(
497 	mddb_set_t		*s,
498 	mddb_bf_t	*bfp
499 )
500 {
501 	bfp->bf_next = s->s_freebufhead;
502 	s->s_freebufhead = bfp;
503 	if (s->s_bufwakeup) {
504 		cv_broadcast(&s->s_buf_cv);
505 		s->s_bufwakeup = 0;
506 	}
507 }
508 
509 int
510 revchk(
511 	uint_t	mine,
512 	uint_t	data
513 )
514 {
515 	if ((MDDB_REV_MAJOR & mine) != (MDDB_REV_MAJOR & data))
516 		return (1);
517 	if ((MDDB_REV_MINOR & mine) < (MDDB_REV_MINOR & data))
518 		return (1);
519 	return (0);
520 }
521 
522 static void
523 blkbusy(
524 	mddb_set_t	*s,
525 	mddb_block_t	blk
526 )
527 {
528 	int		bit, byte;
529 
530 	s->s_freeblkcnt--;
531 	byte = blk / 8;
532 	bit = 1 << (blk & 7);
533 	ASSERT(! (s->s_freebitmap[byte] & bit));
534 	s->s_freebitmap[byte] |= bit;
535 }
536 
537 static void
538 blkfree(
539 	mddb_set_t	*s,
540 	mddb_block_t	blk
541 )
542 {
543 	int		bit, byte;
544 
545 	s->s_freeblkcnt++;
546 	byte = blk / 8;
547 	bit = 1 << (blk & 7);
548 	ASSERT(s->s_freebitmap[byte] & bit);
549 	s->s_freebitmap[byte] &= ~bit;
550 }
551 
552 static int
553 blkcheck(
554 	mddb_set_t	*s,
555 	mddb_block_t	blk
556 )
557 {
558 	int		bit, byte;
559 
560 	byte = blk / 8;
561 	bit = 1 << (blk & 7);
562 	return (s->s_freebitmap[byte] & bit);
563 }
564 
565 /*
566  * not fast but simple
567  */
568 static mddb_block_t
569 getfreeblks(
570 	mddb_set_t	*s,
571 	size_t		count
572 )
573 {
574 	int		i;
575 	size_t		contig;
576 
577 	contig = 0;
578 	for (i = 0; i < s->s_totalblkcnt; i++) {
579 		if (blkcheck(s, i)) {
580 			contig = 0;
581 		} else {
582 			contig++;
583 			if (contig == count) {
584 				contig = i - count + 1;
585 				for (i = (int)contig; i < contig + count; i++)
586 					blkbusy(s, i);
587 				return ((mddb_block_t)contig);
588 			}
589 		}
590 	}
591 	return (0);
592 }
593 
594 static void
595 computefreeblks(
596 	mddb_set_t	*s
597 )
598 {
599 	mddb_db_t	*dbp;
600 	mddb_de_ic_t	*dep;
601 	int		i;
602 	int		minblks;
603 	int		freeblks;
604 	mddb_mb_ic_t	*mbip;
605 	mddb_lb_t	*lbp;
606 	mddb_block_t	maxblk;
607 	mddb_did_db_t	*did_dbp;
608 	int		nblks;
609 
610 	minblks = 0;
611 	lbp = s->s_lbp;
612 	maxblk = 0;
613 
614 	/*
615 	 * Determine the max number of blocks.
616 	 */
617 	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
618 	/*
619 	 * go through and find highest logical block
620 	 */
621 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
622 		if (dbp->db_blknum > maxblk)
623 			maxblk = dbp->db_blknum;
624 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
625 			for (i = 0; i < dep->de_blkcount; i++)
626 				if (dep->de_blks[i] > maxblk)
627 					maxblk = dep->de_blks[i];
628 	}
629 
630 	for (i = 0; i < lbp->lb_loccnt; i++) {
631 		mddb_locator_t	*lp = &lbp->lb_locators[i];
632 
633 		if ((lp->l_flags & MDDB_F_DELETED) ||
634 		    (lp->l_flags & MDDB_F_EMASTER))
635 			continue;
636 
637 		freeblks = 0;
638 		for (mbip = s->s_mbiarray[i]; mbip != NULL;
639 					mbip = mbip->mbi_next) {
640 			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
641 		}
642 		if (freeblks == 0)	/* this happen when there is no */
643 			continue;	/*	master blk		*/
644 
645 		if (freeblks <= maxblk) {
646 			lp->l_flags |= MDDB_F_TOOSMALL;
647 			lp->l_flags &= ~MDDB_F_ACTIVE;
648 		}
649 
650 		if (freeblks < minblks || minblks == 0)
651 			minblks = freeblks;
652 	}
653 	/*
654 	 * set up reasonable freespace if no
655 	 * data bases exist
656 	 */
657 	if (minblks == 0)
658 		minblks = 100;
659 	if (minblks > nblks)
660 		minblks = nblks;
661 	s->s_freeblkcnt = minblks;
662 	s->s_totalblkcnt = minblks;
663 	if (! s->s_freebitmapsize) {
664 		s->s_freebitmapsize = nblks / 8;
665 		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
666 		    KM_SLEEP);
667 	}
668 	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
669 
670 	/* locator block sectors */
671 	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
672 		blkbusy(s, i);
673 
674 	/* locator name sectors */
675 	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
676 		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
677 
678 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
679 		/* locator block device id information */
680 		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
681 			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
682 
683 		/* disk blocks containing actual device ids */
684 		did_dbp = s->s_did_icp->did_ic_dbp;
685 		while (did_dbp) {
686 			for (i = 0; i < did_dbp->db_blkcnt; i++) {
687 				blkbusy(s, did_dbp->db_firstblk + i);
688 			}
689 			did_dbp = did_dbp->db_next;
690 		}
691 	}
692 
693 	/* Only use data tags if not a MN set */
694 	if (!(lbp->lb_flags & MDDB_MNSET)) {
695 		/* Found a bad tag, do NOT mark the data tag blks busy here */
696 		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
697 			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
698 				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
699 		}
700 	}
701 
702 	/* directory block/entry sectors */
703 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
704 		blkbusy(s, dbp->db_blknum);
705 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
706 			for (i = 0; i < dep->de_blkcount; i++)
707 				blkbusy(s, dep->de_blks[i]);
708 	}
709 }
710 
711 /*
712  * Add free space to the device id incore free list.
713  * Called:
714  *    - During startup when all devid blocks are temporarily placed on the
715  *       free list
716  *    - After a devid has been deleted via the metadb command.
717  *    - When mddb_devid_free_get adds unused space from a disk block
718  *       to free list
719  */
720 static int
721 mddb_devid_free_add(
722 	mddb_set_t *s,
723 	uint_t firstblk,
724 	uint_t offset,
725 	uint_t length
726 )
727 {
728 	mddb_did_free_t	*did_freep;
729 
730 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
731 		return (0);
732 	}
733 
734 	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
735 	    KM_SLEEP);
736 	did_freep->free_blk = firstblk;
737 	did_freep->free_offset = offset;
738 	did_freep->free_length = length;
739 	did_freep->free_next = s->s_did_icp->did_ic_freep;
740 	s->s_did_icp->did_ic_freep = did_freep;
741 
742 	return (0);
743 }
744 
745 /*
746  * Remove specific free space from the device id incore free list.
747  * Called at startup (after all devid blocks have been placed on
748  * free list) in order to remove the free space from the list that
749  * contains actual devids.
750  * Returns 0 if area successfully removed.
751  * Returns 1 if no matching area is found - so nothing removed.
752  */
753 static int
754 mddb_devid_free_delete(
755 	mddb_set_t *s,
756 	uint_t firstblk,
757 	uint_t offset,
758 	uint_t length
759 )
760 {
761 	int		block_found = 0;
762 	mddb_did_free_t	*did_freep1;		/* next free block */
763 	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
764 	mddb_did_free_t *did_freep_before;	/* area before offset, len */
765 	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
766 	uint_t		old_length;
767 
768 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
769 		return (1);
770 	}
771 
772 	/* find free block for this devid */
773 	did_freep1 = s->s_did_icp->did_ic_freep;
774 	while (did_freep1) {
775 		/*
776 		 * Look through free list of <block, offset, length> to
777 		 * find our entry in the free list.  Our entry should
778 		 * exist since the entire devid block was placed into
779 		 * this free list at startup.  This code is just removing
780 		 * the non-free (in-use) portions of the devid block so
781 		 * that the remaining linked list does indeed just
782 		 * contain a free list.
783 		 *
784 		 * Our entry has been found if
785 		 *   - the blocks match,
786 		 *   - the offset (starting address) in the free list is
787 		 *	less than the offset of our entry and
788 		 *   - the length+offset (ending address) in the free list is
789 		 *	greater than the length+offset of our entry.
790 		 */
791 		if ((did_freep1->free_blk == firstblk) &&
792 		    (did_freep1->free_offset <= offset) &&
793 		    ((did_freep1->free_length + did_freep1->free_offset) >=
794 			(length + offset))) {
795 			/* Have found our entry - remove from list */
796 			block_found = 1;
797 			did_freep_before = did_freep1;
798 			old_length = did_freep1->free_length;
799 			/* did_freep1 - pts to next free block */
800 			did_freep1 = did_freep1->free_next;
801 			if (did_freep2) {
802 				did_freep2->free_next = did_freep1;
803 			} else {
804 				s->s_did_icp->did_ic_freep = did_freep1;
805 			}
806 
807 			/*
808 			 * did_freep_before points to area in block before
809 			 * offset, length.
810 			 */
811 			did_freep_before->free_length = offset -
812 				did_freep_before->free_offset;
813 			/*
814 			 * did_freep_after points to area in block after
815 			 * offset, length.
816 			 */
817 			did_freep_after = (mddb_did_free_t *)kmem_zalloc
818 					(sizeof (mddb_did_free_t), KM_SLEEP);
819 			did_freep_after->free_blk = did_freep_before->free_blk;
820 			did_freep_after->free_offset = offset + length;
821 			did_freep_after->free_length = old_length - length -
822 				did_freep_before->free_length;
823 			/*
824 			 * Add before and after areas to free list
825 			 * If area before or after offset, length has length
826 			 * of 0, that entry is not added.
827 			 */
828 			if (did_freep_after->free_length) {
829 				did_freep_after->free_next = did_freep1;
830 				if (did_freep2) {
831 				    did_freep2->free_next = did_freep_after;
832 				} else {
833 				    s->s_did_icp->did_ic_freep =
834 					did_freep_after;
835 				}
836 				did_freep1 = did_freep_after;
837 			} else {
838 				kmem_free(did_freep_after,
839 					sizeof (mddb_did_free_t));
840 			}
841 
842 			if (did_freep_before->free_length) {
843 				did_freep_before->free_next = did_freep1;
844 				if (did_freep2) {
845 				    did_freep2->free_next = did_freep_before;
846 				} else {
847 				    s->s_did_icp->did_ic_freep =
848 					did_freep_before;
849 				}
850 			} else {
851 				kmem_free(did_freep_before,
852 					sizeof (mddb_did_free_t));
853 			}
854 			break;
855 		} else {
856 			did_freep2 = did_freep1;
857 			did_freep1 = did_freep1->free_next;
858 		}
859 	}
860 	if (block_found == 0) {
861 		return (1);
862 	} else {
863 		return (0);
864 	}
865 }
866 
867 /*
868  * Find free space of devid length and remove free space from list.
869  * Return a pointer to the previously free area.
870  *
871  * If there's not enough free space on the free list, get an empty
872  * disk block, put the empty disk block on the did_ic_dbp linked list,
873  * and add the disk block space not used for devid to the free list.
874  *
875  * Return pointer to address (inside disk block) of free area for devid.
876  * Return 0 if error.
877  */
878 static caddr_t
879 mddb_devid_free_get(
880 	mddb_set_t *s,
881 	uint_t len,
882 	uint_t *blk,
883 	uint_t *cnt,
884 	uint_t *offset
885 )
886 {
887 	mddb_did_free_t	*freep, *freep2;
888 	mddb_did_db_t	*dbp;
889 	uint_t		blk_cnt, blk_num;
890 	ddi_devid_t	devid_ptr = NULL;
891 
892 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
893 		return (0);
894 	}
895 
896 	freep = s->s_did_icp->did_ic_freep;
897 	freep2 = (mddb_did_free_t *)NULL;
898 	while (freep) {
899 		/* found a free area - remove from free list */
900 		if (len <= freep->free_length) {
901 			*blk = freep->free_blk;
902 			*offset = freep->free_offset;
903 			/* find disk block pointer that contains free area */
904 			dbp = s->s_did_icp->did_ic_dbp;
905 			while (dbp) {
906 				if (dbp->db_firstblk == *blk)
907 					break;
908 				else
909 					dbp = dbp->db_next;
910 			}
911 			/*
912 			 * If a disk block pointer can't be found - something
913 			 * is wrong, so don't use this free space.
914 			 */
915 			if (dbp == NULL) {
916 				freep2 = freep;
917 				freep = freep->free_next;
918 				continue;
919 			}
920 
921 			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
922 			*cnt = dbp->db_blkcnt;
923 
924 			/* Update free list information */
925 			freep->free_offset += len;
926 			freep->free_length -= len;
927 			if (freep->free_length == 0) {
928 				if (freep2) {
929 					freep2->free_next =
930 					freep->free_next;
931 				} else {
932 					s->s_did_icp->did_ic_freep =
933 					freep->free_next;
934 				}
935 				kmem_free(freep, sizeof (mddb_did_free_t));
936 			}
937 			break;
938 		}
939 		freep2 = freep;
940 		freep = freep->free_next;
941 	}
942 
943 	/* Didn't find a free spot */
944 	if (freep == NULL) {
945 		/* get free logical disk blk in replica */
946 		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
947 		blk_num = getfreeblks(s, blk_cnt);
948 		if (blk_num == 0)
949 			return (0);
950 
951 		/* Add disk block to disk block linked list */
952 		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
953 		dbp->db_firstblk = blk_num;
954 		dbp->db_blkcnt = blk_cnt;
955 		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
956 		dbp->db_next = s->s_did_icp->did_ic_dbp;
957 		s->s_did_icp->did_ic_dbp = dbp;
958 		devid_ptr = (ddi_devid_t)dbp->db_ptr;
959 
960 		/* Update return values */
961 		*blk = blk_num;
962 		*offset = 0;
963 		*cnt = blk_cnt;
964 
965 		/* Add unused part of block to free list */
966 		(void) mddb_devid_free_add(s, blk_num,
967 			len, (dbtob(blk_cnt) - len));
968 	}
969 
970 	return ((caddr_t)devid_ptr);
971 }
972 
973 /*
974  * Add device id information for locator index to device id area in set.
975  * Get free area to store device id from free list.   Update checksum
976  * for mddb_did_blk.
977  *
978  * This routine does not write any data out to disk.
979  * After this routine has been called, the routine, writelocall, should
980  * be called to write both the locator block and device id area out
981  * to disk.
982  */
983 static int
984 mddb_devid_add(
985 	mddb_set_t	*s,
986 	uint_t		index,
987 	ddi_devid_t	devid,
988 	char		*minor_name
989 )
990 {
991 	uint_t		devid_len;
992 	uint_t		blk, offset;
993 	ddi_devid_t	devid_ptr;
994 	mddb_did_info_t	*did_info;
995 	uint_t		blkcnt, i;
996 	mddb_did_blk_t	*did_blk;
997 
998 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
999 		return (1);
1000 	}
1001 	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
1002 		return (1);
1003 
1004 	/* Check if device id has already been added */
1005 	did_blk = s->s_did_icp->did_ic_blkp;
1006 	did_info = &(did_blk->blk_info[index]);
1007 	if (did_info->info_flags & MDDB_DID_EXISTS)
1008 		return (0);
1009 
1010 	devid_len = ddi_devid_sizeof(devid);
1011 	devid_ptr = (ddi_devid_t)
1012 			mddb_devid_free_get(s, devid_len, &blk, &blkcnt,
1013 				&offset);
1014 	if (devid_ptr == NULL) {
1015 		return (1);
1016 	}
1017 
1018 	/* Copy devid into devid free area */
1019 	for (i = 0; i < devid_len; i++)
1020 		((char *)devid_ptr)[i] = ((char *)devid)[i];
1021 
1022 	/* Update mddb_did_info area for new device id */
1023 	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID |
1024 				MDDB_DID_UPDATED;
1025 	did_info->info_firstblk = blk;
1026 	did_info->info_blkcnt = blkcnt;
1027 	did_info->info_offset = offset;
1028 	did_info->info_length = devid_len;
1029 	(void) strcpy(did_info->info_minor_name, minor_name);
1030 	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
1031 
1032 	/* Add device id pointer to did_ic_devid array */
1033 	s->s_did_icp->did_ic_devid[index] = devid_ptr;
1034 
1035 	return (0);
1036 }
1037 
1038 
1039 /*
1040  * Delete device id information for locator index from device id area in set.
1041  * Add device id space to free area.
1042  *
1043  * This routine does not write any data out to disk.
1044  * After this routine has been called, the routine, writelocall, should
1045  * be called to write both the locator block and device id area out
1046  * to disk.
1047  */
1048 static int
1049 mddb_devid_delete(mddb_set_t *s, uint_t index)
1050 {
1051 	mddb_did_info_t	*did_info;
1052 	mddb_did_blk_t	*did_blk;
1053 
1054 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1055 		return (1);
1056 	}
1057 
1058 	/* Get device id information from mddb_did_blk */
1059 	did_blk = s->s_did_icp->did_ic_blkp;
1060 	did_info = &(did_blk->blk_info[index]);
1061 
1062 	/*
1063 	 * Ensure that the underlying device supports device ids
1064 	 * before arbitrarily removing them.
1065 	 */
1066 	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
1067 		return (1);
1068 	}
1069 
1070 	/* Remove device id information from mddb_did_blk */
1071 	did_info->info_flags = 0;
1072 
1073 	/* Remove device id from incore area */
1074 	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
1075 
1076 	/* Add new free space in disk block to free list */
1077 	(void) mddb_devid_free_add(s, did_info->info_firstblk,
1078 		did_info->info_offset, did_info->info_length);
1079 
1080 	return (0);
1081 }
1082 
1083 /*
1084  * Check if there is a device id for a locator index.
1085  *
1086  * Caller of this routine should not free devid or minor_name since
1087  * these will point to internal data structures that should not
1088  * be freed.
1089  */
1090 static int
1091 mddb_devid_get(
1092 	mddb_set_t *s,
1093 	uint_t index,
1094 	ddi_devid_t *devid,
1095 	char **minor_name
1096 )
1097 {
1098 	mddb_did_info_t	*did_info;
1099 
1100 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1101 		return (0);
1102 	}
1103 	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
1104 
1105 	if (did_info->info_flags & MDDB_DID_EXISTS) {
1106 		*devid = s->s_did_icp->did_ic_devid[index];
1107 		*minor_name =
1108 		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
1109 		return (1);
1110 	} else
1111 		return (0);
1112 
1113 
1114 }
1115 
1116 /*
1117  * Check if device id is valid on current system.
1118  * Needs devid, previously known dev_t and current minor_name.
1119  *
1120  * Success:
1121  * 	Returns 0 if valid device id is found and updates
1122  * 	dev_t if the dev_t associated with the device id is
1123  *	different than dev_t.
1124  * Failure:
1125  * 	Returns 1 if device id not valid on current system.
1126  */
1127 static int
1128 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
1129 {
1130 	int		retndevs;
1131 	dev_t		*ddi_devs;
1132 	int		devid_flag = 0;
1133 	int 		cnt;
1134 
1135 	if (dev == 0)
1136 		return (1);
1137 	/*
1138 	 * See if devid is valid in the current system.
1139 	 * If so, set dev to match the devid.
1140 	 */
1141 	if (ddi_lyr_devid_to_devlist(devid, minor_name,
1142 	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
1143 		if (retndevs > 0) {
1144 			/* devid is valid to use */
1145 			devid_flag = 1;
1146 			/* does dev_t in list match dev */
1147 			cnt = 0;
1148 			while (cnt < retndevs) {
1149 				if (*dev == md_expldev(ddi_devs[cnt]))
1150 					break;
1151 				cnt++;
1152 			}
1153 			/*
1154 			 * If a different dev_t, then setup
1155 			 * new dev and new major name
1156 			 */
1157 			if (cnt == retndevs) {
1158 				*dev = md_expldev(ddi_devs[0]);
1159 			}
1160 			ddi_lyr_free_devlist(ddi_devs, retndevs);
1161 		}
1162 	}
1163 	if (devid_flag)
1164 		return (0);
1165 	else
1166 		return (1);
1167 }
1168 
1169 
1170 /*
1171  * Free the devid incore data areas
1172  */
1173 static void
1174 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
1175 {
1176 	mddb_did_free_t	*did_freep1, *did_freep2;
1177 	mddb_did_db_t	*did_dbp1, *did_dbp2;
1178 	mddb_did_ic_t	*icp = *did_icp;
1179 
1180 	if (icp) {
1181 		if (icp->did_ic_blkp) {
1182 			kmem_free((caddr_t)icp->did_ic_blkp,
1183 			    dbtob(lbp->lb_didblkcnt));
1184 			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
1185 		}
1186 
1187 		if (icp->did_ic_dbp) {
1188 			did_dbp1 = icp->did_ic_dbp;
1189 			while (did_dbp1) {
1190 				did_dbp2 = did_dbp1->db_next;
1191 				kmem_free((caddr_t)did_dbp1->db_ptr,
1192 				    dbtob(did_dbp1->db_blkcnt));
1193 				kmem_free((caddr_t)did_dbp1,
1194 				    sizeof (mddb_did_db_t));
1195 				did_dbp1 = did_dbp2;
1196 			}
1197 		}
1198 
1199 		if (icp->did_ic_freep) {
1200 			did_freep1 = icp->did_ic_freep;
1201 			while (did_freep1) {
1202 				did_freep2 = did_freep1->free_next;
1203 				kmem_free((caddr_t)did_freep1,
1204 				    sizeof (mddb_did_free_t));
1205 				did_freep1 = did_freep2;
1206 			}
1207 		}
1208 
1209 		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
1210 		*did_icp = (mddb_did_ic_t *)NULL;
1211 	}
1212 
1213 }
1214 
1215 static daddr_t
1216 getphysblk(
1217 	mddb_block_t		blk,
1218 	mddb_mb_ic_t		*mbip
1219 )
1220 {
1221 	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);
1222 
1223 	while (blk >= mbp->mb_blkcnt) {
1224 		if (! mbip->mbi_next)
1225 			return ((daddr_t)-1);	/* no such block */
1226 		blk -= mbp->mb_blkcnt;
1227 		mbip = mbip->mbi_next;
1228 		mbp = &(mbip->mbi_mddb_mb);
1229 	}
1230 
1231 	if (blk >= mbp->mb_blkmap.m_consecutive)
1232 		return ((daddr_t)-1);	/* no such block */
1233 
1234 	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
1235 }
1236 
1237 /*
1238  * when a buf header is passed in the new buffer must be
1239  * put on the front of the chain. writerec counts on it
1240  */
1241 static int
1242 putblks(
1243 	mddb_set_t	*s,		/* incore db set structure */
1244 	caddr_t		buffer,		/* adr of buffer to be written */
1245 	daddr_t		blk,		/* block number for first block */
1246 	int		cnt,		/* number of blocks to be written */
1247 	md_dev64_t	device,		/* device to be written to */
1248 	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
1249 					/*    and put buf address here */
1250 )
1251 {
1252 	buf_t		*bp;
1253 	mddb_bf_t	*bfp;
1254 	int		err = 0;
1255 
1256 	bfp = allocbuffer(s, MDDB_SLEEPOK);
1257 	bp = &bfp->bf_buf;
1258 	bp->b_bcount = MDDB_BSIZE * cnt;
1259 	bp->b_un.b_addr = buffer;
1260 	bp->b_blkno = blk;
1261 	bp->b_edev = md_dev64_to_dev(device);
1262 	/*
1263 	 * if a header for a buf chain is passed in this is async io.
1264 	 * currently only done for optimize  records
1265 	 */
1266 	if (bufhead) {
1267 		bfp->bf_next = *bufhead;
1268 		*bufhead = bfp;
1269 		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
1270 		return (0);
1271 	}
1272 	err = mddb_rwdata(s, B_WRITE, bp);
1273 	freebuffer(s, bfp);
1274 	if (err) {
1275 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1276 		    s->s_setno, device);
1277 		return (MDDB_F_EWRITE);
1278 	}
1279 	return (0);
1280 }
1281 
1282 /*
1283  * wrtblklst - takes an array of logical block numbers
1284  *		and writes the buffer to those blocks (scatter).
1285  * If called during upgrade, this routine expects a
1286  * non-translated (aka target) dev.
1287  */
1288 static int
1289 wrtblklst(
1290 	mddb_set_t	*s,		/* incore set structure */
1291 	caddr_t		buffer,		/* buffer to be written (record blk) */
1292 	mddb_block_t	blka[],		/* list of logical blks for record */
1293 	daddr_t		cnt,		/* number of logical blks */
1294 	const int	li,		/* locator index */
1295 	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
1296 					/*    and put buf address here */
1297 	int		master_only	/* allow only master node to write */
1298 )
1299 {
1300 	daddr_t		blk;
1301 	daddr_t		blk1;
1302 	int		err = 0;
1303 	int		cons;
1304 	mddb_lb_t	*lbp = s->s_lbp;
1305 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1306 	md_dev64_t	dev;
1307 	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];
1308 
1309 	/*
1310 	 * If a MN diskset and only the master can write,
1311 	 * then a non-master node will just return success.
1312 	 */
1313 	if ((lbp->lb_flags & MDDB_MNSET) &&
1314 	    (master_only == MDDB_WR_ONLY_MASTER)) {
1315 
1316 		/* return successfully if we aren't the master */
1317 		if (!(md_set[s->s_setno].s_am_i_master)) {
1318 			return (0);
1319 		}
1320 	}
1321 
1322 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1323 	if (dev == NODEV64) {
1324 		return (1);
1325 	}
1326 
1327 	blk = getphysblk(blka[0], mbip);
1328 	ASSERT(blk >= 0);
1329 
1330 	cons = 1;
1331 	while (cnt) {
1332 		if (cons != cnt) {
1333 			blk1 = getphysblk(blka[cons], mbip);
1334 			ASSERT(blk1 >= 0);
1335 			if ((blk + cons) == blk1) {
1336 				cons++;
1337 				continue;
1338 			}
1339 		}
1340 		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
1341 			/*
1342 			 * If an MN diskset and any_node_can_write
1343 			 * then this request is coming from writeoptrecord
1344 			 * and l_flags field should not be updated.
1345 			 * l_flags will be updated as a result of sending
1346 			 * a class1 message to the master.  Setting l_flags
1347 			 * here will cause slave to be out of sync with
1348 			 * master.
1349 			 *
1350 			 * Otherwise, set the error in l_flags
1351 			 * (this occurs if this is not a MN diskset or
1352 			 * only_master_can_write is set).
1353 			 */
1354 			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
1355 			    (master_only == MDDB_WR_ONLY_MASTER)) {
1356 				lp->l_flags |= MDDB_F_EWRITE;
1357 			}
1358 			return (err);
1359 		}
1360 		if (bufhead)
1361 			(*bufhead)->bf_locator = lp;
1362 
1363 		buffer += MDDB_BSIZE * cons;
1364 		cnt -= cons;
1365 		blka += cons;
1366 		if (cnt) {
1367 			blk = getphysblk(blka[0], mbip);
1368 			ASSERT(blk >= 0);
1369 		}
1370 		cons = 1;
1371 	}
1372 
1373 	return (0);
1374 }
1375 
1376 /*
1377  * writeblks - takes a logical block number/block count pair
1378  * 		and writes the buffer to those contiguous logical blocks.
1379  * If called during upgrade, this routine expects a non-translated
1380  * (aka target) dev.
1381  */
1382 static int
1383 writeblks(
1384 	mddb_set_t	*s,		/* incore set structure */
1385 	caddr_t		buffer,		/* buffer to be written */
1386 	mddb_block_t	blk,		/* starting logical block number */
1387 	int		cnt,		/* number of log blocks to be written */
1388 	const int	li,		/* locator index */
1389 	int		master_only	/* allow only master node to write */
1390 )
1391 {
1392 	daddr_t		physblk;
1393 	int		err = 0;
1394 	int		i;
1395 	mddb_lb_t	*lbp = s->s_lbp;
1396 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1397 	md_dev64_t	dev;
1398 	mddb_block_t	*blkarray;
1399 	int		size;
1400 	int		ret;
1401 
1402 	/*
1403 	 * If a MN diskset and only the master can write,
1404 	 * then a non-master node will just return success.
1405 	 */
1406 	if ((lbp->lb_flags & MDDB_MNSET) &&
1407 	    (master_only == MDDB_WR_ONLY_MASTER)) {
1408 		/* return successfully if we aren't the master */
1409 		if (!(md_set[s->s_setno].s_am_i_master)) {
1410 			return (0);
1411 		}
1412 	}
1413 
1414 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1415 	if (dev == NODEV64) {
1416 		return (1);
1417 	}
1418 
1419 	if (cnt > 1) {
1420 		size = sizeof (mddb_block_t) * cnt;
1421 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1422 		for (i = 0; i < cnt; i++)
1423 			blkarray[i] = blk + i;
1424 		ret = wrtblklst(s, buffer, blkarray, cnt,
1425 			li, 0, MDDB_WR_ONLY_MASTER);
1426 		kmem_free(blkarray, size);
1427 		return (ret);
1428 	}
1429 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1430 	ASSERT(physblk > 0);
1431 	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
1432 		lp->l_flags |= MDDB_F_EWRITE;
1433 		return (err);
1434 	}
1435 	return (0);
1436 }
1437 
1438 /*
1439  * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
1440  */
1441 static int
1442 writeall(
1443 	mddb_set_t	*s,		/* incore set structure */
1444 	caddr_t		buffer,		/* buffer to be written */
1445 	mddb_block_t	block,		/* starting logical block number */
1446 	int		cnt,		/* number of log blocks to be written */
1447 	int		master_only	/* allow only master node to write */
1448 )
1449 {
1450 	int		li;
1451 	int		err = 0;
1452 	mddb_lb_t	*lbp = s->s_lbp;
1453 
1454 	for (li = 0; li < lbp->lb_loccnt; li++) {
1455 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1456 
1457 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1458 		    (lp->l_flags & MDDB_F_EWRITE))
1459 			continue;
1460 
1461 		err |= writeblks(s, buffer, block, cnt, li, master_only);
1462 	}
1463 
1464 	return (err);
1465 }
1466 
1467 /*
1468  * writelocall - write the locator block and device id information (if
1469  * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
1470  *
1471  * Increments the locator block's commitcnt.  Updates the device id area's
1472  * commitcnt if the replica is in device id format.  Regenerates the
1473  * checksums after updating the commitcnt(s).
1474  */
1475 static int
1476 writelocall(
1477 	mddb_set_t	*s	/* incore set structure */
1478 )
1479 {
1480 	int		li;
1481 	int		err = 0;
1482 	mddb_lb_t	*lbp = s->s_lbp;
1483 	mddb_did_blk_t	*did_blk;
1484 	mddb_did_db_t	*did_dbp;
1485 
1486 	s->s_lbp->lb_commitcnt++;
1487 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1488 		did_blk = s->s_did_icp->did_ic_blkp;
1489 		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
1490 		crcgen(did_blk, &did_blk->blk_checksum,
1491 			dbtob(lbp->lb_didblkcnt), NULL);
1492 	}
1493 	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
1494 
1495 	for (li = 0; li < lbp->lb_loccnt; li++) {
1496 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1497 
1498 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1499 		    (lp->l_flags & MDDB_F_EWRITE))
1500 			continue;
1501 
1502 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1503 			/* write out blocks containing actual device ids */
1504 			did_dbp = s->s_did_icp->did_ic_dbp;
1505 			while (did_dbp) {
1506 				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
1507 					did_dbp->db_firstblk,
1508 					did_dbp->db_blkcnt, li,
1509 					MDDB_WR_ONLY_MASTER);
1510 				did_dbp = did_dbp->db_next;
1511 			}
1512 
1513 			/* write out device id area block */
1514 			err |= writeblks(s, (caddr_t)did_blk,
1515 				lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
1516 				MDDB_WR_ONLY_MASTER);
1517 		}
1518 		/* write out locator block */
1519 		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
1520 			MDDB_WR_ONLY_MASTER);
1521 	}
1522 
1523 	/*
1524 	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
1525 	 * in the mddb_set structure to show that the locator block has
1526 	 * been changed.
1527 	 */
1528 
1529 	if ((lbp->lb_flags & MDDB_MNSET) &&
1530 	    (md_set[s->s_setno].s_am_i_master)) {
1531 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
1532 	}
1533 	return (err);
1534 }
1535 
1536 /*
1537  * If called during upgrade, this routine expects a translated
1538  * (aka miniroot) dev.
1539  */
1540 static int
1541 getblks(
1542 	mddb_set_t	*s,	/* incore db set structure */
1543 	caddr_t		buffer,	/* buffer to read data into */
1544 	md_dev64_t	device,	/* device to read from */
1545 	daddr_t		blk,	/* physical block number to read */
1546 	int		cnt	/* number of blocks to read */
1547 )
1548 {
1549 	buf_t		*bp;
1550 	mddb_bf_t	*bfp;
1551 	int		err = 0;
1552 
1553 	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
1554 	bp = &bfp->bf_buf;
1555 	bp->b_bcount = MDDB_BSIZE * cnt;
1556 	bp->b_un.b_addr = buffer;
1557 	bp->b_blkno = blk;
1558 	bp->b_edev = md_dev64_to_dev(device);
1559 	err = mddb_rwdata(s, B_READ, bp);
1560 	freebuffer(s, bfp);
1561 	if (err) {
1562 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1563 		    s->s_setno, device);
1564 		return (MDDB_F_EREAD);
1565 	}
1566 	return (0);
1567 }
1568 
1569 /*
1570  * readblklst - takes an array of logical block numbers
1571  * 		and reads those blocks (gather) into the buffer.
1572  * If called during upgrade, this routine expects a non-translated
1573  * (aka target) dev.
1574  */
1575 static int
1576 readblklst(
1577 	mddb_set_t	*s,	/* incore set structure */
1578 	caddr_t		buffer,	/* buffer to be read (record block) */
1579 	mddb_block_t	blka[],	/* list of logical blocks to be read */
1580 	daddr_t		cnt,	/* number of logical blocks */
1581 	int		li	/* locator index */
1582 )
1583 {
1584 	daddr_t		blk;
1585 	daddr_t		blk1;
1586 	int		err = 0;
1587 	int		cons;
1588 	md_dev64_t	dev;
1589 	mddb_mb_ic_t	*mbip;
1590 
1591 	mbip = s->s_mbiarray[li];
1592 	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1593 	dev = md_xlate_targ_2_mini(dev);
1594 	if (dev == NODEV64) {
1595 		return (1);
1596 	}
1597 
1598 	blk = getphysblk(blka[0], mbip);
1599 	ASSERT(blk >= 0);
1600 
1601 	cons = 1;
1602 	while (cnt) {
1603 		if (cons != cnt) {
1604 			blk1 = getphysblk(blka[cons], mbip);
1605 			ASSERT(blk1 >= 0);
1606 			if ((blk + cons) == blk1) {
1607 				cons++;
1608 				continue;
1609 			}
1610 		}
1611 		if (err = getblks(s, buffer, dev, blk, cons))
1612 			return (err);
1613 		buffer += MDDB_BSIZE * cons;
1614 		cnt -= cons;
1615 		blka += cons;
1616 		if (cnt) {
1617 			blk = getphysblk(blka[0], mbip);
1618 			ASSERT(blk >= 0);
1619 		}
1620 		cons = 1;
1621 	}
1622 	return (0);
1623 }
1624 
1625 /*
1626  * readblks - takes a logical block number/block count pair
1627  * 		and reads those contiguous logical blocks into the buffer.
1628  * If called during upgrade, this routine expects a non-translated
1629  * (aka target) dev.
1630  */
1631 static int
1632 readblks(
1633 	mddb_set_t	*s,	/* incore set structure */
1634 	caddr_t		buffer,	/* buffer to be read into */
1635 	mddb_block_t	blk,	/* logical block number to be read */
1636 	int		cnt,	/* number of logical blocks to be read */
1637 	int		li	/* locator index */
1638 )
1639 {
1640 	daddr_t		physblk;
1641 	md_dev64_t	device;
1642 	int		i;
1643 	mddb_block_t	*blkarray;
1644 	int		size;
1645 	int		ret;
1646 
1647 	if (cnt > 1) {
1648 		size = sizeof (mddb_block_t) * cnt;
1649 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1650 		for (i = 0; i < cnt; i++)
1651 			blkarray[i] = blk + i;
1652 		ret = readblklst(s, buffer, blkarray, cnt, li);
1653 		kmem_free(blkarray, size);
1654 		return (ret);
1655 	}
1656 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1657 	ASSERT(physblk > 0);
1658 	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1659 	device = md_xlate_targ_2_mini(device);
1660 	if (device == NODEV64) {
1661 		return (1);
1662 	}
1663 	return (getblks(s, buffer, device, physblk, 1));
1664 }
1665 
1666 static void
1667 single_thread_start(
1668 	mddb_set_t	*s
1669 )
1670 {
1671 	while (s->s_singlelockgotten) {
1672 		s->s_singlelockwanted++;
1673 		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
1674 	}
1675 	s->s_singlelockgotten++;
1676 }
1677 
1678 static void
1679 single_thread_end(
1680 	mddb_set_t	*s
1681 )
1682 {
1683 	ASSERT(s->s_singlelockgotten);
1684 	s->s_singlelockgotten = 0;
1685 	if (s->s_singlelockwanted) {
1686 		s->s_singlelockwanted = 0;
1687 		cv_broadcast(&s->s_single_thread_cv);
1688 	}
1689 }
1690 
1691 static size_t
1692 sizeofde(
1693 	mddb_de_ic_t	*dep
1694 )
1695 {
1696 	size_t		size;
1697 
1698 	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
1699 		    sizeof (mddb_block_t) * dep->de_blkcount;
1700 	return (size);
1701 }
1702 
1703 static size_t
1704 sizeofde32(
1705 	mddb_de32_t	*dep
1706 )
1707 {
1708 	size_t		size;
1709 
1710 	size = sizeof (*dep) - sizeof (dep->de32_blks) +
1711 		    sizeof (mddb_block_t) * dep->de32_blkcount;
1712 	return (size);
1713 }
1714 
1715 static mddb_de32_t *
1716 nextentry(
1717 	mddb_de32_t	*dep
1718 )
1719 {
1720 	mddb_de32_t	*ret;
1721 
1722 	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
1723 	return (ret);
1724 }
1725 
1726 static void
1727 create_db32rec(
1728 	mddb_db32_t *db32p,
1729 	mddb_db_t *dbp
1730 )
1731 {
1732 	mddb_de_ic_t *dep;
1733 	mddb_de32_t *de32p;
1734 
1735 #if defined(_ILP32) && !defined(lint)
1736 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1737 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1738 #endif
1739 
1740 	dbtodb32(dbp, db32p);
1741 	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
1742 		db32p->db32_firstentry = 0x4;
1743 	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
1744 		+ sizeof (db32p->db32_firstentry)));
1745 	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
1746 		detode32(dep, de32p);
1747 		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
1748 			de32p->de32_next = 0x4;
1749 		de32p = nextentry(de32p);
1750 	}
1751 	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
1752 }
1753 
1754 /*
1755  * If called during upgrade, this routine expects a translated
1756  * (aka miniroot) dev.
1757  * If master blocks are found, set the mn_set parameter to 1 if the
1758  * the master block revision number is MDDB_REV_MNMB; otherwise,
1759  * set it to 0.
1760  * If master blocks are not found, do not change the mnset parameter.
1761  */
1762 static mddb_mb_ic_t *
1763 getmasters(
1764 	mddb_set_t	*s,
1765 	md_dev64_t	dev,
1766 	daddr_t		blkno,
1767 	uint_t		*flag,
1768 	int		*mn_set
1769 )
1770 {
1771 	mddb_mb_ic_t	*mbi = NULL;
1772 	mddb_mb_t	*mb;
1773 	int		error = 0;
1774 	ddi_devid_t	devid;
1775 
1776 
1777 	if (mddb_devopen(dev)) {
1778 		if (flag)
1779 			*flag |= MDDB_F_EMASTER;
1780 		return ((mddb_mb_ic_t *)NULL);
1781 	}
1782 
1783 
1784 	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
1785 	mb = &(mbi->mbi_mddb_mb);
1786 	if (error = getblks(s, (caddr_t)mb, dev, blkno,
1787 	    btodb(MDDB_BSIZE))) {
1788 		error |= MDDB_F_EMASTER;
1789 	}
1790 	if (mb->mb_magic != MDDB_MAGIC_MB) {
1791 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1792 	}
1793 	/* Check for MDDB_REV_MNMB and lower */
1794 	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
1795 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1796 	}
1797 	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
1798 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1799 	}
1800 	if (!(md_get_setstatus(s->s_setno) & MD_SET_IMPORT) &&
1801 		(mb->mb_setno != s->s_setno)) {
1802 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1803 	}
1804 	if (mb->mb_blkno != blkno) {
1805 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1806 	}
1807 	mb->mb_next = NULL;
1808 	mbi->mbi_next = NULL;
1809 
1810 	if (error)
1811 		goto out;
1812 
1813 	/*
1814 	 * Check the md_devid_destroy and md_keep_repl_state flags
1815 	 * to see if we need to regen the devid or not.
1816 	 *
1817 	 * Don't care about devid in local set since it is not used
1818 	 * and this should not be part of set importing
1819 	 */
1820 	if ((s->s_setno != MD_LOCAL_SET) && !(md_get_setstatus(s->s_setno) &
1821 		MD_SET_IMPORT)) {
1822 		/*
1823 		 * Now check the destroy flag. We also need to handle
1824 		 * the case where the destroy flag is reset after the
1825 		 * destroy
1826 		 */
1827 		if (md_devid_destroy || (mb->mb_devid_len == 0)) {
1828 
1829 			if (md_devid_destroy) {
1830 				bzero(mb->mb_devid, mb->mb_devid_len);
1831 				mb->mb_devid_len = 0;
1832 			}
1833 
1834 			/*
1835 			 * Try to regenerate it if the 'keep' flag is not set
1836 			 */
1837 			if (!md_keep_repl_state) {
1838 				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
1839 				    &devid) == DDI_SUCCESS) {
1840 					mb->mb_devid_len =
1841 					    ddi_devid_sizeof(devid);
1842 					bcopy(devid, mb->mb_devid,
1843 					    mb->mb_devid_len);
1844 					ddi_devid_free(devid);
1845 				} else {
1846 					error = MDDB_F_EFMT | MDDB_F_EMASTER;
1847 				}
1848 			}
1849 
1850 			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
1851 
1852 			/*
1853 			 * Push
1854 			 */
1855 			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
1856 				error = MDDB_F_EFMT | MDDB_F_EMASTER;
1857 			}
1858 		}
1859 	}
1860 
1861 	if (! error) {
1862 		/* Set mn_set parameter to 1 if a MN set */
1863 		if (mb->mb_revision == MDDB_REV_MNMB)
1864 			*mn_set = 1;
1865 		else
1866 			*mn_set = 0;
1867 		return (mbi);
1868 	}
1869 
1870 out:
1871 	/* Error Out */
1872 	if (flag)
1873 		*flag |= error;
1874 
1875 	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
1876 	mddb_devclose(dev);
1877 	return ((mddb_mb_ic_t *)NULL);
1878 }
1879 
1880 static int
1881 getrecord(
1882 	mddb_set_t	*s,
1883 	mddb_de_ic_t	*dep,
1884 	int		li
1885 )
1886 {
1887 	int		err = 0;
1888 	mddb_rb32_t	*rbp;
1889 
1890 #if defined(_ILP32) && !defined(lint)
1891 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
1892 #endif
1893 
1894 
1895 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
1896 	rbp = dep->de_rb;
1897 
1898 	err = readblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li);
1899 	if (err) {
1900 		return (MDDB_F_EDATA | err);
1901 	}
1902 	if (rbp->rb_magic != MDDB_MAGIC_RB) {
1903 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1904 	}
1905 	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
1906 	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0)) {
1907 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1908 	}
1909 	/* Check crc for this record */
1910 	if (rec_crcchk(s, dep, rbp)) {
1911 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1912 	}
1913 	return (0);
1914 }
1915 
1916 /*
1917  * Code to read in the locator name information
1918  */
1919 static int
1920 readlocnames(
1921 	mddb_set_t	*s,
1922 	int		li
1923 )
1924 {
1925 	mddb_ln_t	*lnp;
1926 	int		err = 0;
1927 	mddb_block_t	ln_blkcnt, ln_blkno;
1928 
1929 	/*
1930 	 * read in the locator name blocks
1931 	 */
1932 	s->s_lnp = NULL;
1933 
1934 	ln_blkno = s->s_lbp->lb_lnfirstblk;
1935 	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
1936 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
1937 
1938 	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
1939 	if (err) {
1940 		err |= MDDB_F_EDATA;
1941 		goto out;
1942 	}
1943 	if (lnp->ln_magic != MDDB_MAGIC_LN) {
1944 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1945 		goto out;
1946 	}
1947 	if (s->s_lbp->lb_flags & MDDB_MNSET) {
1948 		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
1949 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1950 			goto out;
1951 		}
1952 	} else {
1953 		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
1954 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1955 			goto out;
1956 		}
1957 	}
1958 	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
1959 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1960 		goto out;
1961 	}
1962 out:
1963 	/*
1964 	 *	if error occurred in locator name blocks free them
1965 	 *	and return
1966 	 */
1967 	if (err) {
1968 		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
1969 		return (err);
1970 	}
1971 	s->s_lnp = lnp;
1972 	return (0);
1973 }
1974 
1975 /*
1976  * code to read in a copy of the database.
1977  */
1978 
1979 static int
1980 readcopy(
1981 	mddb_set_t	*s,
1982 	int		li
1983 )
1984 {
1985 	uint_t		blk;
1986 	mddb_db_t	*dbp, *dbp1, *dbhp;
1987 	mddb_db32_t	*db32p;
1988 	mddb_de_ic_t	*dep, *dep2;
1989 	mddb_de32_t	*de32p, *de32p2;
1990 	int		err = 0;
1991 	uint_t		checksum;
1992 
1993 
1994 #if defined(_ILP32) && !defined(lint)
1995 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1996 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1997 #endif
1998 
1999 	dbp = NULL;
2000 	dbhp = NULL;
2001 	/*
2002 	 *	read in all the directory blocks
2003 	 */
2004 	blk = s->s_lbp->lb_dbfirstblk;
2005 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2006 
2007 	for (; blk != 0; blk = dbp->db_nextblk) {
2008 		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
2009 		if (! dbhp) {
2010 			dbhp = dbp1;
2011 		} else {
2012 			dbp->db_next = dbp1;
2013 		}
2014 		dbp = dbp1;
2015 
2016 		err = readblks(s, (caddr_t)db32p, blk, 1, li);
2017 		if (err) {
2018 			err |= MDDB_F_EDATA;
2019 			break;
2020 		}
2021 		db32todb(db32p, dbp);
2022 		if (db32p->db32_magic != MDDB_MAGIC_DB) {
2023 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2024 			break;
2025 		}
2026 		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
2027 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2028 			break;
2029 		}
2030 		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
2031 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2032 			break;
2033 		}
2034 		/*
2035 		 * first go through and fix up all de_next pointers
2036 		 */
2037 		if (dbp->db_firstentry) {
2038 
2039 			de32p = (mddb_de32_t *)
2040 			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
2041 			    + sizeof (db32p->db32_firstentry)));
2042 
2043 			dep = (mddb_de_ic_t *)
2044 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
2045 				sizeof (mddb_block_t) +
2046 				sizeof (mddb_block_t) * de32p->de32_blkcount,
2047 				KM_SLEEP);
2048 			de32tode(de32p, dep);
2049 
2050 			dbp->db_firstentry = dep;
2051 			while (de32p && de32p->de32_next) {
2052 
2053 				de32p2 = nextentry(de32p);
2054 
2055 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
2056 					sizeof (mddb_de_ic_t) -
2057 					sizeof (mddb_block_t) +
2058 					sizeof (mddb_block_t) *
2059 					de32p2->de32_blkcount, KM_SLEEP);
2060 
2061 				de32tode(de32p2, dep2);
2062 
2063 				dep->de_next = dep2;
2064 				dep = dep2;
2065 				de32p = de32p2;
2066 			}
2067 		}
2068 		/*
2069 		 * go through and make all of the pointer to record blocks
2070 		 * are null;
2071 		 */
2072 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
2073 			dep->de_rb = NULL;
2074 	}
2075 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2076 	dbp->db_next = NULL;
2077 	/*
2078 	 *	if error occurred in directory blocks free them
2079 	 *	and return
2080 	 */
2081 	if (err) {
2082 		dbp = dbhp;
2083 		while (dbp) {
2084 			dep = dbp->db_firstentry;
2085 			while (dep) {
2086 				/* No mddb_rb32_t structures yet */
2087 				dep2 = dep->de_next;
2088 				kmem_free((caddr_t)dep, sizeofde(dep));
2089 				dep = dep2;
2090 			}
2091 			dbp1 = dbp->db_next;
2092 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2093 			dbp = dbp1;
2094 		}
2095 		s->s_dbp = NULL;
2096 		return (err);
2097 
2098 	}
2099 	/*
2100 	 */
2101 	err = 0;
2102 	checksum = MDDB_GLOBAL_XOR;
2103 	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
2104 		checksum ^= dbp->db_recsum;
2105 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2106 			if (dep->de_flags & MDDB_F_OPT)
2107 				continue;
2108 			err = getrecord(s, dep, li);
2109 			if (err)
2110 				break;
2111 			/* Don't include CHANGELOG in big XOR */
2112 			if (dep->de_flags & MDDB_F_CHANGELOG)
2113 				continue;
2114 			checksum ^= dep->de_rb->rb_checksum;
2115 			checksum ^= dep->de_rb->rb_checksum_fiddle;
2116 		}
2117 		if (err)
2118 			break;
2119 	}
2120 	if (checksum) {
2121 		if (! err)
2122 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2123 	}
2124 	if (err) {
2125 		dbp = dbhp;
2126 		dbhp = NULL;
2127 		while (dbp) {
2128 			dep = dbp->db_firstentry;
2129 			while (dep) {
2130 				if (dep->de_rb)
2131 					kmem_free((caddr_t)dep->de_rb,
2132 					    dep->de_recsize);
2133 				dep2 = dep->de_next;
2134 				kmem_free((caddr_t)dep, sizeofde(dep));
2135 				dep = dep2;
2136 			}
2137 			dbp1 = dbp->db_next;
2138 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2139 			dbp = dbp1;
2140 		}
2141 	}
2142 	s->s_dbp = dbhp;
2143 	return (err);
2144 }
2145 
2146 static int
2147 getoptcnt(
2148 	mddb_set_t	*s,
2149 	int		li)
2150 {
2151 	int		result;
2152 	mddb_de_ic_t	*dep;
2153 	mddb_db_t	*dbp;
2154 
2155 #if defined(_ILP32) && !defined(lint)
2156 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2157 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2158 #endif
2159 
2160 	result = 0;
2161 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2162 		dep = dbp->db_firstentry;
2163 		for (; dep != NULL; dep = dep->de_next) {
2164 			if (! (dep->de_flags & MDDB_F_OPT))
2165 				continue;
2166 			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
2167 			    (li == dep->de_optinfo[0].o_li)) ||
2168 			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
2169 			    (li == dep->de_optinfo[1].o_li)))
2170 			result++;
2171 		}
2172 	}
2173 	return (result);
2174 }
2175 
2176 static void
2177 getoptdev(
2178 	mddb_set_t	*s,
2179 	mddb_de_ic_t	*rdep,
2180 	int		opti
2181 )
2182 {
2183 	mddb_lb_t	*lbp;
2184 	mddb_locator_t	*lp;
2185 	mddb_optinfo_t	*otherop;
2186 	mddb_optinfo_t	*resultop;
2187 	int		li;
2188 	dev_t		otherdev;
2189 	int		blkonly = 0;
2190 	int		mincnt;
2191 	int		thiscnt;
2192 
2193 	lbp = s->s_lbp;
2194 
2195 	resultop = &rdep->de_optinfo[opti];
2196 	otherop = &rdep->de_optinfo[1-opti];
2197 
2198 	resultop->o_flags = 0;
2199 
2200 	/*
2201 	 * scan through and see if data bases have to vary by only device
2202 	 */
2203 
2204 	if (otherop->o_flags & MDDB_F_ACTIVE) {
2205 		blkonly = 1;
2206 		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
2207 		for (li = 0; li < lbp->lb_loccnt; li++) {
2208 			lp = &lbp->lb_locators[li];
2209 			if (! (lp->l_flags & MDDB_F_ACTIVE))
2210 				continue;
2211 			if (expldev(lp->l_dev) != otherdev) {
2212 				blkonly = 0;
2213 				break;
2214 			}
2215 		}
2216 	}
2217 
2218 	mincnt = 999999;
2219 	for (li = 0; li < lbp->lb_loccnt; li++) {
2220 		dev_info_t	*devi;
2221 		int		removable = 0;
2222 
2223 		lp = &lbp->lb_locators[li];
2224 		if (! (lp->l_flags & MDDB_F_ACTIVE))
2225 			continue;
2226 		if (otherop->o_flags & MDDB_F_ACTIVE) {
2227 			if (blkonly) {
2228 				if (otherop->o_li == li)
2229 					continue;
2230 			} else {
2231 				if (otherdev == expldev(lp->l_dev))
2232 					continue;
2233 			}
2234 		}
2235 
2236 		/*
2237 		 * Check if this is a removable device.  If it is we
2238 		 * assume it is something like a USB flash disk, a zip disk
2239 		 * or even a floppy that is being used to help maintain
2240 		 * mddb quorum.  We don't want to put any optimized resync
2241 		 * records on these kinds of disks since they are usually
2242 		 * slower or don't have the same read/write lifetimes as
2243 		 * a regular fixed disk.
2244 		 */
2245 		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
2246 			int		error;
2247 			struct cb_ops	*cb;
2248 			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
2249 			int		propvalue = 0;
2250 			int		proplength = sizeof (int);
2251 
2252 			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
2253 			    != NULL) {
2254 				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
2255 					prop_op,
2256 					DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
2257 					"removable-media",
2258 					(caddr_t)&propvalue, &proplength);
2259 
2260 				if (error == DDI_PROP_SUCCESS)
2261 					removable = 1;
2262 			}
2263 
2264 			ddi_release_devi(devi);
2265 		}
2266 
2267 		if (removable)
2268 			continue;
2269 
2270 		thiscnt = getoptcnt(s, li);
2271 		if (thiscnt < mincnt) {
2272 			resultop->o_li  = li;
2273 			mincnt = thiscnt;
2274 			resultop->o_flags = MDDB_F_ACTIVE;
2275 		}
2276 	}
2277 }
2278 
2279 static void
2280 allocuserdata(
2281 	mddb_de_ic_t	*dep
2282 )
2283 {
2284 	mddb_rb32_t	*rbp;
2285 
2286 #if defined(_ILP32) && !defined(lint)
2287 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2288 #endif
2289 
2290 	rbp = dep->de_rb;
2291 	rbp->rb_private = 0;
2292 	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
2293 	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
2294 	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
2295 }
2296 
2297 
2298 static void
2299 getuserdata(
2300 	set_t		setno,
2301 	mddb_de_ic_t	*dep
2302 )
2303 {
2304 	mddb_rb32_t	 *rbp;
2305 
2306 
2307 	mddb_type_t	type = dep->de_type1;
2308 	caddr_t		data, udata;
2309 
2310 #if defined(_ILP32) && !defined(lint)
2311 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2312 #endif
2313 	rbp = dep->de_rb;
2314 	data = (caddr_t)rbp->rb_data;
2315 	udata = (caddr_t)dep->de_rb_userdata;
2316 
2317 	/*
2318 	 * If it's a driver record, and an old style record, and not a DRL
2319 	 * record, we must convert it because it was incore as a 64 bit
2320 	 * structure but its on disk layout has only 32 bit for block sizes
2321 	 */
2322 	if (!(md_get_setstatus(setno) & MD_SET_IMPORT) &&
2323 	    (type >= MDDB_FIRST_MODID) &&
2324 	    (rbp->rb_revision == MDDB_REV_RB)) {
2325 
2326 		switch (dep->de_flags) {
2327 
2328 			case MDDB_F_STRIPE:
2329 				stripe_convert(data, udata, BIG_2_SMALL);
2330 				break;
2331 
2332 			case MDDB_F_MIRROR:
2333 				mirror_convert(data, udata, BIG_2_SMALL);
2334 				break;
2335 
2336 			case MDDB_F_RAID:
2337 				raid_convert(data, udata, BIG_2_SMALL);
2338 				break;
2339 
2340 			case MDDB_F_SOFTPART:
2341 				softpart_convert(data, udata, BIG_2_SMALL);
2342 				break;
2343 
2344 			case MDDB_F_TRANS_MASTER:
2345 				trans_master_convert(data, udata, BIG_2_SMALL);
2346 				break;
2347 
2348 			case MDDB_F_TRANS_LOG:
2349 				trans_log_convert(data, udata, BIG_2_SMALL);
2350 				break;
2351 
2352 			case MDDB_F_HOTSPARE:
2353 				hs_convert(data, udata, BIG_2_SMALL);
2354 				break;
2355 
2356 			case MDDB_F_OPT:
2357 			default:
2358 				bcopy(udata, data, dep->de_reqsize);
2359 		}
2360 	} else {
2361 		bcopy(udata, data, dep->de_reqsize);
2362 	}
2363 }
2364 
2365 static void
2366 getoptrecord(
2367 	mddb_set_t	*s,
2368 	mddb_de_ic_t	*dep
2369 )
2370 {
2371 	mddb_lb_t	*lbp;
2372 	mddb_locator_t	*lp;
2373 	mddb_rb32_t	*rbp, *crbp;
2374 	int		li;
2375 	int		i;
2376 	int		err = 0;
2377 	size_t		recsize;
2378 
2379 #if defined(_ILP32) && !defined(lint)
2380 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2381 #endif
2382 
2383 	lbp = s->s_lbp;
2384 
2385 	recsize = dep->de_recsize;
2386 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2387 	rbp = dep->de_rb;
2388 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2389 
2390 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
2391 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2392 
2393 	for (i = 0; i < 2; i++) {
2394 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2395 			continue;
2396 		li = dep->de_optinfo[i].o_li;
2397 		lp = &lbp->lb_locators[li];
2398 
2399 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
2400 		    (lp->l_flags & MDDB_F_EMASTER))
2401 			continue;
2402 
2403 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
2404 		    dep->de_blkcount, li);
2405 
2406 		if (err)
2407 			continue;
2408 
2409 		if (rbp->rb_magic != MDDB_MAGIC_RB)
2410 			continue;
2411 
2412 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
2413 			continue;
2414 
2415 		/* Check the crc for this record */
2416 		if (rec_crcchk(s, dep, rbp)) {
2417 			continue;
2418 		}
2419 
2420 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
2421 
2422 		if (rbp == crbp) {
2423 			if (rbp->rb_checksum != crbp->rb_checksum)
2424 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2425 			break;
2426 		}
2427 		rbp = crbp;
2428 	}
2429 
2430 	if (rbp == crbp) {
2431 		rbp->rb_private = 0;
2432 		kmem_free((caddr_t)crbp, recsize);
2433 		return;
2434 	}
2435 	bzero((caddr_t)rbp, recsize);
2436 	rbp->rb_magic = MDDB_MAGIC_RB;
2437 	rbp->rb_revision = MDDB_REV_RB;
2438 	uniqtime32(&rbp->rb_timestamp);
2439 	/* Generate the crc for this record */
2440 	rec_crcgen(s, dep, rbp);
2441 	kmem_free((caddr_t)crbp, recsize);
2442 }
2443 
2444 /*
2445  * writeoptrecord writes out an optimized record.
2446  */
2447 static int
2448 writeoptrecord(
2449 	mddb_set_t	*s,
2450 	mddb_de_ic_t	*dep
2451 )
2452 {
2453 	mddb_rb32_t	*rbp;
2454 	int		li;
2455 	int		err = 0, wrt_err = 0;
2456 	mddb_bf_t	*bufhead, *bfp;
2457 	mddb_lb_t	*lbp = s->s_lbp;
2458 	mddb_locator_t	*lp;
2459 	int		i;
2460 
2461 #if defined(_ILP32) && !defined(lint)
2462 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2463 #endif
2464 
2465 	bufhead = NULL;
2466 	err = 0;
2467 
2468 	while (s->s_opthavequeuinglck) {
2469 		s->s_optwantqueuinglck++;
2470 		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
2471 	}
2472 	s->s_opthavequeuinglck++;
2473 	rbp = dep->de_rb;
2474 	for (i = 0; i < 2; i++) {
2475 		/*
2476 		 * only possible error is xlate. This can
2477 		 * occur if a replica was off line and came
2478 		 * back. During the mean time the database grew
2479 		 * large than the now on line replica can store
2480 		 */
2481 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2482 			continue;
2483 		li = dep->de_optinfo[i].o_li;
2484 		/*
2485 		 * In a MN diskset, any node can write optimized record(s).
2486 		 */
2487 		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
2488 			dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
2489 		/*
2490 		 * For MN diskset, set error in optinfo structure so
2491 		 * that mddb_commitrec knows which replica failed.
2492 		 */
2493 		if ((MD_MNSET_SETNO(s->s_setno)) &&
2494 		    (wrt_err & MDDB_F_EWRITE)) {
2495 			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
2496 		}
2497 		err |= wrt_err;
2498 	}
2499 	s->s_opthavequeuinglck = 0;
2500 	if (s->s_optwantqueuinglck) {
2501 		s->s_optwantqueuinglck = 0;
2502 		cv_broadcast(&s->s_optqueuing_cv);
2503 	}
2504 	for (bfp = bufhead; bfp; bfp = bufhead) {
2505 		mutex_exit(SETMUTEX(s->s_setno));
2506 		(void) biowait(&bfp->bf_buf);
2507 		mutex_enter(SETMUTEX(s->s_setno));
2508 		if (bfp->bf_buf.b_flags & B_ERROR) {
2509 			/*
2510 			 * If an MN diskset, don't set replica
2511 			 * in error since this hasn't been set in master.
2512 			 * Setting replica in error before master could
2513 			 * leave the nodes with different views of the
2514 			 * world since a class 1 configuration change
2515 			 * could occur in mddb_commitrec as soon as
2516 			 * all locks are dropped.  Must keep this
2517 			 * node the same as master and can't afford a
2518 			 * failure from the class 1 config change
2519 			 * if master succeeded.
2520 			 */
2521 			if (!(MD_MNSET_SETNO(s->s_setno))) {
2522 				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
2523 			} else {
2524 				/*
2525 				 * Find which de_optinfo (which replica)
2526 				 * had a failure and set the failure in
2527 				 * the o_flags field.
2528 				 */
2529 				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
2530 				if (lp == bfp->bf_locator) {
2531 					dep->de_optinfo[0].o_flags |=
2532 						MDDB_F_EWRITE;
2533 				} else {
2534 					dep->de_optinfo[1].o_flags |=
2535 						MDDB_F_EWRITE;
2536 				}
2537 			}
2538 			err |= MDDB_F_EWRITE;
2539 		}
2540 		bufhead = bfp->bf_next;
2541 		freebuffer(s, bfp);
2542 	}
2543 	return (err);
2544 }
2545 
2546 /*
2547  * Fix up the optimized resync record.  Used in the traditional and local
2548  * disksets to move an optimized record from a failed or deleted mddb
2549  * to an active one.
2550  *
2551  * In a MN diskset, the fixing of the optimized record is split between
2552  * the master and slave nodes.  If the master node moves the optimized
2553  * resync record, then the master node will send a MDDB_PARSE_OPTRECS
2554  * message to the slave nodes causing the slave nodes to reget the
2555  * directory entry containing the location of the optimized resync record.
2556  * After the record is reread from disk, then writeoptrecord is called
2557  * if the location of the optimized resync record or flags have changed.
2558  * When writeoptrecord is called, the node that is the owner of this record
2559  * will write the optimized record to the location specified in the directory
2560  * entry.  Since the master node uses the highest class message (PARSE)
2561  * the record owner node is guaranteed to already have an updated
2562  * directory entry incore.
2563  *
2564  * The other difference between the traditional/local set and MN diskset
2565  * is that the directory entry can be written to disk before the optimized
2566  * record in a MN diskset if the record is owned by a slave node.  So,
2567  * the users of an optimized record must handle the failure case when no
2568  * data is available from an optimized record since the master node could
2569  * have failed during the relocation of the optimized record to another mddb.
2570  */
2571 static int
2572 fixoptrecord(
2573 	mddb_set_t	*s,
2574 	mddb_de_ic_t	*dep,
2575 	mddb_db_t	*dbp
2576 )
2577 {
2578 	int		changed;
2579 	int		writedata;
2580 	int		err = 0;
2581 	int		i;
2582 	mddb_lb_t	*lbp;
2583 	mddb_optinfo_t	*op;
2584 	mddb_db32_t	*db32p;
2585 	int		rec_owner;	/* Is node owner of record? */
2586 
2587 #if defined(_ILP32) && !defined(lint)
2588 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2589 #endif
2590 
2591 	lbp = s->s_lbp;
2592 	changed = 0;
2593 	writedata = 0;
2594 	for (i = 0; i < 2; i++) {
2595 		op = &dep->de_optinfo[i];
2596 
2597 		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
2598 			op->o_flags = 0;
2599 
2600 		/*
2601 		 * If optimized record has seen a replica failure,
2602 		 * assign new replica to record and re-write data
2603 		 * to new record.
2604 		 */
2605 		if (! (op->o_flags & MDDB_F_ACTIVE)) {
2606 			getoptdev(s, dep, i);
2607 			writedata++;
2608 			changed++;
2609 			/* Set flag for slaves to reread dep and write rec */
2610 			if (lbp->lb_flags & MDDB_MNSET) {
2611 				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
2612 			}
2613 		}
2614 
2615 		/*
2616 		 * If just an error in the data was seen, set
2617 		 * the optimized record's replica flag to active (ok)
2618 		 * and try again.
2619 		 */
2620 		if (op->o_flags & MDDB_F_EDATA) {
2621 			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
2622 			writedata++;
2623 		}
2624 	}
2625 
2626 	rec_owner = 0;
2627 	if (lbp->lb_flags & MDDB_MNSET) {
2628 		/*
2629 		 * If a MN diskset then check the owner of optimized record.
2630 		 * If the master node owns the record or if there is
2631 		 * no owner of the record, then the master can write the
2632 		 * optimized record to disk.
2633 		 * Master node can write the optimized record now, but
2634 		 * slave nodes write their records during handling of
2635 		 * the MDDB_PARSE_OPTRECS message.
2636 		 */
2637 		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
2638 		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
2639 			rec_owner = 1;
2640 		}
2641 	} else {
2642 		/*
2643 		 * In traditional diskset and local set, this node
2644 		 * is always the record owner and always the master.
2645 		 */
2646 		rec_owner = 1;
2647 	}
2648 
2649 	/*
2650 	 * If this node is the record owner, write out record.
2651 	 */
2652 	if ((writedata) && (rec_owner)) {
2653 		if (err = writeoptrecord(s, dep)) {
2654 			return (err);
2655 		}
2656 	}
2657 	if (! changed)
2658 		return (0);
2659 	uniqtime32(&dbp->db_timestamp);
2660 	dbp->db_revision = MDDB_REV_DB;
2661 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2662 	create_db32rec(db32p, dbp);
2663 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
2664 	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
2665 		1, MDDB_WR_ONLY_MASTER);
2666 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2667 	return (err);
2668 }
2669 
2670 static int
2671 fixoptrecords(
2672 	mddb_set_t		*s
2673 )
2674 {
2675 	mddb_de_ic_t	*dep;
2676 	mddb_db_t	*dbp;
2677 	int		err = 0;
2678 	set_t		setno;
2679 
2680 	/*
2681 	 * In a MN diskset, the master node is the only node that runs
2682 	 * fixoptrecords.  If the master node changes anything, then the
2683 	 * master node sends PARSE message to the slave nodes.  The slave
2684 	 * nodes will then re-read in the locator block or re-read in the
2685 	 * directory blocks and re-write the optimized resync records.
2686 	 */
2687 	setno = s->s_setno;
2688 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
2689 	    (md_set[setno].s_am_i_master == 0)) {
2690 		return (0);
2691 	}
2692 
2693 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2694 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2695 			if (! (dep->de_flags & MDDB_F_OPT))
2696 				continue;
2697 			err = fixoptrecord(s, dep, dbp);
2698 			if (err != 0)
2699 				return (err);
2700 		}
2701 	}
2702 	return (0);
2703 }
2704 
2705 /*
2706  * Checks incore version of mddb data to mddb data ondisk.
2707  *
2708  * Returns:
2709  *	- 0 if the data was successfully read and is good.
2710  *	- MDDB_F_EREAD if a read error occurred.
2711  *	- 1 if the data read is bad (checksum failed, etc)
2712  */
2713 static int
2714 checkcopy
2715 (
2716 	mddb_set_t	*s,
2717 	int		li
2718 )
2719 {
2720 	mddb_db_t	*dbp;
2721 	mddb_db32_t	*cdb32p;
2722 	mddb_de_ic_t	*dep;
2723 	mddb_de32_t	*cde32p;
2724 	mddb_rb32_t	*rbp, *crbp;
2725 	size_t		size;
2726 	int		i;
2727 	int		retval = 1;
2728 
2729 #if defined(_ILP32) && !defined(lint)
2730 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2731 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2732 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2733 #endif
2734 
2735 	if (s->s_databuffer_size == 0) {
2736 		size_t maxrecsize = MDDB_BSIZE;
2737 
2738 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
2739 			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
2740 				if (! (dep->de_flags & MDDB_F_OPT) &&
2741 				    dep->de_recsize > maxrecsize)
2742 					maxrecsize = dep->de_recsize;
2743 
2744 		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
2745 		s->s_databuffer_size = maxrecsize;
2746 	}
2747 
2748 	cdb32p = (mddb_db32_t *)s->s_databuffer;
2749 
2750 	/*
2751 	 * first go through and make sure all directory stuff
2752 	 * is the same
2753 	 */
2754 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2755 		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
2756 			retval = MDDB_F_EREAD;
2757 			goto err;
2758 		}
2759 		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
2760 			goto err;
2761 		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
2762 			goto err;
2763 		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
2764 			goto err;
2765 		if (cdb32p->db32_nextblk != dbp->db_nextblk)
2766 			goto err;
2767 		if (cdb32p->db32_recsum != dbp->db_recsum)
2768 			goto err;
2769 		if (cdb32p->db32_firstentry) {
2770 			cde32p = (mddb_de32_t *)
2771 			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
2772 			    + sizeof (cdb32p->db32_firstentry)));
2773 		} else
2774 			cde32p = NULL;
2775 
2776 		dep = dbp->db_firstentry;
2777 		/*
2778 		 * check if all directory entries are identical
2779 		 */
2780 		while (dep && cde32p) {
2781 			if (dep->de_recid != cde32p->de32_recid)
2782 				goto err;
2783 			if (dep->de_type1 != cde32p->de32_type1)
2784 				goto err;
2785 			if (dep->de_type2 != cde32p->de32_type2)
2786 				goto err;
2787 			if (dep->de_reqsize != cde32p->de32_reqsize)
2788 				goto err;
2789 			if (dep->de_flags != cde32p->de32_flags)
2790 				goto err;
2791 
2792 			for (i = 0; i < 2; i++) {
2793 				if (dep->de_optinfo[i].o_li !=
2794 				    cde32p->de32_optinfo[i].o_li)
2795 					break;
2796 			}
2797 			if (i != 2)
2798 				goto err;
2799 			size = sizeof (mddb_block_t) * dep->de_blkcount;
2800 			if (bcmp((caddr_t)dep->de_blks,
2801 			    (caddr_t)cde32p->de32_blks, size))
2802 				goto err;
2803 			dep = dep->de_next;
2804 			if (cde32p->de32_next)
2805 				cde32p = nextentry(cde32p);
2806 			else
2807 				cde32p = NULL;
2808 		}
2809 		if (dep || cde32p)
2810 			goto err;
2811 	}
2812 	/*
2813 	 * If here, all directories are functionally identical
2814 	 * check to make sure all records are identical
2815 	 * the reason the records are not just bcmped is that the
2816 	 * lock flag does not want to be compared.
2817 	 */
2818 	crbp = (mddb_rb32_t *)cdb32p;
2819 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2820 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2821 			if ((dep->de_flags & MDDB_F_OPT) ||
2822 			    (dep->de_flags & MDDB_F_CHANGELOG))
2823 				continue;
2824 			rbp = (mddb_rb32_t *)dep->de_rb;
2825 			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
2826 			    dep->de_blkcount, li)) {
2827 				retval = MDDB_F_EREAD;
2828 				goto err;
2829 			}
2830 			/* Check the crc for this record */
2831 			if (rec_crcchk(s, dep, crbp))
2832 				goto err;
2833 
2834 			if (rbp->rb_checksum != crbp->rb_checksum ||
2835 			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
2836 				goto err;
2837 		}
2838 	}
2839 	return (0);
2840 err:
2841 	return (retval);
2842 }
2843 
2844 /*
2845  * Determine if the location information for two mddbs is the same.
2846  * The device slice and block offset should match.  If both have devids then
2847  * use that for the comparison, otherwise we compare the dev_ts.
2848  * Comparing with the devid allows us to handle the case where a mddb was
2849  * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
2850  * the dead mddb but the devid comparison will catch this and not match.
2851  *
2852  * Return 1 if the location of the two mddbs match, 0 if not.
2853  */
2854 static int
2855 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
2856 	daddr32_t blkno)
2857 {
2858 	if (rip->ri_flags & MDDB_F_EMASTER) {
2859 		/*
2860 		 * If this element is errored then we don't try to match on it.
2861 		 * If we try to match we could erroneously match on the dev_t
2862 		 * of a relocated disk.
2863 		 */
2864 		return (0);
2865 	}
2866 
2867 	if (rip->ri_devid && devid && minor) {
2868 		if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
2869 		    strcmp(rip->ri_minor_name, minor) != 0)
2870 			return (0);
2871 	} else {
2872 		if (rip->ri_dev != dev)
2873 			return (0);
2874 	}
2875 
2876 	if (rip->ri_blkno != blkno)
2877 		return (0);
2878 
2879 	return (1);
2880 }
2881 
2882 static int
2883 ridev(
2884 	mddb_ri_t	**rip,
2885 	mddb_cfg_loc_t	*clp,
2886 	dev32_t		*dev_2b_fixed,
2887 	int		flag)
2888 {
2889 	mddb_ri_t	*r, *r1;
2890 	md_dev64_t	ldev, ndev;
2891 	major_t		majordev;
2892 	int		sz;
2893 
2894 	if (MD_UPGRADE) {
2895 		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
2896 			clp->l_mnum);
2897 	} else {
2898 		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
2899 			return (EINVAL);
2900 
2901 		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
2902 			clp->l_mnum);
2903 	}
2904 
2905 	if (clp->l_devid != 0) {
2906 		/*
2907 		 * Get dev associated with device id and minor name.
2908 		 * Setup correct driver name if dev is now different.
2909 		 * Don't change driver name if during upgrade.
2910 		 */
2911 		ndev = ldev;
2912 		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
2913 		    &ndev, clp->l_minor_name)) {
2914 			if ((ndev != ldev) && (!(MD_UPGRADE))) {
2915 				majordev = md_getmajor(ndev);
2916 				(void) strcpy(clp->l_driver,
2917 				    ddi_major_to_name(majordev));
2918 				clp->l_mnum = md_getminor(ndev);
2919 				clp->l_devid_flags |= MDDB_DEVID_VALID;
2920 				ldev = ndev;
2921 			}
2922 		} else {
2923 			/* Mark as invalid */
2924 			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
2925 		}
2926 	}
2927 
2928 	clp->l_dev = md_cmpldev(ldev);
2929 	if (dev_2b_fixed)
2930 		*dev_2b_fixed = clp->l_dev;
2931 	r = *rip;
2932 
2933 	while (r) {
2934 		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
2935 		    clp->l_minor_name, ldev, clp->l_blkno)) {
2936 			if ((clp->l_devid != 0) &&
2937 			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2938 				r->ri_flags |= MDDB_F_EMASTER;
2939 			} else {
2940 				r->ri_flags |= flag;
2941 			}
2942 			return (0);	/* already entered return success */
2943 		}
2944 		r = r->ri_next;
2945 	}
2946 
2947 	/*
2948 	 * This replica not represented in the current rip list,
2949 	 * so add it to the list.
2950 	 */
2951 	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
2952 	r->ri_dev = ldev;
2953 	r->ri_blkno = clp->l_blkno;
2954 	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
2955 	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
2956 		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
2957 	}
2958 	if (clp->l_devname != NULL) {
2959 		(void) strcpy(r->ri_devname, clp->l_devname);
2960 	}
2961 	r->ri_flags |= flag;
2962 	if (clp->l_devid != 0) {
2963 		sz = clp->l_devid_sz;
2964 		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
2965 		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
2966 
2967 		if (clp->l_old_devid != NULL) {
2968 			sz = clp->l_old_devid_sz;
2969 			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
2970 			    KM_SLEEP);
2971 			bcopy((char *)(uintptr_t)clp->l_old_devid,
2972 			    (char *)r->ri_old_devid, sz);
2973 		} else {
2974 			r->ri_old_devid = 0;
2975 		}
2976 		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
2977 			(void) strcpy(r->ri_minor_name, clp->l_minor_name);
2978 
2979 		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2980 			/*
2981 			 * Devid is present, but not valid.  This could
2982 			 * happen if device has been powered off or if
2983 			 * the device has been removed.  Mark the device in
2984 			 * error.  Don't allow any writes to this device
2985 			 * based on the dev_t since another device could
2986 			 * have been placed in its spot and be responding to
2987 			 * the dev_t accesses.
2988 			 */
2989 			r->ri_flags |= MDDB_F_EMASTER;
2990 		}
2991 	} else {
2992 		r->ri_devid = 0;
2993 		r->ri_old_devid = 0;
2994 	}
2995 
2996 	/*
2997 	 * If the rip list is empty then this entry
2998 	 * is the list.
2999 	 */
3000 	if (*rip == NULL) {
3001 		*rip = r;
3002 		return (0);
3003 	}
3004 
3005 	/*
3006 	 * Add this entry to the end of the rip list
3007 	 */
3008 	r1 = *rip;
3009 	while (r1->ri_next)
3010 		r1 = r1->ri_next;
3011 	r1->ri_next = r;
3012 	return (0);
3013 }
3014 
3015 /*
3016  * writecopy writes the incore data blocks out to all of the replicas.
3017  * This is called from writestart
3018  *	- when a diskset is started or
3019  *	- when an error has been enountered during the write to a mddb.
3020  * and from newdev when a new mddb is being added.
3021  *
3022  * flag can be 2 values:
3023  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3024  *		always used for traditional and local disksets.
3025  *		For MN diskset:
3026  *			All nodes can call writecopy, but only the
3027  *			master node actually writes data to the disk
3028  *			except for optimized resync records.
3029  *			An optimized resync record can only be written to
3030  *			by the record owner.
3031  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3032  *		master has been chosen, the new master may need to
3033  * 		write its incore mddb to disk (this is the case where the
3034  *		old master had executed a message but hadn't relayed it
3035  *		to this slave yet).  New master should not write the
3036  *		change log records since new master would be overwriting
3037  *		valuable data.  Only used during a reconfig cycle.
3038  */
3039 static int
3040 writecopy(
3041 	mddb_set_t	*s,
3042 	int		li,
3043 	int		flag
3044 )
3045 {
3046 	mddb_db_t	*dbp;
3047 	mddb_db32_t	*db32p;
3048 	mddb_de_ic_t	*dep;
3049 	mddb_rb32_t	*rbp;
3050 	uint_t		checksum;
3051 	int		err = 0;
3052 
3053 #if defined(_ILP32) && !defined(lint)
3054 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
3055 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
3056 #endif
3057 
3058 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
3059 		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
3060 		create_db32rec(db32p, dbp);
3061 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
3062 		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
3063 			MDDB_WR_ONLY_MASTER);
3064 		kmem_free((caddr_t)db32p, MDDB_BSIZE);
3065 		if (err)
3066 			return (err);
3067 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
3068 			/*
3069 			 * In a multinode diskset, when a new master is
3070 			 * chosen the new master may need to write its
3071 			 * incore copy of the mddb to disk.  In this case,
3072 			 * don't want to overwrite the change log records
3073 			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
3074 			 */
3075 			if (flag == MDDB_WRITECOPY_SYNC) {
3076 				if (dep->de_flags & MDDB_F_CHANGELOG)
3077 					continue;
3078 			}
3079 			/*
3080 			 * In a multinode diskset, don't write out optimized
3081 			 * resync resyncs since only the mirror owner node
3082 			 * will have the correct data.  If writecopy is
3083 			 * being called from writestart as a result of
3084 			 * an mddb failure, then writestart will handle
3085 			 * the optimized records when it calls fixoptrecords.
3086 			 */
3087 			if ((MD_MNSET_SETNO(s->s_setno)) &&
3088 			    (dep->de_flags & MDDB_F_OPT)) {
3089 				continue;
3090 			}
3091 
3092 			rbp = dep->de_rb;
3093 			checksum = rbp->rb_checksum_fiddle;
3094 			checksum ^= rbp->rb_checksum;
3095 			/* Generate the crc for this record */
3096 			rec_crcgen(s, dep, rbp);
3097 			checksum ^= rbp->rb_checksum;
3098 			rbp->rb_checksum_fiddle = checksum;
3099 			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
3100 			    dep->de_blkcount, li, (mddb_bf_t **)0,
3101 			    MDDB_WR_ONLY_MASTER))
3102 				return (err);
3103 		}
3104 	}
3105 	return (0);
3106 }
3107 
3108 static int
3109 upd_med(
3110 	mddb_set_t	*s,
3111 	char		*tag
3112 )
3113 {
3114 	med_data_t	meddb;
3115 	int		medok;
3116 	mddb_lb_t	*lbp = s->s_lbp;
3117 	set_t		setno = s->s_setno;
3118 	int		li;
3119 	int		alc;
3120 	int		lc;
3121 
3122 
3123 	/* If no mediator hosts, nothing to do */
3124 	if (s->s_med.n_cnt == 0)
3125 		return (0);
3126 
3127 	/*
3128 	 * If this is a MN set and we are not the master, then don't
3129 	 * update mediator hosts or mark mediator as golden since
3130 	 * only master node should do that.
3131 	 */
3132 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
3133 	    (md_set[setno].s_am_i_master == 0)) {
3134 		return (0);
3135 	}
3136 
3137 	bzero((char *)&meddb, sizeof (med_data_t));
3138 	meddb.med_dat_mag = MED_DATA_MAGIC;
3139 	meddb.med_dat_rev = MED_DATA_REV;
3140 	meddb.med_dat_fl = 0;
3141 	meddb.med_dat_sn = setno;
3142 	meddb.med_dat_cc = lbp->lb_commitcnt;
3143 	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
3144 	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3145 
3146 	/* count accessible mediators */
3147 	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3148 
3149 	/* count accessible and existing replicas */
3150 	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
3151 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3152 
3153 		if (lp->l_flags & MDDB_F_DELETED)
3154 			continue;
3155 
3156 		lc++;
3157 
3158 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
3159 		    (lp->l_flags & MDDB_F_EMASTER) ||
3160 		    (lp->l_flags & MDDB_F_EWRITE))
3161 			continue;
3162 
3163 		alc++;
3164 	}
3165 
3166 	/*
3167 	 * Mediator update quorum is >= 50%: check for less than
3168 	 * "mediator update" quorum.
3169 	 */
3170 	if ((medok * 2) < s->s_med.n_cnt) {
3171 		/* panic if <= 50% of all replicas are accessible */
3172 		if ((lc > 0) && ((alc * 2) <= lc)) {
3173 			cmn_err(CE_PANIC,
3174 			    "md: Update of 50%% of the mediator hosts failed");
3175 			/* NOTREACHED */
3176 		}
3177 
3178 		cmn_err(CE_WARN,
3179 		    "md: Update of 50%% of the mediator hosts failed");
3180 	}
3181 
3182 	/*
3183 	 * If we have mediator update quorum and exactly 50% of the replicas
3184 	 * are accessible then mark the mediator as golden.
3185 	 */
3186 	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
3187 	    ((alc * 2) == lc)) {
3188 		meddb.med_dat_fl = MED_DFL_GOLDEN;
3189 		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3190 		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3191 	}
3192 
3193 	return (0);
3194 }
3195 
3196 static int
3197 push_lb(mddb_set_t *s)
3198 {
3199 	mddb_lb_t	*lbp = s->s_lbp;
3200 
3201 	/* push the change to all the replicas */
3202 	uniqtime32(&lbp->lb_timestamp);
3203 	if (MD_MNSET_SETNO(s->s_setno)) {
3204 		lbp->lb_revision = MDDB_REV_MNLB;
3205 	} else {
3206 		lbp->lb_revision = MDDB_REV_LB;
3207 	}
3208 	return (writelocall(s));
3209 }
3210 
3211 /* Should not call for MN diskset since data tags are not supported */
3212 static int
3213 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
3214 {
3215 	int 		diff = 0;
3216 
3217 	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
3218 	if (diff)
3219 		return (diff);
3220 
3221 	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
3222 	if (diff)
3223 		return (diff);
3224 
3225 	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
3226 	if (diff)
3227 		return (diff);
3228 
3229 	/*CSTYLED*/
3230 	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
3231 }
3232 
3233 /* Should not call for MN diskset since data tags are not supported */
3234 static int
3235 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
3236 {
3237 	int		nextid = 0;
3238 	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
3239 
3240 	/* Run to the end of the list */
3241 	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
3242 		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
3243 			return (0);
3244 		nextid++;
3245 	}
3246 
3247 	/* Add the new member */
3248 	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
3249 
3250 	/* Update the dtag portion of the list */
3251 	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
3252 	    sizeof (mddb_dtag_t));
3253 
3254 	/* Fix up the id value */
3255 	(*dtlpp)->dtl_dt.dt_id = ++nextid;
3256 
3257 	return (0);
3258 }
3259 
3260 /*
3261  * Even though data tags are not supported in MN disksets, dt_cntl may
3262  * be called for a MN diskset since this routine is called even before
3263  * it is known the kind of diskset being read in from disk.
3264  * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
3265  */
3266 static int
3267 dtl_cntl(mddb_set_t *s)
3268 {
3269 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3270 	int		ndt = 0;
3271 
3272 	while (dtlp != NULL) {
3273 		ndt++;
3274 		dtlp = dtlp->dtl_nx;
3275 	}
3276 
3277 	return (ndt);
3278 }
3279 
3280 /*
3281  * Even though data tags are not supported in MN disksets, dt_cntl may
3282  * be called for a MN diskset since this routine is called even before
3283  * it is known the kind of diskset being read in from disk.
3284  * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
3285  */
3286 static mddb_dtag_t *
3287 dtl_findl(mddb_set_t *s, int id)
3288 {
3289 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3290 
3291 	while (dtlp != NULL) {
3292 		if (dtlp->dtl_dt.dt_id == id)
3293 			return (&dtlp->dtl_dt);
3294 		dtlp = dtlp->dtl_nx;
3295 	}
3296 	return ((mddb_dtag_t *)NULL);
3297 }
3298 
3299 /* Should not call for MN diskset since data tags are not supported */
3300 static void
3301 dtl_freel(mddb_dtag_lst_t **dtlpp)
3302 {
3303 	mddb_dtag_lst_t	*dtlp;
3304 	mddb_dtag_lst_t	*tdtlp;
3305 
3306 
3307 	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
3308 		dtlp = tdtlp->dtl_nx;
3309 		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
3310 	}
3311 	*dtlpp = (mddb_dtag_lst_t *)NULL;
3312 }
3313 
3314 /*
3315  * Even though data tags are not supported in MN disksets, dt_setup will
3316  * be called for a MN diskset since this routine is called even before
3317  * it is known the kind of diskset being read in from disk.
3318  * Once this set is known as a MN diskset, the dtp area will be freed.
3319  */
3320 static void
3321 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
3322 {
3323 	mddb_dt_t	*dtp;
3324 	set_t		setno = s->s_setno;
3325 
3326 
3327 	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
3328 		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3329 	else if (dtagp == (mddb_dtag_t *)NULL)
3330 		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
3331 
3332 	/* shorthand */
3333 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3334 
3335 	dtp->dt_mag = MDDB_MAGIC_DT;
3336 	dtp->dt_rev = MDDB_REV_DT;
3337 
3338 	if (dtagp != NULL)
3339 		dtp->dt_dtag = *dtagp;		/* structure assignment */
3340 
3341 	/* Initialize the setno */
3342 	dtp->dt_dtag.dt_setno = setno;
3343 
3344 	/* Clear the id and flags, this is only used in user land */
3345 	dtp->dt_dtag.dt_id = 0;
3346 
3347 	/* Checksum it */
3348 	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
3349 }
3350 
3351 /* Should not call for MN diskset since data tags are not supported */
3352 static int
3353 set_dtag(mddb_set_t *s, md_error_t *ep)
3354 {
3355 	mddb_lb_t	*lbp = s->s_lbp;
3356 	mddb_dtag_t	tag;
3357 
3358 	if (lbp->lb_dtblkcnt == 0) {
3359 		/* Data tags not used in a MN set - so no failure returned */
3360 		if (lbp->lb_flags & MDDB_MNSET)
3361 			return (0);
3362 
3363 		cmn_err(CE_WARN,
3364 		    "No tag record allocated, unable to tag data");
3365 		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
3366 		return (1);
3367 	}
3368 
3369 	/* Clear the stack variable */
3370 	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
3371 
3372 	/* Get the HW serial number for this host */
3373 	(void) strncpy(tag.dt_sn, hw_serial, MDDB_SN_LEN);
3374 	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
3375 
3376 	/* Get the nodename that this host goes by */
3377 	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
3378 	tag.dt_hn[MD_MAX_NODENAME] = '\0';
3379 
3380 	/* Get a time stamp for NOW */
3381 	uniqtime32(&tag.dt_tv);
3382 
3383 	/* Setup the data tag record */
3384 	dt_setup(s, &tag);
3385 
3386 	/* Free any list of tags if they exist */
3387 	dtl_freel(&s->s_dtlp);
3388 
3389 	/* Put the new tag onto the tag list */
3390 	(void) dtl_addl(s, &tag);
3391 
3392 	return (0);
3393 }
3394 
3395 /*
3396  * If called during upgrade, this routine expects a non-translated
3397  * (aka target) dev.
3398  * Should not call for MN diskset since data tags are not supported.
3399  */
3400 static int
3401 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
3402 {
3403 	int		err = 0;
3404 	md_dev64_t	dev;
3405 	caddr_t		tbuf;
3406 	daddr_t		physblk;
3407 	mddb_block_t	blk;
3408 	mddb_dt_t	*dtp;
3409 	mddb_dtag_t	*dtagp;
3410 	set_t		setno = s->s_setno;
3411 
3412 	/* If have not allocated a data tag record, there is nothing to do */
3413 	if (lbp->lb_dtblkcnt == 0)
3414 		return (1);
3415 
3416 	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3417 
3418 	if (dtp == (mddb_dt_t *)NULL)
3419 		return (1);
3420 
3421 	/* shorthand */
3422 	dev = md_xlate_targ_2_mini(rip->ri_dev);
3423 	if (dev == NODEV64) {
3424 		return (1);
3425 	}
3426 
3427 	tbuf = (caddr_t)rip->ri_dtp;
3428 
3429 	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
3430 		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
3431 		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE));
3432 		/* error reading the tag */
3433 		if (err) {
3434 			err = 1;
3435 			goto out;
3436 		}
3437 		tbuf += MDDB_BSIZE;
3438 	}
3439 
3440 	/* magic is valid? */
3441 	if (dtp->dt_mag != MDDB_MAGIC_DT) {
3442 		err = 1;
3443 		goto out;
3444 	}
3445 
3446 	/* revision is valid? */
3447 	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
3448 		err = 1;
3449 		goto out;
3450 	}
3451 
3452 	/* crc is valid? */
3453 	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
3454 		err = 1;
3455 		goto out;
3456 	}
3457 
3458 	/* shorthand */
3459 	dtagp = &dtp->dt_dtag;
3460 
3461 	/* set number match? */
3462 	if (dtagp->dt_setno != setno) {
3463 		err = 1;
3464 		goto out;
3465 	}
3466 
3467 	/* tag is not empty? */
3468 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3469 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3470 	    dtagp->dt_id == 0) {
3471 		err = 2;
3472 		goto out;
3473 	}
3474 
3475 	/* Mark the locator as having tagged data */
3476 	rip->ri_flags |= MDDB_F_TAGDATA;
3477 
3478 out:
3479 	if (err) {
3480 		if (err == 1) {
3481 			md_set_setstatus(setno, MD_SET_BADTAG);
3482 			rip->ri_flags |= MDDB_F_BADTAG;
3483 		}
3484 		if (dtp != NULL) {
3485 			kmem_free(dtp, MDDB_DT_BYTES);
3486 			rip->ri_dtp = (mddb_dt_t *)NULL;
3487 		}
3488 	}
3489 
3490 	return (err);
3491 }
3492 
3493 /* Should not call for MN diskset since data tags are not supported */
3494 static int
3495 dt_write(mddb_set_t *s)
3496 {
3497 	int		li;
3498 	int		err = 0;
3499 	int		werr;
3500 	int		empty_tag = 0;
3501 	mddb_dtag_t	*dtagp;
3502 	mddb_dt_t	*dtp;
3503 	mddb_lb_t	*lbp = s->s_lbp;
3504 	set_t		setno = s->s_setno;
3505 	uint_t		set_status = md_get_setstatus(setno);
3506 
3507 
3508 	ASSERT(md_set[setno].s_dtp != NULL);
3509 
3510 	/* Nowhere to write to */
3511 	if (lbp->lb_dtblkcnt == 0)
3512 		return (err);
3513 
3514 	if (set_status & MD_SET_BADTAG)
3515 		return (err);
3516 
3517 	/* shorthand */
3518 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3519 	dtagp = &dtp->dt_dtag;
3520 
3521 	/* See if the tag is empty. */
3522 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3523 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3524 	    dtagp->dt_id == 0)
3525 		empty_tag = 1;
3526 
3527 	/* Write the tag to the locators and reset appropriate flags. */
3528 	for (li = 0; li < lbp->lb_loccnt; li++) {
3529 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3530 
3531 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3532 		    (lp->l_flags & MDDB_F_DELETED) ||
3533 		    (lp->l_flags & MDDB_F_EWRITE))
3534 			continue;
3535 
3536 		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
3537 		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
3538 
3539 		if (werr) {
3540 			err |= werr;
3541 			continue;
3542 		}
3543 
3544 		if (empty_tag)
3545 			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
3546 		else {
3547 			lp->l_flags |= MDDB_F_TAGDATA;
3548 			lp->l_flags &= ~MDDB_F_BADTAG;
3549 		}
3550 	}
3551 
3552 	if (err)
3553 		return (err);
3554 
3555 
3556 	/* If the tags were written, check to see if any tags remain. */
3557 	for (li = 0; li < lbp->lb_loccnt; li++) {
3558 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3559 
3560 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3561 		    (lp->l_flags & MDDB_F_DELETED) ||
3562 		    (lp->l_flags & MDDB_F_EWRITE))
3563 			continue;
3564 
3565 		if (lp->l_flags & MDDB_F_TAGDATA)
3566 			break;
3567 	}
3568 
3569 	/* If there are no tags, then clear CLRTAG and TAGDATA */
3570 	if (li == lbp->lb_loccnt) {
3571 		md_clr_setstatus(setno, MD_SET_CLRTAG);
3572 		md_clr_setstatus(setno, MD_SET_TAGDATA);
3573 	}
3574 
3575 	return (err);
3576 }
3577 
3578 /* Should not call for MN diskset since data tags are not supported */
3579 static int
3580 dt_alloc_if_needed(mddb_set_t *s)
3581 {
3582 	int		i;
3583 	int		li;
3584 	int		moveit = 0;
3585 	mddb_lb_t	*lbp = s->s_lbp;
3586 	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
3587 	set_t		setno = s->s_setno;
3588 	uint_t		set_status = md_get_setstatus(setno);
3589 
3590 	/*
3591 	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
3592 	 * not detected, there is nothing to do.
3593 	 */
3594 	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
3595 		return (0);
3596 
3597 	/* Bitmap not setup, checks can't be done */
3598 	if (s->s_totalblkcnt == 0)
3599 		return (0);
3600 
3601 	/* While reading the tag(s) an invalid tag data record was seen */
3602 	if (set_status & MD_SET_BADTAG)
3603 		/* See if the invalid tag needs to be moved */
3604 		for (i = 0; i < MDDB_DT_BLOCKS; i++)
3605 			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
3606 				moveit = 1;
3607 				break;
3608 			}
3609 
3610 	/* Need to move or allocate the tag data record */
3611 	if (moveit || blkcnt == 0) {
3612 		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
3613 		if (lbp->lb_dtfirstblk == 0) {
3614 			cmn_err(CE_WARN,
3615 			    "Unable to allocate data tag record");
3616 			return (0);
3617 		}
3618 		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
3619 
3620 		/* Mark the locators so that they get written to disk. */
3621 		for (li = 0; li < lbp->lb_loccnt; li++) {
3622 			mddb_locator_t	*lp = &lbp->lb_locators[li];
3623 
3624 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3625 			    (lp->l_flags & MDDB_F_DELETED) ||
3626 			    (lp->l_flags & MDDB_F_EWRITE))
3627 				continue;
3628 
3629 			lp->l_flags |= MDDB_F_BADTAG;
3630 		}
3631 		return (1);
3632 	}
3633 
3634 	/*
3635 	 * Make sure the blocks are owned, since the calculation in
3636 	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
3637 	 */
3638 	for (i = 0; i < MDDB_DT_BLOCKS; i++)
3639 		blkbusy(s, (i + lbp->lb_dtfirstblk));
3640 
3641 	return (1);
3642 }
3643 
3644 /*
3645  * Writestart writes the incore mddb out to all of the replicas.
3646  * This is called when a diskset is started and when an error has
3647  * been enountered during the write to a mddb.
3648  *
3649  * flag can be 2 values:
3650  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3651  *		always used for traditional and local disksets.
3652  *		This is the normal path for MN disksets since the slave
3653  *		nodes aren't actually allowed to write to disk.
3654  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3655  *		master has been chosen, the new master may need to
3656  * 		write its incore mddb to disk (this is the case where the
3657  *		old master had executed a message but hadn't relayed it
3658  *		to this slave yet).  New master should not write the
3659  *		change log records since new master would be overwriting
3660  *		valuable data.  Only used during a reconfig cycle.
3661  */
3662 static int
3663 writestart(
3664 	mddb_set_t	*s,
3665 	int		flag
3666 )
3667 {
3668 	int		li;
3669 	mddb_locator_t	*lp;
3670 	mddb_lb_t	*lbp;
3671 	mddb_ln_t	*lnp;
3672 	int		err = 0;
3673 	uint_t		set_status;
3674 
3675 	lbp = s->s_lbp;
3676 
3677 	for (li = 0; li < lbp->lb_loccnt; li++) {
3678 		lp = &lbp->lb_locators[li];
3679 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3680 			continue;
3681 		if (! (lp->l_flags & MDDB_F_SUSPECT))
3682 			continue;
3683 		if (writecopy(s, li, flag))
3684 			return (1);
3685 		lp->l_flags |= MDDB_F_UP2DATE;
3686 	}
3687 
3688 	for (li = 0; li < lbp->lb_loccnt; li++) {
3689 		lp = &lbp->lb_locators[li];
3690 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3691 			continue;
3692 		if ((lp->l_flags & MDDB_F_UP2DATE))
3693 			continue;
3694 		if (checkcopy(s, li))
3695 			if (err = writecopy(s, li, flag))
3696 				return (1);
3697 		lp->l_flags |= MDDB_F_UP2DATE;
3698 	}
3699 
3700 	/*
3701 	 * Call fixoptrecord even during a reconfig cycle since a replica
3702 	 * failure may force the master to re-assign the optimized
3703 	 * resync record to another replica.
3704 	 */
3705 	if (fixoptrecords(s))
3706 		return (1);
3707 
3708 	set_status = md_get_setstatus(s->s_setno);
3709 
3710 	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
3711 	for (li = 0; li < lbp->lb_loccnt; li++) {
3712 		lp = &lbp->lb_locators[li];
3713 
3714 		if (lp->l_flags & MDDB_F_DELETED)
3715 			continue;
3716 
3717 		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
3718 		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
3719 		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
3720 		    (lp->l_flags & MDDB_F_OLDACT) != 0))
3721 			break;
3722 
3723 		if ((set_status & MD_SET_TAGDATA) ||
3724 		    (set_status & MD_SET_CLRTAG))
3725 			if ((lp->l_flags & MDDB_F_TAGDATA) ||
3726 			    (lp->l_flags & MDDB_F_BADTAG))
3727 				break;
3728 	}
3729 
3730 	/*
3731 	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
3732 	 * the lbp identifier and the set identifier doesn't match.
3733 	 */
3734 	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
3735 
3736 		/* Only call for traditional and local sets */
3737 		if (!(lbp->lb_flags & MDDB_MNSET))
3738 			(void) dt_write(s);
3739 
3740 		setidentifier(s, &lbp->lb_ident);
3741 
3742 		if (err = push_lb(s))
3743 			return (err);
3744 
3745 		(void) upd_med(s, "writestart(0)");
3746 
3747 		if (err = push_lb(s))
3748 			return (err);
3749 
3750 		(void) upd_med(s, "writestart(1)");
3751 
3752 		lnp = s->s_lnp;
3753 		uniqtime32(&lnp->ln_timestamp);
3754 		if (lbp->lb_flags & MDDB_MNSET)
3755 			lnp->ln_revision = MDDB_REV_MNLN;
3756 		else
3757 			lnp->ln_revision = MDDB_REV_LN;
3758 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
3759 		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
3760 			lbp->lb_lnblkcnt, 0);
3761 		/*
3762 		 * If a MN diskset and this is the master, set the PARSE_LOCNM
3763 		 * flag in the mddb_set structure to show that the locator
3764 		 * names have changed.
3765 		 * Don't set parseflags as a result of a new master sync
3766 		 * during reconfig cycle since slaves nodes are already
3767 		 * in-sync with the new master.
3768 		 */
3769 
3770 		if ((lbp->lb_flags & MDDB_MNSET) &&
3771 		    (md_set[s->s_setno].s_am_i_master) &&
3772 		    (flag != MDDB_WRITECOPY_SYNC)) {
3773 			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
3774 		}
3775 
3776 		if (err)
3777 			return (err);
3778 	}
3779 
3780 	for (li = 0; li < lbp->lb_loccnt; li++) {
3781 		lp = &lbp->lb_locators[li];
3782 		if (lp->l_flags & MDDB_F_DELETED)
3783 			continue;
3784 		if (lp->l_flags & MDDB_F_ACTIVE) {
3785 			lp->l_flags |= MDDB_F_OLDACT;
3786 		} else {
3787 			lp->l_flags &= ~MDDB_F_OLDACT;
3788 		}
3789 	}
3790 
3791 	md_clr_setstatus(s->s_setno, MD_SET_STALE);
3792 
3793 	return (0);
3794 }
3795 
3796 /*
3797  * selectreplicas selects the working replicas and may write the incore
3798  * version of the mddb out to the replicas ondisk.
3799  *
3800  * flag can be 3 values:
3801  *	MDDB_RETRYSCAN - quick scan to see if there is an error.
3802  *			If no new error, returns without writing mddb
3803  *			to disks.  If a new error is seen, writes out
3804  *			mddb to disks.
3805  *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
3806  *			out mddb to the replica ondisk.  Calls writecopy
3807  *			with MDDB_WRITECOPY_ALL flag which writes out
3808  *			all records to the replicas ondisk.
3809  *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
3810  *			and ondisk mddbs by writing incore values to disk.
3811  *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
3812  *			that change log records are not written out.
3813  *			Only used by MN disksets.
3814  *
3815  * Returns:
3816  *	0 - Successful
3817  *	1 - Unable to write incore mddb data to disk since < 50% replicas.
3818  */
3819 int
3820 selectreplicas(
3821 	mddb_set_t	*s,
3822 	int		flag
3823 )
3824 {
3825 	int		li;
3826 	int		alc;
3827 	int		lc;
3828 	mddb_locator_t	*lp;
3829 	mddb_lb_t	*lbp = s->s_lbp;
3830 	set_t		setno = s->s_setno;
3831 	int		wc_flag;
3832 
3833 	/*
3834 	 * can never transition from stale to not stale
3835 	 */
3836 	if (md_get_setstatus(setno) & MD_SET_STALE) {
3837 		for (li = 0; li < lbp->lb_loccnt; li++) {
3838 			lp = &lbp->lb_locators[li];
3839 			if (lp->l_flags & MDDB_F_DELETED)
3840 				continue;
3841 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3842 				lp->l_flags |= MDDB_F_ACTIVE;
3843 			} else {
3844 				lp->l_flags &= ~MDDB_F_ACTIVE;
3845 			}
3846 		}
3847 		return (1);
3848 	}
3849 
3850 	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
3851 		for (li = 0; li < lbp->lb_loccnt; li++) {
3852 			lp = &lbp->lb_locators[li];
3853 			if (lp->l_flags & MDDB_F_DELETED)
3854 				continue;
3855 			if (lp->l_flags & MDDB_F_ACTIVE) {
3856 				lp->l_flags |= MDDB_F_OLDACT;
3857 				lp->l_flags &= ~MDDB_F_SUSPECT;
3858 			} else {
3859 				lp->l_flags |= MDDB_F_SUSPECT;
3860 				lp->l_flags &= ~MDDB_F_OLDACT;
3861 			}
3862 
3863 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3864 				lp->l_flags |= MDDB_F_ACTIVE;
3865 				lp->l_flags &= ~MDDB_F_EWRITE;
3866 				lp->l_flags &= ~MDDB_F_TOOSMALL;
3867 			} else {
3868 				lp->l_flags &= ~MDDB_F_ACTIVE;
3869 			}
3870 		}
3871 		computefreeblks(s); /* set up free block bits */
3872 	} else {
3873 		for (li = 0; li < lbp->lb_loccnt; li++) {
3874 			lp = &lbp->lb_locators[li];
3875 			if (! (lp->l_flags & MDDB_F_ACTIVE))
3876 				continue;
3877 			if (lp->l_flags & MDDB_F_EWRITE)
3878 				break;
3879 		}
3880 
3881 		/*
3882 		 * if there are no errors this is error has already
3883 		 * been processed return current state
3884 		 */
3885 		if (li == lbp->lb_loccnt)
3886 			return (md_get_setstatus(setno) & MD_SET_TOOFEW);
3887 
3888 		lp->l_flags &= ~MDDB_F_ACTIVE;
3889 		do {
3890 			lp = &lbp->lb_locators[li];
3891 			lp->l_flags &= ~MDDB_F_UP2DATE;
3892 		} while (++li < lbp->lb_loccnt);
3893 	}
3894 
3895 	alc = 0;
3896 	lc = 0;
3897 	for (li = 0; li < lbp->lb_loccnt; li++) {
3898 		lp = &lbp->lb_locators[li];
3899 		if (lp->l_flags & MDDB_F_DELETED)
3900 			continue;
3901 		lc++;
3902 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3903 			continue;
3904 		alc++;
3905 	}
3906 
3907 	if (alc < ((lc + 1) / 2)) {
3908 		md_set_setstatus(setno, MD_SET_TOOFEW);
3909 		return (1);
3910 	}
3911 
3912 	/* Set wc_flag based on flag passed in. */
3913 	if (flag == MDDB_SCANALLSYNC)
3914 		wc_flag = MDDB_WRITECOPY_SYNC;
3915 	else
3916 		wc_flag = MDDB_WRITECOPY_ALL;
3917 
3918 	do {
3919 		if (! writestart(s, wc_flag)) {
3920 			md_clr_setstatus(setno, MD_SET_TOOFEW);
3921 			return (0);
3922 		}
3923 		alc  = 0;
3924 		for (li = 0; li < lbp->lb_loccnt; li++) {
3925 			lp = &lbp->lb_locators[li];
3926 			if ((lp->l_flags & MDDB_F_DELETED) ||
3927 			    (lp->l_flags & MDDB_F_EMASTER))
3928 				continue;
3929 
3930 			if (lp->l_flags & MDDB_F_EWRITE) {
3931 				lp->l_flags &= ~MDDB_F_ACTIVE;
3932 				lp->l_flags &= ~MDDB_F_UP2DATE;
3933 				continue;
3934 			}
3935 			alc++;
3936 		}
3937 	} while (alc >= ((lc + 1) / 2));
3938 	md_set_setstatus(setno, MD_SET_TOOFEW);
3939 	return (1);
3940 }
3941 
3942 static int
3943 checkstate(
3944 	mddb_set_t	*s,
3945 	int		probe
3946 )
3947 {
3948 	int		error;
3949 	uint_t		set_status = md_get_setstatus(s->s_setno);
3950 
3951 	ASSERT(s != NULL);
3952 
3953 	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
3954 		return (0);
3955 
3956 	if (probe == MDDB_NOPROBE)
3957 		return (1);
3958 
3959 	single_thread_start(s);
3960 	error = selectreplicas(s, MDDB_SCANALL);
3961 	single_thread_end(s);
3962 
3963 	if (error == 0 && s->s_zombie != 0) {
3964 		mutex_exit(SETMUTEX(s->s_setno));
3965 		error = mddb_deleterec(s->s_zombie);
3966 		mutex_enter(SETMUTEX(s->s_setno));
3967 		if (error == 0)
3968 			s->s_zombie = 0;
3969 	}
3970 	return (error);
3971 }
3972 
3973 static int
3974 writeretry(
3975 	mddb_set_t	*s
3976 )
3977 {
3978 	if (selectreplicas(s, MDDB_RETRYSCAN))
3979 		if (selectreplicas(s, MDDB_SCANALL))
3980 			return (1);
3981 	return (0);
3982 }
3983 
3984 static void
3985 free_mbipp(mddb_mb_ic_t **mbipp)
3986 {
3987 	mddb_mb_ic_t	*mbip1, *mbip2;
3988 
3989 	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
3990 		mbip2 = mbip1->mbi_next;
3991 		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
3992 	}
3993 	*mbipp = (mddb_mb_ic_t *)NULL;
3994 }
3995 
3996 static mddb_ri_t *
3997 save_rip(mddb_set_t *s)
3998 {
3999 	mddb_ri_t	*trip = s->s_rip;
4000 	mddb_ri_t	*nrip = NULL;
4001 	mddb_ri_t	**nripp = &nrip;
4002 	mddb_ri_t	*rip;
4003 
4004 	while (trip) {
4005 		/* Run to the end of the list */
4006 		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
4007 			/* void */;
4008 
4009 		/* Add the new member */
4010 		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
4011 
4012 		ASSERT(*nripp != NULL);
4013 
4014 		/* shorthand */
4015 		rip = *nripp;
4016 
4017 		*rip = *trip;			/* structure assignment */
4018 
4019 		/* Clear the stuff that is not needed for hints */
4020 		rip->ri_flags = 0;
4021 		rip->ri_commitcnt = 0;
4022 		rip->ri_transplant = 0;
4023 		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
4024 		rip->ri_dtp = (mddb_dt_t *)NULL;
4025 		rip->ri_lbp = (mddb_lb_t *)NULL;
4026 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4027 		rip->ri_devid = (ddi_devid_t)NULL;
4028 		rip->ri_old_devid = (ddi_devid_t)NULL;
4029 		rip->ri_next = (mddb_ri_t *)NULL;
4030 
4031 		trip = trip->ri_next;
4032 	}
4033 	return (nrip);
4034 }
4035 
4036 static void
4037 free_rip(mddb_ri_t **ripp)
4038 {
4039 	mddb_ri_t	*rip;
4040 	mddb_ri_t	*arip;
4041 
4042 	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
4043 		arip = rip->ri_next;
4044 		if (rip->ri_devid != (ddi_devid_t)NULL) {
4045 			ddi_devid_free(rip->ri_devid);
4046 			rip->ri_devid = (ddi_devid_t)NULL;
4047 		}
4048 		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
4049 			ddi_devid_free(rip->ri_old_devid);
4050 			rip->ri_old_devid = (ddi_devid_t)NULL;
4051 		}
4052 		kmem_free((caddr_t)rip, sizeof (*rip));
4053 	}
4054 	*ripp = (mddb_ri_t *)NULL;
4055 }
4056 
4057 /*
4058  * this routine selects the correct replica to use
4059  * the rules are as follows
4060  *	1.	if all replica has same init time select highest commit count
4061  *	2.	if some but not all replicas are from another hostid discard
4062  *		them.
4063  *	3.	find which init time is present is most replicas
4064  *	4.	discard all replicas which do not match most init times
4065  *	5.	select replica with highest commit count
4066  */
4067 
4068 static mddb_lb_t *
4069 selectlocator(
4070 	mddb_set_t	*s
4071 )
4072 {
4073 	mddb_ri_t	*rip = s->s_rip;
4074 	mddb_ri_t	*r, *r1;
4075 	mddb_lb_t	*lbp;
4076 	struct timeval32 *tp = (struct timeval32 *)NULL;
4077 	int		different;
4078 	int		same;
4079 	int		count;
4080 	int		maxcount;
4081 	set_t		setno = s->s_setno;
4082 	size_t		sz;
4083 	int		mn_set = 0;
4084 
4085 	/* Clear the ri_transplant flag on all the rip entries. */
4086 	/* Set ri_commitcnt to locator's commitcnt - if available */
4087 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4088 		r->ri_transplant = 0;
4089 		if (r->ri_lbp != (mddb_lb_t *)NULL) {
4090 			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
4091 			/* If any locators have MN bit set, set flag */
4092 			if (r->ri_lbp->lb_flags & MDDB_MNSET)
4093 				mn_set = 1;
4094 		}
4095 	}
4096 
4097 	/*
4098 	 * A data tag is being used, so use it to limit the selection first.
4099 	 * Data tags not used in MN diskset.
4100 	 */
4101 	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
4102 		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;
4103 
4104 		/*
4105 		 * now toss any locators that have a different data tag
4106 		 */
4107 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4108 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4109 				continue;
4110 
4111 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4112 				/* If same tag, keep it */
4113 				if (dtl_cmp(&dtp->dt_dtag,
4114 				    &r->ri_dtp->dt_dtag) == 0)
4115 					continue;
4116 			}
4117 
4118 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4119 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4120 				r->ri_dtp = (mddb_dt_t *)NULL;
4121 			}
4122 
4123 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4124 			if (!(md_get_setstatus(setno) &
4125 			    MD_SET_REPLICATED_IMPORT)) {
4126 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4127 					sz = ddi_devid_sizeof(r->ri_old_devid);
4128 					kmem_free((caddr_t)r->ri_old_devid, sz);
4129 					r->ri_old_devid = (ddi_devid_t)NULL;
4130 				}
4131 			}
4132 
4133 			kmem_free((caddr_t)r->ri_lbp,
4134 			    dbtob(r->ri_lbp->lb_blkcnt));
4135 			r->ri_lbp = (mddb_lb_t *)NULL;
4136 
4137 			r->ri_transplant = 1;
4138 		}
4139 
4140 		/* Tag used, clear the bit */
4141 		md_clr_setstatus(s->s_setno, MD_SET_USETAG);
4142 
4143 		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
4144 			/*
4145 			 * Get rid of the list of tags.
4146 			 */
4147 			dtl_freel(&s->s_dtlp);
4148 
4149 			/*
4150 			 * Re-create the list with the tag used.
4151 			 */
4152 			(void) dtl_addl(s, &dtp->dt_dtag);
4153 		}
4154 	}
4155 
4156 	/*
4157 	 * scan to see if all replicas have same time
4158 	 */
4159 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4160 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4161 			continue;
4162 		if (tp == NULL) {
4163 			tp = &r->ri_lbp->lb_inittime;
4164 			continue;
4165 		}
4166 		/* CSTYLED */
4167 		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
4168 			break;
4169 	}
4170 
4171 	/*
4172 	 * if r == NULL then they were all them same. Choose highest
4173 	 * commit count
4174 	 */
4175 	if (r == (mddb_ri_t *)NULL)
4176 		goto out;
4177 
4178 	/*
4179 	 * If here, a bogus replica is present and at least 1 lb_inittime
4180 	 * did not match.
4181 	 */
4182 
4183 	/*
4184 	 * look and see if any but not all are from different id
4185 	 */
4186 
4187 	different = 0;
4188 	same = 0;
4189 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4190 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4191 			continue;
4192 		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4193 			different = 1;
4194 		else
4195 			same = 1;
4196 	}
4197 
4198 	/*
4199 	 * now go through and throw out different if there are some
4200 	 * that are the same
4201 	 */
4202 	if (different != 0 && same != 0) {
4203 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4204 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4205 				continue;
4206 
4207 			if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4208 				continue;
4209 
4210 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4211 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4212 				r->ri_dtp = (mddb_dt_t *)NULL;
4213 			}
4214 
4215 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4216 			if (!(md_get_setstatus(setno) &
4217 			    MD_SET_REPLICATED_IMPORT)) {
4218 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4219 					sz = ddi_devid_sizeof(r->ri_old_devid);
4220 					kmem_free((caddr_t)r->ri_old_devid, sz);
4221 					r->ri_old_devid = (ddi_devid_t)NULL;
4222 				}
4223 			}
4224 
4225 			kmem_free((caddr_t)r->ri_lbp,
4226 			    dbtob(r->ri_lbp->lb_blkcnt));
4227 			r->ri_lbp = (mddb_lb_t *)NULL;
4228 
4229 			r->ri_transplant = 1;
4230 		}
4231 	}
4232 
4233 	/*
4234 	 * go through and pick highest. Use n square because it is
4235 	 * simple and 40 some is max possible
4236 	 */
4237 	maxcount = 0;
4238 	lbp = (mddb_lb_t *)NULL;
4239 	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
4240 		if (r1->ri_lbp == (mddb_lb_t *)NULL)
4241 			continue;
4242 		count = 0;
4243 		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4244 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4245 				continue;
4246 			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
4247 			    &r->ri_lbp->lb_inittime, ==))
4248 				count++;
4249 		}
4250 		if (count > maxcount) {
4251 			maxcount = count;
4252 			lbp = r1->ri_lbp;
4253 		}
4254 	}
4255 
4256 	/*
4257 	 * now go though and toss any that are of a different time stamp
4258 	 */
4259 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4260 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4261 			continue;
4262 		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
4263 		    &r->ri_lbp->lb_inittime, ==))
4264 			continue;
4265 
4266 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4267 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4268 			r->ri_dtp = (mddb_dt_t *)NULL;
4269 		}
4270 
4271 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4272 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4273 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4274 				sz = ddi_devid_sizeof(r->ri_old_devid);
4275 				kmem_free((caddr_t)r->ri_old_devid, sz);
4276 				r->ri_old_devid = (ddi_devid_t)NULL;
4277 			}
4278 		}
4279 
4280 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4281 		r->ri_lbp = (mddb_lb_t *)NULL;
4282 
4283 		r->ri_transplant = 1;
4284 	}
4285 
4286 out:
4287 	/*
4288 	 * Find the locator with the highest commit count, and make it the
4289 	 * "chosen" one.
4290 	 */
4291 	lbp = (mddb_lb_t *)NULL;
4292 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4293 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4294 			continue;
4295 
4296 		if (lbp == NULL) {
4297 			lbp = r->ri_lbp;
4298 			continue;
4299 		}
4300 
4301 		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
4302 			lbp = r->ri_lbp;
4303 	}
4304 
4305 	/* Toss all locator blocks, except the "chosen" one. */
4306 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4307 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4308 			continue;
4309 
4310 		/* Get rid of all dtp's */
4311 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4312 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4313 			r->ri_dtp = (mddb_dt_t *)NULL;
4314 		}
4315 
4316 		if (r->ri_lbp == lbp)
4317 			continue;
4318 
4319 		/* Get rid of extra locator devid block info */
4320 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4321 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4322 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4323 				sz = ddi_devid_sizeof(r->ri_old_devid);
4324 				kmem_free((caddr_t)r->ri_old_devid, sz);
4325 				r->ri_old_devid = (ddi_devid_t)NULL;
4326 			}
4327 		}
4328 
4329 		/* Get rid of extra locators */
4330 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4331 		r->ri_lbp = (mddb_lb_t *)NULL;
4332 	}
4333 	return (lbp);
4334 }
4335 
4336 static void
4337 locator2cfgloc(
4338 	mddb_lb_t		*lbp,
4339 	mddb_cfg_loc_t		*clp,
4340 	int			li,
4341 	side_t			sideno,
4342 	mddb_did_ic_t		*did_icp
4343 )
4344 {
4345 	mddb_drvnm_t		*dn;
4346 	mddb_locator_t		*lp = &lbp->lb_locators[li];
4347 	mddb_sidelocator_t	*slp;
4348 	mddb_mnsidelocator_t	*mnslp;
4349 	mddb_did_info_t		*did_info;
4350 	int 			i, sz, szalloc;
4351 	int			mn_set = 0;
4352 	mddb_mnlb_t		*mnlbp;
4353 
4354 	if (lbp->lb_flags & MDDB_MNSET) {
4355 		mn_set = 1;
4356 		mnlbp = (mddb_mnlb_t *)lbp;
4357 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4358 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4359 			if (mnslp->mnl_sideno == sideno)
4360 				break;
4361 		}
4362 		if (i == MD_MNMAXSIDES)
4363 			return;
4364 	} else {
4365 		slp = &lbp->lb_sidelocators[sideno][li];
4366 	}
4367 
4368 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4369 	    did_info = &(did_icp->did_ic_blkp->blk_info[li]);
4370 	    if (did_info->info_flags & MDDB_DID_EXISTS) {
4371 		sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
4372 		if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
4373 			/* copy device id from mddb to cfg_loc structure */
4374 			szalloc = clp->l_devid_sz;
4375 			if (sz <= szalloc) {
4376 				for (i = 0; i < sz; i++) {
4377 					((char *)(uintptr_t)clp->l_devid)[i] =
4378 					((char *)did_icp->did_ic_devid[li])[i];
4379 				}
4380 				clp->l_devid_flags |= MDDB_DEVID_VALID;
4381 				(void) strcpy(clp->l_minor_name,
4382 					did_info->info_minor_name);
4383 			} else {
4384 				clp->l_devid_flags |= MDDB_DEVID_NOSPACE;
4385 			}
4386 		} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
4387 			clp->l_devid_flags = MDDB_DEVID_SZ;
4388 			clp->l_devid_sz = sz;
4389 		}
4390 	    }
4391 	}
4392 
4393 	/*
4394 	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
4395 	 * and sidelocators.  During startup, the dev, drvnm and mnum in
4396 	 * these structures may not match the devid (the locators and
4397 	 * sidelocators will be updated to match the devid by the routine
4398 	 * load_old_replicas).  Using out-of-sync values won't cause any
4399 	 * problems since ridev will re-derive these from the devid and mnum.
4400 	 * After startup, the dev, drvnm and mnum in these structures have
4401 	 * been updated and can be used.
4402 	 */
4403 
4404 	clp->l_blkno = lp->l_blkno;
4405 	clp->l_flags = lp->l_flags;
4406 	clp->l_dev = lp->l_dev;
4407 
4408 	if (mn_set) {
4409 		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
4410 		clp->l_mnum = mnslp->mnl_mnum;
4411 	} else {
4412 		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
4413 		clp->l_mnum = slp->l_mnum;
4414 	}
4415 	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
4416 }
4417 
4418 /*
4419  * Find the index into the mnsidelocator where entry will go.
4420  * Then index can be fed into both splitname2locatorblocks and
4421  * cfgloc2locator so that those entries can be kept in sync.
4422  *
4423  * Returns:
4424  *	-1 if failed to find unused slot or if a traditional diskset
4425  *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
4426  */
4427 static int
4428 checklocator(
4429 	mddb_lb_t		*lbp,
4430 	int			li,
4431 	side_t			sideno
4432 )
4433 {
4434 	uchar_t			i;
4435 	mddb_mnsidelocator_t	*mnslp;
4436 	mddb_mnlb_t		*mnlbp;
4437 	int			index = -1;
4438 
4439 	if (lbp->lb_flags & MDDB_MNSET) {
4440 		/*
4441 		 * Checking side locator structure.  First, check if
4442 		 * there is already an entry for this side.  If so,
4443 		 * then use that entry.  Otherwise, find an entry
4444 		 * that has a sideno of 0.
4445 		 */
4446 		mnlbp = (mddb_mnlb_t *)lbp;
4447 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4448 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4449 			if (mnslp->mnl_sideno == sideno) {
4450 				/* Found a match - stop looking */
4451 				index = i;
4452 				break;
4453 			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
4454 				/* Set first empty slot, but keep looking */
4455 				index = i;
4456 			}
4457 		}
4458 		/* Didn't find empty slot or previously used slot */
4459 		if ((i == MD_MNMAXSIDES) && (index == -1)) {
4460 			return (-1);
4461 		}
4462 		return (index);
4463 	} else
4464 		return (0);
4465 }
4466 
4467 /*
4468  * Takes locator information (driver name, minor number, sideno) and
4469  * stores it in the locator block.
4470  * For traditional diskset, the sideno is the index into the sidelocator
4471  * array in the locator block.
4472  * For the MN diskset, the sideno is the nodeid which can be any number,
4473  * so the index passed in is the index into the mnsidelocator array
4474  * in the locator block.
4475  */
4476 static int
4477 cfgloc2locator(
4478 	mddb_lb_t		*lbp,
4479 	mddb_cfg_loc_t		*clp,
4480 	int			li,
4481 	side_t			sideno,
4482 	int			index	/* Only useful in MNsets when > 1 */
4483 )
4484 {
4485 	uchar_t			i;
4486 	mddb_sidelocator_t	*slp;
4487 	mddb_mnsidelocator_t	*mnslp;
4488 	mddb_set_t		*s;
4489 	int			mn_set = 0;
4490 	mddb_mnlb_t		*mnlbp;
4491 
4492 	if (lbp->lb_flags & MDDB_MNSET) {
4493 		mnlbp = (mddb_mnlb_t *)lbp;
4494 		mn_set = 1;
4495 		/*
4496 		 * Index will be the slot that has the given sideno or
4497 		 * the first empty slot if no match is found.
4498 		 * This was pre-checked out in check locator.
4499 		 */
4500 		mnslp = &mnlbp->lb_mnsidelocators[index][li];
4501 	} else {
4502 		slp = &lbp->lb_sidelocators[sideno][li];
4503 	}
4504 
4505 	/*
4506 	 * Look for the driver name
4507 	 */
4508 	for (i = 0; i < MDDB_DRVNMCNT; i++) {
4509 		if (lbp->lb_drvnm[i].dn_len == 0)
4510 			continue;
4511 		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4512 		    MD_MAXDRVNM) == 0)
4513 			break;
4514 	}
4515 
4516 	/*
4517 	 * Didn't find one, add a new one
4518 	 */
4519 	if (i == MDDB_DRVNMCNT) {
4520 		for (i = 0; i < MDDB_DRVNMCNT; i++) {
4521 			if (lbp->lb_drvnm[i].dn_len == 0)
4522 				break;
4523 		}
4524 		if (i == MDDB_DRVNMCNT)
4525 			return (1);
4526 		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4527 		    MD_MAXDRVNM);
4528 		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
4529 	}
4530 
4531 	/* Fill in the drvnm index */
4532 	if (mn_set) {
4533 		mnslp->mnl_drvnm_index = i;
4534 		mnslp->mnl_mnum = clp->l_mnum;
4535 		mnslp->mnl_sideno = sideno;
4536 	} else {
4537 		slp->l_drvnm_index = i;
4538 		slp->l_mnum = clp->l_mnum;
4539 	}
4540 
4541 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4542 		/*
4543 		 * This device id could already be associated with this index
4544 		 * if this is not the first side added to the set.
4545 		 * If device id is 0, there is no device id for this device.
4546 		 */
4547 		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
4548 			return (0);
4549 		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
4550 		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
4551 		    clp->l_minor_name)) {
4552 			return (1);
4553 		}
4554 	}
4555 
4556 	return (0);
4557 }
4558 
4559 /*
4560  * See if there are mediator hosts and try to use the data.
4561  */
4562 static int
4563 mediate(
4564 	mddb_set_t	*s
4565 )
4566 {
4567 	mddb_lb_t	*lbp = s->s_lbp;
4568 	med_data_lst_t	*meddlp = NULL;
4569 	med_data_lst_t	*tmeddlp = NULL;
4570 	med_data_t	*meddp;
4571 	int		medok = 0;
4572 	int		medacc = 0;
4573 	uint_t		maxcc;
4574 	int		golden = 0;
4575 	int		err = 1;
4576 	set_t		setno = s->s_setno;
4577 
4578 	/* Do not have a mediator, then the state is stale */
4579 	if (s->s_med.n_cnt == 0)
4580 		return (err);
4581 
4582 	/* Contact the mediator hosts for the data */
4583 	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
4584 
4585 	/* No mediator data, stale */
4586 	if (meddlp == NULL)
4587 		return (err);
4588 
4589 	/* Mark all the mediator data that is not for this set as errored */
4590 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4591 		struct timeval32 tmptime;
4592 		meddp = tmeddlp->mdl_med;
4593 
4594 		/* Count the number of mediators contacted */
4595 		medacc++;
4596 
4597 		/* Paranoid check */
4598 		if (meddp->med_dat_sn != setno)
4599 			meddp->med_dat_fl |= MED_DFL_ERROR;
4600 
4601 		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
4602 
4603 		/*CSTYLED*/
4604 		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
4605 			meddp->med_dat_fl |= MED_DFL_ERROR;
4606 	}
4607 
4608 	/* Get the max commitcount */
4609 	maxcc = 0;
4610 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4611 		meddp = tmeddlp->mdl_med;
4612 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4613 			continue;
4614 		if (meddp->med_dat_cc > maxcc)
4615 			maxcc = meddp->med_dat_cc;
4616 	}
4617 
4618 	/* Now mark the records that don't have the highest cc as errored */
4619 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4620 		meddp = tmeddlp->mdl_med;
4621 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4622 			continue;
4623 		if (meddp->med_dat_cc != maxcc)
4624 			meddp->med_dat_fl |= MED_DFL_ERROR;
4625 	}
4626 
4627 	/* Now mark the records that don't match the lb commitcnt as errored */
4628 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4629 		meddp = tmeddlp->mdl_med;
4630 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4631 			continue;
4632 		if (meddp->med_dat_cc != lbp->lb_commitcnt)
4633 			meddp->med_dat_fl |= MED_DFL_ERROR;
4634 	}
4635 
4636 	/* Is there a "golden" copy and how many valid mediators */
4637 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4638 		meddp = tmeddlp->mdl_med;
4639 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4640 			continue;
4641 
4642 		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
4643 			golden++;
4644 
4645 		medok++;
4646 	}
4647 
4648 	/* No survivors, stale */
4649 	if (medok == 0)
4650 		goto out;
4651 
4652 	/* No mediator quorum and no golden copies, stale */
4653 	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
4654 		/* Skip odd numbers, no exact 50% */
4655 		if (s->s_med.n_cnt & 1)
4656 			goto out;
4657 		/* Have 50%, allow an accept */
4658 		if (medacc == (s->s_med.n_cnt / 2))
4659 			md_set_setstatus(setno, MD_SET_ACCOK);
4660 		goto out;
4661 	}
4662 
4663 	/* We either have a quorum or a golden copy, or both */
4664 	err = 0;
4665 
4666 out:
4667 	if (meddlp) {
4668 		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
4669 			tmeddlp = meddlp->mdl_nx;
4670 			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
4671 			kmem_free(meddlp, sizeof (med_data_lst_t));
4672 		}
4673 	}
4674 
4675 	return (err);
4676 }
4677 
4678 /*
4679  *	1. read masterblks and locator blocks for all know database locations
4680  *		a. keep track of which have good master blks
4681  *		b. keep track of which have good locators
4682  *
4683  */
4684 static int
4685 get_mbs_n_lbs(
4686 	mddb_set_t	*s,
4687 	int		*write_lb
4688 )
4689 {
4690 	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
4691 						/* May be cast to mddb_mnlb_t */
4692 						/* if accessing sidenames in */
4693 						/* MN set */
4694 	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
4695 	mddb_did_blk_t	*did_blkp = 0;
4696 	int		did_blkp_sz = 0;
4697 	mddb_did_db_t	*did_dbp;
4698 	mddb_did_info_t	*did_info;
4699 	caddr_t		did_block;
4700 	mddb_ri_t	*rip;
4701 	mddb_dtag_lst_t	*dtlp;
4702 	mddb_locator_t	*lp;
4703 	daddr_t		physblk;
4704 	int		li;
4705 	uint_t		blk;
4706 	md_dev64_t	dev;
4707 	caddr_t		buffer;
4708 	uint_t		lb_blkcnt;
4709 	int		retval = 0;
4710 	int		err = 0;
4711 	int		lb_ok = 0;
4712 	int		lb_total = 0;
4713 	int		lb_tagged = 0;
4714 	int		lb_tags;
4715 	set_t		setno = s->s_setno;
4716 	int		cont_flag, i;
4717 	mddb_did_db_t	*did_dbp1, *did_dbp2;
4718 	int		mn_set = 0;
4719 	mddb_cfg_loc_t	*cl;
4720 
4721 	/*
4722 	 * read in master blocks and locator block for all known locators.
4723 	 * lb_blkcnt will be set correctly for MN set later once getmasters
4724 	 * has determined that the set is a MN set.
4725 	 */
4726 	lb_blkcnt = ((setno == MD_LOCAL_SET) ?
4727 			MDDB_LOCAL_LBCNT : MDDB_LBCNT);
4728 
4729 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
4730 		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
4731 		    MDDB_F_EMASTER);
4732 		rip->ri_lbp = (mddb_lb_t *)NULL;
4733 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4734 
4735 		/*
4736 		 * Translated dev is only used in calls to getmasters and
4737 		 * getblks which expect a translated (aka miniroot) dev.
4738 		 */
4739 		dev = md_xlate_targ_2_mini(rip->ri_dev);
4740 		if (dev == NODEV64) {
4741 			/* Set error flag that getmasters would have set */
4742 			/* if getmasters had been allowed to fail */
4743 			rip->ri_flags |= MDDB_F_EMASTER;
4744 		}
4745 
4746 		/*
4747 		 * Invalid device id on system (due to failed or
4748 		 * removed device) or invalid devt during upgrade
4749 		 * (due to powered off device) will cause this
4750 		 * replica to be marked in error and not used.
4751 		 */
4752 		if (rip->ri_flags & MDDB_F_EMASTER)
4753 			continue;
4754 
4755 		/* get all master blocks, does mddb_devopen() */
4756 		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
4757 		    &rip->ri_flags, &mn_set);
4758 
4759 		/* if invalid master block - try next replica */
4760 		if (! rip->ri_mbip)
4761 			continue;
4762 
4763 		/*
4764 		 * If lbp alloc'd to wrong size - reset it.
4765 		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
4766 		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
4767 		 */
4768 		if (lbp) {
4769 			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
4770 			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
4771 				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
4772 				lbp = (mddb_lb_t *)NULL;
4773 			}
4774 		}
4775 
4776 		if (lbp == (mddb_lb_t *)NULL) {
4777 			/* If a MN set, set lb_blkcnt for MN loc blk size */
4778 			if (mn_set)
4779 				lb_blkcnt = MDDB_MNLBCNT;
4780 			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
4781 			    KM_SLEEP);
4782 		}
4783 
4784 		/*
4785 		 * Read in all the sectors for the locator block
4786 		 * NOTE: Need to use getblks, rather than readblklst.
4787 		 *	because it is too early and things are
4788 		 *	NOT set up yet for read*()'s
4789 		 */
4790 		buffer = (caddr_t)lbp;
4791 		for (blk = 0; blk < lb_blkcnt; blk++) {
4792 			physblk = getphysblk(blk, rip->ri_mbip);
4793 			err = getblks(s, buffer, dev, physblk,
4794 			    btodb(MDDB_BSIZE));
4795 			if (err) {
4796 				rip->ri_flags |= err;
4797 				break;
4798 			}
4799 			buffer += MDDB_BSIZE;
4800 		}
4801 
4802 		if (err)
4803 			continue;
4804 
4805 		/* Verify the locator block */
4806 		if (blk != lb_blkcnt)
4807 			continue;
4808 		if (lbp->lb_magic != MDDB_MAGIC_LB)
4809 			continue;
4810 		if (lbp->lb_blkcnt != lb_blkcnt)
4811 			continue;
4812 		if (mn_set) {
4813 			/* If a MN set, check for MNLB revision in lb. */
4814 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
4815 				continue;
4816 		} else {
4817 			/* If not a MN set, check for LB revision in lb. */
4818 			if (revchk(MDDB_REV_LB, lbp->lb_revision))
4819 				continue;
4820 		}
4821 		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
4822 			continue;
4823 
4824 		/*
4825 		 * With the addition of MultiNode Disksets, we must make sure
4826 		 * to verify that this is the correct set.  A node could
4827 		 * have been out of the config for awhile and this disk could
4828 		 * have been moved to a different diskset and we don't want
4829 		 * to accidentally start the wrong set.
4830 		 *
4831 		 * We don't do this check if we're in the middle of
4832 		 * importing a set.
4833 		 */
4834 		if (!(md_get_setstatus(s->s_setno) & MD_SET_IMPORT) &&
4835 		    (lbp->lb_setno != s->s_setno))
4836 			continue;
4837 
4838 		rip->ri_flags |= MDDB_F_LOCACC;
4839 
4840 		/*
4841 		 * a commit count of zero means this locator has been deleted
4842 		 */
4843 		if (lbp->lb_commitcnt == 0)
4844 			continue;
4845 
4846 		/*
4847 		 * If replica is in the device ID style and md_devid_destroy
4848 		 * flag is set, turn off device id style.  This is only to be
4849 		 * used in a catastrophic failure case.  Examples would be
4850 		 * where the device id of all drives in the system
4851 		 * (especially the mirror'd root drives) had been changed
4852 		 * by firmware upgrade or by a patch to an existing disk
4853 		 * driver.  Another example would be in the case of non-unique
4854 		 * device ids due to a bug.  The device id would be valid on
4855 		 * the system, but would return the wrong dev_t.
4856 		 */
4857 		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
4858 			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
4859 			lbp->lb_didfirstblk = 0;
4860 			lbp->lb_didblkcnt = 0;
4861 			*write_lb = 1;
4862 		}
4863 
4864 
4865 		/*
4866 		 * If replica is in device ID style, read in device ID
4867 		 * block and verify device ID block information.
4868 		 */
4869 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4870 
4871 			/* Read in device ID block */
4872 			if (did_icp == NULL) {
4873 				did_icp = (mddb_did_ic_t *)
4874 					kmem_zalloc(sizeof (mddb_did_ic_t),
4875 					    KM_SLEEP);
4876 			} else {
4877 				/* Reuse did_icp, but clear out data */
4878 				if (did_icp->did_ic_blkp !=
4879 				    (mddb_did_blk_t *)NULL) {
4880 					kmem_free((caddr_t)did_icp->did_ic_blkp,
4881 					    did_blkp_sz);
4882 					did_blkp = (mddb_did_blk_t *)NULL;
4883 					did_icp->did_ic_blkp =
4884 					    (mddb_did_blk_t *)NULL;
4885 				}
4886 				if (did_icp->did_ic_dbp !=
4887 					(mddb_did_db_t *)NULL) {
4888 					did_dbp1 = did_icp->did_ic_dbp;
4889 					while (did_dbp1) {
4890 					    did_dbp2 = did_dbp1->db_next;
4891 					    kmem_free((caddr_t)did_dbp1->db_ptr,
4892 						dbtob(did_dbp1->db_blkcnt));
4893 					    kmem_free((caddr_t)did_dbp1,
4894 						sizeof (mddb_did_db_t));
4895 					    did_dbp1 = did_dbp2;
4896 					}
4897 					did_icp->did_ic_dbp =
4898 						(mddb_did_db_t *)NULL;
4899 				}
4900 				for (i = 0; i < MDDB_NLB; i++) {
4901 					did_icp->did_ic_devid[i] =
4902 						(ddi_devid_t)NULL;
4903 				}
4904 			}
4905 
4906 			/* Can't reuse blkp since size could be different */
4907 			if (did_blkp != (mddb_did_blk_t *)NULL) {
4908 				kmem_free(did_blkp, did_blkp_sz);
4909 			}
4910 			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
4911 			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
4912 			    KM_SLEEP);
4913 			did_icp->did_ic_blkp = did_blkp;
4914 			buffer = (caddr_t)did_blkp;
4915 			for (blk = lbp->lb_didfirstblk;
4916 			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
4917 			    blk++) {
4918 				physblk = getphysblk(blk, rip->ri_mbip);
4919 				err = getblks(s, buffer, dev, physblk,
4920 				    btodb(MDDB_BSIZE));
4921 				if (err) {
4922 					rip->ri_flags |= err;
4923 					break;
4924 				}
4925 				buffer += MDDB_BSIZE;
4926 			}
4927 			if (err)
4928 				continue;
4929 
4930 			/* Verify the Device ID block */
4931 			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
4932 				continue;
4933 			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
4934 				continue;
4935 			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
4936 				continue;
4937 			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
4938 				continue;
4939 			if (crcchk(did_blkp, &did_blkp->blk_checksum,
4940 				dbtob(lbp->lb_didblkcnt), NULL))
4941 				continue;
4942 
4943 			/*
4944 			 * Check if device ID block is out of sync with the
4945 			 * Locator Block by checking if the locator block
4946 			 * commitcnt does not match the device id block
4947 			 * commitcnt.  If an 'out of sync' condition
4948 			 * exists, discard this replica since it has
4949 			 * inconsistent data and can't be used in
4950 			 * determining the best replica.
4951 			 *
4952 			 * An 'out of sync' condition could happen if old
4953 			 * SDS code was running with new devid style replicas
4954 			 * or if a failure occurred between the writing of
4955 			 * the locator block's commitcnt and the device
4956 			 * id block's commitcnt.
4957 			 *
4958 			 * If old SDS code had been running, the upgrade
4959 			 * process should detect this situation and
4960 			 * have removed all of the device id information
4961 			 * via the md_devid_destroy flag in md.conf.
4962 			 */
4963 			if (did_blkp->blk_commitcnt !=
4964 			    lbp->lb_commitcnt) {
4965 				continue;
4966 			}
4967 		}
4968 
4969 
4970 		/*
4971 		 * If replica is still in device ID style, read in all
4972 		 * of the device IDs, verify the checksum of the device IDs.
4973 		 */
4974 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4975 			/*
4976 			 * Reset valid bit in device id info block flags. This
4977 			 * flag is stored on disk, but the valid bit is reset
4978 			 * when reading in the replica.  If the corresponding
4979 			 * device id is valid (aka meaning that the system
4980 			 * knows about this device id), the valid bit will
4981 			 * be set at a later time.  The valid bit for this
4982 			 * replica's device ID will be set in this routine.
4983 			 * The valid bits for the rest of the device id's
4984 			 * will be set after the 'best' replica has
4985 			 * been selected in routine load_old_replicas.
4986 			 * Reset updated bit in device id info block flags.
4987 			 * This flag is also stored on disk, reset when read
4988 			 * in and set when the locators and side locators
4989 			 * have been updated to match this valid device
4990 			 * id information.
4991 			 */
4992 		    for (li = 0; li < lbp->lb_loccnt; li++) {
4993 			did_info = &did_blkp->blk_info[li];
4994 			if (did_info->info_flags & MDDB_DID_EXISTS)
4995 				did_info->info_flags &=
4996 					~(MDDB_DID_VALID | MDDB_DID_UPDATED);
4997 		    }
4998 
4999 		    cont_flag = 0;
5000 		    for (li = 0; li < lbp->lb_loccnt; li++) {
5001 			did_info = &did_blkp->blk_info[li];
5002 			did_block = (caddr_t)NULL;
5003 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5004 			    /* Check if block has already been read in */
5005 			    did_dbp = did_icp->did_ic_dbp;
5006 			    while (did_dbp != 0) {
5007 				if (did_dbp->db_firstblk ==
5008 				    did_info->info_firstblk)
5009 					break;
5010 				else
5011 					did_dbp = did_dbp->db_next;
5012 			    }
5013 			    /* if block not found, read it in */
5014 			    if (did_dbp == NULL) {
5015 				did_block = (caddr_t)(kmem_zalloc(dbtob
5016 					    (did_info->info_blkcnt), KM_SLEEP));
5017 				buffer = (caddr_t)did_block;
5018 				for (blk = did_info->info_firstblk;
5019 				    blk < (did_info->info_firstblk +
5020 				    did_info->info_blkcnt); blk++) {
5021 					physblk = getphysblk(blk, rip->ri_mbip);
5022 					err = getblks(s, buffer, dev, physblk,
5023 					    btodb(MDDB_BSIZE));
5024 					if (err) {
5025 						rip->ri_flags |= err;
5026 						break;
5027 					}
5028 					buffer += MDDB_BSIZE;
5029 				}
5030 				if (err) {
5031 				    kmem_free(did_block,
5032 					dbtob(did_info->info_blkcnt));
5033 					did_block = (caddr_t)NULL;
5034 				    cont_flag = 1;
5035 				    break;
5036 				}
5037 
5038 				/*
5039 				 * Block read in - alloc Disk Block area
5040 				 */
5041 				did_dbp = (mddb_did_db_t *)kmem_zalloc(
5042 				    sizeof (mddb_did_db_t), KM_SLEEP);
5043 				did_dbp->db_ptr = did_block;
5044 				did_dbp->db_firstblk = did_info->info_firstblk;
5045 				did_dbp->db_blkcnt = did_info->info_blkcnt;
5046 
5047 				/* Add to front of dbp list */
5048 				did_dbp->db_next = did_icp->did_ic_dbp;
5049 				did_icp->did_ic_dbp = did_dbp;
5050 			    }
5051 			    /* Check validity of devid in block */
5052 			    if (crcchk(((char *)did_dbp->db_ptr +
5053 				did_info->info_offset),
5054 				&did_info->info_checksum,
5055 				did_info->info_length, NULL)) {
5056 				    cont_flag = 1;
5057 				    break;
5058 			    }
5059 
5060 			    /* Block now pointed to by did_dbp */
5061 			    did_icp->did_ic_devid[li] = (ddi_devid_t)
5062 				((char *)did_dbp->db_ptr +
5063 				did_info->info_offset);
5064 			}
5065 		    }
5066 		    if (cont_flag)
5067 			continue;
5068 		}
5069 
5070 		/*
5071 		 * All blocks containing devids are now in core.
5072 		 */
5073 
5074 		/*
5075 		 * If we're doing a replicated import (also known as
5076 		 * remote copy import), the device id in the locator
5077 		 * block is incorrect and we need to fix it up here
5078 		 * alongwith the l_dev otherwise we run into lots of
5079 		 * trouble later on.
5080 		 */
5081 		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5082 			mddb_ri_t	*trip;
5083 			for (li = 0; li < lbp->lb_loccnt; li++) {
5084 				did_info = &did_blkp->blk_info[li];
5085 				lp = &lbp->lb_locators[li];
5086 
5087 				if (lp->l_flags & MDDB_F_DELETED)
5088 					continue;
5089 
5090 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5091 					continue;
5092 
5093 				if (rip->ri_old_devid == NULL)
5094 					continue;
5095 
5096 				if (did_icp->did_ic_devid[li] == NULL)
5097 					continue;
5098 
5099 				for (trip = s->s_rip; trip != NULL;
5100 				    trip = trip->ri_next) {
5101 					if (ddi_devid_compare(
5102 					    trip->ri_old_devid,
5103 					    did_icp->did_ic_devid[li]) != 0) {
5104 						continue;
5105 					}
5106 
5107 					/* update l_dev */
5108 					lp->l_dev = md_cmpldev(trip->ri_dev);
5109 				}
5110 			}
5111 		}
5112 
5113 
5114 		/*
5115 		 * If there is a valid devid, verify that this locator
5116 		 * block has information about itself by checking the
5117 		 * device ID, minor_name and block
5118 		 * number from this replica's incore data structure
5119 		 * against the locator block information that has just
5120 		 * been read in from disk.
5121 		 *
5122 		 * If not a valid devid, verify that this locator block
5123 		 * has information about itself by checking the minor
5124 		 * number, block number and driver name from this
5125 		 * replica's incore data structure against the locator
5126 		 * block information that has just been read in from disk.
5127 		 */
5128 		if ((rip->ri_devid != NULL) &&
5129 		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
5130 			/*
5131 			 * This locator block MUST have locator (replica)
5132 			 * information about itself.  Check against devid,
5133 			 * slice part of minor number, and block number.
5134 			 */
5135 			for (li = 0; li < lbp->lb_loccnt; li++) {
5136 				did_info = &did_blkp->blk_info[li];
5137 				lp = &lbp->lb_locators[li];
5138 				if (lp->l_flags & MDDB_F_DELETED)
5139 					continue;
5140 
5141 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5142 					continue;
5143 
5144 				if ((md_get_setstatus(setno) &
5145 				    MD_SET_REPLICATED_IMPORT)) {
5146 					if (ddi_devid_compare(rip->ri_old_devid,
5147 					    did_icp->did_ic_devid[li]) != 0)
5148 					    continue;
5149 				} else {
5150 					if (ddi_devid_compare(rip->ri_devid,
5151 					    did_icp->did_ic_devid[li]) != 0)
5152 					    continue;
5153 				}
5154 
5155 				if (strcmp(rip->ri_minor_name,
5156 				    did_info->info_minor_name) != 0)
5157 					continue;
5158 
5159 				if (lp->l_blkno == rip->ri_blkno)
5160 					break;
5161 			}
5162 		} else {
5163 			/*
5164 			 * This locator block MUST have locator (replica)
5165 			 * information about itself.
5166 			 */
5167 			if (!mn_set) {
5168 			    for (li = 0; li < lbp->lb_loccnt; li++) {
5169 				mddb_drvnm_t		*dn;
5170 				mddb_sidelocator_t	*slp;
5171 
5172 				lp = &lbp->lb_locators[li];
5173 				slp = &lbp->lb_sidelocators[s->s_sideno][li];
5174 				if (lp->l_flags & MDDB_F_DELETED)
5175 					continue;
5176 				if (slp->l_mnum != md_getminor(rip->ri_dev))
5177 					continue;
5178 				if (lp->l_blkno != rip->ri_blkno)
5179 					continue;
5180 				dn = &lbp->lb_drvnm[slp->l_drvnm_index];
5181 				if (strncmp(dn->dn_data, rip->ri_driver,
5182 				    MD_MAXDRVNM) == 0)
5183 				break;
5184 			    }
5185 			} else {
5186 			    for (li = 0; li < lbp->lb_loccnt; li++) {
5187 				mddb_drvnm_t		*dn;
5188 				mddb_mnsidelocator_t	*mnslp;
5189 				mddb_mnlb_t		*mnlbp;
5190 				int			i;
5191 
5192 				/*
5193 				 * Check all possible locators locking for
5194 				 * match to the currently read-in locator,
5195 				 * must match on:
5196 				 *	- blkno
5197 				 *	- side locator for this node's side
5198 				 *	- side locator minor number
5199 				 *	- side locator driver name
5200 				 */
5201 
5202 				/* Looking at sidelocs - cast lbp -> mnlbp */
5203 				mnlbp = (mddb_mnlb_t *)lbp;
5204 				lp = &mnlbp->lb_locators[li];
5205 				if (lp->l_flags & MDDB_F_DELETED)
5206 					continue;
5207 				if (lp->l_blkno != rip->ri_blkno)
5208 					continue;
5209 
5210 				for (i = 0; i < MD_MNMAXSIDES; i++) {
5211 				    mnslp = &mnlbp->lb_mnsidelocators[i][li];
5212 				    if (mnslp->mnl_sideno == s->s_sideno) {
5213 					break;
5214 				    }
5215 				}
5216 				/* No matching side found */
5217 				if (i == MD_MNMAXSIDES)
5218 					continue;
5219 				if (mnslp->mnl_mnum != md_getminor(rip->ri_dev))
5220 					continue;
5221 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
5222 				if (strncmp(dn->dn_data, rip->ri_driver,
5223 				    MD_MAXDRVNM) == 0)
5224 					break;
5225 			    }
5226 			}
5227 		}
5228 
5229 		/*
5230 		 * Didn't find ourself in this locator block it means
5231 		 * the locator block is a stale transplant. Probably from
5232 		 * a user doing a dd.
5233 		 */
5234 		if (li == lbp->lb_loccnt)
5235 			continue;
5236 
5237 		/*
5238 		 * Keep track of the number of accessed and valid
5239 		 * locator blocks.
5240 		 */
5241 		lb_ok++;
5242 
5243 		/*
5244 		 * Read the tag in, skips invalid or blank tags.
5245 		 * Only valid tags allocate storage
5246 		 * Data tags are not used in MN disksets.
5247 		 */
5248 		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
5249 			/*
5250 			 * Keep track of the number of tagged
5251 			 * locator blocks.
5252 			 */
5253 			lb_tagged++;
5254 
5255 			/* Keep a list of unique tags. */
5256 			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
5257 		}
5258 
5259 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5260 			/*
5261 			 * go through locator block and add any other
5262 			 * locations of the data base.
5263 			 * For the replicated import case, this was done earlier
5264 			 * and we really don't need or want to do so again
5265 			 */
5266 			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
5267 			for (li = 0; li < lbp->lb_loccnt; li++) {
5268 				lp = &lbp->lb_locators[li];
5269 				if (lp->l_flags & MDDB_F_DELETED)
5270 					continue;
5271 
5272 				cl->l_devid_flags = MDDB_DEVID_GETSZ;
5273 				cl->l_devid = (uint64_t)0;
5274 				cl->l_devid_sz = 0;
5275 				cl->l_old_devid = (uint64_t)0;
5276 				cl->l_old_devid_sz = 0;
5277 				cl->l_minor_name[0] = '\0';
5278 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5279 				    did_icp);
5280 
5281 				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
5282 					if ((cl->l_devid = (uintptr_t)kmem_alloc
5283 					    (cl->l_devid_sz, KM_SLEEP))
5284 					    == NULL) {
5285 						continue;
5286 					} else {
5287 						cl->l_devid_flags =
5288 						    MDDB_DEVID_SPACE;
5289 					}
5290 				}
5291 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5292 				    did_icp);
5293 
5294 				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
5295 
5296 				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
5297 					kmem_free((caddr_t)(uintptr_t)
5298 					    cl->l_devid, cl->l_devid_sz);
5299 			}
5300 			kmem_free(cl, sizeof (mddb_cfg_loc_t));
5301 		}
5302 
5303 		/* Save LB for later */
5304 		rip->ri_lbp = lbp;
5305 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5306 			rip->ri_did_icp = did_icp;
5307 			did_icp = (mddb_did_ic_t *)NULL;
5308 			did_blkp = (mddb_did_blk_t *)NULL;
5309 		} else
5310 			rip->ri_did_icp = NULL;
5311 		lbp = (mddb_lb_t *)NULL;
5312 	}
5313 
5314 	if (lbp != (mddb_lb_t *)NULL)
5315 		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
5316 
5317 	if (did_icp != (mddb_did_ic_t *)NULL) {
5318 		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
5319 			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
5320 			did_blkp = (mddb_did_blk_t *)NULL;
5321 		}
5322 		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
5323 			mddb_did_db_t	*did_dbp1, *did_dbp2;
5324 
5325 			did_dbp1 = did_icp->did_ic_dbp;
5326 			while (did_dbp1) {
5327 				did_dbp2 = did_dbp1->db_next;
5328 				kmem_free((caddr_t)did_dbp1->db_ptr,
5329 				    dbtob(did_dbp1->db_blkcnt));
5330 				kmem_free((caddr_t)did_dbp1,
5331 				    sizeof (mddb_did_db_t));
5332 				did_dbp1 = did_dbp2;
5333 			}
5334 		}
5335 		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
5336 	}
5337 
5338 	if (did_blkp != (mddb_did_blk_t *)NULL) {
5339 		kmem_free((caddr_t)did_blkp, did_blkp_sz);
5340 	}
5341 
5342 	/* No locator blocks were ok */
5343 	if (lb_ok == 0)
5344 		goto out;
5345 
5346 	/* No tagged data was found - will be 0 for MN diskset */
5347 	if (lb_tagged == 0)
5348 		goto out;
5349 
5350 	/* Find the highest non-deleted replica count */
5351 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5352 		int		lb_tot = 0;
5353 
5354 		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
5355 			continue;
5356 
5357 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
5358 			continue;
5359 
5360 		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
5361 			lp = &rip->ri_lbp->lb_locators[li];
5362 			if (lp->l_flags & MDDB_F_DELETED)
5363 				continue;
5364 			lb_tot++;
5365 		}
5366 
5367 		if (lb_tot > lb_total)
5368 			lb_total = lb_tot;
5369 	}
5370 
5371 	/* Count the number of unique tags */
5372 	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
5373 		lb_tags++;
5374 
5375 	/* Should have at least one tag at this point */
5376 	ASSERT(lb_tags > 0);
5377 
5378 
5379 	/*
5380 	 * If the number of tagged locators is not the same as the number of
5381 	 * OK locators OR more than one tag exists, then make sure the
5382 	 * selected tag will be written out later.
5383 	 */
5384 	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
5385 		md_set_setstatus(setno, MD_SET_TAGDATA);
5386 
5387 	/* Only a single tag, take the tagged data */
5388 	if (lb_tags == 1) {
5389 		dt_setup(s, &s->s_dtlp->dtl_dt);
5390 		md_set_setstatus(setno, MD_SET_USETAG);
5391 		goto out;
5392 	}
5393 
5394 	/* Multiple tags, not selecting a tag, tag mode is on */
5395 	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
5396 		retval = MDDB_E_TAGDATA;
5397 
5398 out:
5399 
5400 	return (retval);
5401 }
5402 
5403 /*
5404  *	1. Select a locator.
5405  *	2. check if enough locators now have current copies
5406  *	3. read in database from one of latest
5407  *	4. if known to have latest make all database the same
5408  *	5. if configuration has changed rewrite locators
5409  *
5410  * Parameters:
5411  * 	s - pointer to mddb_set structure
5412  *	flag - used in MN disksets to tell if this node is being joined to
5413  *		a diskset that is in the STALE state.  If the flag is
5414  *		MDDB_MN_STALE, then this node should be marked in the STALE
5415  *		state even if > 50% mddbs are available.  (The diskset can
5416  *		only change from STALE->OK if all nodes withdraw from the
5417  *		MN diskset and then rejoin).
5418  */
5419 static int
5420 load_old_replicas(
5421 	mddb_set_t	*s,
5422 	int		flag
5423 )
5424 {
5425 	mddb_lb_t	*lbp = NULL;
5426 	mddb_mnlb_t	*mnlbp = NULL;
5427 	mddb_ri_t	*rip;
5428 	mddb_locator_t	*lp;
5429 	mddb_db_t	*dbp;
5430 	mddb_de_ic_t	*dep;
5431 	int		li;
5432 	int		alc;
5433 	int		lc;
5434 	int		tlc;
5435 	int		retval = 0;
5436 	caddr_t		p;
5437 	size_t		maxrecsize;
5438 	set_t		setno = s->s_setno;
5439 	mddb_did_db_t	*did_dbp1;
5440 	mddb_did_info_t	*did_info;
5441 	mddb_did_ic_t	*did_icp = NULL;
5442 	md_dev64_t	*newdev;
5443 	mddb_sidelocator_t	*slp = 0;
5444 	mddb_mnsidelocator_t	*mnslp = 0;
5445 	uchar_t		i;
5446 	char		*name;
5447 	ddi_devid_t	ret_devid;
5448 	md_dev64_t	dev;
5449 	uint_t		len, sz;
5450 	char		*minor_name;
5451 	int		write_lb = 0;
5452 
5453 	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
5454 	if (retval = get_mbs_n_lbs(s, &write_lb))
5455 		goto errout;
5456 
5457 	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
5458 		retval = MDDB_E_NOLOCBLK;
5459 		goto errout;
5460 	}
5461 
5462 	/* If a multi-node set, then set md_set.s_status flag */
5463 	if (lbp->lb_flags & MDDB_MNSET) {
5464 		md_set_setstatus(setno, MD_SET_MNSET);
5465 		/*
5466 		 * If data tag area had been allocated before set type was
5467 		 * known - free it now.
5468 		 */
5469 		if (md_set[setno].s_dtp) {
5470 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
5471 			md_set[setno].s_dtp = NULL;
5472 		}
5473 	}
5474 
5475 	/*
5476 	 * If the replica is in devid format, setup the devid incore ptr.
5477 	 */
5478 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5479 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5480 			if (rip->ri_lbp == s->s_lbp) {
5481 				did_icp = s->s_did_icp = rip->ri_did_icp;
5482 				break;
5483 			}
5484 		}
5485 		/*
5486 		 * If no devid incore info found - something has gone
5487 		 * wrong so errout.
5488 		 */
5489 		if (rip == NULL) {
5490 			retval = MDDB_E_NODEVID;
5491 			goto errout;
5492 		}
5493 
5494 		/*
5495 		 * Add all blocks containing devids to free list.
5496 		 * Then remove addresses that actually contain devids.
5497 		 */
5498 		did_dbp1 = did_icp->did_ic_dbp;
5499 		while (did_dbp1) {
5500 			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
5501 				0, dbtob(did_dbp1->db_blkcnt))) {
5502 				retval = MDDB_E_NOSPACE;
5503 				goto errout;
5504 			}
5505 
5506 			did_dbp1 = did_dbp1->db_next;
5507 		}
5508 		for (li = 0; li < lbp->lb_loccnt; li++) {
5509 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5510 			if (!(did_info->info_flags & MDDB_DID_EXISTS))
5511 				continue;
5512 
5513 			if (mddb_devid_free_delete(s, did_info->info_firstblk,
5514 			    did_info->info_offset, did_info->info_length)) {
5515 				/* unable to find disk block */
5516 				retval = MDDB_E_NODEVID;
5517 				goto errout;
5518 			}
5519 		}
5520 	}
5521 
5522 	/*
5523 	 * create mddb_mbaray, count all locators and active locators.
5524 	 */
5525 	alc = 0;
5526 	lc = 0;
5527 	for (li = 0; li < lbp->lb_loccnt; li++) {
5528 		ddi_devid_t	li_devid;
5529 
5530 		lp = &lbp->lb_locators[li];
5531 
5532 		if (lp->l_flags & MDDB_F_DELETED)
5533 			continue;
5534 
5535 		/* Count non-deleted replicas */
5536 		lc++;
5537 
5538 		/*
5539 		 * Use the devid of this locator to compare with the rip
5540 		 * list.  The scenario to watch out for here is that this
5541 		 * locator could be on a disk that is dead and there could
5542 		 * be a valid entry in the rip list for a different disk
5543 		 * that has been moved to the dead disks dev_t.  We don't
5544 		 * want to match with the moved disk.
5545 		 */
5546 		li_devid = NULL;
5547 		(void) mddb_devid_get(s, li, &li_devid, &minor_name);
5548 
5549 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5550 			if (match_mddb(rip, li_devid, minor_name,
5551 			    md_expldev(lp->l_dev), lp->l_blkno)) {
5552 				break;
5553 			}
5554 		}
5555 		if (rip == NULL) {
5556 			/*
5557 			 * If rip not found, then mark error in master block
5558 			 * so that no writes are later attempted to this
5559 			 * replica.  rip may not be setup if ridev
5560 			 * failed due to un-found driver name.
5561 			 */
5562 			lp->l_flags |= MDDB_F_EMASTER;
5563 			continue;
5564 		}
5565 
5566 		s->s_mbiarray[li] = rip->ri_mbip;
5567 
5568 		lp->l_flags &= MDDB_F_ACTIVE;
5569 		lp->l_flags |= (int)rip->ri_flags;
5570 
5571 		if (rip->ri_transplant)
5572 			lp->l_flags &= ~MDDB_F_ACTIVE;
5573 
5574 		if (lp->l_flags & MDDB_F_LOCACC)
5575 			alc++;
5576 	}
5577 
5578 	/* Save on a divide - calculate 50% + 1 up front */
5579 	tlc = ((lc + 1) / 2);
5580 
5581 	if (alc > tlc) {		/* alc > tlc		- OK */
5582 		md_clr_setstatus(setno, MD_SET_STALE);
5583 	} else if (alc < tlc) {		/* alc < tlc		- stale */
5584 		md_set_setstatus(setno, MD_SET_STALE);
5585 	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
5586 		md_clr_setstatus(setno, MD_SET_STALE);
5587 	} else {			/* alc == tlc && even	- ? */
5588 		/* Can do an accept, and are */
5589 		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
5590 			md_clr_setstatus(setno, MD_SET_STALE);
5591 		} else {		/* possibly has a mediator */
5592 			if (mediate(s)) {
5593 				md_set_setstatus(setno, MD_SET_STALE);
5594 			} else {
5595 				md_clr_setstatus(setno, MD_SET_STALE);
5596 			}
5597 		}
5598 
5599 		/*
5600 		 * The mirrored_root_flag allows the sysadmin to decide to
5601 		 * start the local set in a read/write (non-stale) mode
5602 		 * when there are only 50% available mddbs on the system and
5603 		 * when the root file system is on a mirror.  This is useful
5604 		 * in a 2 disk system where 1 disk failure would cause an mddb
5605 		 * quorum failure and subsequent boot failures since the root
5606 		 * filesystem would be in a read-only state.
5607 		 */
5608 		if (mirrored_root_flag == 1 && setno == 0 &&
5609 		    svm_bootpath[0] != 0) {
5610 			md_clr_setstatus(setno, MD_SET_STALE);
5611 		} else {
5612 			if (md_get_setstatus(setno) & MD_SET_STALE) {
5613 				/* Allow half mode - CAREFUL! */
5614 				if (mddb_allow_half)
5615 					md_clr_setstatus(setno, MD_SET_STALE);
5616 			}
5617 		}
5618 
5619 		/*
5620 		 * In a MN diskset,
5621 		 *	- if 50% mddbs are unavailable and this
5622 		 *		has been marked STALE above
5623 		 * 	- master node isn't in the STALE state
5624 		 *	- this node isn't the master node (this node
5625 		 *		isn't the first node to join the set)
5626 		 * then clear the STALE state and set TOOFEW.
5627 		 *
5628 		 * If this node is the master node and set was marked STALE,
5629 		 * then the set stays STALE.
5630 		 *
5631 		 * If this node is not the master and this node's state is
5632 		 * STALE and the master node is not marked STALE,
5633 		 * then master node must be in the TOOFEW state or the
5634 		 * master is panic'ing.  A MN diskset can only be placed into
5635 		 * the STALE state by having the first node join the set
5636 		 * with <= 50% mddbs.  There's no way for a MN diskset to
5637 		 * transition between STALE and not-STALE states unless all
5638 		 * nodes are withdrawn from the diskset or all nodes in the
5639 		 * diskset are rebooted at the same time.
5640 		 *
5641 		 * So, mark this node's state as TOOFEW instead of STALE.
5642 		 */
5643 		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
5644 		    == (MD_SET_MNSET | MD_SET_STALE)) &&
5645 		    ((flag & MDDB_MN_STALE) == 0) &&
5646 		    (!(md_set[setno].s_am_i_master))) {
5647 			md_clr_setstatus(setno, MD_SET_STALE);
5648 			md_set_setstatus(setno, MD_SET_TOOFEW);
5649 		}
5650 	}
5651 
5652 	/*
5653 	 * If a MN set is marked STALE on the other nodes,
5654 	 * mark it stale here.  Override all other considerations
5655 	 * such as a mediator or > 50% mddbs available.
5656 	 */
5657 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
5658 		if (flag & MDDB_MN_STALE)
5659 			md_set_setstatus(setno, MD_SET_STALE);
5660 	}
5661 
5662 	/*
5663 	 * read a good copy of the locator names
5664 	 * if an error occurs reading what is suppose
5665 	 * to be a good copy continue looking for another
5666 	 * good copy
5667 	 */
5668 	s->s_lnp = NULL;
5669 	for (li = 0; li < lbp->lb_loccnt; li++) {
5670 		lp = &lbp->lb_locators[li];
5671 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5672 		    (lp->l_flags & MDDB_F_EMASTER))
5673 			continue;
5674 
5675 		/* Find rip entry for this locator if one exists */
5676 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5677 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5678 			    lp->l_blkno))
5679 				break;
5680 		}
5681 
5682 		if (rip == NULL) {
5683 			continue;
5684 		}
5685 		if (rip->ri_lbp == (mddb_lb_t *)NULL) {
5686 			continue;
5687 		}
5688 		if (rip->ri_lbp->lb_commitcnt != lbp->lb_commitcnt) {
5689 			continue;
5690 		}
5691 
5692 		/*
5693 		 * Now have a copy of the database that is equivalent
5694 		 * to the chosen locator block with respect to
5695 		 * inittime, identifier and commitcnt.   Trying the
5696 		 * equivalent databases in the order that they were
5697 		 * written will provide the most up to date data.
5698 		 */
5699 		lp->l_flags |= readlocnames(s, li);
5700 		if (s->s_lnp)
5701 			break;
5702 	}
5703 
5704 	if (s->s_lnp == NULL) {
5705 		retval = MDDB_E_NOLOCNMS;
5706 		goto errout;
5707 	}
5708 
5709 	/*
5710 	 * read a good copy of the data base
5711 	 * if an error occurs reading what is suppose
5712 	 * to be a good copy continue looking for another
5713 	 * good copy
5714 	 */
5715 
5716 	s->s_dbp = NULL;
5717 	for (li = 0; li < lbp->lb_loccnt; li++) {
5718 		lp = &lbp->lb_locators[li];
5719 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5720 		    (lp->l_flags & MDDB_F_EMASTER))
5721 			continue;
5722 
5723 		/* Find rip entry for this locator if one exists */
5724 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5725 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5726 			    lp->l_blkno))
5727 				break;
5728 		}
5729 
5730 		if (rip == NULL) {
5731 			continue;
5732 		}
5733 		if (rip->ri_lbp == (mddb_lb_t *)NULL) {
5734 			continue;
5735 		}
5736 		if (rip->ri_lbp->lb_commitcnt != lbp->lb_commitcnt) {
5737 			continue;
5738 		}
5739 
5740 		/*
5741 		 * Now have a copy of the database that is equivalent
5742 		 * to the chosen locator block with respect to
5743 		 * inittime, identifier and commitcnt.   Trying the
5744 		 * equivalent databases in the order that they were
5745 		 * written will provide the most up to date data.
5746 		 */
5747 		lp->l_flags |= readcopy(s, li);
5748 
5749 		if (s->s_dbp)
5750 			break;
5751 	}
5752 
5753 	if (s->s_dbp == NULL) {
5754 		retval = MDDB_E_NODIRBLK;
5755 		goto errout;
5756 	}
5757 
5758 	lp->l_flags |= MDDB_F_MASTER;
5759 	lp->l_flags |= MDDB_F_UP2DATE;
5760 
5761 	/*
5762 	 * go through and find largest record;
5763 	 * Also fixup the user data area's
5764 	 */
5765 	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
5766 
5767 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
5768 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
5769 			if (dep->de_flags & MDDB_F_OPT)
5770 				getoptrecord(s, dep);
5771 			else {
5772 				allocuserdata(dep);
5773 				maxrecsize = MAX(dep->de_recsize, maxrecsize);
5774 			}
5775 
5776 	if (maxrecsize > s->s_databuffer_size) {
5777 		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
5778 		if (s->s_databuffer_size)
5779 			kmem_free(s->s_databuffer, s->s_databuffer_size);
5780 		s->s_databuffer = p;
5781 		s->s_databuffer_size = maxrecsize;
5782 	}
5783 
5784 	/* If we can clear the tag data record, do it now. */
5785 	/* Data tags not supported on MN sets */
5786 	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
5787 	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
5788 		dt_setup(s, NULL);
5789 
5790 	/* This will return non-zero if STALE or TOOFEW */
5791 	/* This will write out chosen replica image to all replicas */
5792 	if (selectreplicas(s, MDDB_SCANALL))
5793 		goto errout;
5794 
5795 	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5796 		ddi_devid_t	devidptr;
5797 
5798 		lbp = s->s_lbp;
5799 		for (li = 0; li < lbp->lb_loccnt; li++) {
5800 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5801 
5802 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5803 				devidptr = s->s_did_icp->did_ic_devid[li];
5804 				lp = &lbp->lb_locators[li];
5805 				for (rip = s->s_rip; rip != NULL;
5806 				    rip = rip->ri_next) {
5807 					if (rip->ri_old_devid == 0)
5808 						continue;
5809 					if (ddi_devid_compare(rip->ri_old_devid,
5810 					    devidptr) != 0) {
5811 						continue;
5812 					}
5813 					if (update_locatorblock(s,
5814 					    md_expldev(lp->l_dev),
5815 					    rip->ri_devid)) {
5816 						goto errout;
5817 					}
5818 				}
5819 			}
5820 		}
5821 	}
5822 	/*
5823 	 * If the replica is in device id style - validate the device id's,
5824 	 * if present, in the locator block devid area.
5825 	 */
5826 	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
5827 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5828 		for (li = 0; li < lbp->lb_loccnt; li++) {
5829 			newdev[li] = 0;
5830 			lp = &lbp->lb_locators[li];
5831 			if (lp->l_flags & MDDB_F_DELETED)
5832 				continue;
5833 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5834 			dev = md_expldev(lp->l_dev);
5835 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5836 				/* Validate device id on current system */
5837 				newdev[li] = dev;
5838 				if (mddb_devid_validate(
5839 					did_icp->did_ic_devid[li],
5840 					&(newdev[li]),
5841 					did_info->info_minor_name) == 0) {
5842 					/* Set valid flag */
5843 					did_info->info_flags |= MDDB_DID_VALID;
5844 				} else {
5845 					lp->l_flags |= MDDB_F_EMASTER;
5846 				}
5847 			} else if (!(MD_UPGRADE)) {
5848 				/*
5849 				 * If a device doesn't have a device id,
5850 				 * check if there is now a device ID
5851 				 * associated with device.  If one exists,
5852 				 * add it to the locator block devid area.
5853 				 * If there's not enough space to add it,
5854 				 * print a warning.
5855 				 * Don't do this during upgrade.
5856 				 */
5857 				dev_t ddi_dev = md_dev64_to_dev(dev);
5858 				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
5859 				    DDI_SUCCESS) {
5860 					if (ddi_lyr_get_minor_name(ddi_dev,
5861 					    S_IFBLK, &minor_name)
5862 					    == DDI_SUCCESS) {
5863 						if (mddb_devid_add(s, li,
5864 						    ret_devid, minor_name)) {
5865 							cmn_err(CE_WARN,
5866 							"Not enough space in"
5867 							" metadevice state"
5868 							" database\n");
5869 							cmn_err(CE_WARN,
5870 							"to add relocation"
5871 							" information for"
5872 							" device:\n");
5873 							cmn_err(CE_WARN,
5874 							" major = %d, "
5875 							" minor = %d\n",
5876 							getmajor(ddi_dev),
5877 							getminor(ddi_dev));
5878 						} else {
5879 						    write_lb = 1;
5880 						}
5881 						kmem_free(minor_name,
5882 						    strlen(minor_name) + 1);
5883 					}
5884 					ddi_devid_free(ret_devid);
5885 				}
5886 			}
5887 		}
5888 
5889 		/*
5890 		 * If a device has a valid device id and if the dev_t
5891 		 * associated with the device id has changed, update the
5892 		 * driver name, minor num and dev_t in the local and side
5893 		 * locators to match the dev_t that the system currently
5894 		 * associates with the device id.
5895 		 *
5896 		 * Don't do this during upgrade.
5897 		 */
5898 		if (!(MD_UPGRADE)) {
5899 		    for (li = 0; li < lbp->lb_loccnt; li++) {
5900 			lp = &lbp->lb_locators[li];
5901 			if (lp->l_flags & MDDB_F_DELETED)
5902 				continue;
5903 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5904 			if ((did_info->info_flags & MDDB_DID_VALID) &&
5905 			    !(did_info->info_flags & MDDB_DID_UPDATED)) {
5906 				if (lbp->lb_flags & MDDB_MNSET) {
5907 					int 	j;
5908 					int	index = -1;
5909 					mnlbp = (mddb_mnlb_t *)lbp;
5910 					for (j = 0; j < MD_MNMAXSIDES; j++) {
5911 					    mnslp = &mnlbp->
5912 						lb_mnsidelocators[j][li];
5913 					    if (mnslp->mnl_sideno ==
5914 						s->s_sideno)
5915 						break;
5916 					    if (mnslp->mnl_sideno == 0)
5917 						index = j;
5918 					}
5919 					if (j == MD_MNMAXSIDES) {
5920 					    /* No match found; take empty */
5921 					    mnslp = &mnlbp->
5922 						lb_mnsidelocators[index][li];
5923 					    write_lb = 1;
5924 					    mnslp->mnl_mnum =
5925 						md_getminor(newdev[li]);
5926 					} else if (mnslp->mnl_mnum !=
5927 					    md_getminor(newdev[li])) {
5928 						write_lb = 1;
5929 						mnslp->mnl_mnum =
5930 						    md_getminor(newdev[li]);
5931 					}
5932 				} else {
5933 					slp = &lbp->
5934 					    lb_sidelocators[s->s_sideno][li];
5935 					if (slp->l_mnum !=
5936 					    md_getminor(newdev[li])) {
5937 						write_lb = 1;
5938 						slp->l_mnum =
5939 						    md_getminor(newdev[li]);
5940 					}
5941 				}
5942 				name = ddi_major_to_name(
5943 						md_getmajor(newdev[li]));
5944 				if (lbp->lb_flags & MDDB_MNSET) {
5945 					i = mnslp->mnl_drvnm_index;
5946 				} else {
5947 					i = slp->l_drvnm_index;
5948 				}
5949 				if (strncmp(lbp->lb_drvnm[i].dn_data, name,
5950 					lbp->lb_drvnm[i].dn_len) != 0) {
5951 					/* Driver name has changed */
5952 					len = strlen(name);
5953 					/* Look for the driver name */
5954 					for (i = 0; i < MDDB_DRVNMCNT; i++) {
5955 						if (lbp->lb_drvnm[i].dn_len
5956 						    != len)
5957 							continue;
5958 						if (strncmp(
5959 						    lbp->lb_drvnm[i].dn_data,
5960 						    name, len) == 0)
5961 							break;
5962 					}
5963 					/* Didn't find one, add it */
5964 					if (i == MDDB_DRVNMCNT) {
5965 					    for (i = 0; i < MDDB_DRVNMCNT;
5966 						i++) {
5967 						if (lbp->lb_drvnm[i].dn_len
5968 						    == 0)
5969 							break;
5970 					    }
5971 					    if (i == MDDB_DRVNMCNT) {
5972 						cmn_err(CE_WARN,
5973 						    "Unable to update driver"
5974 						    " name for dev:  "
5975 						    "major = %d, "
5976 						    "minor = %d\n",
5977 						    md_getmajor(newdev[li]),
5978 						    md_getminor(newdev[li]));
5979 						continue;
5980 					    }
5981 					    (void) strncpy(
5982 						lbp->lb_drvnm[i].dn_data,
5983 						name, MD_MAXDRVNM);
5984 					    lbp->lb_drvnm[i].dn_len =
5985 						(uchar_t)strlen(name);
5986 					}
5987 					/* Fill in the drvnm index */
5988 					if (lbp->lb_flags & MDDB_MNSET) {
5989 						mnslp->mnl_drvnm_index = i;
5990 					} else {
5991 						slp->l_drvnm_index = i;
5992 					}
5993 					write_lb = 1;
5994 				}
5995 				did_info->info_flags |= MDDB_DID_UPDATED;
5996 			}
5997 		}
5998 	    }
5999 	}
6000 	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
6001 
6002 	/*
6003 	 * If locator block has been changed by get_mbs_n_lbs,
6004 	 * by addition of new device id, by updated minor name or
6005 	 * by updated driver name - write out locator block.
6006 	 */
6007 	if (write_lb) {
6008 		if (push_lb(s))
6009 			goto errout;
6010 	}
6011 
6012 	/*
6013 	 * If the tag was moved, allocated, or a BADTAG was seen for some other
6014 	 * reason, then make sure tags are written to all the replicas.
6015 	 * Data tags not supported on MN sets.
6016 	 */
6017 	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
6018 		if (! (lc = dt_alloc_if_needed(s))) {
6019 			for (li = 0; li < lbp->lb_loccnt; li++) {
6020 				lp = &lbp->lb_locators[li];
6021 
6022 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
6023 				    (lp->l_flags & MDDB_F_EMASTER))
6024 					continue;
6025 
6026 				if (lp->l_flags & MDDB_F_BADTAG) {
6027 					lc = 1;
6028 					break;
6029 				}
6030 			}
6031 		}
6032 
6033 		if (lc) {
6034 			md_set_setstatus(setno, MD_SET_TAGDATA);
6035 			md_clr_setstatus(setno, MD_SET_BADTAG);
6036 			(void) selectreplicas(s, MDDB_SCANALL);
6037 		}
6038 	}
6039 
6040 errout:
6041 
6042 	/* Free extraneous rip components. */
6043 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
6044 		/* Get rid of lbp's and dtp's */
6045 
6046 		if (rip->ri_lbp != lbp) {
6047 			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
6048 				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
6049 				rip->ri_dtp = (mddb_dt_t *)NULL;
6050 			}
6051 
6052 			if (rip->ri_devid != (ddi_devid_t)NULL) {
6053 				sz = (int)ddi_devid_sizeof(rip->ri_devid);
6054 				kmem_free((caddr_t)rip->ri_devid, sz);
6055 				rip->ri_devid = (ddi_devid_t)NULL;
6056 			}
6057 			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
6058 				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
6059 				kmem_free((caddr_t)rip->ri_old_devid, sz);
6060 				rip->ri_old_devid = (ddi_devid_t)NULL;
6061 			}
6062 
6063 			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
6064 				mddb_devid_icp_free(&rip->ri_did_icp,
6065 				    rip->ri_lbp);
6066 
6067 				kmem_free((caddr_t)rip->ri_lbp,
6068 				    dbtob(rip->ri_lbp->lb_blkcnt));
6069 				rip->ri_lbp = (mddb_lb_t *)NULL;
6070 			}
6071 		}
6072 
6073 		if (lbp != NULL) {
6074 			for (li = 0; li < lbp->lb_loccnt; li++) {
6075 				lp = &lbp->lb_locators[li];
6076 				if (lp->l_flags & MDDB_F_DELETED)
6077 					continue;
6078 				if (rip->ri_dev == md_expldev(lp->l_dev) &&
6079 				    rip->ri_blkno == lp->l_blkno)
6080 					break;
6081 			}
6082 			if (li < lbp->lb_loccnt)
6083 				continue;
6084 		}
6085 
6086 		/*
6087 		 * Get rid of mbp's:
6088 		 *	if lbp, those out of lb_loccnt bounds
6089 		 *	if !lbp,  all of them.
6090 		 */
6091 		if (rip->ri_mbip) {
6092 			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
6093 			if (dev64 != NODEV64) {
6094 				mddb_devclose(dev64);
6095 				free_mbipp(&rip->ri_mbip);
6096 			}
6097 		}
6098 		/*
6099 		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
6100 		 * code always ends up calling ridev for all replicas
6101 		 * before calling load_old_replicas.  ridev will reset
6102 		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
6103 		 */
6104 		if (setno != MD_LOCAL_SET)
6105 			rip->ri_flags &= ~MDDB_F_EMASTER;
6106 	}
6107 	return (retval);
6108 }
6109 
6110 /*
6111  * Given the devt from the md.conf info, get the devid for the device.
6112  */
6113 static void
6114 lookup_db_devid(mddb_cfg_loc_t *cl)
6115 {
6116 	dev_t		ldev;
6117 	ddi_devid_t	devid;
6118 	char		*minor;
6119 
6120 	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
6121 		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
6122 		return;
6123 	}
6124 
6125 	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
6126 	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
6127 		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
6128 		    cl->l_driver, cl->l_mnum);
6129 		return;
6130 	}
6131 
6132 	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
6133 		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
6134 		    cl->l_mnum);
6135 		return;
6136 	}
6137 
6138 	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
6139 	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
6140 	cl->l_devid = (uint64_t)(uintptr_t)devid;
6141 	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
6142 
6143 	kmem_free(minor, strlen(minor) + 1);
6144 }
6145 
6146 /*
6147  * grab driver name, minor, block and devid out of
6148  * strings like "driver:minor:block:devid"
6149  */
6150 static int
6151 parse_db_loc(
6152 	char		*str,
6153 	mddb_cfg_loc_t	*clp
6154 )
6155 {
6156 	char		*p, *e;
6157 	char		*minor_name;
6158 	ddi_devid_t	ret_devid;
6159 
6160 	clp->l_dev = 0;
6161 	p = clp->l_driver;
6162 	e = p + sizeof (clp->l_driver) - 1;
6163 	while ((*str != ':') && (*str != '\0') && (p < e))
6164 		*p++ = *str++;
6165 	*p = '\0';
6166 	if (*str++ != ':')
6167 		return (-1);
6168 	clp->l_mnum = 0;
6169 	while (ISNUM(*str)) {
6170 		clp->l_mnum *= 10;
6171 		clp->l_mnum += *str++ - '0';
6172 	}
6173 	if (*str++ != ':')
6174 		return (-1);
6175 	clp->l_blkno = 0;
6176 	while (ISNUM(*str)) {
6177 		clp->l_blkno *= 10;
6178 		clp->l_blkno += *str++ - '0';
6179 	}
6180 	if (*str++ != ':')
6181 		return (-1);
6182 
6183 	/*
6184 	 * If the md_devid_destroy flag is set, ignore the device ids.
6185 	 * This is only to used in a catastrophic failure case.  Examples
6186 	 * would be where the device id of all drives in the system
6187 	 * (especially the mirror'd root drives) had been changed
6188 	 * by firmware upgrade or by a patch to an existing disk
6189 	 * driver.  Another example would be in the case of non-unique
6190 	 * device ids due to a bug.  The device id would be valid on
6191 	 * the system, but would return the wrong dev_t.
6192 	 */
6193 	if (md_devid_destroy) {
6194 		clp->l_devid_flags = 0;
6195 		clp->l_devid = (uint64_t)NULL;
6196 		clp->l_devid_sz = 0;
6197 		clp->l_old_devid = (uint64_t)NULL;
6198 		clp->l_old_devid_sz = 0;
6199 		clp->l_minor_name[0] = '\0';
6200 		return (0);
6201 	}
6202 
6203 	if (ddi_devid_str_decode(str,
6204 	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
6205 		return (-1);
6206 
6207 	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
6208 	clp->l_devid_flags = 0;
6209 	clp->l_old_devid = (uint64_t)NULL;
6210 	clp->l_old_devid_sz = 0;
6211 
6212 	/* If no device id associated with device, just return */
6213 	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
6214 		clp->l_devid_sz = 0;
6215 		clp->l_minor_name[0] = '\0';
6216 		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
6217 		    md_keep_repl_state == 0) {
6218 			/*
6219 			 * No devid in md.conf; we're in recovery mode so
6220 			 * lookup the devid for the device as specified by
6221 			 * the devt in md.conf.
6222 			 */
6223 			lookup_db_devid(clp);
6224 		}
6225 		return (0);
6226 	}
6227 
6228 	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
6229 	    MDDB_DEVID_SZ;
6230 	clp->l_devid_sz = (int)ddi_devid_sizeof(
6231 	    (ddi_devid_t)(uintptr_t)clp->l_devid);
6232 	(void) strcpy(clp->l_minor_name, minor_name);
6233 	kmem_free(minor_name, strlen(minor_name) + 1);
6234 
6235 	return (0);
6236 }
6237 
6238 /*
6239  * grab driver name, minor, and block out of
6240  * strings like "driver:minor:block:devid driver:minor:block:devid ..."
6241  */
6242 static void
6243 parse_db_string(
6244 	char		*str
6245 )
6246 {
6247 	char		*p, *e;
6248 	mddb_cfg_loc_t	*cl;
6249 	char		restore_space;
6250 
6251 	/* CSTYLED */
6252 	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
6253 	for (p = str; (*p != '\0'); ) {
6254 		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
6255 			;
6256 		if (*p == '\0')
6257 			break;
6258 		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
6259 			;
6260 		/*
6261 		 * Only give parse_db_loc 1 entry, so stuff a null into
6262 		 * the string if we're not at the end.  We need to save this
6263 		 * char and restore it after call.
6264 		 */
6265 		restore_space = '\0';
6266 		if (*e != '\0') {
6267 			restore_space = *e;
6268 			*e = '\0';
6269 		}
6270 		if (parse_db_loc(p, cl) != 0) {
6271 			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
6272 		} else {
6273 			(void) ridev(
6274 			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
6275 			    cl, NULL, MDDB_F_PTCHED);
6276 			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
6277 				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
6278 				    cl->l_devid_sz);
6279 			}
6280 		}
6281 		if (restore_space != '\0') {
6282 			*e = restore_space;
6283 		}
6284 		p = e;
6285 	}
6286 	kmem_free(cl, sizeof (mddb_cfg_loc_t));
6287 }
6288 
6289 /*
6290  * grab database locations supplied by md.conf as properties
6291  */
6292 static void
6293 parse_db_strings(void)
6294 {
6295 	int		bootlist_id;
6296 	int		proplen;
6297 	/*
6298 	 * size of _bootlist_name should match uses of line and entry in
6299 	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
6300 	 */
6301 	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
6302 	char		*bootlist_name;
6303 	caddr_t		prop;
6304 
6305 /*
6306  * Step through the bootlist properties one at a time by forming the
6307  * correct name, fetching the property, parsing the property and
6308  * then freeing the memory.  If a property does not exist or returns
6309  * some form of error just ignore it.  There is no guarantee that
6310  * the properties will always exist in sequence, for example
6311  * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
6312  * mddb_bootlist3 existing.
6313  */
6314 	bootlist_name = &_bootlist_name[0];
6315 	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
6316 
6317 		proplen = 0;
6318 		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
6319 
6320 		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
6321 		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
6322 		    &proplen) != DDI_PROP_SUCCESS)
6323 			continue;
6324 
6325 		if (proplen <= 0)
6326 			continue;
6327 
6328 		if (md_init_debug)
6329 			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
6330 
6331 		parse_db_string(prop);
6332 		kmem_free(prop, proplen);
6333 	}
6334 }
6335 
6336 static int
6337 initit(
6338 	set_t		setno,
6339 	int		flag
6340 )
6341 {
6342 	int		i;
6343 	mddb_set_t	*s;
6344 	mddb_lb_t	*lbp;		/* pointer to locator block */
6345 	mddb_ln_t	*lnp;		/* pointer to locator names */
6346 	mddb_db_t	*dbp;		/* pointer to directory block */
6347 	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
6348 	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
6349 	mddb_bf_t	*bfp;
6350 	side_t		sideno;
6351 	side_t		maxsides;
6352 	mddb_block_t	lb_blkcnt;
6353 	int		retval = 0;
6354 	md_dev64_t	dev;
6355 	mddb_mnlb_t	*mnlbp;
6356 	int		devid_flag;
6357 
6358 	/* single thread's all loads/unloads of set's */
6359 	mutex_enter(&mddb_lock);
6360 	mutex_enter(SETMUTEX(setno));
6361 
6362 	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
6363 		mutex_exit(SETMUTEX(setno));
6364 		mutex_exit(&mddb_lock);
6365 		return (MDDB_E_NOTNOW);
6366 	}
6367 
6368 	s = (mddb_set_t *)md_set[setno].s_db;
6369 
6370 	single_thread_start(s);
6371 
6372 	/*
6373 	 * init is already underway, block. Return success.
6374 	 */
6375 	if (s->s_lbp) {
6376 		single_thread_end(s);
6377 		mutex_exit(SETMUTEX(setno));
6378 		mutex_exit(&mddb_lock);
6379 		return (0);
6380 	}
6381 
6382 	uniqtime32(&s->s_inittime);
6383 
6384 	/* grab database locations patched by /etc/system */
6385 	if (setno == MD_LOCAL_SET)
6386 		parse_db_strings();
6387 
6388 	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
6389 	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
6390 
6391 	s->s_zombie = 0;
6392 	s->s_staledeletes = 0;
6393 	s->s_optcmtcnt = 0;
6394 	s->s_opthavelck = 0;
6395 	s->s_optwantlck = 0;
6396 	s->s_optwaiterr = 0;
6397 	s->s_opthungerr = 0;
6398 
6399 	/*
6400 	 * KEEPTAG can never be set for a MN diskset since no tags are
6401 	 * allowed to be stored in a MN diskset.  No way to check
6402 	 * if this is a MN diskset or not at this point since the mddb
6403 	 * hasn't been read in from disk yet.  (flag will only have
6404 	 * MUTLINODE bit set if a new set is being created.)
6405 	 */
6406 	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
6407 		dt_setup(s, NULL);
6408 
6409 	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
6410 
6411 	for (i = 0; i <	mddb_maxbufheaders; i++) {
6412 		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
6413 		sema_init(&bfp->bf_buf.b_io, 0, NULL,
6414 		    SEMA_DEFAULT, NULL);
6415 		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
6416 		    SEMA_DEFAULT, NULL);
6417 		bfp->bf_buf.b_offset = -1;
6418 		freebuffer(s, bfp);
6419 	}
6420 
6421 	retval = load_old_replicas(s, flag);
6422 	/* If 0 return value - success */
6423 	if (! retval) {
6424 		single_thread_end(s);
6425 		mutex_exit(SETMUTEX(setno));
6426 		mutex_exit(&mddb_lock);
6427 		return (0);
6428 	}
6429 
6430 	/*
6431 	 * If here, then the load_old_replicas() failed
6432 	 */
6433 
6434 
6435 	/* If the database was supposed to exist. */
6436 	if (flag & MDDB_MUSTEXIST) {
6437 		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
6438 			for (i = 0; i < mddb_maxcopies;	 i++) {
6439 				if (! s->s_mbiarray[i])
6440 					continue;
6441 				dev = md_expldev(
6442 					s->s_lbp->lb_locators[i].l_dev);
6443 				dev = md_xlate_targ_2_mini(dev);
6444 				if (dev != NODEV64) {
6445 					mddb_devclose(dev);
6446 					free_mbipp(&s->s_mbiarray[i]);
6447 				}
6448 			}
6449 
6450 			kmem_free((caddr_t)s->s_mbiarray,
6451 				sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
6452 			s->s_mbiarray = NULL;
6453 		}
6454 
6455 		if (s->s_lnp != (mddb_ln_t *)NULL) {
6456 			kmem_free((caddr_t)s->s_lnp,
6457 			    dbtob(s->s_lbp->lb_lnblkcnt));
6458 			s->s_lnp = (mddb_ln_t *)NULL;
6459 		}
6460 
6461 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
6462 
6463 		if (s->s_lbp != (mddb_lb_t *)NULL) {
6464 			kmem_free((caddr_t)s->s_lbp,
6465 			    dbtob(s->s_lbp->lb_blkcnt));
6466 			s->s_lbp = (mddb_lb_t *)NULL;
6467 		}
6468 
6469 		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
6470 			kmem_free((caddr_t)bfp, sizeof (*bfp));
6471 
6472 		single_thread_end(s);
6473 		mutex_exit(SETMUTEX(setno));
6474 		mutex_exit(&mddb_lock);
6475 
6476 		if (retval == MDDB_E_TAGDATA)
6477 			return (retval);
6478 
6479 		/* Want a bit more detailed error messages */
6480 		if (mddb_db_err_detail)
6481 			return (retval);
6482 
6483 		return (MDDB_E_NODB);
6484 	}
6485 
6486 
6487 	/*
6488 	 * MDDB_NOOLDOK set - Creating a new database, so do
6489 	 * more initialization.
6490 	 */
6491 
6492 	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6493 				MDDB_LOCAL_LBCNT : MDDB_LBCNT);
6494 	if (flag & MDDB_MULTINODE) {
6495 		lb_blkcnt = MDDB_MNLBCNT;
6496 	}
6497 
6498 	if (s->s_lbp == NULL)
6499 		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
6500 	lbp = s->s_lbp;
6501 
6502 	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
6503 	lbp->lb_setno = setno;
6504 	lbp->lb_magic = MDDB_MAGIC_LB;
6505 	if (flag & MDDB_MULTINODE) {
6506 		lbp->lb_revision = MDDB_REV_MNLB;
6507 	} else {
6508 		lbp->lb_revision = MDDB_REV_LB;
6509 	}
6510 	lbp->lb_inittime = s->s_inittime;
6511 	if (flag & MDDB_MULTINODE) {
6512 		mnlbp = (mddb_mnlb_t *)lbp;
6513 		for (i = 0; i < MDDB_NLB; i++) {
6514 			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
6515 				mddb_mnsidelocator_t	*mnslp;
6516 				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
6517 				mnslp->mnl_mnum = NODEV32;
6518 				mnslp->mnl_sideno = 0;
6519 				mnslp->mnl_drvnm_index = 0;
6520 			}
6521 		}
6522 	} else {
6523 		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
6524 		for (i = 0; i < MDDB_NLB; i++) {
6525 			for (sideno = 0; sideno < maxsides; sideno++) {
6526 				mddb_sidelocator_t	*slp;
6527 				slp = &lbp->lb_sidelocators[sideno][i];
6528 				slp->l_mnum = NODEV32;
6529 			}
6530 		}
6531 	}
6532 	lbp->lb_blkcnt = lb_blkcnt;
6533 
6534 	/* lb starts on block 0 */
6535 	/* locator names starts after locator block */
6536 	lbp->lb_lnfirstblk = lb_blkcnt;
6537 	if (flag & MDDB_MULTINODE) {
6538 		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
6539 	} else {
6540 		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6541 		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
6542 	}
6543 
6544 	if (flag & MDDB_MULTINODE) {
6545 		/* Creating a multinode diskset */
6546 		md_set_setstatus(setno, MD_SET_MNSET);
6547 		lbp->lb_flags |= MDDB_MNSET;
6548 	}
6549 
6550 	/* Data portion of mddb located after locator names */
6551 	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
6552 
6553 	/* the btodb that follows is converting the directory block size */
6554 	/* Data tag part of mddb located after first block of mddb data */
6555 	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
6556 						btodb(MDDB_BSIZE));
6557 	/* Data tags are not used in MN diskset - so set count to 0 */
6558 	if (flag & MDDB_MULTINODE)
6559 		lbp->lb_dtblkcnt = (mddb_block_t)0;
6560 	else
6561 		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
6562 
6563 
6564 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
6565 	lnp->ln_magic = MDDB_MAGIC_LN;
6566 	if (flag & MDDB_MULTINODE) {
6567 		lnp->ln_revision = MDDB_REV_MNLN;
6568 	} else {
6569 		lnp->ln_revision = MDDB_REV_LN;
6570 	}
6571 	s->s_lnp = lnp;
6572 
6573 	/*
6574 	 * Set up Device ID portion of Locator Block.
6575 	 * Do not set locator to device id style if
6576 	 * md_devid_destroy is 1 and md_keep_repl_state is 1
6577 	 * (destroy all device id data and keep replica in
6578 	 * non device id mode).
6579 	 *
6580 	 * This is logically equivalent to set locator to
6581 	 * device id style if md_devid_destroy is 0 or
6582 	 * md_keep_repl_state is 0.
6583 	 *
6584 	 * In SunCluster environment, device id mode is disabled
6585 	 * which means diskset will be run in non-devid mode.  For
6586 	 * localset, the behavior will remain intact and run in
6587 	 * device id mode.
6588 	 *
6589 	 * In multinode diskset devids are turned off.
6590 	 */
6591 	devid_flag = 1;
6592 	if (cluster_bootflags & CLUSTER_CONFIGURED)
6593 		if (setno != MD_LOCAL_SET)
6594 			devid_flag = 0;
6595 	if (flag & MDDB_MULTINODE)
6596 		devid_flag = 0;
6597 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
6598 		devid_flag = 0;
6599 	/*
6600 	 * if we weren't devid style before and md_keep_repl_state=1
6601 	 * we need to stay non-devid
6602 	 */
6603 	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
6604 	    (md_keep_repl_state == 1))
6605 		devid_flag = 0;
6606 	if (devid_flag) {
6607 		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
6608 			lbp->lb_dtblkcnt;
6609 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6610 		lbp->lb_flags |= MDDB_DEVID_STYLE;
6611 
6612 		did_icp = (mddb_did_ic_t *)kmem_zalloc
6613 			(sizeof (mddb_did_ic_t), KM_SLEEP);
6614 		did_blkp = (mddb_did_blk_t *)
6615 			kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
6616 		did_blkp->blk_magic = MDDB_MAGIC_DI;
6617 		did_blkp->blk_revision = MDDB_REV_DI;
6618 		did_icp->did_ic_blkp = did_blkp;
6619 		s->s_did_icp = did_icp;
6620 	}
6621 
6622 	setidentifier(s, &lbp->lb_ident);
6623 	uniqtime32(&lbp->lb_timestamp);
6624 	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
6625 	dbp->db_magic = MDDB_MAGIC_DB;
6626 	dbp->db_revision = MDDB_REV_DB;
6627 	uniqtime32(&dbp->db_timestamp);
6628 	dbp->db_nextblk = 0;
6629 	dbp->db_firstentry = NULL;
6630 	dbp->db_blknum = lbp->lb_dbfirstblk;
6631 	dbp->db_recsum = MDDB_GLOBAL_XOR;
6632 	s->s_dbp = dbp;
6633 	single_thread_end(s);
6634 	mutex_exit(SETMUTEX(setno));
6635 	mutex_exit(&mddb_lock);
6636 	return (0);
6637 }
6638 
6639 mddb_set_t *
6640 mddb_setenter(
6641 	set_t		setno,
6642 	int		flag,
6643 	int		*errorcodep
6644 )
6645 {
6646 	mddb_set_t	*s;
6647 	int		err = 0;
6648 	size_t		sz = sizeof (void *) * MD_MAXUNITS;
6649 
6650 	mutex_enter(SETMUTEX(setno));
6651 	if (! md_set[setno].s_db) {
6652 		mutex_exit(SETMUTEX(setno));
6653 		if (errorcodep != NULL)
6654 			*errorcodep = MDDB_E_NOTOWNER;
6655 		return (NULL);
6656 	}
6657 
6658 	/* Allocate s_un and s_ui arrays if not already present. */
6659 	if (md_set[setno].s_un == NULL) {
6660 		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
6661 		if (md_set[setno].s_un == NULL) {
6662 			mutex_exit(SETMUTEX(setno));
6663 			if (errorcodep != NULL)
6664 				*errorcodep = MDDB_E_NOTOWNER;
6665 			return (NULL);
6666 		}
6667 	}
6668 	if (md_set[setno].s_ui == NULL) {
6669 		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
6670 		if (md_set[setno].s_ui == NULL) {
6671 			mutex_exit(&md_set[setno].s_dbmx);
6672 			kmem_free(md_set[setno].s_un, sz);
6673 			md_set[setno].s_un = NULL;
6674 			if (errorcodep != NULL)
6675 				*errorcodep = MDDB_E_NOTOWNER;
6676 			return (NULL);
6677 		}
6678 	}
6679 	s = (mddb_set_t *)md_set[setno].s_db;
6680 	if (s->s_lbp)
6681 		return (s);
6682 
6683 	if (flag & MDDB_NOINIT)
6684 		return (s);
6685 
6686 	/*
6687 	 * Release the set mutex - it will be acquired and released in
6688 	 * initit after acquiring the mddb_lock.  This is done to assure
6689 	 * that mutexes are always acquired in the same order to prevent
6690 	 * possible deadlock
6691 	 */
6692 	mutex_exit(SETMUTEX(setno));
6693 
6694 	if ((err = initit(setno, flag)) != 0) {
6695 		if (errorcodep != NULL)
6696 			*errorcodep = err;
6697 		return (NULL);
6698 	}
6699 
6700 	mutex_enter(SETMUTEX(setno));
6701 	return ((mddb_set_t *)md_set[setno].s_db);
6702 }
6703 
6704 /*
6705  * Release the set lock for a given set.
6706  *
6707  * In a MN diskset, this routine may send messages to the rpc.mdcommd
6708  * in order to have the slave nodes re-parse parts of the mddb.
6709  * Messages are only sent if the global ioctl lock is not held.
6710  *
6711  * With the introduction of multi-threaded ioctls, there is no way
6712  * to determine which thread(s) are holding the ioctl lock.  So, if
6713  * the ioctl lock is held (by process X) process X will send the
6714  * messages to the slave nodes when process X releases the ioctl lock.
6715  */
6716 void
6717 mddb_setexit(
6718 	mddb_set_t	*s
6719 )
6720 {
6721 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
6722 	md_mn_kresult_t			*kresult;
6723 	mddb_lb_t			*lbp = s->s_lbp;
6724 	int				i;
6725 	int				rval = 1;
6726 
6727 	/*
6728 	 * If not a MN diskset OR
6729 	 * a MN diskset but this node isn't master,
6730 	 * then release the mutex.
6731 	 */
6732 	if (!(MD_MNSET_SETNO(s->s_setno)) ||
6733 	    ((MD_MNSET_SETNO(s->s_setno)) &&
6734 	    (!md_set[s->s_setno].s_am_i_master))) {
6735 		mutex_exit(SETMUTEX(s->s_setno));
6736 		return;
6737 	}
6738 
6739 	/*
6740 	 * If global ioctl lock is held, then send no messages,
6741 	 * just release mutex and return.
6742 	 *
6743 	 */
6744 	if (md_status & MD_GBL_IOCTL_LOCK) {
6745 		mutex_exit(SETMUTEX(s->s_setno));
6746 		return;
6747 	}
6748 
6749 	/*
6750 	 * This thread is not holding the ioctl lock, so drop the set
6751 	 * lock, send messages to slave nodes to reparse portions
6752 	 * of the mddb and return.
6753 	 *
6754 	 * If the block parse flag is set, do not send parse messages.
6755 	 * This flag is set when master is adding a new mddb that would
6756 	 * cause parse messages to be sent to the slaves, but the slaves
6757 	 * don't have knowledge of the new mddb yet since the mddb add
6758 	 * operation hasn't been run on the slave nodes yet.  When the
6759 	 * master unblocks the parse flag, the parse messages will be
6760 	 * generated.
6761 	 *
6762 	 * If s_mn_parseflags_sending is non-zero, then another thread
6763 	 * is already currently sending a parse message, so just release
6764 	 * the mutex and return.  If an mddb change occurred that results
6765 	 * in a parse message to be generated, the thread that is currently
6766 	 * sending a parse message would generate the additional parse message.
6767 	 *
6768 	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
6769 	 * then loop until s_mn_parseflags is 0 (until there are no more
6770 	 * messages to send).
6771 	 * While s_mn_parseflags is non-zero,
6772 	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
6773 	 * 	set s_mn_parseflags to zero
6774 	 *	release mutex
6775 	 *	send message
6776 	 *	re-grab mutex
6777 	 *	set s_mn_parseflags_sending to zero
6778 	 */
6779 	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
6780 		KM_SLEEP);
6781 	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
6782 	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
6783 	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
6784 		/* Grab snapshot of parse flags */
6785 		s->s_mn_parseflags_sending = s->s_mn_parseflags;
6786 		s->s_mn_parseflags = 0;
6787 
6788 		mutex_exit(SETMUTEX(s->s_setno));
6789 
6790 		/*
6791 		 * Send the message to the slaves to re-parse
6792 		 * the indicated portions of the mddb. Send the status
6793 		 * of the 50 mddbs in this set so that slaves know which
6794 		 * mddbs that the master node thinks are 'good'.
6795 		 * Otherwise, slave may reparse, but from wrong replica.
6796 		 */
6797 		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
6798 		for (i = 0; i < MDDB_NLB; i++) {
6799 			mddb_parse_msg->msg_lb_flags[i] =
6800 				lbp->lb_locators[i].l_flags;
6801 		}
6802 		kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
6803 		while (rval != 0) {
6804 			rval = mdmn_ksend_message(s->s_setno,
6805 				MD_MN_MSG_MDDB_PARSE, 0,
6806 				(char *)mddb_parse_msg,
6807 				sizeof (mddb_parse_msg), kresult);
6808 			if (rval != 0)
6809 				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
6810 					"mddb update message to other nodes in "
6811 					"diskset %s\n", s->s_setname);
6812 		}
6813 		kmem_free(kresult, sizeof (md_mn_kresult_t));
6814 
6815 		/*
6816 		 * Re-grab mutex to clear sending field and to
6817 		 * see if another parse message needs to be generated.
6818 		 */
6819 		mutex_enter(SETMUTEX(s->s_setno));
6820 		s->s_mn_parseflags_sending = 0;
6821 	}
6822 	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
6823 	mutex_exit(SETMUTEX(s->s_setno));
6824 }
6825 
6826 static void
6827 mddb_setexit_no_parse(
6828 	mddb_set_t	*s
6829 )
6830 {
6831 	mutex_exit(SETMUTEX(s->s_setno));
6832 }
6833 
6834 uint_t
6835 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
6836 {
6837 	uint_t			li;
6838 	mddb_lb_t		*lbp = s->s_lbp;
6839 	mddb_locator_t		*lp;
6840 	ddi_devid_t		ret_devid;
6841 	uint_t			devid_len;
6842 	dev_t			ddi_dev;
6843 	mddb_did_ic_t		*did_icp;
6844 	mddb_did_blk_t		*did_blkp;
6845 	char			*minor_name;
6846 	size_t			sz;
6847 	int			retval;
6848 	int			err;
6849 	md_dev64_t		dev64; /* tmp var to make code look better */
6850 
6851 
6852 	/* Need disk block(s) to hold mddb_did_blk_t */
6853 	*blk_cnt = MDDB_DID_BLOCKS;
6854 
6855 	if (doit) {
6856 		/*
6857 		 * Alloc mddb_did_blk_t disk block and fill in header area.
6858 		 * Don't fill in did magic number until end of routine so
6859 		 * if machine panics in the middle of conversion, the
6860 		 * device id information will be thrown away at the
6861 		 * next snarfing of this set.
6862 		 * Need to set DEVID_STYLE so that mddb_devid_add will
6863 		 * function properly.
6864 		 */
6865 		/* grab the mutex */
6866 		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
6867 			return (1);
6868 		}
6869 		single_thread_start(s);
6870 		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
6871 		if (lbp->lb_didfirstblk == 0) {
6872 			single_thread_end(s);
6873 			mddb_setexit(s);
6874 			return (1);
6875 		}
6876 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6877 		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
6878 		    KM_SLEEP);
6879 		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
6880 		    KM_SLEEP);
6881 
6882 		did_blkp->blk_revision = MDDB_REV_DI;
6883 		did_icp->did_ic_blkp = did_blkp;
6884 		s->s_did_icp = did_icp;
6885 		lbp->lb_flags |= MDDB_DEVID_STYLE;
6886 	}
6887 
6888 	/* Fill in information in mddb_did_info_t array */
6889 	for (li = 0; li < lbp->lb_loccnt; li++) {
6890 		lp = &lbp->lb_locators[li];
6891 		if (lp->l_flags & MDDB_F_DELETED)
6892 			continue;
6893 
6894 		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
6895 		ddi_dev = md_dev64_to_dev(dev64);
6896 		if (ddi_dev == NODEV) {
6897 			/*
6898 			 * No translation available for replica.
6899 			 * Could fail conversion to device id replica,
6900 			 * but instead will just continue with next
6901 			 * replica in list.
6902 			 */
6903 			continue;
6904 		}
6905 		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
6906 			/*
6907 			 * Just count each devid as at least 1 block.  This
6908 			 * is conservative since several device id's may fit
6909 			 * into 1 disk block, but it's better to overestimate
6910 			 * the number of blocks needed than to underestimate.
6911 			 */
6912 			devid_len = (int)ddi_devid_sizeof(ret_devid);
6913 			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
6914 			if (doit) {
6915 				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
6916 				    &minor_name) == DDI_SUCCESS) {
6917 					if (mddb_devid_add(s, li, ret_devid,
6918 					    minor_name)) {
6919 						cmn_err(CE_WARN,
6920 						"Not enough space in metadb"
6921 						" to add device id for"
6922 						"  dev: major = %d, "
6923 						"minor = %d\n",
6924 						getmajor(ddi_dev),
6925 						getminor(ddi_dev));
6926 					}
6927 					sz = strlen(minor_name) + 1;
6928 					kmem_free(minor_name, sz);
6929 				}
6930 			}
6931 			ddi_devid_free(ret_devid);
6932 		}
6933 	}
6934 
6935 	if (doit) {
6936 		did_blkp->blk_magic = MDDB_MAGIC_DI;
6937 		retval = push_lb(s);
6938 		single_thread_end(s);
6939 		mddb_setexit(s);
6940 		if (retval != 0)
6941 			return (1);
6942 	}
6943 
6944 	return (0);
6945 }
6946 
6947 static mddb_set_t *
6948 init_set(
6949 	mddb_config_t	*cp,
6950 	int		flag,
6951 	int		*errp
6952 )
6953 {
6954 	mddb_set_t	*s;
6955 	char		*setname = NULL;
6956 	set_t		setno = MD_LOCAL_SET;
6957 	side_t		sideno = 0;
6958 	struct timeval32 *created = NULL;
6959 
6960 	if (cp != NULL) {
6961 		setname = cp->c_setname;
6962 		setno = cp->c_setno;
6963 		sideno = cp->c_sideno;
6964 		created = &cp->c_timestamp;
6965 	}
6966 
6967 	if (setno >= MD_MAXSETS)
6968 		return ((mddb_set_t *)NULL);
6969 
6970 	if (md_set[setno].s_db)
6971 		return (mddb_setenter(setno, flag, errp));
6972 
6973 	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
6974 
6975 	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
6976 	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
6977 	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
6978 	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
6979 	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
6980 
6981 	s->s_setno = setno;
6982 	s->s_sideno = sideno;
6983 	if (setno == MD_LOCAL_SET) {
6984 		(void) strcpy(s->s_ident.serial, hw_serial);
6985 	} else {
6986 		s->s_ident.createtime = *created;
6987 		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
6988 		    KM_SLEEP);
6989 		(void) strcpy(s->s_setname, setname);
6990 	}
6991 
6992 	/* have a config struct,  copy mediator information */
6993 	if (cp != NULL)
6994 		s->s_med = cp->c_med;		/* structure assignment */
6995 
6996 	md_set[setno].s_db = (void *) s;
6997 
6998 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
6999 
7000 	return (mddb_setenter(setno, flag, errp));
7001 }
7002 
7003 void
7004 mddb_unload_set(
7005 	set_t		setno
7006 )
7007 {
7008 
7009 	mddb_set_t	*s;
7010 	mddb_db_t	*dbp, *adbp = NULL;
7011 	mddb_de_ic_t	*dep, *dep2;
7012 	mddb_bf_t	*bfp;
7013 	int		i;
7014 	md_dev64_t	dev;
7015 
7016 	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
7017 		return;
7018 
7019 	single_thread_start(s);
7020 
7021 	s->s_opthavequeuinglck = 0;
7022 	s->s_optwantqueuinglck = 0;
7023 
7024 	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
7025 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
7026 			if (dep->de_rb_userdata != NULL) {
7027 				if (dep->de_icreqsize)
7028 					kmem_free(dep->de_rb_userdata_ic,
7029 					    dep->de_icreqsize);
7030 				else
7031 					kmem_free(dep->de_rb_userdata,
7032 					    dep->de_reqsize);
7033 			}
7034 			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
7035 			dep2 = dep->de_next;
7036 			kmem_free((caddr_t)dep, sizeofde(dep));
7037 		}
7038 		adbp = dbp->db_next;
7039 		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
7040 	}
7041 	s->s_dbp = (mddb_db_t *)NULL;
7042 
7043 	free_rip(&s->s_rip);
7044 
7045 	for (i = 0; i < mddb_maxcopies;	 i++) {
7046 		if (! s->s_mbiarray)
7047 			break;
7048 
7049 		if (! s->s_mbiarray[i])
7050 			continue;
7051 
7052 		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
7053 		dev = md_xlate_targ_2_mini(dev);
7054 		if (dev != NODEV64) {
7055 			mddb_devclose(dev);
7056 			free_mbipp(&s->s_mbiarray[i]);
7057 		}
7058 	}
7059 
7060 	if (s->s_mbiarray) {
7061 		kmem_free((caddr_t)s->s_mbiarray,
7062 		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
7063 		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
7064 	}
7065 
7066 	if (s->s_lnp) {
7067 		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
7068 		s->s_lnp = (mddb_ln_t *)NULL;
7069 	}
7070 
7071 	if (s->s_lbp) {
7072 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
7073 		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
7074 		s->s_lbp = (mddb_lb_t *)NULL;
7075 	}
7076 
7077 	if (s->s_freebitmap) {
7078 		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
7079 		s->s_freebitmap = NULL;
7080 		s->s_freebitmapsize = 0;
7081 	}
7082 
7083 	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
7084 		kmem_free((caddr_t)bfp, sizeof (*bfp));
7085 
7086 	if (s->s_databuffer_size) {
7087 		kmem_free(s->s_databuffer, s->s_databuffer_size);
7088 		s->s_databuffer_size = 0;
7089 	}
7090 
7091 	if (s->s_setname != NULL)
7092 		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
7093 
7094 	/* Data tags not supported on MN sets. */
7095 	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
7096 		dtl_freel(&s->s_dtlp);
7097 
7098 	md_set[setno].s_db = NULL;
7099 	ASSERT(s->s_singlelockwanted == 0);
7100 	kmem_free(s, sizeof (mddb_set_t));
7101 
7102 	/* Take care of things setup in the md_set array */
7103 	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
7104 		if (md_set[setno].s_dtp) {
7105 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
7106 			md_set[setno].s_dtp = NULL;
7107 		}
7108 	}
7109 
7110 	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
7111 				MD_SET_TAGDATA | MD_SET_USETAG |
7112 				MD_SET_TOOFEW | MD_SET_STALE |
7113 				MD_SET_OWNERSHIP | MD_SET_BADTAG |
7114 				MD_SET_CLRTAG | MD_SET_MNSET |
7115 				MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK |
7116 				MD_SET_MN_MIR_STATE_RC);
7117 
7118 	mutex_exit(SETMUTEX(setno));
7119 }
7120 
7121 /*
7122  * returns 0 if name can be put into locator block
7123  * returns 1 if locator block prefixes are all used
7124  *
7125  * Takes splitname (suffix, prefix, sideno) and
7126  * stores it in the locator name structure.
7127  * For traditional diskset, the sideno is the index into the suffixes
7128  * array in the locator name structure.
7129  * For the MN diskset, the sideno is the nodeid which can be any number,
7130  * so the index passed in is the index into the mnsuffixes array
7131  * in the locator structure.  This index was computed by the
7132  * routine checklocator which basically checked the locator block
7133  * mnside locator structure.
7134  */
7135 static int
7136 splitname2locatorblock(
7137 	md_splitname	*spn,
7138 	mddb_ln_t	*lnp,
7139 	int		li,
7140 	side_t		sideno,
7141 	int		index
7142 )
7143 {
7144 	uchar_t			i;
7145 	md_name_suffix		*sn;
7146 	md_mnname_suffix_t	*mnsn;
7147 	mddb_mnln_t		*mnlnp;
7148 
7149 	for (i = 0; i < MDDB_PREFIXCNT; i++) {
7150 		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
7151 			continue;
7152 		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
7153 		    SPN_PREFIX(spn).pre_len) == 0)
7154 			break;
7155 	}
7156 	if (i == MDDB_PREFIXCNT) {
7157 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7158 			if (lnp->ln_prefixes[i].pre_len == 0)
7159 				break;
7160 		}
7161 		if (i == MDDB_PREFIXCNT)
7162 			return (1);
7163 		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
7164 		    SPN_PREFIX(spn).pre_len);
7165 		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
7166 	}
7167 
7168 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7169 		/* If a MN diskset, use index */
7170 		mnlnp = (mddb_mnln_t *)lnp;
7171 		mnsn = &mnlnp->ln_mnsuffixes[index][li];
7172 		mnsn->mn_ln_sideno = sideno;
7173 		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
7174 		mnsn->mn_ln_suffix.suf_prefix = i;
7175 		bcopy(SPN_SUFFIX(spn).suf_data,
7176 		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
7177 	} else {
7178 		sn = &lnp->ln_suffixes[sideno][li];
7179 		sn->suf_len = SPN_SUFFIX(spn).suf_len;
7180 		sn->suf_prefix = i;
7181 		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
7182 		    SPN_SUFFIX(spn).suf_len);
7183 	}
7184 	return (0);
7185 }
7186 
7187 /*
7188  * Find the locator name for the given sideno and convert the locator name
7189  * information into a splitname structure.
7190  */
7191 void
7192 mddb_locatorblock2splitname(
7193 	mddb_ln_t	*lnp,
7194 	int		li,
7195 	side_t		sideno,
7196 	md_splitname	*spn
7197 )
7198 {
7199 	int			iprefix;
7200 	md_name_suffix		*sn;
7201 	md_mnname_suffix_t	*mnsn;
7202 	int			i;
7203 	mddb_mnln_t		*mnlnp;
7204 
7205 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7206 		mnlnp = (mddb_mnln_t *)lnp;
7207 		for (i = 0; i < MD_MNMAXSIDES; i++) {
7208 			mnsn = &mnlnp->ln_mnsuffixes[i][li];
7209 			if (mnsn->mn_ln_sideno == sideno)
7210 				break;
7211 		}
7212 		if (i == MD_MNMAXSIDES)
7213 			return;
7214 
7215 		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
7216 		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
7217 			SPN_SUFFIX(spn).suf_len);
7218 		iprefix = mnsn->mn_ln_suffix.suf_prefix;
7219 	} else {
7220 		sn = &lnp->ln_suffixes[sideno][li];
7221 		SPN_SUFFIX(spn).suf_len = sn->suf_len;
7222 		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
7223 			SPN_SUFFIX(spn).suf_len);
7224 		iprefix = sn->suf_prefix;
7225 	}
7226 	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
7227 	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
7228 	    SPN_PREFIX(spn).pre_len);
7229 }
7230 
7231 static int
7232 getdeldev(
7233 	mddb_config_t	*cp,
7234 	int		command,
7235 	md_error_t	*ep
7236 )
7237 {
7238 	mddb_set_t	*s;
7239 	mddb_lb_t	*lbp;
7240 	mddb_locator_t	*locators;
7241 	uint_t		loccnt;
7242 	mddb_mb_ic_t	*mbip;
7243 	mddb_block_t	blk;
7244 	int		err = 0;
7245 	int		i, j;
7246 	int		li;
7247 	uint_t		commitcnt;
7248 	set_t		setno = cp->c_setno;
7249 	uint_t		set_status;
7250 	md_dev64_t	dev;
7251 	int		flags = MDDB_MUSTEXIST;
7252 
7253 	cp->c_dbmax = MDDB_NLB;
7254 
7255 	/*
7256 	 * Data checking
7257 	 */
7258 	if (setno >= md_nsets || cp->c_id < 0 ||
7259 		cp->c_id > cp->c_dbmax) {
7260 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7261 	}
7262 
7263 	if (cp->c_flags & MDDB_C_STALE)
7264 		flags |= MDDB_MN_STALE;
7265 
7266 	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
7267 		return (mddbstatus2error(ep, err, NODEV32, setno));
7268 
7269 	cp->c_flags = 0;
7270 
7271 	lbp = s->s_lbp;
7272 	loccnt = lbp->lb_loccnt;
7273 	locators = lbp->lb_locators;
7274 
7275 	/* shorthand */
7276 	set_status = md_get_setstatus(setno);
7277 
7278 	if (set_status & MD_SET_STALE)
7279 		cp->c_flags |= MDDB_C_STALE;
7280 
7281 	if (set_status & MD_SET_TOOFEW)
7282 		cp->c_flags |= MDDB_C_TOOFEW;
7283 
7284 	cp->c_sideno = s->s_sideno;
7285 
7286 	cp->c_dbcnt = 0;
7287 	/*
7288 	 * go through and count active entries
7289 	 */
7290 	for (i = 0; i < loccnt;	 i++) {
7291 		if (locators[i].l_flags & MDDB_F_DELETED)
7292 			continue;
7293 		cp->c_dbcnt++;
7294 	}
7295 
7296 	/*
7297 	 * add the ability to accept a locator block index
7298 	 * which is not relative to previously deleted replicas.  This
7299 	 * is for support of MD_DEBUG=STAT in metastat since it asks for
7300 	 * replica information specifically for each of the mirror resync
7301 	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
7302 	 * the mddb_config_t type.
7303 	 */
7304 	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
7305 		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
7306 			mddb_setexit(s);
7307 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7308 						setno));
7309 		}
7310 		li = cp->c_id;
7311 	} else {
7312 		if (cp->c_id >= cp->c_dbcnt) {
7313 			mddb_setexit(s);
7314 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7315 						setno));
7316 		}
7317 
7318 		/* CSTYLED */
7319 		for (li = 0, j = 0; /* void */; li++) {
7320 			if (locators[li].l_flags & MDDB_F_DELETED)
7321 				continue;
7322 			j++;
7323 			if (j > cp->c_id)
7324 				break;
7325 		}
7326 	}
7327 
7328 	if (command == MDDB_ENDDEV) {
7329 		daddr_t ib = 0, jb;
7330 
7331 		blk = 0;
7332 		if ((s != NULL) && s->s_mbiarray[li]) {
7333 			mbip = s->s_mbiarray[li];
7334 			while ((jb = getphysblk(blk++, mbip)) > 0) {
7335 				if (jb > ib)
7336 					ib = jb;
7337 			}
7338 			cp->c_dbend = (int)ib;
7339 		} else {
7340 			cp->c_dbend = 0;
7341 		}
7342 	}
7343 
7344 	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
7345 	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
7346 
7347 	if (command != MDDB_DELDEV) {
7348 		mddb_setexit(s);
7349 		return (0);
7350 	}
7351 
7352 	/* Currently don't allow addition/deletion of sides during upgrade */
7353 	if (MD_UPGRADE) {
7354 		cmn_err(CE_WARN,
7355 		    "Deletion of replica not allowed during upgrade.\n");
7356 		mddb_setexit(s);
7357 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7358 	}
7359 
7360 	/*
7361 	 * If here, replica delete in progress.
7362 	 */
7363 	single_thread_start(s);
7364 
7365 	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
7366 	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
7367 		commitcnt = lbp->lb_commitcnt;
7368 		lbp->lb_commitcnt = 0;
7369 		setidentifier(s, &lbp->lb_ident);
7370 		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
7371 		/*
7372 		 * Don't need to write out device id area, since locator
7373 		 * block on this replica is being deleted by setting the
7374 		 * commitcnt to 0.
7375 		 */
7376 		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
7377 			MDDB_WR_ONLY_MASTER);
7378 		lbp->lb_commitcnt = commitcnt;
7379 	}
7380 
7381 	if (s->s_mbiarray[li])
7382 		free_mbipp(&s->s_mbiarray[li]);
7383 
7384 	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
7385 		dev = md_expldev(locators[li].l_dev);
7386 		dev = md_xlate_targ_2_mini(dev);
7387 		if (dev != NODEV64)
7388 			mddb_devclose(dev);
7389 	}
7390 
7391 	s->s_mbiarray[li] = 0;
7392 	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
7393 
7394 	/* Only support data tags for traditional and local sets */
7395 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
7396 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
7397 	    setno != MD_LOCAL_SET)
7398 		if (set_dtag(s, ep))
7399 			mdclrerror(ep);
7400 
7401 	/* Write data tags to all accessible devices */
7402 	/* Only support data tags for traditional and local sets */
7403 	if (!(lbp->lb_flags & MDDB_MNSET)) {
7404 		(void) dt_write(s);
7405 	}
7406 
7407 	/* Delete device id of deleted replica */
7408 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7409 		(void) mddb_devid_delete(s, li);
7410 	}
7411 	/* write new locator to all devices */
7412 	err = writelocall(s);
7413 
7414 	(void) upd_med(s, "getdeldev(0)");
7415 
7416 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
7417 	    md_expldev(locators[li].l_dev));
7418 
7419 	computefreeblks(s); /* recompute always it may be larger */
7420 	cp->c_dbcnt--;
7421 	err |= fixoptrecords(s);
7422 	if (err) {
7423 		if (writeretry(s)) {
7424 			single_thread_end(s);
7425 			mddb_setexit(s);
7426 			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
7427 		}
7428 	}
7429 
7430 	single_thread_end(s);
7431 	mddb_setexit(s);
7432 	return (0);
7433 }
7434 
7435 static int
7436 getdriver(
7437 	mddb_cfg_loc_t	*clp
7438 )
7439 {
7440 	major_t		majordev;
7441 
7442 	/*
7443 	 * Data checking
7444 	 */
7445 	if (clp->l_dev <= 0)
7446 		return (EINVAL);
7447 
7448 	majordev = getmajor(expldev(clp->l_dev));
7449 
7450 	if (ddi_major_to_name(majordev) == (char *)NULL)
7451 		return (EINVAL);
7452 
7453 	if (MD_UPGRADE)
7454 		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
7455 	else
7456 		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
7457 	return (0);
7458 }
7459 
7460 /*
7461  * update_valid_replica - updates the locator block namespace (prefix
7462  * 	and/or suffix) with new pathname and devname.
7463  *	RETURN
7464  *		1	Error
7465  *		0	Success
7466  */
7467 static int
7468 update_valid_replica(
7469 	side_t		side,
7470 	mddb_locator_t	*lp,
7471 	mddb_set_t	*s,
7472 	int		li,
7473 	char		*devname,
7474 	char		*pathname,
7475 	md_dev64_t	devt
7476 )
7477 {
7478 	uchar_t		pre_len, suf_len;
7479 	md_name_suffix	*sn;
7480 	mddb_ln_t	*lnp;
7481 	uchar_t		pre_index;
7482 	uchar_t		i;
7483 
7484 	if (md_expldev(lp->l_dev) != devt) {
7485 		return (0);
7486 	}
7487 
7488 	if (pathname[strlen(pathname) - 1] == '/')
7489 		pathname[strlen(pathname) - 1] = '\0';
7490 
7491 	pre_len = (uchar_t)strlen(pathname);
7492 	suf_len = (uchar_t)strlen(devname);
7493 
7494 	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
7495 		return (1);
7496 
7497 	lnp = s->s_lnp;
7498 
7499 	/*
7500 	 * Future note:  Need to do something here for the MN diskset case
7501 	 * when device ids are supported in disksets.
7502 	 * Can't add until merging devids_in_diskset code into code base
7503 	 * Currently only called with side of 0.
7504 	 */
7505 
7506 	sn = &lnp->ln_suffixes[side][li];
7507 
7508 	/*
7509 	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
7510 	 * If new prefix is the same as the previous prefix - no change.
7511 	 *
7512 	 * If new prefix is not the same, check if new prefix
7513 	 * matches an existing one.  If so, use that one.
7514 	 *
7515 	 * If new prefix doesn't exist, add a new prefix.  If not enough
7516 	 * space, return failure.
7517 	 */
7518 	pre_index = sn->suf_prefix;
7519 	/* Check if new prefix is the same as the old prefix. */
7520 	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
7521 	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
7522 	    pre_len) != 0)) {
7523 		/* Check if new prefix is an already known prefix. */
7524 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7525 			if (lnp->ln_prefixes[i].pre_len != pre_len) {
7526 				continue;
7527 			}
7528 			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
7529 			    pre_len) == 0) {
7530 				break;
7531 			}
7532 		}
7533 		/* If no match found for new prefix - add the new prefix */
7534 		if (i == MDDB_PREFIXCNT) {
7535 			for (i = 0; i < MDDB_PREFIXCNT; i++) {
7536 				if (lnp->ln_prefixes[i].pre_len == 0)
7537 					break;
7538 			}
7539 			/* No space to add new prefix - return failure */
7540 			if (i == MDDB_PREFIXCNT) {
7541 				return (1);
7542 			}
7543 			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
7544 			lnp->ln_prefixes[i].pre_len = pre_len;
7545 		}
7546 		sn->suf_prefix = i;
7547 	}
7548 
7549 	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
7550 	if ((sn->suf_len != suf_len) ||
7551 	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
7552 		bcopy(devname, sn->suf_data, suf_len);
7553 		sn->suf_len = suf_len;
7554 	}
7555 	return (0);
7556 }
7557 
7558 
7559 /*
7560  * md_update_locator_namespace - If in devid style and active and the devid's
7561  *		exist and are valid update the locator namespace pathname
7562  *		and devname.
7563  *	RETURN
7564  *		1	Error
7565  *		0	Success
7566  */
7567 int
7568 md_update_locator_namespace(
7569 	set_t		setno,		/* which set to get name from */
7570 	side_t		side,
7571 	char		*dname,
7572 	char		*pname,
7573 	md_dev64_t	devt
7574 )
7575 {
7576 	mddb_set_t	*s;
7577 	mddb_lb_t	*lbp;
7578 	int		li;
7579 	uint_t		flg;
7580 	int		err = 0;
7581 	mddb_ln_t	*lnp;
7582 
7583 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7584 		return (1);
7585 	single_thread_start(s);
7586 	lbp = s->s_lbp;
7587 	/* must be DEVID_STYLE */
7588 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7589 		for (li = 0; li < lbp->lb_loccnt; li++) {
7590 			mddb_locator_t *lp = &lbp->lb_locators[li];
7591 
7592 			if (lp->l_flags & MDDB_F_DELETED) {
7593 				continue;
7594 			}
7595 
7596 			/* replica also must be active */
7597 			if (lp->l_flags & MDDB_F_ACTIVE) {
7598 				flg = s->s_did_icp->did_ic_blkp->
7599 				    blk_info[li].info_flags;
7600 				/* only update if did exists and is valid */
7601 				if ((flg & MDDB_DID_EXISTS) &&
7602 				    (flg & MDDB_DID_VALID)) {
7603 					if (update_valid_replica(side, lp, s,
7604 					    li, dname, pname, devt)) {
7605 						err = 1;
7606 						goto out;
7607 					}
7608 				}
7609 			}
7610 		}
7611 	}
7612 	lnp = s->s_lnp;
7613 	uniqtime32(&lnp->ln_timestamp);
7614 	if (lbp->lb_flags & MDDB_MNSET)
7615 		lnp->ln_revision = MDDB_REV_MNLN;
7616 	else
7617 		lnp->ln_revision = MDDB_REV_LN;
7618 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
7619 	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
7620 		lbp->lb_lnblkcnt, 0);
7621 	/*
7622 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
7623 	 * flag in the mddb_set structure to show that the locator
7624 	 * names have changed.
7625 	 */
7626 
7627 	if ((lbp->lb_flags & MDDB_MNSET) &&
7628 	    (md_set[s->s_setno].s_am_i_master)) {
7629 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
7630 	}
7631 out:
7632 	single_thread_end(s);
7633 	mddb_setexit(s);
7634 	if (err)
7635 		return (1);
7636 	return (0);
7637 }
7638 
7639 /*
7640  * update_locatorblock - for active entries in the locator block, check
7641  *		the devt to see if it matches the given devt. If so, and
7642  *		there is an associated device id which is not the same
7643  *		as the passed in devid, delete old devid and add a new one.
7644  *	RETURN
7645  *		MDDB_E_NODEVID
7646  *		MDDB_E_NOLOCBLK
7647  *		1	Error
7648  *		0	Success
7649  */
7650 static int
7651 update_locatorblock(mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr)
7652 {
7653 	mddb_lb_t	*lbp = NULL;
7654 	mddb_locator_t	*lp;
7655 	int		li;
7656 	uint_t		flg;
7657 	ddi_devid_t	devid_ptr;
7658 	int		retval = 0;
7659 	char		*minor_name;
7660 
7661 	lbp = s->s_lbp;
7662 	/* find replicas that haven't been deleted */
7663 	for (li = 0; li < lbp->lb_loccnt; li++) {
7664 		lp = &lbp->lb_locators[li];
7665 
7666 		if ((lp->l_flags & MDDB_F_DELETED)) {
7667 			continue;
7668 		}
7669 		/*
7670 		 * check to see if locator devt matches given dev
7671 		 * and if there is a device ID associated with it
7672 		 */
7673 		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
7674 		if ((md_expldev(lp->l_dev) == dev) &&
7675 		    (flg & MDDB_DID_EXISTS)) {
7676 			if (flg & MDDB_DID_VALID) {
7677 				continue; /* cont to nxt active entry */
7678 			}
7679 			devid_ptr = s->s_did_icp->did_ic_devid[li];
7680 			if (devid_ptr == NULL) {
7681 				return (MDDB_E_NODEVID);
7682 			}
7683 			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
7684 				/*
7685 				 * devid's not equal so
7686 				 * delete and add
7687 				 */
7688 				if (ddi_lyr_get_minor_name(
7689 					md_dev64_to_dev(dev),
7690 					S_IFBLK, &minor_name) == DDI_SUCCESS) {
7691 					(void) mddb_devid_delete(s, li);
7692 					(void) mddb_devid_add(s, li, didptr,
7693 								minor_name);
7694 					kmem_free(minor_name,
7695 						    strlen(minor_name)+1);
7696 						break;
7697 				} else {
7698 					retval = 1;
7699 					goto err_out;
7700 				}
7701 			}
7702 		}
7703 	} /* end for */
7704 	retval = push_lb(s);
7705 err_out:
7706 	return (retval);
7707 }
7708 
7709 static int
7710 update_mb_devid(
7711 	mddb_set_t	*s,
7712 	mddb_ri_t	*rip,
7713 	ddi_devid_t	devidptr
7714 )
7715 {
7716 	mddb_mb_ic_t	*mbip;
7717 	mddb_mb_t	*mb = NULL;
7718 	daddr_t		blkno;
7719 	md_dev64_t	device;
7720 	uint_t		sz;
7721 	int		mb2free = 0;
7722 	int		err = 0;
7723 
7724 
7725 	/*
7726 	 * There is case where a disk may not have mddb,
7727 	 * and only has dummy mddb which contains
7728 	 * a valid devid we like to update and in this
7729 	 * case, the rip_lbp will be NULL but we still
7730 	 * like to update the devid embedded in the
7731 	 * dummy mb block.
7732 	 *
7733 	 */
7734 	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
7735 		mbip = rip->ri_mbip;
7736 		mb = &mbip->mbi_mddb_mb;
7737 	} else {
7738 		/*
7739 		 * Done if it is non-replicated set
7740 		 */
7741 		if (devidptr != (ddi_devid_t)NULL) {
7742 			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
7743 				KM_SLEEP);
7744 			mb->mb_magic = MDDB_MAGIC_DU;
7745 			mb->mb_revision = MDDB_REV_MB;
7746 			mb2free = 1;
7747 		} else {
7748 			goto out;
7749 		}
7750 	}
7751 
7752 	blkno = rip->ri_blkno;
7753 	device = rip->ri_dev;
7754 	/*
7755 	 * Replace the mb_devid with the new/valid one
7756 	 */
7757 	if (devidptr != (ddi_devid_t)NULL) {
7758 		/*
7759 		 * Zero out what we have previously
7760 		 */
7761 		if (mb->mb_devid_len)
7762 			bzero(mb->mb_devid, mb->mb_devid_len);
7763 		sz = ddi_devid_sizeof(devidptr);
7764 		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
7765 		mb->mb_devid_len = sz;
7766 	}
7767 
7768 	mb->mb_setno = s->s_setno;
7769 	uniqtime32(&mb->mb_timestamp);
7770 	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
7771 	/*
7772 	 * putblks will
7773 	 *
7774 	 *	- drop the s_dbmx lock
7775 	 *	- biowait
7776 	 *	- regain the s_dbmx lock
7777 	 *
7778 	 * Need to update this if we wants to handle
7779 	 * mb_next != NULL which it is unlikely will happen
7780 	 */
7781 	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
7782 
7783 	if (mb2free) {
7784 		kmem_free(mb, MDDB_BSIZE);
7785 	}
7786 out:
7787 	return (err);
7788 }
7789 
7790 static int
7791 setdid(
7792 	mddb_config_t		*cp
7793 )
7794 {
7795 	ddi_devid_t		devidp;
7796 	dev_t			ddi_dev;
7797 	mddb_set_t		*s;
7798 	int			err = 0;
7799 	mddb_ri_t		*rip;
7800 
7801 	/*
7802 	 * Data integrity check
7803 	 */
7804 	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
7805 		return (EINVAL);
7806 
7807 	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
7808 		return (0);
7809 
7810 	ddi_dev = md_dev64_to_dev(cp->c_devt);
7811 	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
7812 		return (-1);
7813 	}
7814 	if (devidp == NULL) {
7815 		return (-1);
7816 	}
7817 
7818 	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
7819 		return (-1);
7820 	single_thread_start(s);
7821 
7822 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
7823 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
7824 			continue;
7825 		/*
7826 		 * We only update what is asked
7827 		 */
7828 		if (rip->ri_dev == cp->c_devt) {
7829 			if (update_mb_devid(s, rip, devidp) != 0) {
7830 				err = -1;
7831 				goto out;
7832 			}
7833 		}
7834 	}
7835 
7836 	if (update_locatorblock(s, cp->c_devt, devidp)) {
7837 		err = -1;
7838 		goto out;
7839 	}
7840 
7841 out:
7842 	single_thread_end(s);
7843 	mddb_setexit(s);
7844 	ddi_devid_free(devidp);
7845 	return (err);
7846 }
7847 
7848 static int
7849 delnewside(
7850 	mddb_config_t		*cp,
7851 	int			command,
7852 	md_error_t		*ep
7853 )
7854 {
7855 	mddb_set_t		*s;
7856 	int			li;
7857 	mddb_lb_t		*lbp;		/* pointer to locator block */
7858 	mddb_ln_t		*lnp;		/* pointer to locator names */
7859 	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
7860 	mddb_locator_t		*lp;
7861 	mddb_sidelocator_t	*slp;
7862 	mddb_cfg_loc_t		*clp;
7863 	int			err = 0;
7864 	set_t			setno = cp->c_setno;
7865 	ddi_devid_t		devid;
7866 	ddi_devid_t		ret_devid = NULL;
7867 	char			*minor_name;
7868 	uint_t			use_devid = 0;
7869 	dev_t			ddi_dev;
7870 	md_mnname_suffix_t	*mnsn;
7871 	mddb_mnlb_t		*mnlbp;
7872 	mddb_mnsidelocator_t	*mnslp;
7873 
7874 	/* Currently don't allow addition/deletion of sides during upgrade */
7875 	if (MD_UPGRADE) {
7876 		cmn_err(CE_WARN,
7877 		    "Addition and deletion of sides not allowed"
7878 		    " during upgrade. \n");
7879 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7880 	}
7881 
7882 	/*
7883 	 * Data integrity check
7884 	 */
7885 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
7886 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7887 
7888 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7889 		return (mddbstatus2error(ep, err, NODEV32, setno));
7890 
7891 	single_thread_start(s);
7892 	clp = &cp->c_locator;
7893 
7894 	lbp = s->s_lbp;
7895 
7896 	if (lbp->lb_setno != setno) {
7897 		single_thread_end(s);
7898 		mddb_setexit(s);
7899 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
7900 	}
7901 
7902 	/*
7903 	 * Find this device/blkno pair
7904 	 */
7905 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7906 		ddi_dev = md_dev64_to_dev(clp->l_dev);
7907 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
7908 		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
7909 		    == DDI_SUCCESS)) {
7910 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
7911 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
7912 				use_devid = 1;
7913 				(void) strcpy(clp->l_minor_name, minor_name);
7914 			}
7915 			kmem_free(minor_name, strlen(minor_name)+1);
7916 		}
7917 		if (use_devid != 1 && ret_devid != NULL)
7918 			ddi_devid_free(ret_devid);
7919 	}
7920 	for (li = 0; li < lbp->lb_loccnt; li++) {
7921 		lp = &lbp->lb_locators[li];
7922 		if (lp->l_flags & MDDB_F_DELETED)
7923 			continue;
7924 		if (use_devid) {
7925 			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
7926 				continue;
7927 			if ((ddi_devid_compare(devid,
7928 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
7929 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
7930 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
7931 				break;
7932 			}
7933 		} else {
7934 			if (lp->l_dev == clp->l_dev &&
7935 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
7936 				break;
7937 			}
7938 		}
7939 	}
7940 
7941 	if (li == lbp->lb_loccnt) {
7942 		if (use_devid)
7943 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
7944 		single_thread_end(s);
7945 		mddb_setexit(s);
7946 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
7947 	}
7948 
7949 	lnp = s->s_lnp;
7950 	if (command == MDDB_NEWSIDE) {
7951 		int 	index = 0;
7952 		/*
7953 		 * If a MN diskset, need to find the index where the new
7954 		 * locator information is to be stored in the mnsidelocator
7955 		 * field of the locator block so that the locator name can
7956 		 * be stored at the same array index in the mnsuffixes
7957 		 * field of the locator names structure.
7958 		 */
7959 		if (lbp->lb_flags & MDDB_MNSET) {
7960 			if ((index = checklocator(lbp, li,
7961 			    cp->c_sideno)) == -1) {
7962 				if (use_devid) {
7963 					ddi_devid_free((ddi_devid_t)
7964 					    (uintptr_t)clp->l_devid);
7965 				}
7966 				single_thread_end(s);
7967 				mddb_setexit(s);
7968 				return (mdmddberror(ep, MDE_DB_TOOSMALL,
7969 					NODEV32, setno));
7970 			}
7971 		}
7972 
7973 		/*
7974 		 * Store the locator name before the sidelocator information
7975 		 * in case a panic occurs between these 2 steps.  Must have
7976 		 * the locator name information in order to print reasonable
7977 		 * error information.
7978 		 */
7979 		if (splitname2locatorblock(&cp->c_devname, lnp, li,
7980 		    cp->c_sideno, index)) {
7981 			if (use_devid)
7982 				ddi_devid_free(
7983 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
7984 			single_thread_end(s);
7985 			mddb_setexit(s);
7986 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
7987 						setno));
7988 		}
7989 
7990 		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
7991 			if (use_devid)
7992 				ddi_devid_free(
7993 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
7994 			single_thread_end(s);
7995 			mddb_setexit(s);
7996 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
7997 						setno));
7998 		}
7999 	}
8000 
8001 	if (use_devid)
8002 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8003 
8004 	if (command == MDDB_DELSIDE) {
8005 		int i;
8006 		for (i = 0; i < lbp->lb_loccnt; i++) {
8007 			if (lbp->lb_flags & MDDB_MNSET) {
8008 				int	j;
8009 				mnlbp = (mddb_mnlb_t *)lbp;
8010 				for (j = 0; j < MD_MNMAXSIDES; j++) {
8011 				    mnslp = &mnlbp->lb_mnsidelocators[j][i];
8012 				    if (mnslp->mnl_sideno == cp->c_sideno)
8013 					break;
8014 				}
8015 				if (j < MD_MNMAXSIDES) {
8016 					mnslp->mnl_mnum = NODEV32;
8017 					mnslp->mnl_sideno = 0;
8018 					mnlnp = (mddb_mnln_t *)lnp;
8019 					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
8020 					bzero((caddr_t)mnsn,
8021 						sizeof (md_mnname_suffix_t));
8022 				}
8023 			} else {
8024 				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
8025 				bzero((caddr_t)&lnp->ln_suffixes
8026 				    [cp->c_sideno][i], sizeof (md_name_suffix));
8027 				slp->l_mnum = NODEV32;
8028 			}
8029 		}
8030 	}
8031 
8032 	/* write new locator names to all devices */
8033 	uniqtime32(&lnp->ln_timestamp);
8034 	if (lbp->lb_flags & MDDB_MNSET)
8035 		lnp->ln_revision = MDDB_REV_MNLN;
8036 	else
8037 		lnp->ln_revision = MDDB_REV_LN;
8038 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8039 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8040 		lbp->lb_lnblkcnt, 0);
8041 	/*
8042 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8043 	 * flag in the mddb_set structure to show that the locator
8044 	 * names have changed.
8045 	 */
8046 
8047 	if ((lbp->lb_flags & MDDB_MNSET) &&
8048 	    (md_set[s->s_setno].s_am_i_master)) {
8049 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8050 	}
8051 	if (err) {
8052 		if (writeretry(s)) {
8053 			single_thread_end(s);
8054 			mddb_setexit(s);
8055 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8056 		}
8057 	}
8058 
8059 	uniqtime32(&lbp->lb_timestamp);
8060 	/* write new locator to all devices */
8061 	err = writelocall(s);
8062 	computefreeblks(s); /* recompute always it may be larger */
8063 	if (err) {
8064 		if (writeretry(s)) {
8065 			single_thread_end(s);
8066 			mddb_setexit(s);
8067 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8068 		}
8069 	}
8070 
8071 	single_thread_end(s);
8072 	mddb_setexit(s);
8073 
8074 	return (0);
8075 }
8076 
8077 static int
8078 newdev(
8079 	mddb_config_t	*cp,
8080 	int		command,
8081 	md_error_t	*ep
8082 )
8083 {
8084 	mddb_set_t	*s;
8085 	mddb_mb_ic_t	*mbip, *mbip1;
8086 	int		i, j;
8087 	int		li;
8088 	mddb_lb_t	*lbp;		/* pointer to locator block */
8089 	mddb_ln_t	*lnp;		/* pointer to locator names */
8090 	mddb_locator_t	*lp;
8091 	mddb_cfg_loc_t	*clp;
8092 	int		err = 0;
8093 	set_t		setno = cp->c_setno;
8094 	ddi_devid_t	devid2;
8095 	ddi_devid_t	ret_devid = NULL;
8096 	char		*minor_name;
8097 	uint_t		use_devid = 0;
8098 	dev_t		ddi_dev;
8099 	int		old_flags;
8100 	int		flags;
8101 	int		mn_set = 0;
8102 	int		index;
8103 
8104 
8105 	/* Currently don't allow addition of new replica during upgrade */
8106 	if (MD_UPGRADE) {
8107 		cmn_err(CE_WARN,
8108 		    "Addition of new replica not allowed during upgrade.\n");
8109 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8110 	}
8111 
8112 	/*
8113 	 * Data integrity check
8114 	 */
8115 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8116 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8117 
8118 	/* Determine the flag settings for multinode sets */
8119 	flags = MDDB_NOOLDOK;
8120 	if (cp->c_multi_node)
8121 		flags |= MDDB_MULTINODE;
8122 
8123 	if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
8124 		if (err != MDDB_E_NOTOWNER)
8125 			return (mddbstatus2error(ep, err, NODEV32, setno));
8126 		s = init_set(cp, flags, &err);
8127 		if (s == NULL)
8128 			return (mddbstatus2error(ep, err, NODEV32, setno));
8129 	}
8130 
8131 	single_thread_start(s);
8132 
8133 	/* shorthand */
8134 	clp = &cp->c_locator;
8135 
8136 	/* shorthand */
8137 	lbp = s->s_lbp;
8138 
8139 	if (lbp->lb_setno != setno) {
8140 		single_thread_end(s);
8141 		mddb_setexit(s);
8142 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8143 	}
8144 
8145 	/*
8146 	 * See if this device/blkno pair is already a replica
8147 	 */
8148 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8149 		ddi_dev = expldev(clp->l_dev);
8150 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8151 		    (ddi_lyr_get_minor_name(ddi_dev,
8152 		    S_IFBLK, &minor_name) == DDI_SUCCESS)) {
8153 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8154 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8155 				use_devid = 1;
8156 				(void) strcpy(clp->l_minor_name, minor_name);
8157 			}
8158 			kmem_free(minor_name, strlen(minor_name)+1);
8159 		}
8160 		if (use_devid != 1 && ret_devid != NULL)
8161 			ddi_devid_free(ret_devid);
8162 	}
8163 
8164 	for (i = 0; i < lbp->lb_loccnt;	 i++) {
8165 		lp = &lbp->lb_locators[i];
8166 		if (lp->l_flags & MDDB_F_DELETED)
8167 			continue;
8168 		if (use_devid) {
8169 			if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
8170 				continue;
8171 			if ((ddi_devid_compare(devid2,
8172 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8173 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8174 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8175 				if (command == MDDB_NEWDEV) {
8176 					ddi_devid_free((ddi_devid_t)(uintptr_t)
8177 						clp->l_devid);
8178 					single_thread_end(s);
8179 					mddb_setexit(s);
8180 					return (mdmddberror(ep,
8181 						MDE_DB_EXISTS, NODEV32, setno));
8182 				}
8183 			}
8184 		} else {
8185 			if (lp->l_dev == clp->l_dev &&
8186 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8187 				if (command == MDDB_NEWDEV) {
8188 					single_thread_end(s);
8189 					mddb_setexit(s);
8190 					return (mdmddberror(ep,
8191 						MDE_DB_EXISTS, NODEV32, setno));
8192 				}
8193 			}
8194 		}
8195 	}
8196 
8197 	/*
8198 	 * Really is a new replica, go get the master blocks
8199 	 */
8200 	mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
8201 	    (uint_t *)0, &mn_set);
8202 	if (! mbip) {
8203 		if (use_devid)
8204 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8205 		single_thread_end(s);
8206 		mddb_setexit(s);
8207 		return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
8208 	}
8209 
8210 	/*
8211 	 * Compute free blocks in replica.
8212 	 */
8213 	computefreeblks(s);
8214 
8215 	/*
8216 	 * Check if this is large enough
8217 	 */
8218 	for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
8219 		i += mbip1->mbi_mddb_mb.mb_blkcnt;
8220 	for (j = i; j < s->s_totalblkcnt; j++) {
8221 		if (blkcheck(s, j)) {
8222 			while (mbip) {
8223 				mbip1 = mbip->mbi_next;
8224 				kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
8225 				mbip = mbip1;
8226 			}
8227 			if (use_devid)
8228 				ddi_devid_free(
8229 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8230 			mddb_devclose(md_expldev(clp->l_dev));
8231 			single_thread_end(s);
8232 			mddb_setexit(s);
8233 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8234 						setno));
8235 		}
8236 	}
8237 
8238 	/* Look for a deleted slot */
8239 	for (li = 0; li < lbp->lb_loccnt; li++) {
8240 		lp = &lbp->lb_locators[li];
8241 		if (lp->l_flags & MDDB_F_DELETED)
8242 			break;
8243 	}
8244 
8245 	/* If no deleted slots, add a new one */
8246 	if (li == lbp->lb_loccnt) {
8247 		/* Already have the max replicas, bail */
8248 		if (lbp->lb_loccnt == MDDB_NLB) {
8249 			if (use_devid)
8250 				ddi_devid_free((ddi_devid_t)(uintptr_t)
8251 				    clp->l_devid);
8252 			mddb_devclose(md_expldev(clp->l_dev));
8253 			single_thread_end(s);
8254 			mddb_setexit(s);
8255 			return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
8256 			    setno));
8257 		}
8258 		lbp->lb_loccnt++;
8259 		lp = &lbp->lb_locators[li];
8260 	}
8261 
8262 	/* Initialize the new or deleted slot */
8263 	old_flags = lp->l_flags;
8264 	lp->l_dev = clp->l_dev;
8265 	lp->l_blkno = (daddr32_t)clp->l_blkno;
8266 	lp->l_flags = clp->l_flags;
8267 
8268 	/* shorthand */
8269 	lnp = s->s_lnp;
8270 
8271 	index = 0;
8272 	if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
8273 		/*
8274 		 * If a MN diskset, need to find the index where the new
8275 		 * locator information is to be stored in the mnsidelocator
8276 		 * field of the locator block so that the locator name can
8277 		 * be stored at the same array index in the mnsuffixes
8278 		 * field of the locator names structure.
8279 		 */
8280 		lbp->lb_flags |= MDDB_MNSET;
8281 		if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
8282 			if (use_devid)
8283 				ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
8284 				    l_devid);
8285 			lp->l_flags = old_flags;
8286 			lbp->lb_loccnt--;
8287 			mddb_devclose(md_expldev(clp->l_dev));
8288 			single_thread_end(s);
8289 			mddb_setexit(s);
8290 			return (mdmddberror(ep, MDE_DB_TOOSMALL,
8291 				NODEV32, setno));
8292 		}
8293 	}
8294 	/*
8295 	 * Store the locator name before the sidelocator information
8296 	 * in case a panic occurs between these 2 steps.  Must have
8297 	 * the locator name information in order to print reasonable
8298 	 * error information.
8299 	 */
8300 	if (splitname2locatorblock(&cp->c_devname, lnp, li,
8301 	    s->s_sideno, index)) {
8302 		if (use_devid)
8303 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8304 		lp->l_flags = old_flags;
8305 		lbp->lb_loccnt--;
8306 		mddb_devclose(md_expldev(clp->l_dev));
8307 		single_thread_end(s);
8308 		mddb_setexit(s);
8309 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8310 	}
8311 
8312 	/*
8313 	 * Compute free blocks in replica before calling cfgloc2locator
8314 	 * since cfgloc2locator may attempt to alloc an unused block
8315 	 * to store the device id.
8316 	 * mbiarray needs to be setup before calling computefreeblks.
8317 	 */
8318 	s->s_mbiarray[li] = mbip;
8319 	computefreeblks(s);
8320 
8321 	if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
8322 		if (use_devid)
8323 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8324 		lp->l_flags = old_flags;
8325 		lbp->lb_loccnt--;
8326 		s->s_mbiarray[li] = 0;
8327 		mddb_devclose(md_expldev(clp->l_dev));
8328 		single_thread_end(s);
8329 		mddb_setexit(s);
8330 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8331 	}
8332 
8333 	if (use_devid)
8334 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8335 
8336 	uniqtime32(&lbp->lb_timestamp);
8337 	lp->l_flags = MDDB_F_ACTIVE;
8338 
8339 	/* write db copy to new device */
8340 	err = writecopy(s, li, MDDB_WRITECOPY_ALL);
8341 	lp->l_flags |= MDDB_F_UP2DATE;
8342 
8343 	/* write new locator names to all devices */
8344 	uniqtime32(&lnp->ln_timestamp);
8345 	if (lbp->lb_flags & MDDB_MNSET)
8346 		lnp->ln_revision = MDDB_REV_MNLN;
8347 	else
8348 		lnp->ln_revision = MDDB_REV_LN;
8349 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8350 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8351 		lbp->lb_lnblkcnt, 0);
8352 	/*
8353 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8354 	 * flag in the mddb_set structure to show that the locator
8355 	 * names have changed.
8356 	 */
8357 
8358 	if ((lbp->lb_flags & MDDB_MNSET) &&
8359 	    (md_set[s->s_setno].s_am_i_master)) {
8360 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8361 	}
8362 	if (err) {
8363 		if (writeretry(s)) {
8364 			single_thread_end(s);
8365 			mddb_setexit(s);
8366 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8367 		}
8368 	}
8369 
8370 	/* Data tags not supported on MN sets */
8371 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
8372 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
8373 	    setno != MD_LOCAL_SET)
8374 		if (set_dtag(s, ep))
8375 			mdclrerror(ep);
8376 
8377 	/* Write data tags to all accessible devices */
8378 	/* Data tags not supported on MN sets */
8379 	if (!(lbp->lb_flags & MDDB_MNSET)) {
8380 		(void) dt_write(s);
8381 	}
8382 
8383 	/* write new locator to all devices */
8384 	err = writelocall(s);
8385 
8386 	(void) upd_med(s, "newdev(0)");
8387 
8388 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
8389 	    md_expldev(clp->l_dev));
8390 
8391 	computefreeblks(s); /* recompute always it may be smaller */
8392 	if (err) {
8393 		if (writeretry(s)) {
8394 			single_thread_end(s);
8395 			mddb_setexit(s);
8396 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8397 		}
8398 	}
8399 
8400 	single_thread_end(s);
8401 	mddb_setexit(s);
8402 
8403 	return (0);
8404 }
8405 
8406 #ifdef DEBUG
8407 static void
8408 mddb_check_set(
8409 	set_t	setno
8410 )
8411 {
8412 	mddb_set_t	*s;
8413 	mddb_db_t	*dbp;
8414 	mddb_de_ic_t	*dep;
8415 	mddb_rb32_t	*rbp;
8416 
8417 	if (! md_set[setno].s_db)
8418 		return;
8419 
8420 	s = (mddb_set_t *)md_set[setno].s_db;
8421 
8422 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8423 		for (dep = dbp->db_firstentry;
8424 		    dep != NULL; dep = dep->de_next) {
8425 			rbp = dep->de_rb;
8426 			ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
8427 			if (dep->de_rb_userdata)
8428 				ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
8429 		}
8430 	}
8431 }
8432 #endif /* DEBUG */
8433 
8434 /*
8435  * Exported Entry Points
8436  */
8437 #ifdef DEBUG
8438 void
8439 mddb_check(void)
8440 {
8441 	int	i;
8442 
8443 	for (i = 0; i < md_nsets; i++) {
8444 		if (! md_set[i].s_db)
8445 			return;
8446 
8447 		mddb_check_set(i);
8448 	}
8449 
8450 }
8451 #endif /* DEBUG */
8452 
8453 int
8454 mddb_configure(
8455 	mddb_cfgcmd_t	command,
8456 	mddb_config_t	*cp
8457 )
8458 {
8459 	mddb_set_t	*s;
8460 	md_error_t	*ep = &cp->c_mde;
8461 	int		flag = 0;
8462 	int		err = 0;
8463 	set_t		setno = cp->c_setno;
8464 
8465 	mdclrerror(ep);
8466 
8467 	switch (command) {
8468 	    case MDDB_NEWDEV:
8469 		err = newdev(cp, command, ep);
8470 		break;
8471 
8472 	    case MDDB_NEWSIDE:
8473 	    case MDDB_DELSIDE:
8474 		err = delnewside(cp, command, ep);
8475 		break;
8476 
8477 	    case MDDB_GETDEV:
8478 	    case MDDB_DELDEV:
8479 	    case MDDB_ENDDEV:
8480 		err = getdeldev(cp, command, ep);
8481 		break;
8482 
8483 	    case MDDB_GETDRVRNAME:
8484 		err = getdriver(&cp->c_locator);
8485 		break;
8486 
8487 	    case MDDB_USEDEV:
8488 		/*
8489 		 * Note: must allow USEDEV ioctl during upgrade to support
8490 		 * auto-take disksets.
8491 		 *
8492 		 * Also during the set import if the md_devid_destroy
8493 		 * flag is set then error out
8494 		 */
8495 
8496 		if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
8497 			return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8498 
8499 		if (setno >= md_nsets)
8500 			return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8501 
8502 		if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
8503 			if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
8504 				err = mddbstatus2error(ep, err, NODEV32, setno);
8505 				break;
8506 			}
8507 		}
8508 		if (setno == MD_LOCAL_SET)
8509 			flag = MDDB_F_IOCTL;
8510 		if (cp->c_locator.l_old_devid) {
8511 			md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT);
8512 		}
8513 		if ((err = ridev(&s->s_rip, &cp->c_locator, NULL, flag)) != 0)
8514 			err = mddbstatus2error(ep, err, NODEV32, setno);
8515 		mddb_setexit(s);
8516 		break;
8517 
8518 	    case MDDB_RELEASESET:
8519 		mutex_enter(&mddb_lock);
8520 		mddb_unload_set(cp->c_setno);
8521 		mutex_exit(&mddb_lock);
8522 		break;
8523 
8524 	    case MDDB_SETDID:
8525 		err = setdid(cp);
8526 		break;
8527 
8528 	    default:
8529 		err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno);
8530 	}
8531 
8532 	return (err);
8533 }
8534 
8535 int
8536 mddb_getoptloc(
8537 	mddb_optloc_t		*ol
8538 )
8539 {
8540 	mddb_set_t		*s;
8541 	mddb_db_t		*dbp;
8542 	mddb_de_ic_t		*dep;
8543 	mddb_recid_t		id;
8544 	set_t			setno;
8545 
8546 	ol->li[0] = -1;
8547 	ol->li[1] = -1;
8548 
8549 	id = ol->recid;
8550 	setno = DBSET(id);
8551 	if (setno >= md_nsets)
8552 		return (EINVAL);
8553 
8554 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
8555 		return (0);
8556 
8557 	id = DBID(id);
8558 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8559 		for (dep = dbp->db_firstentry;
8560 		    dep != NULL; dep = dep->de_next) {
8561 			if (dep->de_recid != id)
8562 				continue;
8563 			ol->li[0] = dep->de_optinfo[0].o_li;
8564 			ol->li[1] = dep->de_optinfo[1].o_li;
8565 			mddb_setexit(s);
8566 			return (0);
8567 		}
8568 	}
8569 	mddb_setexit(s);
8570 	return (0);
8571 }
8572 
8573 void
8574 mddb_init(void)
8575 {
8576 	mddb_set_t	*s;
8577 
8578 	mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
8579 	if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
8580 		mddb_setexit(s);
8581 }
8582 
8583 
8584 void
8585 mddb_unload(void)
8586 {
8587 	int	i;
8588 
8589 	mutex_enter(&mddb_lock);
8590 
8591 	for (i = 0; i < md_nsets; i++) {
8592 		md_clr_setstatus(i, MD_SET_KEEPTAG);
8593 		mddb_unload_set(i);
8594 	}
8595 
8596 	crcfreetab();
8597 
8598 	mutex_exit(&mddb_lock);
8599 }
8600 
8601 mddb_recid_t
8602 mddb_createrec(
8603 	size_t		usersize,	 /* size of db record */
8604 	mddb_type_t	type,		 /* type1 of db record */
8605 	uint_t		type2,		 /* type2 of db record */
8606 	md_create_rec_option_t	options, /* options for this creation  */
8607 	set_t		setno		 /* set number to create record in */
8608 )
8609 {
8610 	mddb_set_t	*s;
8611 	mddb_db_t	*dbp, *prevdbp, *newdbp;
8612 	mddb_db32_t	*db32p;
8613 	mddb_de_ic_t	*dep;
8614 	/* LINTED variable unused - used for sizeof calculations */
8615 	mddb_de32_t	*de32p;
8616 	mddb_rb32_t	*rbp;
8617 	size_t		recsize;
8618 	ulong_t		blkcnt;
8619 	ulong_t		maxblocks;
8620 	size_t		desize, desize_ic;
8621 	size_t		used;
8622 	mddb_recid_t	newid;
8623 	caddr_t		tmppnt;
8624 	int		i, err = 0;
8625 	void		*userdata;
8626 	uint_t		flag_type;
8627 
8628 #if defined(_ILP32) && !defined(lint)
8629 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
8630 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8631 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8632 #endif
8633 
8634 	/*
8635 	 * everyone is supposed to sepcify if it's a
8636 	 * 32 bit or a 64 bit record
8637 	 */
8638 	if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
8639 		return (MDDB_E_INVALID);
8640 	}
8641 
8642 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8643 		return (err);
8644 
8645 	if (checkstate(s, MDDB_PROBE)) {
8646 		mddb_setexit(s);
8647 		return (MDDB_E_NOTNOW);
8648 	}
8649 
8650 	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
8651 				usersize, MDDB_BSIZE);
8652 	blkcnt = btodb(recsize);
8653 
8654 	if (mddb_maxblocks)
8655 		maxblocks = mddb_maxblocks;
8656 	else
8657 		maxblocks = (MDDB_BSIZE -
8658 			(sizeof (*db32p) + sizeof (*de32p) -
8659 			sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
8660 
8661 	if (blkcnt > maxblocks) {
8662 		mddb_setexit(s);
8663 		return (MDDB_E_INVALID);
8664 	}
8665 	/*
8666 	 * allocate record block
8667 	 * and new directory block so to avoid sleeping
8668 	 * after starting single_thread
8669 	 */
8670 	rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
8671 	if ((options & MD_CRO_OPTIMIZE) == 0)
8672 		userdata = kmem_zalloc(usersize, KM_SLEEP);
8673 	newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);
8674 
8675 	/*
8676 	 * if this is the largest record allocate new buffer for
8677 	 * checkcopy();
8678 	 */
8679 	if (recsize > s->s_databuffer_size) {
8680 		tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
8681 		/*
8682 		 * this test is incase when to sleep during kmem_alloc
8683 		 * and some other task bumped max record size
8684 		 */
8685 		if (recsize > s->s_databuffer_size) {
8686 			if (s->s_databuffer_size)
8687 				kmem_free(s->s_databuffer,
8688 				    s->s_databuffer_size);
8689 			s->s_databuffer = tmppnt;
8690 			s->s_databuffer_size = recsize;
8691 		} else {
8692 			kmem_free(tmppnt, recsize);
8693 		}
8694 	}
8695 
8696 	single_thread_start(s);
8697 
8698 	newid = 0;
8699 	do {
8700 		newid++;
8701 		if (DBID(newid) == 0) {
8702 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8703 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8704 			if ((options & MD_CRO_OPTIMIZE) == 0)
8705 				kmem_free(userdata, usersize);
8706 			single_thread_end(s);
8707 			mddb_setexit(s);
8708 			return (MDDB_E_NOTNOW);
8709 		}
8710 
8711 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8712 			for (dep = dbp->db_firstentry; dep;
8713 			    dep = dep->de_next) {
8714 				if (dep->de_recid == newid)
8715 					break;
8716 			}
8717 			if (dep != NULL)
8718 				break;
8719 		}
8720 	} while (dbp);
8721 
8722 	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
8723 			(sizeof (mddb_block_t) * blkcnt);
8724 
8725 	/*
8726 	 * see if a directory block exists which will hold this entry
8727 	 */
8728 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8729 		used = sizeof (*db32p);
8730 		for (dep = dbp->db_firstentry;
8731 		    dep != NULL; dep = dep->de_next) {
8732 			used += sizeof (*de32p) - sizeof (de32p->de32_blks);
8733 			used += sizeof (mddb_block_t) * dep->de_blkcount;
8734 		}
8735 		if ((used + desize) < MDDB_BSIZE)
8736 			break;
8737 	}
8738 	if (dbp) {
8739 		kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8740 		if (blkcnt > s->s_freeblkcnt) {
8741 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8742 			if ((options & MD_CRO_OPTIMIZE) == 0)
8743 				kmem_free(userdata, usersize);
8744 			single_thread_end(s);
8745 			mddb_setexit(s);
8746 			return (MDDB_E_NOSPACE);
8747 		}
8748 		prevdbp = NULL;
8749 	} else {
8750 		/*
8751 		 * need to add directory block
8752 		 */
8753 		if ((blkcnt + 1) > s->s_freeblkcnt) {
8754 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8755 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8756 			if ((options & MD_CRO_OPTIMIZE) == 0)
8757 				kmem_free(userdata, usersize);
8758 			single_thread_end(s);
8759 			mddb_setexit(s);
8760 			return (MDDB_E_NOSPACE);
8761 		}
8762 		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next);
8763 		dbp->db_next = newdbp;
8764 		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
8765 		dbp->db_nextblk = getfreeblks(s, 1);
8766 		dbp->db_next->db_blknum = dbp->db_nextblk;
8767 		prevdbp = dbp;
8768 		dbp = dbp->db_next;
8769 		dbp->db_nextblk = 0;
8770 		dbp->db_firstentry = NULL;
8771 		dbp->db_recsum = 0;
8772 		dbp->db_magic = MDDB_MAGIC_DB;
8773 	}
8774 	/*
8775 	 * ready to add record
8776 	 */
8777 	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
8778 			(sizeof (mddb_block_t) * blkcnt);
8779 	if (dbp->db_firstentry) {
8780 		for (dep = dbp->db_firstentry; dep->de_next;
8781 		    dep = dep->de_next);
8782 		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8783 		dep = dep->de_next;
8784 	} else {
8785 		dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8786 		dbp->db_firstentry = dep;
8787 	}
8788 	bzero((caddr_t)dep, desize_ic);
8789 	dep->de_recid = newid;
8790 	/*
8791 	 * Optimized records have an owner node associated with them in
8792 	 * a MN diskset.  The owner is only set on a node that is actively
8793 	 * writing to that record.  The other nodes will show that record
8794 	 * as having an invalid owner.  The owner for an optimized record
8795 	 * is used during fixoptrecord to determine which node should
8796 	 * write out the record when the replicas associated with that
8797 	 * optimized record have been changed.
8798 	 */
8799 	if (MD_MNSET_SETNO(s->s_setno)) {
8800 		dep->de_owner_nodeid = MD_MN_INVALID_NID;
8801 	}
8802 	dep->de_type1 =	type;
8803 	dep->de_type2 = type2;
8804 	dep->de_reqsize = usersize;
8805 	dep->de_recsize = recsize;
8806 	dep->de_blkcount = blkcnt;
8807 	flag_type = options &
8808 	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
8809 		MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
8810 		MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
8811 	switch (flag_type) {
8812 	case MD_CRO_OPTIMIZE:
8813 		dep->de_flags = MDDB_F_OPT;
8814 		getoptdev(s, dep, 0);
8815 		getoptdev(s, dep, 1);
8816 		break;
8817 	case MD_CRO_STRIPE:
8818 		dep->de_flags = MDDB_F_STRIPE;
8819 		break;
8820 	case MD_CRO_MIRROR:
8821 		dep->de_flags = MDDB_F_MIRROR;
8822 		break;
8823 	case MD_CRO_RAID:
8824 		dep->de_flags = MDDB_F_RAID;
8825 		break;
8826 	case MD_CRO_SOFTPART:
8827 		dep->de_flags = MDDB_F_SOFTPART;
8828 		break;
8829 	case MD_CRO_TRANS_MASTER:
8830 		dep->de_flags = MDDB_F_TRANS_MASTER;
8831 		break;
8832 	case MD_CRO_TRANS_LOG:
8833 		dep->de_flags = MDDB_F_TRANS_LOG;
8834 		break;
8835 	case MD_CRO_HOTSPARE:
8836 		dep->de_flags = MDDB_F_HOTSPARE;
8837 		break;
8838 	case MD_CRO_HOTSPARE_POOL:
8839 		dep->de_flags = MDDB_F_HOTSPARE_POOL;
8840 		break;
8841 	case MD_CRO_CHANGELOG:
8842 		dep->de_flags = MDDB_F_CHANGELOG;
8843 		break;
8844 	}
8845 	/*
8846 	 * try to get all blocks consecutive. If not possible
8847 	 * just get them one at a time
8848 	 */
8849 	dep->de_blks[0] = getfreeblks(s, blkcnt);
8850 	if (dep->de_blks[0]) {
8851 		for (i = 1; i < blkcnt; i++)
8852 			dep->de_blks[i] = dep->de_blks[0] + i;
8853 	} else {
8854 		for (i = 0; i < blkcnt;	 i++)
8855 			dep->de_blks[i] = getfreeblks(s, 1);
8856 	}
8857 	dep->de_rb = rbp;
8858 	bzero((caddr_t)rbp, recsize);
8859 	rbp->rb_magic = MDDB_MAGIC_RB;
8860 
8861 	/* Do we have to create an old style (32 bit) record?  */
8862 	if (options & MD_CRO_32BIT) {
8863 		rbp->rb_revision = MDDB_REV_RB;
8864 	} else {
8865 		rbp->rb_revision = MDDB_REV_RB64;
8866 	}
8867 
8868 	/* set de_rb_userdata for non optimization records */
8869 	if ((options & MD_CRO_OPTIMIZE) == 0) {
8870 		dep->de_rb_userdata = userdata;
8871 	}
8872 
8873 	uniqtime32(&rbp->rb_timestamp);
8874 	/* Generate the crc for this record */
8875 	rec_crcgen(s, dep, rbp);
8876 	tmppnt = (caddr_t)rbp;
8877 	/*
8878 	 * the following code writes new records to all instances of
8879 	 * the data base. Writing one block at a time to each instance
8880 	 * is safe because they are not yet in a directory entry which
8881 	 * has been written to the data base
8882 	 */
8883 	err = 0;
8884 	if ((options & MD_CRO_OPTIMIZE) == 0) {
8885 		for (i = 0; i < blkcnt;	 i++) {
8886 			err |= writeall(s, (caddr_t)tmppnt,
8887 				dep->de_blks[i], 1, 0);
8888 			tmppnt += MDDB_BSIZE;
8889 		}
8890 	} else {
8891 		if ((MD_MNSET_SETNO(s->s_setno)) &&
8892 		    md_set[s->s_setno].s_am_i_master) {
8893 		/*
8894 		 * If a MN diskset then only master writes out newly
8895 		 * created optimized record.
8896 		 */
8897 			err |= writeoptrecord(s, dep);
8898 		}
8899 	}
8900 	uniqtime32(&dbp->db_timestamp);
8901 	dbp->db_revision = MDDB_REV_DB;
8902 	/* Don't include opt resync and change log records in global XOR */
8903 	if (!(dep->de_flags & MDDB_F_OPT) &&
8904 	    !(dep->de_flags & MDDB_F_CHANGELOG))
8905 		dbp->db_recsum ^= rbp->rb_checksum;
8906 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
8907 	create_db32rec(db32p, dbp);
8908 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
8909 	err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
8910 	if (prevdbp) {
8911 		dbp = prevdbp;
8912 		uniqtime32(&dbp->db_timestamp);
8913 		dbp->db_revision = MDDB_REV_DB;
8914 		create_db32rec(db32p, dbp);
8915 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
8916 		err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
8917 	}
8918 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
8919 	if (err) {
8920 		if (writeretry(s)) {
8921 			s->s_zombie = newid;
8922 			single_thread_end(s);
8923 			mddb_setexit(s);
8924 			return (MDDB_E_NOTNOW);
8925 		}
8926 	}
8927 	single_thread_end(s);
8928 	mddb_setexit(s);
8929 
8930 	ASSERT((newid & MDDB_SETMASK) == 0);
8931 	return (MAKERECID(setno, newid));
8932 }
8933 
8934 int
8935 mddb_deleterec(
8936 	mddb_recid_t	id
8937 )
8938 {
8939 	mddb_set_t	*s;
8940 	mddb_db_t	*dbp;
8941 	mddb_db32_t	*db32p;
8942 	mddb_de_ic_t	*dep, *dep1;
8943 	int		i;
8944 
8945 #if defined(_ILP32) && !defined(lint)
8946 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8947 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8948 #endif
8949 
8950 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
8951 	ASSERT(s != NULL);
8952 
8953 	id = DBID(id);
8954 	if (checkstate(s, MDDB_PROBE)) {
8955 		mddb_setexit(s);
8956 		return (MDDB_E_NOTNOW);
8957 	}
8958 
8959 	ASSERT(s->s_lbp != NULL);
8960 	single_thread_start(s);
8961 
8962 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8963 		dep1 = NULL;
8964 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
8965 			if (dep->de_recid == id)
8966 				break;
8967 			dep1 = dep;
8968 		}
8969 		if (dep != NULL)
8970 			break;
8971 	}
8972 	/*
8973 	 * no such record
8974 	 */
8975 	if (dep == NULL) {
8976 		single_thread_end(s);
8977 		ASSERT(s->s_staledeletes != 0);
8978 		s->s_staledeletes--;
8979 		mddb_setexit(s);
8980 		return (0);
8981 	}
8982 
8983 	if (!(dep->de_flags & MDDB_F_OPT) &&
8984 	    !(dep->de_flags & MDDB_F_CHANGELOG)) {
8985 		dbp->db_recsum ^= dep->de_rb->rb_checksum;
8986 		dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
8987 	}
8988 
8989 	if (dep->de_rb_userdata != NULL) {
8990 		if (dep->de_icreqsize)
8991 			kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
8992 		else
8993 			kmem_free(dep->de_rb_userdata, dep->de_reqsize);
8994 	}
8995 
8996 	kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
8997 
8998 	for (i = 0; i < dep->de_blkcount; i++)
8999 		blkfree(s, dep->de_blks[i]);
9000 	if (dep1)
9001 		dep1->de_next = dep->de_next;
9002 	else
9003 		dbp->db_firstentry = dep->de_next;
9004 
9005 	kmem_free(dep, sizeofde(dep));
9006 
9007 	uniqtime32(&dbp->db_timestamp);
9008 	dbp->db_revision = MDDB_REV_DB;
9009 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9010 	create_db32rec(db32p, dbp);
9011 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9012 	if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
9013 		if (writeretry(s)) {
9014 			/*
9015 			 * staledelete is used to mark deletes which failed.
9016 			 * its only use is to not panic when the user retries
9017 			 * the delete once the database is active again
9018 			 */
9019 			single_thread_end(s);
9020 			s->s_staledeletes++;
9021 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
9022 			mddb_setexit(s);
9023 			return (MDDB_E_NOTNOW);
9024 		}
9025 	}
9026 	single_thread_end(s);
9027 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9028 	mddb_setexit(s);
9029 	return (0);
9030 }
9031 
9032 mddb_recid_t
9033 mddb_getnextrec(
9034 	mddb_recid_t		id,
9035 	mddb_type_t		typ,
9036 	uint_t			type2
9037 )
9038 {
9039 	mddb_set_t		*s;
9040 	mddb_db_t		*dbp;
9041 	mddb_de_ic_t		*dep;
9042 	int			searching, err;
9043 	set_t			setno;
9044 
9045 	setno = DBSET(id);
9046 	id = DBID(id);
9047 	searching = id;
9048 
9049 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9050 		return (err);
9051 
9052 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9053 		for (dep = dbp->db_firstentry;
9054 		    dep != NULL; dep = dep->de_next) {
9055 			if (searching) {
9056 				if (dep->de_recid == id)
9057 					searching = 0;
9058 			} else {
9059 				if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
9060 				    (type2 == 0 || dep->de_type2 == type2)) {
9061 					id = dep->de_recid;
9062 					mddb_setexit(s);
9063 					ASSERT((id & MDDB_SETMASK) == 0);
9064 					return (MAKERECID(setno, id));
9065 				}
9066 			}
9067 		}
9068 	}
9069 
9070 	mddb_setexit(s);
9071 
9072 	if (searching)
9073 		return (MDDB_E_NORECORD);
9074 	return (0);
9075 }
9076 
9077 void *
9078 mddb_getrecaddr(
9079 	mddb_recid_t		id
9080 )
9081 {
9082 	mddb_set_t		*s;
9083 	mddb_db_t		*dbp;
9084 	mddb_de_ic_t		*dep;
9085 	void			*rval;
9086 
9087 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9088 		return (NULL);
9089 
9090 	id = DBID(id);
9091 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9092 		for (dep = dbp->db_firstentry;
9093 		    dep != NULL; dep = dep->de_next) {
9094 			if (dep->de_recid != id)
9095 				continue;
9096 			if (dep->de_rb_userdata)
9097 				rval = (void *)dep->de_rb_userdata;
9098 			else
9099 				rval = (void *)dep->de_rb->rb_data;
9100 			mddb_setexit(s);
9101 			return (rval);
9102 		}
9103 	}
9104 
9105 	mddb_setexit(s);
9106 	return (NULL);
9107 }
9108 
9109 
9110 mddb_de_ic_t *
9111 mddb_getrecdep(
9112 	mddb_recid_t		id
9113 )
9114 {
9115 	mddb_set_t		*s;
9116 	mddb_db_t		*dbp;
9117 	mddb_de_ic_t		*dep;
9118 
9119 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9120 		return (NULL);
9121 
9122 	id = DBID(id);
9123 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9124 		for (dep = dbp->db_firstentry;
9125 		    dep != NULL; dep = dep->de_next) {
9126 			if (dep->de_recid != id)
9127 				continue;
9128 			mddb_setexit(s);
9129 			return (dep);
9130 		}
9131 	}
9132 
9133 	mddb_setexit(s);
9134 	return (NULL);
9135 }
9136 
9137 void *
9138 mddb_getrecaddr_resize(
9139 	mddb_recid_t		id,
9140 	size_t			icsize,
9141 	off_t			off
9142 )
9143 {
9144 	mddb_set_t		*s;
9145 	mddb_db_t		*dbp;
9146 	mddb_de_ic_t		*dep;
9147 	void			*rval = NULL;
9148 
9149 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9150 		return (NULL);
9151 
9152 	id = DBID(id);
9153 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9154 		for (dep = dbp->db_firstentry;
9155 		    dep != NULL; dep = dep->de_next) {
9156 			if (dep->de_recid != id)
9157 				continue;
9158 			if (dep->de_rb_userdata)
9159 				rval = (void *)dep->de_rb_userdata;
9160 			else
9161 				rval = (void *)dep->de_rb->rb_data;
9162 			break;
9163 		}
9164 		if (rval != NULL)
9165 			break;
9166 	}
9167 
9168 	if (rval == NULL) {
9169 		mddb_setexit(s);
9170 		return (NULL);
9171 	}
9172 
9173 	if (dep->de_rb_userdata) {
9174 		caddr_t nud;
9175 
9176 		if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
9177 			mddb_setexit(s);
9178 			return (rval);
9179 		}
9180 		ASSERT((dep->de_reqsize + off) <= icsize);
9181 		nud = kmem_zalloc(icsize, KM_SLEEP);
9182 		bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
9183 		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9184 		dep->de_rb_userdata = nud + off;
9185 		dep->de_rb_userdata_ic = nud;
9186 		dep->de_icreqsize = icsize;
9187 		rval = nud;
9188 	} else {
9189 		size_t recsize;
9190 		/* LINTED variable unused - used for sizeof calculations */
9191 		mddb_rb32_t *nrbp;
9192 
9193 		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
9194 				icsize, MDDB_BSIZE);
9195 		if (dep->de_recsize < recsize)
9196 			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
9197 				"nonoptimized records can be resized\n");
9198 	}
9199 
9200 	mddb_setexit(s);
9201 	return (rval);
9202 }
9203 
9204 int
9205 mddb_getrecprivate(
9206 	mddb_recid_t		id
9207 )
9208 {
9209 	mddb_set_t		*s;
9210 	mddb_db_t		*dbp;
9211 	mddb_de_ic_t		*dep;
9212 	int			err = 0;
9213 	int			private;
9214 
9215 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9216 		return (err);
9217 
9218 	id = DBID(id);
9219 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9220 		for (dep = dbp->db_firstentry;
9221 		    dep != NULL; dep = dep->de_next) {
9222 			if (dep->de_recid != id)
9223 				continue;
9224 			private = (int)dep->de_rb->rb_private;
9225 			mddb_setexit(s);
9226 			return (private);
9227 		}
9228 	}
9229 
9230 	mddb_setexit(s);
9231 	return (MDDB_E_NORECORD);
9232 }
9233 
9234 void
9235 mddb_setrecprivate(
9236 	mddb_recid_t		id,
9237 	uint_t			private
9238 )
9239 {
9240 	mddb_set_t		*s;
9241 	mddb_db_t		*dbp;
9242 	mddb_de_ic_t		*dep;
9243 
9244 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
9245 		ASSERT(0);
9246 		return;
9247 	}
9248 
9249 	id = DBID(id);
9250 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9251 		for (dep = dbp->db_firstentry;
9252 		    dep != NULL; dep = dep->de_next) {
9253 			if (dep->de_recid != id)
9254 				continue;
9255 			dep->de_rb->rb_private = private;
9256 			mddb_setexit(s);
9257 			return;
9258 		}
9259 	}
9260 
9261 	mddb_setexit(s);
9262 	ASSERT(0);
9263 }
9264 
9265 mddb_type_t
9266 mddb_getrectype1(
9267 	mddb_recid_t		id
9268 )
9269 {
9270 	mddb_set_t		*s;
9271 	mddb_db_t		*dbp;
9272 	mddb_de_ic_t		*dep;
9273 	int			err = 0;
9274 	mddb_type_t		rval;
9275 
9276 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9277 		return (err);
9278 
9279 	id = DBID(id);
9280 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9281 		for (dep = dbp->db_firstentry;
9282 		    dep != NULL; dep = dep->de_next) {
9283 			if (dep->de_recid != id)
9284 				continue;
9285 			rval = dep->de_type1;
9286 			mddb_setexit(s);
9287 			return (rval);
9288 		}
9289 	}
9290 
9291 	mddb_setexit(s);
9292 	return (MDDB_E_NORECORD);
9293 }
9294 
9295 int
9296 mddb_getrectype2(
9297 	mddb_recid_t		id
9298 )
9299 {
9300 	mddb_set_t		*s;
9301 	mddb_db_t		*dbp;
9302 	mddb_de_ic_t		*dep;
9303 	int			err = 0;
9304 	int			rval;
9305 
9306 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9307 		return (err);
9308 
9309 	id = DBID(id);
9310 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9311 		for (dep = dbp->db_firstentry;
9312 		    dep != NULL; dep = dep->de_next) {
9313 			if (dep->de_recid != id)
9314 				continue;
9315 			rval = (int)dep->de_type2;
9316 			mddb_setexit(s);
9317 			return (rval);
9318 		}
9319 	}
9320 
9321 	mddb_setexit(s);
9322 	return (MDDB_E_NORECORD);
9323 }
9324 
9325 int
9326 mddb_getrecsize(
9327 	mddb_recid_t		id
9328 )
9329 {
9330 	mddb_set_t		*s;
9331 	mddb_db_t		*dbp;
9332 	mddb_de_ic_t		*dep;
9333 	int			err = 0;
9334 	int			rval;
9335 
9336 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9337 		return (err);
9338 
9339 	id = DBID(id);
9340 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9341 		for (dep = dbp->db_firstentry;
9342 		    dep != NULL; dep = dep->de_next) {
9343 			if (dep->de_recid != id)
9344 				continue;
9345 			rval = (int)dep->de_reqsize;
9346 			mddb_setexit(s);
9347 			return (rval);
9348 		}
9349 	}
9350 
9351 	mddb_setexit(s);
9352 	return (MDDB_E_NORECORD);
9353 }
9354 
9355 
9356 mddb_recstatus_t
9357 mddb_getrecstatus(
9358 	mddb_recid_t		id
9359 )
9360 {
9361 	mddb_set_t		*s;
9362 	mddb_db_t		*dbp;
9363 	mddb_de_ic_t		*dep;
9364 	int			err = 0;
9365 	mddb_recstatus_t	e_err;
9366 
9367 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9368 		return ((mddb_recstatus_t)err);
9369 
9370 	id = DBID(id);
9371 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9372 		for (dep = dbp->db_firstentry;
9373 		    dep != NULL; dep = dep->de_next) {
9374 			if (dep->de_recid == id)
9375 				break;
9376 		}
9377 		if (dep)
9378 			break;
9379 	}
9380 
9381 	e_err = MDDB_OK;
9382 
9383 	if (! dep)
9384 		e_err = MDDB_NORECORD;
9385 	else if (! dep->de_rb->rb_commitcnt)
9386 		e_err = MDDB_NODATA;
9387 	else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
9388 		e_err = MDDB_STALE;
9389 
9390 	mddb_setexit(s);
9391 	return (e_err);
9392 }
9393 
9394 /*
9395  * Commit given record to disk.
9396  * If committing an optimized record, do not call
9397  * with md ioctl lock held.
9398  */
9399 int
9400 mddb_commitrec(
9401 	mddb_recid_t	id
9402 )
9403 {
9404 	mddb_set_t			*s;
9405 	mddb_db_t			*dbp;
9406 	mddb_de_ic_t			*dep;
9407 	mddb_recid_t			ids[2];
9408 	mddb_rb32_t			*rbp;
9409 	static int			err = 0;
9410 	md_mn_msg_mddb_optrecerr_t	*msg_recerr;
9411 	md_mn_kresult_t			*kres;
9412 	mddb_lb_t			*lbp;
9413 	mddb_mnlb_t			*mnlbp;
9414 	mddb_locator_t			*lp;
9415 	mddb_mnsidelocator_t		*mnslp;
9416 	mddb_drvnm_t			*dn;
9417 	int				li;
9418 	md_replica_recerr_t		*recerr;
9419 	int				i, j;
9420 	int				rval;
9421 	int				hit_err = 0;
9422 
9423 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9424 	ASSERT(s != NULL);
9425 
9426 	if (checkstate(s, MDDB_PROBE)) {
9427 		mddb_setexit(s);
9428 		return (MDDB_E_NOTNOW);
9429 	}
9430 
9431 	if (DBID(id) == 0) {
9432 		mddb_setexit(s);
9433 		return (0);
9434 	}
9435 
9436 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9437 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9438 			if (dep->de_recid == DBID(id))
9439 				break;
9440 		}
9441 		if (dep)
9442 			break;
9443 	}
9444 
9445 	if (dep == NULL) {
9446 		mddb_setexit(s);
9447 		return (MDDB_E_NORECORD);
9448 	}
9449 
9450 	if (! (dep->de_flags & MDDB_F_OPT)) {
9451 		ids[0] = id;
9452 		ids[1] = 0;
9453 		mddb_setexit(s);
9454 		return (mddb_commitrecs(ids));
9455 	}
9456 
9457 	/*
9458 	 * following code allows multiple processes to be doing
9459 	 * optimization commits in parallel.
9460 	 * NOTE: if lots of optimization commits then the lock
9461 	 * will not get released until it winds down
9462 	 */
9463 	if (s->s_optwaiterr) {
9464 		while (s->s_optwaiterr) {
9465 			s->s_opthungerr = 1;
9466 			cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
9467 		}
9468 		if (checkstate(s, MDDB_PROBE)) {
9469 			mddb_setexit(s);
9470 			return (MDDB_E_NOTNOW);
9471 		}
9472 	}
9473 	if (s->s_optcmtcnt++ == 0) {
9474 		single_thread_start(s);
9475 		s->s_opthavelck = 1;
9476 		if (s->s_optwantlck) {
9477 			cv_broadcast(&s->s_optwantlck_cv);
9478 			s->s_optwantlck = 0;
9479 		}
9480 	} else {
9481 		while (! s->s_opthavelck) {
9482 			s->s_optwantlck = 1;
9483 			cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
9484 		}
9485 	}
9486 
9487 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9488 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9489 			if (dep->de_recid == DBID(id))
9490 				break;
9491 		}
9492 		if (dep)
9493 			break;
9494 	}
9495 
9496 	if (dep == NULL) {
9497 		if (! (--s->s_optcmtcnt)) {
9498 			single_thread_end(s);
9499 			s->s_opthavelck = 0;
9500 		}
9501 		mddb_setexit(s);
9502 		return (MDDB_E_NORECORD);
9503 	}
9504 
9505 	rbp = dep->de_rb;
9506 	rbp->rb_commitcnt++;
9507 	uniqtime32(&rbp->rb_timestamp);
9508 	/* Generate the crc for this record */
9509 	rec_crcgen(s, dep, rbp);
9510 
9511 	if (writeoptrecord(s, dep)) {
9512 		if (MD_MNSET_SETNO(s->s_setno)) {
9513 			hit_err = 1;
9514 		}
9515 		s->s_optwaiterr++;
9516 	}
9517 	if (MD_MNSET_SETNO(s->s_setno)) {
9518 		/* If last thread out, release single_thread_start */
9519 		if (! (--s->s_optcmtcnt)) {
9520 			single_thread_end(s);
9521 			s->s_opthavelck = 0;
9522 		}
9523 		/*
9524 		 * If this thread had a writeoptrecords failure, then
9525 		 * need to send message to master.
9526 		 * But, multiple threads could all be running on the
9527 		 * same single_thread_start, so serialize the threads
9528 		 * by making each thread grab single_thread_start.
9529 		 *
9530 		 * After return from sending message to master message,
9531 		 * replicas associated with optimized record will havei
9532 		 * been changed (via a callback from the master to all
9533 		 * nodes), so retry call to writeoptrecord.
9534 		 * This code is replacing the call to writeretry that
9535 		 * occurs for the local and traditional disksets.
9536 		 */
9537 		if (hit_err) {
9538 			single_thread_start(s);
9539 			/*
9540 			 * If > 50% of replicas are alive then continue
9541 			 * to send message to master until writeoptrecord
9542 			 * succeeds.  For now, assume that minor name,
9543 			 * major number on this node is the same as on
9544 			 * the master node.  Once devids are turned on
9545 			 * for MN disksets, can send devid.
9546 			 */
9547 			kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
9548 			msg_recerr = kmem_zalloc(
9549 			    sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
9550 			while (!(md_get_setstatus(s->s_setno) &
9551 			    MD_SET_TOOFEW)) {
9552 				bzero((caddr_t)msg_recerr,
9553 				    sizeof (md_mn_msg_mddb_optrecerr_t));
9554 				lbp = s->s_lbp;
9555 				mnlbp = (mddb_mnlb_t *)lbp;
9556 				for (i = 0; i < 2; i++) {
9557 				    li = dep->de_optinfo[i].o_li;
9558 				    lp = &lbp->lb_locators[li];
9559 				    for (j = 0; j < MD_MNMAXSIDES; j++) {
9560 					mnslp =
9561 					    &mnlbp->lb_mnsidelocators[j][li];
9562 					if (mnslp->mnl_sideno == s->s_sideno)
9563 					    break;
9564 				    }
9565 				    if (j == MD_MNMAXSIDES)
9566 					continue;
9567 
9568 				    dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
9569 				    recerr = &msg_recerr->msg_recerr[i];
9570 				    recerr->r_li = li;
9571 				    recerr->r_flags =
9572 					dep->de_optinfo[i].o_flags;
9573 				    recerr->r_blkno = lp->l_blkno;
9574 				    recerr->r_mnum = md_getminor(lp->l_dev);
9575 				    (void) strncpy(recerr->r_driver_name,
9576 					dn->dn_data, MD_MAXDRVNM);
9577 				}
9578 
9579 				/* Release locks */
9580 				single_thread_end(s);
9581 				mutex_exit(SETMUTEX(s->s_setno));
9582 
9583 				/*
9584 				 * Send message to master about optimized
9585 				 * record failure.  After return, master
9586 				 * should have marked failed replicas
9587 				 * and sent parse message to slaves causing
9588 				 * slaves to have fixed up the optimized
9589 				 * record.
9590 				 * On return from ksend_message, retry
9591 				 * the write since this node should have fixed
9592 				 * the optimized resync records it owns.
9593 				 */
9594 				rval = mdmn_ksend_message(s->s_setno,
9595 					MD_MN_MSG_MDDB_OPTRECERR,
9596 					MD_MSGF_NO_BCAST,
9597 					(char *)msg_recerr,
9598 					sizeof (md_mn_msg_mddb_optrecerr_t),
9599 					kres);
9600 				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
9601 					cmn_err(CE_WARN, "mddb_commitrec: "
9602 						"Unable to send optimized "
9603 						"resync record failure "
9604 						"message to other nodes in "
9605 						"diskset %s\n", s->s_setname);
9606 					mdmn_ksend_show_error(rval, kres,
9607 					    "MD_MN_MSG_MDDB_OPTRECERR");
9608 				}
9609 
9610 				/* Regrab locks */
9611 				mutex_enter(SETMUTEX(s->s_setno));
9612 				single_thread_start(s);
9613 
9614 				/* Start over in case mddb changed */
9615 				for (dbp = s->s_dbp; dbp != NULL;
9616 				    dbp = dbp->db_next) {
9617 					for (dep = dbp->db_firstentry; dep;
9618 					    dep = dep->de_next) {
9619 						if (dep->de_recid == DBID(id))
9620 							break;
9621 					}
9622 					if (dep)
9623 						break;
9624 				}
9625 				if (dep) {
9626 					rbp = dep->de_rb;
9627 					rbp->rb_commitcnt++;
9628 					uniqtime32(&rbp->rb_timestamp);
9629 					/* Generate the crc for this record */
9630 					rec_crcgen(s, dep, rbp);
9631 
9632 					/*
9633 					 * If writeoptrecord succeeds, then
9634 					 * break out.
9635 					 */
9636 					if (!(writeoptrecord(s, dep)))
9637 						break;
9638 				}
9639 			}
9640 			kmem_free(kres, sizeof (md_mn_kresult_t));
9641 			kmem_free(msg_recerr,
9642 				sizeof (md_mn_msg_mddb_optrecerr_t));
9643 
9644 			/* Resync record should be fixed - if possible */
9645 			s->s_optwaiterr--;
9646 			if (s->s_optwaiterr == 0) {
9647 				/* All errors have been handled */
9648 				if (s->s_opthungerr) {
9649 					s->s_opthungerr = 0;
9650 					cv_broadcast(&s->s_opthungerr_cv);
9651 				}
9652 			}
9653 			single_thread_end(s);
9654 			mddb_setexit(s);
9655 			if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
9656 				return (MDDB_E_NOTNOW);
9657 			} else {
9658 				return (0);
9659 			}
9660 		}
9661 	} else {
9662 		/* If set is a traditional or local set */
9663 		if (! (--s->s_optcmtcnt)) {
9664 			err = 0;
9665 			if (s->s_optwaiterr) {
9666 				err = writeretry(s);
9667 				s->s_optwaiterr = 0;
9668 				if (s->s_opthungerr) {
9669 					s->s_opthungerr = 0;
9670 					cv_broadcast(&s->s_opthungerr_cv);
9671 				}
9672 			}
9673 			single_thread_end(s);
9674 			s->s_opthavelck = 0;
9675 			mddb_setexit(s);
9676 			if (err)
9677 				return (MDDB_E_NOTNOW);
9678 			return (0);
9679 		}
9680 		if (s->s_optwaiterr) {
9681 			while (s->s_optwaiterr) {
9682 				s->s_opthungerr = 1;
9683 				cv_wait(&s->s_opthungerr_cv,
9684 				    SETMUTEX(s->s_setno));
9685 			}
9686 			if (checkstate(s, MDDB_NOPROBE)) {
9687 				mddb_setexit(s);
9688 				return (MDDB_E_NOTNOW);
9689 			}
9690 		}
9691 	}
9692 
9693 	mddb_setexit(s);
9694 	return (0);
9695 }
9696 
9697 int
9698 mddb_commitrecs(
9699 	mddb_recid_t	ids[]
9700 )
9701 {
9702 	mddb_set_t	*s;
9703 	mddb_db_t	*dbp;
9704 	mddb_de_ic_t	*dep;
9705 	mddb_rb32_t	*rbp;
9706 	mddb_rb32_t	*saverbp;
9707 	mddb_lb_t	*lbp;
9708 	int		li;
9709 	uint_t		checksum;
9710 	mddb_recid_t	*idp;
9711 	int		err = 0;
9712 	set_t		setno;
9713 
9714 	if (panicstr)
9715 		cmn_err(CE_PANIC, "md: mddb: commit not allowed");
9716 
9717 	/*
9718 	 * scan through and make sure ids are from the same set
9719 	 */
9720 	setno = DBSET(ids[0]);
9721 	for (idp = ids; *idp != NULL; idp++)
9722 		ASSERT(DBSET(*idp) == setno);
9723 
9724 	s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);
9725 
9726 	if (checkstate(s, MDDB_PROBE)) {
9727 		mddb_setexit(s);
9728 		return (MDDB_E_NOTNOW);
9729 	}
9730 
9731 	ASSERT(s->s_lbp != NULL);
9732 	err = 0;
9733 
9734 	if (! ids[0]) {
9735 		mddb_setexit(s);
9736 		return (0);
9737 	}
9738 
9739 	single_thread_start(s);
9740 	/*
9741 	 * scan through and make sure ids all exist
9742 	 */
9743 	for (idp = ids; *idp != NULL; idp++) {
9744 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9745 			for (dep = dbp->db_firstentry; dep;
9746 			    dep = dep->de_next) {
9747 				if (dep->de_recid == DBID(*idp))
9748 					break;
9749 			}
9750 			if (dep != NULL)
9751 				break;
9752 		}
9753 		if (dep == NULL) {
9754 			single_thread_end(s);
9755 			mddb_setexit(s);
9756 			return (MDDB_E_NORECORD);
9757 		}
9758 	}
9759 
9760 	/*
9761 	 * scan through records fix commit counts and
9762 	 * zero fiddles and update time stamp and rechecksum record
9763 	 */
9764 	checksum = 0;
9765 	idp = ids;
9766 	saverbp = NULL;
9767 	while (*idp) {
9768 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9769 			for (dep = dbp->db_firstentry; dep;
9770 			    dep = dep->de_next) {
9771 				if (dep->de_recid == DBID(*idp))
9772 					break;
9773 			}
9774 			if (dep != NULL)
9775 				break;
9776 		}
9777 		rbp = dep->de_rb;
9778 		ASSERT(! (dep->de_flags & MDDB_F_OPT));
9779 
9780 		getuserdata(setno, dep);
9781 		/* Don't do fiddles for CHANGE LOG records */
9782 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9783 			checksum ^= rbp->rb_checksum_fiddle;
9784 			rbp->rb_checksum_fiddle = 0;
9785 			checksum ^= rbp->rb_checksum;
9786 			saverbp = rbp;
9787 		}
9788 		rbp->rb_commitcnt++;
9789 		uniqtime32(&rbp->rb_timestamp);
9790 		/* Generate the crc for this record */
9791 		rec_crcgen(s, dep, rbp);
9792 
9793 		/* Don't do fiddles for CHANGE LOG records */
9794 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9795 			checksum ^= rbp->rb_checksum;
9796 		}
9797 		idp++;
9798 	}
9799 
9800 	if (saverbp)
9801 		saverbp->rb_checksum_fiddle = checksum;
9802 
9803 	/*
9804 	 * If this is a MN set but we are not the master, then we are not
9805 	 * supposed to update the mddb on disk. So we finish at this point.
9806 	 */
9807 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
9808 	    (md_set[setno].s_am_i_master == 0)) {
9809 		single_thread_end(s);
9810 		mddb_setexit(s);
9811 		return (0);
9812 	}
9813 
9814 	lbp = s->s_lbp;
9815 	for (li = 0; li < lbp->lb_loccnt; li++) {
9816 		if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
9817 			continue;
9818 
9819 		idp = ids;
9820 		while (*idp) {
9821 			for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9822 				dep = dbp->db_firstentry;
9823 				while (dep && (dep->de_recid != DBID(*idp)))
9824 					dep = dep->de_next;
9825 				if (dep != NULL)
9826 					break;
9827 			}
9828 			rbp = dep->de_rb;
9829 			err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
9830 			    dep->de_blkcount, li, (mddb_bf_t **)0,
9831 			    MDDB_WR_ONLY_MASTER);
9832 			if (err)
9833 				break;
9834 			idp++;
9835 		}
9836 		if (err)
9837 			break;
9838 	}
9839 	if (err) {
9840 		if (writeretry(s)) {
9841 			single_thread_end(s);
9842 			mddb_setexit(s);
9843 			return (MDDB_E_NOTNOW);
9844 		}
9845 	}
9846 	single_thread_end(s);
9847 	mddb_setexit(s);
9848 	return (0);
9849 }
9850 
9851 mddb_recid_t
9852 mddb_makerecid(
9853 	set_t		setno,
9854 	mddb_recid_t	id
9855 )
9856 {
9857 	return (MAKERECID(setno, id));
9858 }
9859 
9860 set_t
9861 mddb_getsetnum(
9862 	mddb_recid_t	id
9863 )
9864 {
9865 	return (DBSET(id));
9866 }
9867 
9868 char *
9869 mddb_getsetname(
9870 	set_t	setno
9871 )
9872 {
9873 	return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
9874 }
9875 
9876 side_t
9877 mddb_getsidenum(
9878 	set_t	setno
9879 )
9880 {
9881 	if (md_set[setno].s_db)
9882 		return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
9883 	return (0);
9884 }
9885 
9886 int
9887 mddb_ownset(
9888 	set_t	setno
9889 )
9890 {
9891 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
9892 		return (1);
9893 
9894 	if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
9895 		return (1);
9896 
9897 	return (0);
9898 }
9899 
9900 /*ARGSUSED*/
9901 int
9902 getmed_ioctl(mddb_med_parm_t *medpp, int mode)
9903 {
9904 	mddb_set_t	*s;
9905 	int		err = 0;
9906 	set_t		setno = medpp->med_setno;
9907 	md_error_t	*ep = &medpp->med_mde;
9908 
9909 	mdclrerror(ep);
9910 
9911 	if (setno >= md_nsets)
9912 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
9913 
9914 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
9915 		return (0);
9916 
9917 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
9918 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
9919 
9920 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9921 		return (mddbstatus2error(ep, err, NODEV32, setno));
9922 
9923 	medpp->med = s->s_med;			/* structure assignment */
9924 
9925 	mddb_setexit(s);
9926 
9927 	return (0);
9928 }
9929 
9930 int
9931 setmed_ioctl(mddb_med_parm_t *medpp, int mode)
9932 {
9933 
9934 	mddb_set_t	*s;
9935 	int		err = 0;
9936 	set_t		setno = medpp->med_setno;
9937 	md_error_t	*ep = &medpp->med_mde;
9938 
9939 	mdclrerror(ep);
9940 
9941 	if ((mode & FWRITE) == 0)
9942 		return (mdsyserror(ep, EACCES));
9943 
9944 	/*
9945 	 * This should be the only thing that prevents LOCAL sets from having
9946 	 * mediators, at least in the kernel, userland needs to have some code
9947 	 * written.
9948 	 */
9949 	if (setno == MD_LOCAL_SET)
9950 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
9951 
9952 	if (setno >= md_nsets)
9953 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
9954 
9955 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
9956 		return (0);
9957 
9958 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
9959 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
9960 
9961 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9962 		return (mddbstatus2error(ep, err, NODEV32, setno));
9963 
9964 	s->s_med = medpp->med;			/* structure assignment */
9965 
9966 	mddb_setexit(s);
9967 
9968 	return (0);
9969 }
9970 
9971 int
9972 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
9973 {
9974 
9975 	mddb_set_t	*s;
9976 	int		err = 0;
9977 	set_t		setno = medpp->med_setno;
9978 	md_error_t	*ep = &medpp->med_mde;
9979 
9980 	mdclrerror(ep);
9981 
9982 	if ((mode & FWRITE) == 0)
9983 		return (mdsyserror(ep, EACCES));
9984 
9985 	if (setno >= md_nsets)
9986 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
9987 
9988 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
9989 		return (0);
9990 
9991 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
9992 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
9993 
9994 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9995 		return (mddbstatus2error(ep, err, NODEV32, setno));
9996 
9997 	single_thread_start(s);
9998 	(void) upd_med(s, "updmed_ioctl()");
9999 	single_thread_end(s);
10000 
10001 	mddb_setexit(s);
10002 
10003 	return (0);
10004 }
10005 
10006 int
10007 take_set(mddb_config_t *cp, int mode)
10008 {
10009 	int			err = 0;
10010 	mddb_med_upd_parm_t	medup;
10011 	set_t			setno = cp->c_setno;
10012 	md_error_t		*ep = &cp->c_mde;
10013 	int			snarf_ok = 0;
10014 
10015 	if (md_get_setstatus(setno) & MD_SET_SNARFED)
10016 		return (0);
10017 
10018 	err = mddb_configure(MDDB_GETDEV, cp);
10019 	if (! err && mdisok(ep)) {
10020 		if (md_snarf_db_set(setno, ep) != 0)
10021 			goto out;
10022 		snarf_ok = 1;
10023 	}
10024 
10025 	if (! err && mdisok(ep)) {
10026 		if (! cp->c_flags) {
10027 			medup.med_setno = setno;
10028 			mdclrerror(&medup.med_mde);
10029 
10030 			err = updmed_ioctl(&medup, mode);
10031 			if (! mdisok(&medup.med_mde))
10032 				(void) mdstealerror(ep, &medup.med_mde);
10033 		}
10034 	}
10035 
10036 out:
10037 	/*
10038 	 * In the case that the snarf failed, the diskset is
10039 	 * left with s_db set, but s_lbp not set.  The node is not
10040 	 * an owner of the set and won't be allowed to release the
10041 	 * diskset in order to cleanup.  With s_db set, any call to the
10042 	 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
10043 	 * will cause the diskset to be loaded.  So, cleanup the diskset so
10044 	 * that an inadvertent start of the diskset doesn't happen later.
10045 	 */
10046 	if ((snarf_ok == 0) && md_set[setno].s_db &&
10047 	    (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
10048 		mutex_enter(&mddb_lock);
10049 		mddb_unload_set(setno);
10050 		mutex_exit(&mddb_lock);
10051 	}
10052 	return (err);
10053 }
10054 
10055 /*ARGSUSED*/
10056 int
10057 release_set(mddb_config_t *cp, int mode)
10058 {
10059 	int			err = 0;
10060 	set_t			setno = cp->c_setno;
10061 	md_error_t		*ep = &cp->c_mde;
10062 
10063 	/*
10064 	 * Data integrity check
10065 	 */
10066 	if (setno >= md_nsets)
10067 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10068 
10069 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
10070 	md_haltsnarf_enter(setno);
10071 	/*
10072 	 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
10073 	 * that the mirror code is currently searching all mirrors for a
10074 	 * errored component that needs a hotspare. While this search is in
10075 	 * progress, we cannot release the set and thgerefore we return EBUSY.
10076 	 * Once we have set HOLD, the mirror function (check_4_hotspares) will
10077 	 * block before the search until the set is released.
10078 	 */
10079 	if (md_holdset_testandenter(setno) != 0) {
10080 		md_haltsnarf_exit(setno);
10081 		rw_exit(&md_unit_array_rw.lock);
10082 		return (EBUSY);
10083 	}
10084 
10085 	if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
10086 		err = mddb_configure(MDDB_RELEASESET, cp);
10087 
10088 	md_holdset_exit(setno);
10089 	md_haltsnarf_exit(setno);
10090 	rw_exit(&md_unit_array_rw.lock);
10091 
10092 	if (! err && mdisok(ep)) {
10093 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
10094 		    NODEV64);
10095 	}
10096 
10097 	return (err);
10098 }
10099 
10100 int
10101 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
10102 {
10103 	mddb_set_t	*s;
10104 	int		err = 0;
10105 	mddb_dtag_lst_t	*dtlp;
10106 	set_t		setno = dtgpp->dtgp_setno;
10107 	md_error_t	*ep = &dtgpp->dtgp_mde;
10108 
10109 	mdclrerror(ep);
10110 
10111 	if ((mode & FREAD) == 0)
10112 		return (mdsyserror(ep, EACCES));
10113 
10114 	if (setno >= md_nsets)
10115 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10116 
10117 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10118 		return (0);
10119 
10120 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10121 		return (mddbstatus2error(ep, err, NODEV32, setno));
10122 
10123 	/*
10124 	 * Data tags not supported on MN sets so return invalid operation.
10125 	 * This ioctl could be called before the mddb has been read in so
10126 	 * the set status may not yet be set to MNSET, so code following
10127 	 * this check must handle a MN diskset properly.
10128 	 */
10129 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10130 		mddb_setexit(s);
10131 		return (mderror(ep, MDE_INVAL_MNOP));
10132 	}
10133 
10134 	/* s_dtlp is NULL for MN diskset */
10135 	dtlp = s->s_dtlp;
10136 	while (dtlp != NULL) {
10137 		if (dtgpp->dtgp_dt.dt_id == 0 ||
10138 		    dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
10139 			bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
10140 			    sizeof (mddb_dtag_t));
10141 			break;
10142 		}
10143 		dtlp = dtlp->dtl_nx;
10144 	}
10145 
10146 	/* Walked the whole list and id not found, return error */
10147 	if (dtlp == (mddb_dtag_lst_t *)NULL) {
10148 		mddb_setexit(s);
10149 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10150 	}
10151 
10152 	mddb_setexit(s);
10153 
10154 	return (0);
10155 }
10156 
10157 int
10158 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
10159 {
10160 	mddb_set_t	*s;
10161 	int		err = 0;
10162 	mddb_config_t	*cp;
10163 	mddb_ri_t	*trip = NULL;
10164 	mddb_dtag_t	*dtagp = NULL;
10165 	set_t		setno = dtupp->dtup_setno;
10166 	md_error_t	*ep = &dtupp->dtup_mde;
10167 
10168 	mdclrerror(ep);
10169 
10170 	if ((mode & FWRITE) == 0)
10171 		return (mdsyserror(ep, EACCES));
10172 
10173 	if (setno >= md_nsets)
10174 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10175 
10176 	if (dtupp->dtup_id < 0)
10177 		return (mdsyserror(ep, EINVAL));
10178 	else if (dtupp->dtup_id == 0)
10179 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10180 
10181 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10182 		return (0);
10183 
10184 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
10185 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10186 
10187 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10188 		return (mddbstatus2error(ep, err, NODEV32, setno));
10189 
10190 	/*
10191 	 * Data tags not supported on MN sets so return invalid operation.
10192 	 * This ioctl could be called before the mddb has been read in so
10193 	 * the set status may not yet be set to MNSET, so code following
10194 	 * this check must handle a MN diskset properly.
10195 	 */
10196 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10197 		mddb_setexit(s);
10198 		return (mderror(ep, MDE_INVAL_MNOP));
10199 	}
10200 
10201 	/* Validate and find the id requested - nothing found if MN diskset */
10202 	if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
10203 		mddb_setexit(s);
10204 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10205 	}
10206 
10207 	/* Usetag is only valid when more than one tag exists */
10208 	if (dtl_cntl(s) < 2) {
10209 		mddb_setexit(s);
10210 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10211 	}
10212 
10213 	/* Put the selected tag in place */
10214 	dt_setup(s, dtagp);
10215 
10216 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10217 
10218 	/* Save the hint information */
10219 	trip = save_rip(s);
10220 
10221 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10222 	cp->c_setno = setno;
10223 	cp->c_sideno = s->s_sideno;
10224 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10225 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10226 	cp->c_med = s->s_med;				/* struct assignment */
10227 
10228 	mddb_setexit(s);
10229 
10230 	s = NULL;
10231 
10232 	/* shorthand */
10233 	setno = cp->c_setno;
10234 
10235 	/* Let unload know not to free the tag */
10236 	md_set_setstatus(setno, MD_SET_KEEPTAG);
10237 
10238 	/* Release the set */
10239 	if (err = release_set(cp, mode))
10240 		goto out;
10241 
10242 	if (! mdisok(&cp->c_mde)) {
10243 		(void) mdstealerror(ep, &cp->c_mde);
10244 		err = 1;
10245 		goto out;
10246 	}
10247 
10248 	/* Re-init set using the saved mddb_config_t structure */
10249 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10250 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10251 			err = mddbstatus2error(ep, err, NODEV32, setno);
10252 			goto out;
10253 		}
10254 	}
10255 
10256 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10257 
10258 	/* use the saved rip structure */
10259 	s->s_rip = trip;
10260 	trip = (mddb_ri_t *)NULL;
10261 
10262 	/* Let the take code know a tag is being used */
10263 	md_set_setstatus(setno, MD_SET_USETAG);
10264 
10265 	mddb_setexit(s);
10266 
10267 	s = NULL;
10268 
10269 	/* Take the set */
10270 	if (err = take_set(cp, mode))
10271 		goto out;
10272 
10273 	if (! mdisok(&cp->c_mde))
10274 		(void) mdstealerror(ep, &cp->c_mde);
10275 
10276 out:
10277 	md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));
10278 
10279 	kmem_free(cp, sizeof (mddb_config_t));
10280 
10281 	if (trip)
10282 		free_rip(&trip);
10283 
10284 	if (s)
10285 		mddb_setexit(s);
10286 
10287 	return (err);
10288 }
10289 
10290 int
10291 accept_ioctl(mddb_accept_parm_t *accpp, int mode)
10292 {
10293 	mddb_set_t	*s;
10294 	int		err = 0;
10295 	mddb_config_t	*cp;
10296 	mddb_ri_t	*trip = NULL;
10297 	set_t		setno = accpp->accp_setno;
10298 	md_error_t	*ep = &accpp->accp_mde;
10299 
10300 	mdclrerror(ep);
10301 
10302 	if ((mode & FWRITE) == 0)
10303 		return (mdsyserror(ep, EACCES));
10304 
10305 	if (setno >= md_nsets)
10306 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10307 
10308 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10309 		return (0);
10310 
10311 	if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
10312 		return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));
10313 
10314 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10315 		return (mddbstatus2error(ep, err, NODEV32, setno));
10316 
10317 	/*
10318 	 * Data tags not supported on MN sets so return invalid operation.
10319 	 * mddb is guaranteed to be incore at this point, so this
10320 	 * check will catch all MN disksets.
10321 	 */
10322 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10323 		mddb_setexit(s);
10324 		return (mderror(ep, MDE_INVAL_MNOP));
10325 	}
10326 
10327 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10328 
10329 	trip = save_rip(s);
10330 
10331 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10332 	cp->c_setno = setno;
10333 	cp->c_sideno = s->s_sideno;
10334 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10335 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10336 	cp->c_med = s->s_med;				/* struct assignment */
10337 
10338 	/* Tag the data */
10339 	if (err = set_dtag(s, ep)) {
10340 		err = mdsyserror(ep, err);
10341 		goto out;
10342 	}
10343 
10344 	/* If we had a BADTAG, it will be re-written, so clear the bit. */
10345 	if (md_get_setstatus(setno) & MD_SET_BADTAG)
10346 		md_clr_setstatus(setno, MD_SET_BADTAG);
10347 
10348 	if (err = dt_write(s)) {
10349 		err = mdsyserror(ep, err);
10350 		goto out;
10351 	}
10352 
10353 	mddb_setexit(s);
10354 
10355 	s = NULL;
10356 
10357 	/* shorthand */
10358 	setno = cp->c_setno;
10359 
10360 	/* Clear the keeptag */
10361 	md_clr_setstatus(setno, MD_SET_KEEPTAG);
10362 
10363 	/* Release the set */
10364 	if (err = release_set(cp, mode))
10365 		goto out;
10366 
10367 	if (! mdisok(&cp->c_mde)) {
10368 		(void) mdstealerror(ep, &cp->c_mde);
10369 		goto out;
10370 	}
10371 
10372 	/* Re-init set using the saved mddb_config_t structure */
10373 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10374 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10375 			err = mddbstatus2error(ep, err, NODEV32, setno);
10376 			goto out;
10377 		}
10378 	}
10379 
10380 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10381 
10382 	/* Free the allocated rip structure */
10383 	if (s->s_rip != (mddb_ri_t *)NULL)
10384 		free_rip(&s->s_rip);
10385 
10386 	/* use the saved rip structure */
10387 	s->s_rip = trip;
10388 	trip = (mddb_ri_t *)NULL;
10389 
10390 	/* Let the set init code know an accept is in progress */
10391 	md_set_setstatus(setno, MD_SET_ACCEPT);
10392 
10393 	mddb_setexit(s);
10394 
10395 	s = NULL;
10396 
10397 	/* Take the set */
10398 	if (err = take_set(cp, mode))
10399 		goto out;
10400 
10401 	if (! mdisok(&cp->c_mde))
10402 		(void) mdstealerror(ep, &cp->c_mde);
10403 
10404 out:
10405 	md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));
10406 
10407 	kmem_free(cp, sizeof (mddb_config_t));
10408 
10409 	if (trip)
10410 		free_rip(&trip);
10411 
10412 	if (s)
10413 		mddb_setexit(s);
10414 
10415 	return (err);
10416 }
10417 
10418 /*
10419  * mddb_getinvlb_devid - cycles through the locator block and determines
10420  *		if the device id's for any of the replica disks are invalid.
10421  *		If so, it returns the diskname in the ctdptr.
10422  *	RETURN
10423  *		-1	Error
10424  *		cnt	number of invalid device id's
10425  */
10426 int
10427 mddb_getinvlb_devid(
10428 	set_t	setno,
10429 	int	count,
10430 	int	size,
10431 	char	**ctdptr
10432 )
10433 {
10434 	mddb_set_t	*s;
10435 	int		err = 0;
10436 	mddb_lb_t	*lbp;
10437 	int		li;
10438 	mddb_did_blk_t	*did_blk;
10439 	mddb_did_info_t	*did_info;
10440 	int		len;
10441 	int		cnt = 0;
10442 	char		*cptr;
10443 	md_name_suffix	*sn;
10444 	int		i, dont_add_it;
10445 	char		*tmpctd, *diskname;
10446 	char		*tmpname;
10447 
10448 	cptr = *ctdptr;
10449 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
10450 		return (-1);
10451 	}
10452 
10453 	single_thread_start(s);
10454 	lbp = s->s_lbp;
10455 
10456 	if (lbp->lb_setno != setno) {
10457 		single_thread_end(s);
10458 		mddb_setexit(s);
10459 		return (-1);
10460 	}
10461 
10462 	/* check for lb being devid style */
10463 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
10464 		did_blk = s->s_did_icp->did_ic_blkp;
10465 		for (li = 0; li < lbp->lb_loccnt; li++) {
10466 			did_info = &(did_blk->blk_info[li]);
10467 			/* Only if devid exists and isn't valid */
10468 			if ((did_info->info_flags & MDDB_DID_EXISTS) &&
10469 			    !(did_info->info_flags & MDDB_DID_VALID)) {
10470 				/*
10471 				 * if we count more invalid did's than
10472 				 * was passed in there's an error somewhere
10473 				 */
10474 				if (cnt++ > count) {
10475 					single_thread_end(s);
10476 					mddb_setexit(s);
10477 					return (-1);
10478 				}
10479 
10480 				/*
10481 				 * Future note: Need to do something here
10482 				 * for the MN diskset case when device ids
10483 				 * are supported in disksets.
10484 				 * Can't add until merging devids_in_diskset
10485 				 * code into code base.
10486 				 */
10487 
10488 				sn = &s->s_lnp->ln_suffixes[0][li];
10489 				/*
10490 				 * check to make sure length of device name is
10491 				 * not greater than computed first time through
10492 				 */
10493 				len = sn->suf_len;
10494 				if (len > size) {
10495 					single_thread_end(s);
10496 					mddb_setexit(s);
10497 					return (-1);
10498 				}
10499 				tmpctd = *ctdptr;
10500 				/* strip off slice part */
10501 				diskname = md_strdup(sn->suf_data);
10502 				tmpname = strrchr(diskname, 's');
10503 				*tmpname = '\0';
10504 				dont_add_it = 0;
10505 				/* look to see if diskname is already in list */
10506 				for (i = 0; i < (cnt-1); i++) {
10507 					if (strcmp(diskname, tmpctd) == 0) {
10508 						/* already there, don't add */
10509 						dont_add_it = 1;
10510 						break;
10511 					}
10512 					/* point to next diskname in list */
10513 					tmpctd += size;
10514 				}
10515 				if (dont_add_it == 0) {
10516 					/* add diskname to list */
10517 					(void) strcpy(cptr, diskname);
10518 					cptr += size;
10519 				}
10520 				kmem_free(diskname, strlen(sn->suf_data) + 1);
10521 			}
10522 		}
10523 	}
10524 	/* null terminate the list */
10525 	*cptr = '\0';
10526 	/*
10527 	 * need to save the new pointer so that calling routine can continue
10528 	 * to add information onto the end.
10529 	 */
10530 	*ctdptr = cptr;
10531 	single_thread_end(s);
10532 	mddb_setexit(s);
10533 	return (cnt);
10534 }
10535 
10536 /*
10537  * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
10538  *		track of length of longest devicename.
10539  *	RETURN
10540  *		-1	error
10541  *		 cnt	number of lb's with invalid devid's
10542  */
10543 int
10544 mddb_validate_lb(
10545 	set_t	setno,
10546 	int	*rmaxsz
10547 )
10548 {
10549 	mddb_set_t	*s;
10550 	int		err = 0;
10551 	mddb_lb_t	*lbp;
10552 	int		li;
10553 	mddb_did_blk_t	*did_blk;
10554 	mddb_did_info_t	*did_info;
10555 	int		len;
10556 	int		cnt = 0;
10557 
10558 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10559 		return (-1);
10560 
10561 	single_thread_start(s);
10562 	lbp = s->s_lbp;
10563 
10564 	if (lbp->lb_setno != setno) {
10565 		single_thread_end(s);
10566 		mddb_setexit(s);
10567 		return (-1);
10568 	}
10569 
10570 	/* lb must be in devid style */
10571 	if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
10572 		goto mvl_out;
10573 
10574 	did_blk = s->s_did_icp->did_ic_blkp;
10575 	for (li = 0; li < lbp->lb_loccnt; li++) {
10576 		char		*minor_name;
10577 		mddb_locator_t	*lp;
10578 		dev_t		ddi_dev;
10579 		ddi_devid_t	devid;
10580 		ddi_devid_t	rtn_devid = NULL;
10581 		int		get_rval;
10582 
10583 		did_info = &(did_blk->blk_info[li]);
10584 		if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
10585 		    (did_info->info_flags & MDDB_DID_VALID))
10586 			continue;
10587 
10588 		/* Here we know, did exists but isn't valid */
10589 
10590 		lp = &lbp->lb_locators[li];
10591 		ddi_dev = expldev(lp->l_dev);
10592 		get_rval = mddb_devid_get(s, li, &devid, &minor_name);
10593 		ASSERT(get_rval == 1);
10594 		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
10595 		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
10596 			did_info->info_flags = MDDB_DID_VALID |
10597 						MDDB_DID_EXISTS |
10598 						MDDB_DID_UPDATED;
10599 		} else {
10600 			cnt++;
10601 			/*
10602 			 * Future note: Need to do something here
10603 			 * for the MN diskset case when device ids
10604 			 * are supported in disksets.
10605 			 * Can't add until merging devids_in_diskset
10606 			 * code into code base.
10607 			 */
10608 			len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
10609 			if (*rmaxsz < len)
10610 				*rmaxsz = len;
10611 		}
10612 		if (rtn_devid != NULL)
10613 			ddi_devid_free(rtn_devid);
10614 	}
10615 
10616 mvl_out:
10617 
10618 	if (push_lb(s) != 0)
10619 		cnt = -1;
10620 	single_thread_end(s);
10621 	mddb_setexit(s);
10622 	return (cnt);
10623 }
10624 
10625 int
10626 check_active_locators()
10627 {
10628 	mddb_set_t	*s;
10629 	mddb_lb_t	*lbp;
10630 	int		li;
10631 	int		active = 0;
10632 
10633 	mutex_enter(&mddb_lock);
10634 	/* there is nothing here..so we can unload */
10635 	if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
10636 		mutex_exit(&mddb_lock);
10637 		return (0);
10638 	}
10639 	s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
10640 	lbp = s->s_lbp;
10641 	if (lbp == NULL) {
10642 		mutex_exit(&mddb_lock);
10643 		return (0);
10644 	}
10645 
10646 	for (li = 0; li < lbp->lb_loccnt; li++) {
10647 		mddb_locator_t *lp = &lbp->lb_locators[li];
10648 		if (lp->l_flags & MDDB_F_ACTIVE) {
10649 			active = 1;
10650 			break;
10651 		}
10652 	}
10653 	mutex_exit(&mddb_lock);
10654 	return (active);
10655 }
10656 
10657 /*
10658  * regetoptrecord:
10659  * --------------
10660  *	Update the in-core optimized resync record contents by re-reading the
10661  *	record from the on-disk metadb.
10662  *	The contents of the resync record will be overwritten by calling this
10663  *	routine. This means that callers that require the previous contents to
10664  *	be preserved must save the data before calling this routine.
10665  */
10666 static void
10667 regetoptrecord(
10668 	mddb_set_t	*s,
10669 	mddb_de_ic_t	*dep
10670 )
10671 {
10672 	mddb_lb_t	*lbp;
10673 	mddb_locator_t	*lp;
10674 	mddb_rb32_t	*rbp, *crbp;
10675 	int		li;
10676 	int		i;
10677 	int		err = 0;
10678 	size_t		recsize;
10679 
10680 #if defined(_ILP32) && !defined(lint)
10681 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
10682 #endif
10683 
10684 	recsize = dep->de_recsize;
10685 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
10686 
10687 	single_thread_start(s);
10688 	rbp = dep->de_rb;
10689 
10690 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
10691 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10692 
10693 	lbp = s->s_lbp;
10694 
10695 	for (i = 0; i < 2; i++) {
10696 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
10697 			continue;
10698 		li = dep->de_optinfo[i].o_li;
10699 		lp = &lbp->lb_locators[li];
10700 
10701 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
10702 		    (lp->l_flags & MDDB_F_EMASTER))
10703 			continue;
10704 
10705 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
10706 		    dep->de_blkcount, li);
10707 
10708 		if (err)
10709 			continue;
10710 
10711 		if (rbp->rb_magic != MDDB_MAGIC_RB)
10712 			continue;
10713 
10714 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
10715 			continue;
10716 
10717 		/* Check the crc for this record */
10718 		if (rec_crcchk(s, dep, rbp)) {
10719 			continue;
10720 		}
10721 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
10722 
10723 		if (rbp == crbp) {
10724 			if (rbp->rb_checksum != crbp->rb_checksum)
10725 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10726 			break;
10727 		}
10728 		rbp = crbp;
10729 	}
10730 
10731 	single_thread_end(s);
10732 
10733 	if (rbp == crbp) {
10734 		rbp->rb_private = 0;
10735 		kmem_free((caddr_t)crbp, recsize);
10736 		return;
10737 	}
10738 	uniqtime32(&rbp->rb_timestamp);
10739 	/* Generate the crc for this record */
10740 	rec_crcgen(s, dep, rbp);
10741 	kmem_free((caddr_t)crbp, recsize);
10742 }
10743 
10744 /*
10745  * mddb_reread_rr:
10746  *	Re-read the resync record from the on-disk copy. This is required for
10747  *	multi-node support so that a new mirror-owner can determine if a resync
10748  *	operation is required to guarantee data integrity.
10749  *
10750  * Arguments:
10751  *	setno	Associated set
10752  *	id	Resync record ID
10753  *
10754  * Return Value:
10755  *	0	successful reread
10756  *	-1	invalid set (not multi-node or non-existant)
10757  *	>0	metadb state invalid
10758  */
10759 int
10760 mddb_reread_rr(
10761 	set_t		setno,
10762 	mddb_recid_t	id
10763 )
10764 {
10765 	mddb_set_t	*s;
10766 	int		err = 0;
10767 	mddb_db_t	*dbp;
10768 	mddb_de_ic_t	*dep;
10769 
10770 	if (setno >= md_nsets)
10771 		return (-1);
10772 
10773 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10774 		return (-1);
10775 
10776 	if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
10777 		mddb_setexit(s);
10778 		return (-1);
10779 	}
10780 
10781 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10782 		dep = dbp->db_firstentry;
10783 		while (dep && (dep->de_recid != DBID(id)))
10784 			dep = dep->de_next;
10785 		if (dep != NULL)
10786 			break;
10787 	}
10788 
10789 	if (dep != NULL) {
10790 		regetoptrecord(s, dep);
10791 		err = 0;
10792 	} else {
10793 		err = -1;
10794 	}
10795 	mddb_setexit(s);
10796 	return (err);
10797 }
10798 
10799 /*
10800  * Set owner associated with MN optimized resync record.
10801  *
10802  * Optimized records have an owner node associated with them in
10803  * a MN diskset.  The owner is only set on a node that is actively
10804  * writing to that record.  The other nodes will show that record
10805  * as having an invalid owner.  The owner for an optimized record
10806  * is used during fixoptrecord to determine which node should
10807  * write out the record when the replicas associated with that
10808  * optimized record have been changed.
10809  *
10810  * Called directly from mirror driver and not from an ioctl.
10811  *
10812  * Returns
10813  *	NULL if successful.
10814  *	MDDB_E_NORECORD if record not found.
10815  */
10816 int
10817 mddb_setowner(
10818 	mddb_recid_t		id,
10819 	md_mn_nodeid_t		owner
10820 )
10821 {
10822 	mddb_set_t		*s;
10823 	mddb_db_t		*dbp;
10824 	mddb_de_ic_t		*dep;
10825 	int			found = 0;
10826 
10827 
10828 	if (DBSET(id) >= md_nsets)
10829 		return (MDDB_E_NORECORD);
10830 
10831 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
10832 		return (MDDB_E_NORECORD);
10833 
10834 	id = DBID(id);
10835 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10836 		for (dep = dbp->db_firstentry;
10837 		    dep != NULL; dep = dep->de_next) {
10838 			if (dep->de_recid != id)
10839 				continue;
10840 			dep->de_owner_nodeid = owner;
10841 			found = 1;
10842 			break;
10843 		}
10844 		if (found)
10845 			break;
10846 	}
10847 
10848 	mddb_setexit(s);
10849 
10850 	if (!found) {
10851 		return (MDDB_E_NORECORD);
10852 	}
10853 
10854 	return (NULL);
10855 }
10856 
10857 /*
10858  * mddb_parse re-reads portions of the mddb from disk given a list
10859  * of good replicas to read from and flags describing
10860  * which portion of the mddb to read in.
10861  *
10862  * Used in a MN diskset when the master has made a change to some part
10863  * of the mddb and wants to relay this information to the slaves.
10864  */
10865 int
10866 mddb_parse(mddb_parse_parm_t *mpp)
10867 {
10868 	mddb_set_t	*s;
10869 	int		err = 0;
10870 	mddb_locator_t	*lp, *old_lp;
10871 	mddb_lb_t	*lbp, *old_lbp;
10872 	int		rval = 0;
10873 	int		i, li;
10874 	int		found_good_one = 0;
10875 	mddb_ln_t	*lnp;
10876 	mddb_block_t	ln_blkcnt;
10877 	md_error_t	*ep = &mpp->c_mde;
10878 
10879 	if (mpp->c_setno >= md_nsets)
10880 		return (EINVAL);
10881 
10882 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10883 		return (0);
10884 
10885 	if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
10886 		return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
10887 	}
10888 
10889 	if (!(MD_MNSET_SETNO(mpp->c_setno))) {
10890 		mddb_setexit_no_parse(s);
10891 		return (EINVAL);
10892 	}
10893 
10894 	/*
10895 	 * Master node initiated this request, so there's no work for
10896 	 * the master node to do.
10897 	 */
10898 	if (md_set[mpp->c_setno].s_am_i_master) {
10899 		mddb_setexit_no_parse(s);
10900 		return (rval);
10901 	}
10902 
10903 	single_thread_start(s);
10904 
10905 	if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
10906 		lbp = 0;
10907 		for (i = 0; i < MDDB_NLB; i++) {
10908 			/* Walk through master's active list */
10909 			if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
10910 				continue;
10911 			if (s->s_mbiarray[i] == NULL)
10912 				continue;
10913 
10914 			/* Assumes master blocks are already setup */
10915 			if (lbp == (mddb_lb_t *)NULL) {
10916 				lbp = (mddb_lb_t *)kmem_zalloc(
10917 					dbtob(MDDB_MNLBCNT), KM_SLEEP);
10918 			}
10919 			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
10920 
10921 			if (err)
10922 				continue;
10923 
10924 			if (lbp->lb_magic != MDDB_MAGIC_LB)
10925 				continue;
10926 			if (lbp->lb_blkcnt != MDDB_MNLBCNT)
10927 				continue;
10928 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
10929 				continue;
10930 			if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
10931 			    NULL))
10932 				continue;
10933 			if (lbp->lb_setno != s->s_setno)
10934 				continue;
10935 			/*
10936 			 * a commit count of zero means this locator has
10937 			 * been deleted
10938 			 */
10939 			if (lbp->lb_commitcnt == 0) {
10940 				continue;
10941 			}
10942 			/* Found a good locator - keep it */
10943 			found_good_one = 1;
10944 			break;
10945 		}
10946 
10947 		/*
10948 		 * If found a good copy of the mddb, then read it into
10949 		 * this node's locator block.  Fix up the set's s_mbiarray
10950 		 * pointer (master block incore array pointer) to be
10951 		 * in sync with the newly read in locator block.  If a
10952 		 * new mddb was added, read in the master blocks associated
10953 		 * with the new mddb.  If an mddb was deleted, free the
10954 		 * master blocks associated with deleted mddb.
10955 		 */
10956 		if (found_good_one)  {
10957 			/* Compare old and new view of mddb locator blocks */
10958 			old_lbp = s->s_lbp;
10959 			for (li = 0; li < lbp->lb_loccnt; li++) {
10960 				int	mn_set;
10961 
10962 				lp = &lbp->lb_locators[li];
10963 				old_lp = &old_lbp->lb_locators[li];
10964 
10965 				/* If old and new views match, continue */
10966 				if ((lp->l_flags & MDDB_F_ACTIVE) ==
10967 				    (old_lp->l_flags & MDDB_F_ACTIVE))
10968 					continue;
10969 
10970 				if (lp->l_flags & MDDB_F_ACTIVE) {
10971 					/*
10972 					 * If new mddb has been added - delete
10973 					 * old mbiarray and get new one.
10974 					 *
10975 					 * When devids are supported, will
10976 					 * need to get dev from devid.
10977 					 */
10978 					if (s->s_mbiarray[li]) {
10979 						free_mbipp(&s->s_mbiarray[li]);
10980 					}
10981 					/*
10982 					 * If getmasters fails, getmasters
10983 					 * will set appropriate error flags.
10984 					 */
10985 					s->s_mbiarray[li] = getmasters(s,
10986 					    md_expldev(lp->l_dev), lp->l_blkno,
10987 					    (uint_t *)&(lp->l_flags), &mn_set);
10988 				} else if (lp->l_flags & MDDB_F_DELETED) {
10989 					/*
10990 					 * If old one has been deleted -
10991 					 * delete old mbiarray.
10992 					 */
10993 					if (s->s_mbiarray[li]) {
10994 						free_mbipp(&s->s_mbiarray[li]);
10995 					}
10996 				}
10997 			}
10998 
10999 			/* Free this node's old view of mddb locator blocks */
11000 			kmem_free((caddr_t)s->s_lbp,
11001 				dbtob(s->s_lbp->lb_blkcnt));
11002 			s->s_lbp = lbp;
11003 		} else {
11004 			if (lbp)
11005 				kmem_free(lbp, dbtob(MDDB_MNLBCNT));
11006 		}
11007 	}
11008 
11009 	if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
11010 		lnp = s->s_lnp;
11011 		lbp = s->s_lbp;
11012 		ln_blkcnt = lbp->lb_lnblkcnt;
11013 		s->s_lnp = NULL; /* readlocnames does this anyway */
11014 		for (li = 0; li < lbp->lb_loccnt; li++) {
11015 			lp = &lbp->lb_locators[li];
11016 
11017 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11018 			    (lp->l_flags & MDDB_F_EMASTER))
11019 				continue;
11020 
11021 			/* Successfully read the locator names */
11022 			if (readlocnames(s, li) == 0)
11023 				break;
11024 		}
11025 
11026 		if (li == lbp->lb_loccnt) {
11027 			/* Did not successfully read locnames; restore lnp */
11028 			s->s_lnp = lnp;
11029 		} else {
11030 			/* readlocnames successful, free old struct */
11031 			kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
11032 		}
11033 	}
11034 
11035 	if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
11036 		mddb_de_ic_t	*dep, *tdep, *first_dep, *dep2;
11037 		mddb_db_t	*dbp;
11038 		mddb_db32_t	*db32p;
11039 		mddb_de32_t	*de32p, *de32p2;
11040 		int		writeout;
11041 
11042 		lbp = s->s_lbp;
11043 		/*
11044 		 * Walk through directory block and directory entry incore
11045 		 * linked list looking for optimized resync records.
11046 		 * For each opt record found, re-read in directory block.
11047 		 * The directoy block consists of a number of directory
11048 		 * entries.  The directory entry for this opt record will
11049 		 * describe which 2 mddbs actually contain the resync record
11050 		 * since it could have been relocated by the master node
11051 		 * due to mddb failure or mddb deletion.  If this node
11052 		 * is the record owner for this opt record, then write out
11053 		 * the record to the 2 mddbs listed in the directory entry
11054 		 * if the mddbs locations are different than previously known.
11055 		 */
11056 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11057 			for (dep = dbp->db_firstentry; dep;
11058 			    dep = dep->de_next) {
11059 				/* Found an opt record */
11060 				if (dep->de_flags & MDDB_F_OPT)
11061 					break;
11062 			}
11063 			/* If no opt records found, go to next dbp */
11064 			if (dep == NULL)
11065 				continue;
11066 
11067 			/*
11068 			 * Reread directory block from disk since
11069 			 * master could have rewritten in during fixoptrecord.
11070 			 */
11071 			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
11072 				KM_SLEEP);
11073 			create_db32rec(db32p, dbp);
11074 			for (li = 0; li < lbp->lb_loccnt; li++) {
11075 				lp = &lbp->lb_locators[li];
11076 
11077 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11078 				    (lp->l_flags & MDDB_F_EMASTER))
11079 					continue;
11080 
11081 				err = readblks(s, (caddr_t)db32p,
11082 					db32p->db32_blknum, 1, li);
11083 				if (err)
11084 					continue;
11085 
11086 				/* Reverify db; go to next mddb if bad */
11087 				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
11088 				    (revchk(MDDB_REV_DB,
11089 					db32p->db32_revision)) ||
11090 				    (crcchk(db32p, &db32p->db32_checksum,
11091 					MDDB_BSIZE, NULL))) {
11092 					continue;
11093 				} else {
11094 					break;
11095 				}
11096 			}
11097 			/*
11098 			 * If all mddbs are unavailable then panic since
11099 			 * this slave cannot be allowed to continue out-of-sync
11100 			 * with the master node.  Since the optimized resync
11101 			 * records are written by all nodes, all nodes must
11102 			 * stay in sync with the master.
11103 			 *
11104 			 * This also handles the case when all storage
11105 			 * connectivity to a slave node has failed.  The
11106 			 * slave node will send an MDDB_OPTRECERR message to
11107 			 * the master node when the slave node has been unable
11108 			 * to write an optimized resync record to both
11109 			 * designated mddbs.  After the master has fixed the
11110 			 * optimized records to be on available mddbs, the
11111 			 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
11112 			 * is sent to all slave nodes.  If a slave node is
11113 			 * unable to access any mddb in order to read in the
11114 			 * relocated optimized resync record, then the slave
11115 			 * node must panic.
11116 			 */
11117 			if (li == lbp->lb_loccnt) {
11118 				kmem_free((caddr_t)db32p, MDDB_BSIZE);
11119 				cmn_err(CE_PANIC, "md: mddb: Node unable to "
11120 					"access any SVM state database "
11121 					"replicas for diskset %s\n",
11122 					s->s_setname);
11123 			}
11124 			/*
11125 			 * Setup temp copy of linked list of de's.
11126 			 * Already have an incore copy, but need to walk
11127 			 * the directory entry list contained in the
11128 			 * new directory block that was just read in above.
11129 			 * After finding the directory entry of an opt record
11130 			 * by walking the incore list, find the corresponding
11131 			 * entry in the temporary list and then update
11132 			 * the incore directory entry record with
11133 			 * the (possibly changed) mddb location stored
11134 			 * for the optimized resync records.
11135 			 */
11136 			de32p = (mddb_de32_t *)
11137 			    ((void *) ((caddr_t)
11138 			    (&db32p->db32_firstentry)
11139 			    + sizeof (db32p->db32_firstentry)));
11140 			tdep = (mddb_de_ic_t *)
11141 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
11142 			    sizeof (mddb_block_t) +
11143 			    sizeof (mddb_block_t) *
11144 			    de32p->de32_blkcount, KM_SLEEP);
11145 			de32tode(de32p, tdep);
11146 			first_dep = tdep;
11147 			while (de32p && de32p->de32_next) {
11148 				de32p2 = nextentry(de32p);
11149 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
11150 				    sizeof (mddb_de_ic_t) -
11151 				    sizeof (mddb_block_t) +
11152 				    sizeof (mddb_block_t) *
11153 				    de32p2->de32_blkcount, KM_SLEEP);
11154 				de32tode(de32p2, dep2);
11155 				tdep->de_next = dep2;
11156 				tdep = dep2;
11157 				de32p = de32p2;
11158 			}
11159 
11160 			/* Now, walk the incore directory entry list */
11161 			for (dep = dbp->db_firstentry; dep;
11162 			    dep = dep->de_next) {
11163 				if (! (dep->de_flags & MDDB_F_OPT))
11164 					continue;
11165 				/*
11166 				 * Found an opt record in the incore copy.
11167 				 * Find the corresponding entry in the temp
11168 				 * list.  If anything has changed in the
11169 				 * opt record info between the incore copy
11170 				 * and the temp copy, update the incore copy
11171 				 * and set a flag to writeout the opt record
11172 				 * to the new mddb locations.
11173 				 */
11174 				for (tdep = first_dep; tdep;
11175 				    tdep = tdep->de_next) {
11176 					if (dep->de_recid == tdep->de_recid) {
11177 					    writeout = 0;
11178 					    /* Check first mddb location */
11179 					    if ((dep->de_optinfo[0].o_li !=
11180 						tdep->de_optinfo[0].o_li) ||
11181 						(dep->de_optinfo[0].o_flags !=
11182 						tdep->de_optinfo[0].o_flags)) {
11183 						    dep->de_optinfo[0] =
11184 						    tdep->de_optinfo[0];
11185 						    writeout = 1;
11186 					    }
11187 					    /* Check second mddb location */
11188 					    if ((dep->de_optinfo[1].o_li !=
11189 						tdep->de_optinfo[1].o_li) ||
11190 						(dep->de_optinfo[1].o_flags !=
11191 						tdep->de_optinfo[1].o_flags)) {
11192 						    dep->de_optinfo[1] =
11193 						    tdep->de_optinfo[1];
11194 						    writeout = 1;
11195 					    }
11196 					    /* Record owner should rewrite it */
11197 					    if ((writeout) &&
11198 						(dep->de_owner_nodeid ==
11199 						md_set[mpp->c_setno].
11200 						s_nodeid)) {
11201 						    (void) writeoptrecord(s,
11202 							dep);
11203 					    }
11204 					    break;
11205 					}
11206 				}
11207 			}
11208 			/*
11209 			 * Update the incore checksum information for this
11210 			 * directory block to match the newly read in checksum.
11211 			 * This should have only changed if the incore and
11212 			 * temp directory entries differed, but it takes
11213 			 * more code to do the check than to just update
11214 			 * the information everytime.
11215 			 */
11216 			dbp->db_checksum = db32p->db32_checksum;
11217 
11218 			/* Now free everything */
11219 			tdep = first_dep;
11220 			while (tdep) {
11221 				dep2 = tdep->de_next;
11222 				kmem_free((caddr_t)tdep,
11223 				    sizeofde(tdep));
11224 				tdep = dep2;
11225 			}
11226 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
11227 		}
11228 		rval = 0;
11229 	}
11230 out:
11231 	single_thread_end(s);
11232 	mddb_setexit_no_parse(s);
11233 	return (rval);
11234 }
11235 
11236 int
11237 mddb_block(mddb_block_parm_t *mbp)
11238 {
11239 	mddb_set_t	*s;
11240 	int		err = 0;
11241 	md_error_t	*ep = &mbp->c_mde;
11242 
11243 	if (mbp->c_setno >= md_nsets)
11244 		return (EINVAL);
11245 
11246 	/*
11247 	 * If the new_master flag is set for this setno we are in the middle
11248 	 * of a reconfig cycle, and blocking or unblocking is not needed.
11249 	 * Hence we can return success immediately
11250 	 */
11251 	if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
11252 		return (0);
11253 	}
11254 
11255 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11256 		return (0);
11257 
11258 	if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11259 		return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
11260 	}
11261 
11262 	if (!(MD_MNSET_SETNO(mbp->c_setno))) {
11263 		mddb_setexit_no_parse(s);
11264 		return (EINVAL);
11265 	}
11266 
11267 	single_thread_start(s);
11268 
11269 	if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
11270 		md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11271 
11272 	if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
11273 		md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11274 
11275 	single_thread_end(s);
11276 	mddb_setexit_no_parse(s);
11277 	return (err);
11278 }
11279 
11280 /*
11281  * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
11282  * to relocate any optimized resync records to available mddbs.
11283  * This routine is only called on the master node.
11284  *
11285  * Used in a MN diskset when a slave node has failed to write an optimized
11286  * resync record.  The failed mddb information is sent to the master node
11287  * so the master can relocate the optimized records, if possible.  If the
11288  * failed mddb information has a mddb marked as failed that was previously
11289  * marked active on the master, the master sets its incore mddb state to
11290  * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
11291  * to relocate any optimized records on the newly failed mddbs by calling
11292  * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
11293  * optimized records are relocated.)
11294  *
11295  * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
11296  * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
11297  * flag causes the slave node to re-read in the locator block from disk.
11298  * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
11299  * blocks and write out any optimized resync records that have been
11300  * relocated to a different mddb.
11301  */
11302 int
11303 mddb_optrecfix(mddb_optrec_parm_t *mop)
11304 {
11305 	mddb_set_t		*s;
11306 	int			err = 0;
11307 	mddb_lb_t		*lbp;
11308 	mddb_mnlb_t		*mnlbp;
11309 	mddb_locator_t		*lp;
11310 	int			li;
11311 	mddb_mnsidelocator_t	*mnslp;
11312 	mddb_drvnm_t		*dn;
11313 	int			i, j;
11314 	md_replica_recerr_t	*recerr;
11315 	md_error_t		*ep = &mop->c_mde;
11316 	int			something_changed = 0;
11317 	int			alc, lc;
11318 	int			setno;
11319 
11320 	setno = mop->c_setno;
11321 	if (mop->c_setno >= md_nsets)
11322 		return (EINVAL);
11323 
11324 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11325 		return (0);
11326 
11327 	if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11328 		return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
11329 	}
11330 
11331 	if (!(MD_MNSET_SETNO(mop->c_setno))) {
11332 		mddb_setexit(s);
11333 		return (EINVAL);
11334 	}
11335 
11336 	single_thread_start(s);
11337 	lbp = s->s_lbp;
11338 	mnlbp = (mddb_mnlb_t *)lbp;
11339 
11340 	/*
11341 	 * If slave node has seen an mddb failure, but the master node
11342 	 * hasn't encountered this failure, mark the mddb as failed on
11343 	 * the master node and set the something_changed flag to 1.
11344 	 */
11345 	for (i = 0; i < 2; i++) {
11346 		recerr = &mop->c_recerr[i];
11347 		if (recerr->r_flags & MDDB_F_EWRITE) {
11348 			li = recerr->r_li;
11349 			lp = &lbp->lb_locators[li];
11350 			for (j = 0; j < MD_MNMAXSIDES; j++) {
11351 				mnslp = &mnlbp->lb_mnsidelocators[j][li];
11352 				if (mnslp->mnl_sideno == s->s_sideno)
11353 					break;
11354 			}
11355 			/* Do quick check using li */
11356 			if (j != MD_MNMAXSIDES)
11357 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11358 
11359 			if ((j != MD_MNMAXSIDES) &&
11360 			    (strncmp(dn->dn_data, recerr->r_driver_name,
11361 			    MD_MAXDRVNM) == 0) &&
11362 			    (recerr->r_blkno == lp->l_blkno) &&
11363 			    (recerr->r_mnum == mnslp->mnl_mnum)) {
11364 				if ((lp->l_flags & MDDB_F_ACTIVE) ||
11365 				    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
11366 					something_changed = 1;
11367 					lp->l_flags |= MDDB_F_EWRITE;
11368 					lp->l_flags &= ~MDDB_F_ACTIVE;
11369 				}
11370 			} else {
11371 			/*
11372 			 * Passed in li from slave does not match
11373 			 * the replica in the master's structures.
11374 			 * This could have occurred if a delete
11375 			 * mddb command was running when the
11376 			 * optimized resync record had a failure.
11377 			 * Search all replicas for this entry.
11378 			 * If no match, just ignore.
11379 			 * If a match, set replica in error.
11380 			 */
11381 			    for (li = 0; li < lbp->lb_loccnt; li++) {
11382 				lp = &lbp->lb_locators[li];
11383 				if (lp->l_flags & MDDB_F_DELETED)
11384 					continue;
11385 
11386 				for (j = 0; j < MD_MNMAXSIDES; j++) {
11387 					mnslp =
11388 					    &mnlbp->lb_mnsidelocators[j][li];
11389 					if (mnslp->mnl_sideno == s->s_sideno)
11390 						break;
11391 				}
11392 				if (j == MD_MNMAXSIDES)
11393 					continue;
11394 
11395 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11396 				if ((strncmp(dn->dn_data, recerr->r_driver_name,
11397 				    MD_MAXDRVNM) == 0) &&
11398 				    (recerr->r_blkno == lp->l_blkno) &&
11399 				    (recerr->r_mnum == mnslp->mnl_mnum)) {
11400 					if ((lp->l_flags & MDDB_F_ACTIVE) ||
11401 					    ((lp->l_flags & MDDB_F_EWRITE)
11402 					    == 0)) {
11403 						something_changed = 1;
11404 						lp->l_flags |= MDDB_F_EWRITE;
11405 						lp->l_flags &= ~MDDB_F_ACTIVE;
11406 					}
11407 					break;
11408 				}
11409 			    }
11410 			}
11411 		}
11412 	}
11413 
11414 	/*
11415 	 * If this message changed nothing, then we're done since this
11416 	 * failure has already been handled.
11417 	 * If some mddb state has been changed, send a parse message to
11418 	 * the slave nodes so that the slaves will re-read the locator
11419 	 * block from disk.
11420 	 */
11421 	if (something_changed == 0) {
11422 		single_thread_end(s);
11423 		mddb_setexit(s);
11424 		return (0);
11425 	} else {
11426 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11427 	}
11428 
11429 	/*
11430 	 * Scan replicas setting MD_SET_TOOFEW if
11431 	 * 50% or more of the mddbs have seen errors.
11432 	 * Note: Don't call selectreplicas or writeretry
11433 	 * since these routines may end up setting the ACTIVE flag
11434 	 * on a failed mddb if the master is able to access the mddb
11435 	 * but the slave node couldn't.  Need to have the ACTIVE flag
11436 	 * turned off in order to relocate the optimized records to
11437 	 * mddbs that are (hopefully) available on all nodes.
11438 	 */
11439 	alc = 0;
11440 	lc = 0;
11441 	for (li = 0; li < lbp->lb_loccnt; li++) {
11442 		lp = &lbp->lb_locators[li];
11443 		if (lp->l_flags & MDDB_F_DELETED)
11444 			continue;
11445 		lc++;
11446 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11447 			continue;
11448 		alc++;
11449 	}
11450 
11451 	/*
11452 	 * If more than 50% mddbs have failed, then don't relocate opt recs.
11453 	 * The node sending the mddb failure information will detect TOOFEW
11454 	 * and will panic when it attempts to re-write the optimized record.
11455 	 */
11456 	if (alc < ((lc + 1) / 2)) {
11457 		md_set_setstatus(setno, MD_SET_TOOFEW);
11458 		(void) push_lb(s);
11459 		single_thread_end(s);
11460 		mddb_setexit(s);
11461 		return (0);
11462 	}
11463 
11464 	/* Attempt to relocate optimized records that are on failed mddbs */
11465 	(void) fixoptrecords(s);
11466 
11467 	/* Push changed locator block out to disk */
11468 	(void) push_lb(s);
11469 
11470 	/* Recheck for TOOFEW after writing out locator blocks */
11471 	alc = 0;
11472 	lc = 0;
11473 	for (li = 0; li < lbp->lb_loccnt; li++) {
11474 		lp = &lbp->lb_locators[li];
11475 		if (lp->l_flags & MDDB_F_DELETED)
11476 			continue;
11477 		lc++;
11478 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11479 			continue;
11480 		alc++;
11481 	}
11482 
11483 	/* If more than 50% mddbs have failed, then don't relocate opt recs */
11484 	if (alc < ((lc + 1) / 2)) {
11485 		md_set_setstatus(setno, MD_SET_TOOFEW);
11486 		single_thread_end(s);
11487 		mddb_setexit(s);
11488 		return (0);
11489 	}
11490 
11491 	single_thread_end(s);
11492 	mddb_setexit(s);
11493 	return (0);
11494 }
11495 
11496 /*
11497  * Check if incore mddb on master node matches ondisk mddb.
11498  * If not, master writes out incore view to all mddbs.
11499  * Have previously verified that master is an owner of the
11500  * diskset (master has snarfed diskset) and that diskset is
11501  * not stale.
11502  *
11503  * Meant to be called during reconfig cycle during change of master.
11504  * Previous master in diskset may have changed the mddb and
11505  * panic'd before relaying information to slave nodes.  New
11506  * master node just writes out its incore view of the mddb and
11507  * the replay of the change log will resync all the nodes.
11508  *
11509  * Only supported for MN disksets.
11510  *
11511  * Return values:
11512  *	0 - success
11513  *	non-zero - failure
11514  */
11515 int
11516 mddb_check_write_ioctl(mddb_config_t *info)
11517 {
11518 	int			err = 0;
11519 	set_t			setno = info->c_setno;
11520 	mddb_set_t		*s;
11521 	int			li;
11522 	mddb_locator_t		*lp;
11523 	mddb_lb_t		*lbp;
11524 	mddb_mnlb_t		*mnlbp_od;
11525 	mddb_ln_t		*lnp;
11526 	mddb_mnln_t		*mnlnp_od;
11527 	mddb_db_t		*dbp;
11528 	mddb_de_ic_t		*dep;
11529 	int			write_out_mddb;
11530 	md_error_t		*ep = &info->c_mde;
11531 	int			mddb_err = 0;
11532 	int			prev_li = 0;
11533 	int			rval = 0;
11534 	int			alc, lc;
11535 	int			mddbs_present = 0;
11536 
11537 	/* Verify that setno is in valid range */
11538 	if (setno >= md_nsets)
11539 		return (EINVAL);
11540 
11541 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11542 		return (0);
11543 
11544 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
11545 		return (mddbstatus2error(ep, err, NODEV32, setno));
11546 	}
11547 
11548 	/* Calling diskset must be a MN diskset */
11549 	if (!(MD_MNSET_SETNO(setno))) {
11550 		mddb_setexit(s);
11551 		return (EINVAL);
11552 	}
11553 
11554 	/* Re-verify that set is not stale */
11555 	if (md_get_setstatus(setno) & MD_SET_STALE) {
11556 		mddb_setexit(s);
11557 		return (mdmddberror(ep, MDE_DB_STALE,
11558 			NODEV32, setno));
11559 	}
11560 
11561 	lbp = s->s_lbp;
11562 	lnp = s->s_lnp;
11563 
11564 	/*
11565 	 * Previous master could have died during the write of data to
11566 	 * the mddbs so that the ondisk mddbs may not be consistent.
11567 	 * So, need to check the contents of the first and last active mddb
11568 	 * to see if the mddbs need to be rewritten.
11569 	 */
11570 	for (li = 0; li < lbp->lb_loccnt; li++) {
11571 		int	checkcopy_err;
11572 
11573 		lp = &lbp->lb_locators[li];
11574 		/* Find replica that is active */
11575 		if (lp->l_flags & MDDB_F_DELETED)
11576 			continue;
11577 		mddbs_present = 1;
11578 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11579 			continue;
11580 		if (s->s_mbiarray[li] == NULL)
11581 			continue;
11582 		/* Check locator block */
11583 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11584 		    KM_SLEEP);
11585 		/* read in on-disk locator block */
11586 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11587 
11588 		/* If err, try next mddb */
11589 		if (err) {
11590 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11591 			continue;
11592 		}
11593 
11594 		/*
11595 		 * We resnarf all changelog entries for this set.
11596 		 * They may have been altered by the previous master
11597 		 */
11598 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11599 		    for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
11600 			if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
11601 				continue;
11602 			}
11603 			/* This has been alloc'ed while joining the set */
11604 			if (dep->de_rb) {
11605 				kmem_free(dep->de_rb, dep->de_recsize);
11606 				dep->de_rb = (mddb_rb32_t *)NULL;
11607 			}
11608 			if (dep->de_rb_userdata) {
11609 				kmem_free(dep->de_rb_userdata, dep->de_reqsize);
11610 				dep->de_rb_userdata = (caddr_t)NULL;
11611 			}
11612 
11613 			err = getrecord(s, dep, li);
11614 			if (err) {
11615 				/*
11616 				 * When we see on error while reading the
11617 				 * changelog entries, we move on to the next
11618 				 * mddb
11619 				 */
11620 				err = 1;
11621 				break; /* out of inner for-loop */
11622 			}
11623 			allocuserdata(dep);
11624 		    }
11625 		    if (err)
11626 			    break; /* out of outer for-loop */
11627 		}
11628 
11629 		/* If err, try next mddb */
11630 		if (err) {
11631 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11632 			continue;
11633 		}
11634 
11635 		/* Is incore locator block same as ondisk? */
11636 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11637 									== 1) {
11638 			write_out_mddb = 1;
11639 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11640 			break;
11641 		}
11642 
11643 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11644 
11645 		/* If lb ok, check locator names */
11646 		mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
11647 		    KM_SLEEP);
11648 		/* read in on-disk locator names */
11649 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11650 			lbp->lb_lnblkcnt, li);
11651 
11652 		/* If err, try next mddb */
11653 		if (err) {
11654 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11655 			continue;
11656 		}
11657 
11658 		/* Are incore locator names same as ondisk? */
11659 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11660 									== 1) {
11661 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11662 			write_out_mddb = 1;
11663 			break;
11664 		}
11665 
11666 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11667 
11668 		/*
11669 		 * Check records in mddb.
11670 		 * If a read error is encountered, set the error flag and
11671 		 * continue to the next mddb.  Otherwise, if incore data is
11672 		 * different from ondisk, then set the flag to write out
11673 		 * the mddb and break out.
11674 		 */
11675 		checkcopy_err = checkcopy(s, li);
11676 		if (checkcopy_err == MDDB_F_EREAD) {
11677 			lp->l_flags |= MDDB_F_EREAD;
11678 			mddb_err = 1;
11679 			continue;
11680 		} else if (checkcopy_err == 1) {
11681 			write_out_mddb = 1;
11682 			break;
11683 		}
11684 		/*
11685 		 * Have found first active mddb and the data is the same as
11686 		 * incore - break out of loop
11687 		 */
11688 		write_out_mddb = 0;
11689 		break;
11690 	}
11691 
11692 	/*
11693 	 * Skip checking for last active mddb if:
11694 	 *	- already found a mismatch in the first active mddb
11695 	 *		(write_out_mddb is 1)  OR
11696 	 * 	- didn't find a readable mddb when looking for first
11697 	 *	  active mddb (there are mddbs present but all failed
11698 	 *	  when read was attempted).
11699 	 *
11700 	 * In either case, go to write_out_mddb label in order to attempt
11701 	 * to write out the data. If < 50% mddbs are available, panic.
11702 	 */
11703 	if ((write_out_mddb == 1) ||
11704 	    ((li == lbp->lb_loccnt) && mddbs_present)) {
11705 		write_out_mddb = 1;
11706 		goto write_out_mddb;
11707 	}
11708 
11709 	/*
11710 	 * Save which index was checked for the first active mddb.  If only 1
11711 	 * active mddb, don't want to recheck the same mddb when looking for
11712 	 * last active mddb.
11713 	 */
11714 	prev_li = li;
11715 
11716 	/*
11717 	 * Now, checking for last active mddb.  If found same index as before
11718 	 * (only 1 active mddb), then skip.
11719 	 */
11720 	for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
11721 		int	checkcopy_err;
11722 
11723 		lp = &lbp->lb_locators[li];
11724 		/* Find replica that is active */
11725 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11726 			continue;
11727 		if (lp->l_flags & MDDB_F_DELETED)
11728 			continue;
11729 		if (s->s_mbiarray[li] == NULL)
11730 			continue;
11731 		/* If already checked mddb, bail out */
11732 		if (li == prev_li)
11733 			break;
11734 		/* Check locator block */
11735 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11736 		    KM_SLEEP);
11737 		/* read in on-disk locator block */
11738 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11739 
11740 		/* If err, try next mddb */
11741 		if (err) {
11742 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11743 			continue;
11744 		}
11745 
11746 
11747 		/* Is incore locator block same as ondisk? */
11748 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11749 									== 1) {
11750 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11751 			write_out_mddb = 1;
11752 			break;
11753 		}
11754 
11755 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11756 
11757 		/* If lb ok, check locator names */
11758 		mnlnp_od = (mddb_mnln_t *)
11759 		    kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);
11760 
11761 		/* read in on-disk locator names */
11762 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11763 		    lbp->lb_lnblkcnt, li);
11764 
11765 		/* If err, try next mddb */
11766 		if (err) {
11767 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11768 			continue;
11769 		}
11770 
11771 		/* Are incore locator names same as ondisk? */
11772 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11773 									== 1) {
11774 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11775 			write_out_mddb = 1;
11776 			break;
11777 		}
11778 
11779 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11780 
11781 		/*
11782 		 * Check records in mddb.
11783 		 * If a read error is encountered, set the error flag and
11784 		 * continue to the next mddb.  Otherwise, if incore data is
11785 		 * different from ondisk, then set the flag to write out
11786 		 * the mddb and break out.
11787 		 */
11788 		checkcopy_err = checkcopy(s, li);
11789 		if (checkcopy_err == MDDB_F_EREAD) {
11790 			lp->l_flags |= MDDB_F_EREAD;
11791 			mddb_err = 1;
11792 			continue;
11793 		} else if (checkcopy_err == 1) {
11794 			write_out_mddb = 1;
11795 			break;
11796 		}
11797 		/*
11798 		 * Have found last active mddb and the data is the same as
11799 		 * incore - break out of loop
11800 		 */
11801 		write_out_mddb = 0;
11802 		break;
11803 	}
11804 
11805 	/*
11806 	 * If ondisk and incore versions of the mddb don't match, then
11807 	 * write out this node's incore version to disk.
11808 	 * Or, if unable to read a copy of the mddb, attempt to write
11809 	 * out a new one.
11810 	 */
11811 write_out_mddb:
11812 	if (write_out_mddb) {
11813 		/* Recompute free blocks based on incore information */
11814 		computefreeblks(s); /* set up free block bits */
11815 
11816 		/*
11817 		 * Write directory entries and record blocks.
11818 		 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
11819 		 * routine won't write out change log records.
11820 		 */
11821 		for (li = 0; li < lbp->lb_loccnt; li++) {
11822 			lp = &lbp->lb_locators[li];
11823 			/* Don't write to inactive or deleted mddbs */
11824 			if (! (lp->l_flags & MDDB_F_ACTIVE))
11825 				continue;
11826 			if (lp->l_flags & MDDB_F_DELETED)
11827 				continue;
11828 			if (s->s_mbiarray[li] == NULL)
11829 				continue;
11830 			/* If encounter a write error, save it for later */
11831 			if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
11832 				lp->l_flags |= MDDB_F_EWRITE;
11833 				mddb_err = 1;
11834 			}
11835 		}
11836 
11837 		/*
11838 		 * Write out locator blocks to all replicas.
11839 		 * push_lb will set MDDB_F_EWRITE on replicas that fail.
11840 		 */
11841 		if (push_lb(s))
11842 			mddb_err = 1;
11843 
11844 		/* Write out locator names to all replicas */
11845 		lnp = s->s_lnp;
11846 		uniqtime32(&lnp->ln_timestamp);
11847 		lnp->ln_revision = MDDB_REV_MNLN;
11848 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
11849 
11850 		/* writeall sets MDDB_F_EWRITE if writes fails to replica */
11851 		if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
11852 		    lbp->lb_lnblkcnt, 0))
11853 			mddb_err = 1;
11854 
11855 		/*
11856 		 * The writes to the replicas above would have set
11857 		 * the MDDB_F_EWRITE flags if any write error was
11858 		 * encountered.
11859 		 * If < 50% of the mddbs are available, panic.
11860 		 */
11861 		lc = alc = 0;
11862 		for (li = 0; li < lbp->lb_loccnt; li++) {
11863 			lp = &lbp->lb_locators[li];
11864 			if (lp->l_flags & MDDB_F_DELETED)
11865 				continue;
11866 			lc++;
11867 			/*
11868 			 * If mddb:
11869 			 *	- is not active (previously had an error)
11870 			 *	- had an error reading the master blocks  or
11871 			 *	- had an error in writing to the mddb
11872 			 * then don't count this mddb in the active count.
11873 			 */
11874 			if (! (lp->l_flags & MDDB_F_ACTIVE) ||
11875 			    (lp->l_flags & MDDB_F_EMASTER) ||
11876 			    (lp->l_flags & MDDB_F_EWRITE))
11877 				continue;
11878 			alc++;
11879 		}
11880 		if (alc < ((lc + 1) / 2)) {
11881 			cmn_err(CE_PANIC,
11882 			    "md: Panic due to lack of DiskSuite state\n"
11883 			    " database replicas. Fewer than 50%% of "
11884 			    "the total were available,\n so panic to "
11885 			    "ensure data integrity.");
11886 		}
11887 	}
11888 
11889 	/*
11890 	 * If encountered an error during checking or writing of
11891 	 * mddbs, call selectreplicas so that replica error can
11892 	 * be properly handled. This will involve another attempt
11893 	 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
11894 	 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
11895 	 * turned off. Set the MDDB_SCANALLSYNC flag so that
11896 	 * selectreplicas doesn't overwrite the change log entries.
11897 	 *
11898 	 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
11899 	 * that the locator block has been changed.
11900 	 */
11901 	if (mddb_err) {
11902 		(void) selectreplicas(s, MDDB_SCANALLSYNC);
11903 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11904 	}
11905 
11906 write_out_end:
11907 	mddb_setexit(s);
11908 	return (rval);
11909 }
11910 
11911 /*
11912  * Set/reset/get set flags in set structure.
11913  * Used during reconfig cycle
11914  * Only supported for MN disksets.
11915  *
11916  * Return values:
11917  *	0 - success
11918  *	non-zero - failure
11919  */
11920 int
11921 mddb_setflags_ioctl(mddb_setflags_config_t *info)
11922 {
11923 	set_t			setno = info->sf_setno;
11924 
11925 	/* Verify that setno is in valid range */
11926 	if (setno >= md_nsets)
11927 		return (EINVAL);
11928 
11929 	/*
11930 	 * When setting the flags, the set may not
11931 	 * be snarfed yet. So, don't check for SNARFED or MNset
11932 	 * and don't call mddb_setenter.
11933 	 * In order to discourage bad ioctl calls,
11934 	 * verify that magic field in structure is set correctly.
11935 	 */
11936 	if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
11937 		return (EINVAL);
11938 
11939 	switch (info->sf_flags) {
11940 	case MDDB_NM_SET:
11941 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
11942 			md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
11943 		if (info->sf_setflags & MD_SET_MN_START_RC)
11944 			md_set_setstatus(setno, MD_SET_MN_START_RC);
11945 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
11946 			md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
11947 		break;
11948 
11949 	case MDDB_NM_RESET:
11950 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
11951 			md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
11952 		if (info->sf_setflags & MD_SET_MN_START_RC)
11953 			md_clr_setstatus(setno, MD_SET_MN_START_RC);
11954 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
11955 			md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
11956 		break;
11957 
11958 	case MDDB_NM_GET:
11959 		info->sf_setflags = md_get_setstatus(setno) &
11960 		    (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
11961 		    MD_SET_MN_MIR_STATE_RC);
11962 		break;
11963 	}
11964 
11965 	return (0);
11966 }
11967 
11968 int
11969 md_update_minor(
11970 	set_t	setno,
11971 	side_t	side,
11972 	mdkey_t	key
11973 )
11974 {
11975 	struct nm_next_hdr	*nh;
11976 	struct nm_name		*n;
11977 	char			*shn;
11978 	int			retval = 1;
11979 
11980 	/*
11981 	 * Load the devid name space if it exists
11982 	 */
11983 	(void) md_load_namespace(setno, NULL, NM_DEVID);
11984 	if (! md_load_namespace(setno, NULL, 0L)) {
11985 		/*
11986 		 * Unload the devid namespace
11987 		 */
11988 		(void) md_unload_namespace(setno, NM_DEVID);
11989 		return (0);
11990 	}
11991 
11992 	rw_enter(&nm_lock.lock, RW_READER);
11993 
11994 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
11995 		retval = 0;
11996 		goto out;
11997 	}
11998 
11999 	/*
12000 	 * Look up the key
12001 	 */
12002 	if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) {
12003 		/*
12004 		 * Find the entry, update its n_minor if metadevice
12005 		 */
12006 		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12007 		    == NULL) {
12008 			retval = 0;
12009 			goto out;
12010 		}
12011 
12012 		if (strcmp(shn, "md") == 0) {
12013 			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12014 		}
12015 	}
12016 
12017 out:
12018 	rw_exit(&nm_lock.lock);
12019 	return (retval);
12020 }
12021 
12022 static void
12023 md_imp_nm(
12024 	mddb_set_t	*s
12025 )
12026 {
12027 	mddb_db_t		*dbp;
12028 	mddb_de_ic_t		*dep;
12029 	struct nm_rec_hdr	*hdr;
12030 	struct nm_header	*hhdr;
12031 	set_t			setno = s->s_setno;
12032 
12033 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12034 		for (dep = dbp->db_firstentry; dep != NULL;
12035 		    dep = dep->de_next) {
12036 			switch (dep->de_type1) {
12037 
12038 			case MDDB_NM_HDR:
12039 			case MDDB_DID_NM_HDR:
12040 
12041 				hhdr = (struct nm_header *)
12042 				    dep->de_rb_userdata;
12043 
12044 				hdr = &hhdr->h_names;
12045 				if (hdr->r_next_recid > 0) {
12046 					hdr->r_next_recid = MAKERECID(setno,
12047 					    DBID(hdr->r_next_recid));
12048 				}
12049 
12050 				hdr = &hhdr->h_shared;
12051 				if (hdr->r_next_recid > 0) {
12052 					hdr->r_next_recid = MAKERECID(setno,
12053 					    DBID(hdr->r_next_recid));
12054 				}
12055 				break;
12056 
12057 			case MDDB_NM:
12058 			case MDDB_DID_NM:
12059 			case MDDB_SHR_NM:
12060 			case MDDB_DID_SHR_NM:
12061 
12062 				hdr = (struct nm_rec_hdr *)
12063 				    dep->de_rb_userdata;
12064 
12065 				if (hdr->r_next_recid > 0) {
12066 					hdr->r_next_recid = MAKERECID
12067 					    (setno, DBID(hdr->r_next_recid));
12068 				}
12069 				break;
12070 
12071 			default:
12072 				break;
12073 			}
12074 		}
12075 	}
12076 }
12077 
12078 static int
12079 update_db_rec(
12080 	mddb_set_t	*s
12081 )
12082 {
12083 	mddb_db_t	*dbp;
12084 	mddb_de_ic_t	*dep;
12085 	mddb_recid_t	ids[2];
12086 
12087 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12088 		for (dep = dbp->db_firstentry; dep != NULL;
12089 		    dep = dep->de_next) {
12090 			if (! (dep->de_flags & MDDB_F_OPT)) {
12091 				ids[0] = MAKERECID(s->s_setno, dep->de_recid);
12092 				ids[1] = 0;
12093 				if (mddb_commitrecs(ids)) {
12094 					return (MDDB_E_NORECORD);
12095 				}
12096 			}
12097 		}
12098 	}
12099 	return (0);
12100 }
12101 
12102 static int
12103 update_mb(
12104 	mddb_set_t	*s
12105 )
12106 {
12107 	mddb_ri_t	*rip;
12108 	int	err = 0;
12109 
12110 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12111 
12112 		if (md_get_setstatus(s->s_setno) &
12113 			MD_SET_REPLICATED_IMPORT) {
12114 			/*
12115 			 * It is a replicated set
12116 			 */
12117 			if (rip->ri_devid == (ddi_devid_t)NULL) {
12118 				return (-1);
12119 			}
12120 			err = update_mb_devid(s, rip, rip->ri_devid);
12121 		} else {
12122 			/*
12123 			 * It is a non-replicated set
12124 			 * and there is no need to update
12125 			 * devid
12126 			 */
12127 			err = update_mb_devid(s, rip, NULL);
12128 		}
12129 
12130 		if (err)
12131 			return (err);
12132 	}
12133 
12134 	return (0);
12135 }
12136 
12137 static int
12138 update_setname(
12139 	set_t	setno
12140 )
12141 {
12142 	struct nm_next_hdr	*nh;
12143 	struct nm_shared_name	*shn, *new_shn;
12144 	char			*prefix = "/dev/md/";
12145 	char			*shrname;
12146 	int			len;
12147 	mdkey_t			o_key;
12148 	uint32_t		o_count, o_data;
12149 	mddb_recid_t		recid, ids[3];
12150 	int			err = 0;
12151 	mddb_set_t		*dbp;
12152 
12153 	/* Import setname */
12154 	dbp = (mddb_set_t *)md_set[setno].s_db;
12155 	len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
12156 	shrname = kmem_zalloc(len, KM_SLEEP);
12157 	(void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");
12158 
12159 	rw_enter(&nm_lock.lock, RW_WRITER);
12160 	if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
12161 		err = MD_KEYBAD;
12162 		goto out;
12163 	}
12164 
12165 	if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
12166 	    0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
12167 		/*
12168 		 * No metadevice is okay
12169 		 */
12170 		err = 0;
12171 		goto out;
12172 	}
12173 
12174 	/*
12175 	 * We have it, go ahead and update the namespace.
12176 	 */
12177 	o_key = shn->sn_key;
12178 	o_count = shn->sn_count;
12179 	o_data = shn->sn_data;
12180 
12181 	if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
12182 	    NM_NOCOMMIT)) {
12183 		err = MD_KEYBAD;
12184 		goto out;
12185 	}
12186 	if ((new_shn = (struct nm_shared_name *)alloc_entry(
12187 	    nh, md_set[setno].s_nmid, len, NM_SHARED |
12188 	    NM_NOCOMMIT, &recid)) == NULL) {
12189 		err = MD_KEYBAD;
12190 		goto out;
12191 	}
12192 
12193 	new_shn->sn_key = o_key;
12194 	new_shn->sn_count = o_count;
12195 	new_shn->sn_data = o_data;
12196 	new_shn->sn_namlen = (ushort_t)len;
12197 	(void) strcpy(new_shn->sn_name, shrname);
12198 
12199 	ids[0] = recid;
12200 	ids[1] = md_set[setno].s_nmid;
12201 	ids[2] = 0;
12202 	err = mddb_commitrecs(ids);
12203 
12204 out:
12205 	if (shrname)
12206 		kmem_free(shrname, len);
12207 	rw_exit(&nm_lock.lock);
12208 	return (err);
12209 }
12210 
12211 static int
12212 md_imp_db(
12213 	set_t	setno
12214 )
12215 {
12216 	mddb_set_t	*s;
12217 	int		err = 0;
12218 	mddb_dt_t	*dtp;
12219 
12220 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12221 		return (err);
12222 	}
12223 
12224 	/* Update dt */
12225 	if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
12226 		crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
12227 	}
12228 
12229 	if ((err = dt_write(s)) != 0) {
12230 		mddb_setexit(s);
12231 		return (err);
12232 	}
12233 
12234 	/* Update lb */
12235 	if ((err = writelocall(s)) != 0) {
12236 		mddb_setexit(s);
12237 		return (err);
12238 	}
12239 
12240 
12241 	/* Update mb */
12242 	if ((err = update_mb(s)) != 0) {
12243 		mddb_setexit(s);
12244 		return (err);
12245 	}
12246 
12247 	mddb_setexit(s);
12248 
12249 	/* Update db records */
12250 	if ((err = update_db_rec(s)) != 0)
12251 		return (err);
12252 
12253 	/* Update setname embedded in the namespace */
12254 	err = update_setname(setno);
12255 
12256 	return (err);
12257 }
12258 
12259 static void
12260 md_dr_add(
12261 	md_set_record	*sr,
12262 	md_drive_record	*dr
12263 )
12264 {
12265 	md_drive_record	*drv;
12266 
12267 	if (sr->sr_driverec == 0) {
12268 		sr->sr_driverec = dr->dr_selfid;
12269 		return;
12270 	}
12271 
12272 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12273 	    drv->dr_nextrec != 0;
12274 	    drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
12275 		;
12276 	drv->dr_nextrec = dr->dr_selfid;
12277 }
12278 
12279 static void
12280 md_setup_recids(
12281 	md_set_record	*sr,
12282 	mddb_recid_t	**ids,
12283 	size_t		size
12284 )
12285 {
12286 	md_drive_record	*drv;
12287 	int		cnt;
12288 	mddb_recid_t	*recids;
12289 
12290 	recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
12291 	    * size, KM_SLEEP);
12292 	recids[0] = sr->sr_selfid;
12293 	cnt = 1;
12294 
12295 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12296 	    /* CSTYLED */
12297 	    drv != NULL;) {
12298 		recids[cnt++] = drv->dr_selfid;
12299 		if (drv->dr_nextrec != 0)
12300 			drv = (md_drive_record *)mddb_getrecaddr
12301 			    (drv->dr_nextrec);
12302 		else
12303 			drv = NULL;
12304 	}
12305 	recids[cnt] = 0;
12306 	*ids = &recids[0];
12307 }
12308 
12309 static int
12310 md_imp_create_set(
12311 	set_t	setno
12312 )
12313 {
12314 	mddb_set_t	*s;
12315 	int		drc = 0, err = 0;
12316 	size_t		sr_size = sizeof (md_set_record);
12317 	md_set_record	*sr;
12318 	mddb_recid_t	sr_recid, dr_recid, *ids = NULL;
12319 	mddb_ri_t	*rip, *trip;
12320 	md_drive_record	*dr;
12321 	size_t		dr_size = sizeof (md_drive_record);
12322 	mdkey_t		dr_key;
12323 
12324 
12325 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
12326 		return (err);
12327 
12328 	/* Create and fill in set record */
12329 	if ((sr_recid = mddb_createrec(sr_size, MDDB_USER, MDDB_UR_SR,
12330 	    MD_CRO_32BIT, MD_LOCAL_SET)) < 0) {
12331 		mddb_setexit(s);
12332 		return (MDDB_E_INVALID);
12333 	}
12334 
12335 	sr = (md_set_record *)mddb_getrecaddr(sr_recid);
12336 	sr->sr_selfid = sr_recid;
12337 	sr->sr_setno = s->s_setno;
12338 	(void) strcpy(sr->sr_setname, s->s_setname);
12339 	uniqtime32(&sr->sr_ctime);
12340 	sr->sr_genid = 0;
12341 	sr->sr_revision = MD_SET_RECORD_REVISION;
12342 	sr->sr_flags |= MD_SR_OK;
12343 	sr->sr_mhiargs = defmhiargs;
12344 	(void) strcpy(sr->sr_nodes[0], utsname.nodename);
12345 
12346 	/* Create and fillin drive records */
12347 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12348 		/*
12349 		 * Add entry and create the record
12350 		 */
12351 		if ((dr_key = md_setdevname(MD_LOCAL_SET, 1, MD_KEYWILD,
12352 		    rip->ri_driver, md_getminor(rip->ri_dev),
12353 		    rip->ri_devname, setno)) == 0)
12354 			continue;
12355 
12356 		if (dr_key < 0) {
12357 			mddb_setexit(s);
12358 			return (MD_KEYBAD);
12359 		}
12360 
12361 		if ((dr_recid = mddb_createrec(dr_size, MDDB_USER,
12362 		    MDDB_UR_DR, MD_CRO_32BIT, MD_LOCAL_SET)) < 0) {
12363 			mddb_setexit(s);
12364 			return (MDDB_E_INVALID);
12365 		}
12366 
12367 		dr = (md_drive_record *)mddb_getrecaddr(dr_recid);
12368 		dr->dr_selfid = dr_recid;
12369 
12370 		/*
12371 		 * We need to check to see if the drive on
12372 		 * the rip has a replica. If it doesn't have
12373 		 * a replica, then we need to set the dr_dbcnt
12374 		 * and dr_dbsize to 0 to reflect that.
12375 		 */
12376 		if (rip->ri_mbip == NULL) {
12377 			dr->dr_dbcnt = 0;
12378 			dr->dr_dbsize = 0;
12379 		} else {
12380 			dr->dr_dbcnt = 1;
12381 
12382 			for (trip = s->s_rip; trip != NULL;
12383 			    trip = trip->ri_next) {
12384 
12385 				if (trip == rip)
12386 					continue;
12387 
12388 				if ((trip->ri_dev == rip->ri_dev) &&
12389 				    (strcmp(trip->ri_devname, rip->ri_devname)
12390 				    == 0))
12391 					dr->dr_dbcnt++;
12392 			}
12393 
12394 			dr->dr_dbsize = rip->ri_mbip->mbi_mddb_mb.mb_blkcnt + 1;
12395 		}
12396 		dr->dr_key = dr_key;
12397 		uniqtime32(&dr->dr_ctime);
12398 		dr->dr_genid = 1;
12399 		dr->dr_revision = MD_DRIVE_RECORD_REVISION;
12400 		dr->dr_flags = MD_SR_OK;
12401 		drc++;
12402 
12403 		/* Add on the linked list */
12404 		(void) md_dr_add(sr, dr);
12405 	}
12406 
12407 	/*
12408 	 * Alloc and setup recids which include set record
12409 	 */
12410 	(void) md_setup_recids(sr, &ids, drc + 2);
12411 
12412 	/*
12413 	 * Commit all the records
12414 	 */
12415 	err = mddb_commitrecs(ids);
12416 
12417 	if (ids)
12418 		kmem_free(ids, sizeof (mddb_recid_t) * (drc + 2));
12419 	mddb_setexit(s);
12420 	return (err);
12421 }
12422 
12423 /*
12424  * namespace is loaded before this is called.
12425  * The purpose of this function is to update the device ids in the entire
12426  * namespace using the data in the ri structure. Compare the devid found in
12427  * the namespace with ri_old_devid and if they are the same, update with the
12428  * devid in ri_devid.
12429  */
12430 static int
12431 md_imp_update_namespace_did(mddb_set_t *s)
12432 {
12433 	set_t			setno = s->s_lbp->lb_setno;
12434 	struct nm_next_hdr	*nh;
12435 	mdkey_t			key = MD_KEYWILD;
12436 	side_t			side = MD_SIDEWILD;
12437 	mddb_ri_t		*rip = NULL;
12438 	mddb_recid_t		recids[3];
12439 	struct did_min_name	*n;
12440 	struct nm_next_hdr	*did_shr_nh;
12441 	struct did_shr_name	*shr_n;
12442 	mdkey_t			ent_did_key;
12443 	uint32_t		ent_did_count;
12444 	uint32_t		ent_did_data;
12445 	size_t			ent_size, size;
12446 	ddi_devid_t		devid = NULL;
12447 	struct did_shr_name	*shn;
12448 	size_t			offset;
12449 	struct nm_next_hdr	*this_did_shr_nh;
12450 
12451 	/*
12452 	 * It is okay if we dont have any configuration
12453 	 */
12454 	offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name));
12455 	if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
12456 	    == NULL) {
12457 		return (0);
12458 	}
12459 	while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
12460 		/* check out every entry in the namespace */
12461 		if ((n = (struct did_min_name *)lookup_entry(nh, setno,
12462 		    side, key, NODEV64, NM_DEVID)) == NULL) {
12463 			break;
12464 		} else {
12465 			did_shr_nh = get_first_record(setno, 0, NM_DEVID |
12466 			    NM_SHARED);
12467 			if (did_shr_nh == NULL) {
12468 				return (ENOENT);
12469 			}
12470 			this_did_shr_nh = did_shr_nh->nmn_nextp;
12471 			shr_n = (struct did_shr_name *)lookup_shared_entry(
12472 			    did_shr_nh, n->min_devid_key, (char *)0,
12473 			    &recids[0], NM_DEVID);
12474 			if (shr_n == NULL) {
12475 				return (ENOENT);
12476 			}
12477 			rw_enter(&nm_lock.lock, RW_WRITER);
12478 			devid = (ddi_devid_t)shr_n->did_devid;
12479 			/* find this devid in the incore replica  */
12480 			for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12481 				if (ddi_devid_compare(devid, rip->ri_old_devid)
12482 				    == 0) {
12483 					/*
12484 					 * found the corresponding entry
12485 					 * update with new devid
12486 					 */
12487 					/* first remove old devid info */
12488 					ent_did_key = shr_n ->did_key;
12489 					ent_did_count = shr_n->did_count;
12490 					ent_did_data = shr_n->did_data;
12491 					ent_size = DID_SHR_NAMSIZ(shr_n);
12492 					size = ((struct nm_rec_hdr *)
12493 					    this_did_shr_nh->nmn_record)->
12494 					    r_used_size - offset - ent_size;
12495 					if (size == 0) {
12496 						(void) bzero(shr_n, ent_size);
12497 					} else {
12498 						(void) ovbcopy((caddr_t)shr_n +
12499 						    ent_size, shr_n, size);
12500 						(void) bzero((caddr_t)shr_n +
12501 						    size, ent_size);
12502 					}
12503 					((struct nm_rec_hdr *)this_did_shr_nh->
12504 					    nmn_record)->r_used_size -=
12505 					    ent_size;
12506 					/* add in new devid info */
12507 					if ((shn = (struct did_shr_name *)
12508 					    alloc_entry(did_shr_nh,
12509 					    md_set[setno].s_did_nmid,
12510 					    ddi_devid_sizeof(rip->ri_devid),
12511 					    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
12512 					    &recids[0])) == NULL) {
12513 						rw_exit(&nm_lock.lock);
12514 						return (ENOMEM);
12515 					}
12516 					shn->did_key = ent_did_key;
12517 					shn->did_count = ent_did_count;
12518 					ent_did_data |= NM_DEVID_VALID;
12519 					shn->did_data = ent_did_data;
12520 					shn->did_size = ddi_devid_sizeof(
12521 					    rip->ri_devid);
12522 					bcopy((void *)rip->ri_devid, (void *)
12523 					    shn->did_devid, shn->did_size);
12524 					recids[1] = md_set[setno].s_nmid;
12525 					recids[2] = 0;
12526 					mddb_commitrecs_wrapper(recids);
12527 				}
12528 			}
12529 			rw_exit(&nm_lock.lock);
12530 		}
12531 	}
12532 	return (0);
12533 }
12534 
12535 /*ARGSUSED*/
12536 int
12537 md_imp_snarf_set(
12538 	set_t	*setnum,
12539 	int	mode
12540 )
12541 {
12542 	set_t		setno = *setnum; /* import setno */
12543 	mddb_set_t	*s;
12544 	int		i, err = 0;
12545 	md_ops_t	*ops;
12546 
12547 	if (setno >= md_nsets) {
12548 		return (EINVAL);
12549 	}
12550 
12551 	md_haltsnarf_enter(setno);
12552 	if (md_get_setstatus(setno) & MD_SET_IMPORT) {
12553 		goto out;
12554 	}
12555 
12556 	/* Set the bit first otherwise load_old_replicas can fail */
12557 	md_set_setstatus(setno, MD_SET_IMPORT);
12558 
12559 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12560 		goto out;
12561 	}
12562 
12563 	/*
12564 	 * Upon completion of load_old_replicas, the old setno is
12565 	 * restored from the disk so we need to reset
12566 	 */
12567 	s->s_lbp->lb_setno = setno;
12568 
12569 	/*
12570 	 * Fixup the NM records before loading namespace
12571 	 */
12572 	(void) md_imp_nm(s);
12573 	mddb_setexit(s);
12574 
12575 	/*
12576 	 * Load the devid name space if it exists
12577 	 * and ask each module to fixup unit records
12578 	 */
12579 	if (!md_load_namespace(setno, NULL, NM_DEVID)) {
12580 		err = ENOENT;
12581 		goto cleanup;
12582 	}
12583 	if (!md_load_namespace(setno, NULL, 0L)) {
12584 		(void) md_unload_namespace(setno, NM_DEVID);
12585 		err = ENOENT;
12586 		goto cleanup;
12587 	}
12588 
12589 	do {
12590 		i = 0;
12591 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
12592 			if (ops->md_imp_set != NULL)
12593 				i += ops->md_imp_set(setno);
12594 	} while (i);
12595 
12596 	/*
12597 	 * Fixup
12598 	 *	(1) locator block
12599 	 *	(2) locator name block if necessary
12600 	 *	(3) master block
12601 	 *	(4) directory block
12602 	 * calls appropriate writes to push changes out
12603 	 */
12604 	if ((err = md_imp_db(setno)) != 0)
12605 		goto cleanup;
12606 
12607 	/*
12608 	 * Create set in MD_LOCAL_SET
12609 	 */
12610 	if ((err = md_imp_create_set(setno)) != 0)
12611 		goto cleanup;
12612 
12613 	/*
12614 	 * update the namespace device ids if necessary (ie. block copy disk)
12615 	 */
12616 	if ((md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) {
12617 		if ((err = md_imp_update_namespace_did(s)) != 0) {
12618 			goto cleanup;
12619 		}
12620 	}
12621 
12622 cleanup:
12623 	/*
12624 	 * Halt the set
12625 	 */
12626 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
12627 	(void) md_halt_set(setno, MD_HALT_ALL);
12628 	rw_exit(&md_unit_array_rw.lock);
12629 
12630 	/*
12631 	 * Unload the namespace for the imported set
12632 	 */
12633 	mutex_enter(&mddb_lock);
12634 	mddb_unload_set(setno);
12635 	mutex_exit(&mddb_lock);
12636 
12637 out:
12638 	md_haltsnarf_exit(setno);
12639 	md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
12640 	return (err);
12641 }
12642 #endif	/* MDDB_FAKE */
12643