xref: /onnv-gate/usr/src/uts/common/io/lvm/md/md_mddb.c (revision 1945:74cee1cd404b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/conf.h>
30 #include <sys/time.h>
31 #include <sys/uio.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/systeminfo.h>
35 #include <sys/sysmacros.h>
36 #include <sys/buf.h>
37 #include <sys/kmem.h>
38 #include <sys/file.h>
39 #include <sys/open.h>
40 #include <sys/debug.h>
41 #include <sys/stat.h>
42 #include <sys/lvm/mdvar.h>
43 #include <sys/lvm/md_crc.h>
44 #include <sys/lvm/md_convert.h>
45 #include <sys/types.h>
46 #include <sys/kmem.h>
47 #include <sys/lvm/mdmn_commd.h>
48 #include <sys/cladm.h>
49 
50 mhd_mhiargs_t	defmhiargs = {
51 	1000,
52 	{ 6000, 6000, 30000 }
53 };
54 
55 #define	MDDB
56 
57 #include <sys/lvm/mdvar.h>
58 #include <sys/lvm/mdmed.h>
59 #include <sys/lvm/md_names.h>
60 #include <sys/cred.h>
61 #include <sys/ddi.h>
62 #include <sys/sunddi.h>
63 #include <sys/esunddi.h>
64 
65 #include <sys/sysevent/eventdefs.h>
66 #include <sys/sysevent/svm.h>
67 
68 extern char svm_bootpath[];
69 
70 int			md_maxbootlist = MAXBOOTLIST;
71 static ulong_t		mddb_maxblocks = 0;	/* tune for small records */
72 static int		mddb_maxbufheaders = 50;
73 static uint_t		mddb_maxcopies = MDDB_NLB;
74 
75 /*
76  * If this is set, more detailed messages about DB init will be given, instead
77  * of just the MDE_DB_NODB.
78  */
79 static int		mddb_db_err_detail = 0;
80 
81 /*
82  * This lock is used to single-thread load/unload of all sets
83  */
84 static kmutex_t		mddb_lock;
85 
86 /*
87  * You really do NOT want to change this boolean.
88  * It can be VERY dangerous to do so.  Loss of
89  * data may occur. USE AT YOUR OWN RISK!!!!
90  */
91 static int		mddb_allow_half = 0;
92 /*
93  * For mirrored root allow reboot with only half the replicas available
94  * Flag inserted for Santa Fe project.
95  */
96 int mirrored_root_flag;
97 
98 #define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
99 			    ((c) == '\r') || ((c) == '\n'))
100 #define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))
101 
102 #define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)
103 
104 extern md_krwlock_t	md_unit_array_rw;	/* md.c */
105 extern set_t		md_nsets;		/* md.c */
106 extern int		md_nmedh;		/* md.c */
107 extern md_set_t		md_set[];		/* md.c */
108 extern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
109 extern dev_info_t	*md_devinfo;
110 extern int		md_init_debug;
111 extern int		md_status;
112 extern md_ops_t		*md_opslist;
113 extern md_krwlock_t	nm_lock;
114 
115 static int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
116 				ddi_devid_t didptr, ddi_devid_t old_didptr);
117 
118 /*
119  * Defines for crc calculation for records
120  * rec_crcgen generates a crc checksum for a record block
121  * rec_crcchk checks the crc checksum for a record block
122  */
123 #define	REC_CRCGEN	0
124 #define	REC_CRCCHK	1
125 #define	rec_crcgen(s, dep, rbp) \
126 	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
127 #define	rec_crcchk(s, dep, rbp) \
128 	rec_crcfunc(s, dep, rbp, REC_CRCCHK)
129 
130 /*
131  * During upgrade, SVM basically runs with the devt from the target
132  * being upgraded.  Translations are made from the target devt to the
133  * miniroot devt when writing data out to the disk.  This is done by
134  * the following routines:
135  *	wrtblklst
136  *	writeblks
137  *	readblklst
138  *	readblks
139  *	dt_read
140  *
141  * The following routines are used by the routines listed above and
142  * expect a translated (aka miniroot) devt:
143  *	getblks
144  * 	getmasters
145  *
146  * Also, when calling any system routines, such as ddi_lyr_get_devid,
147  * the translated (aka miniroot) devt must be used.
148  *
149  * By the same token, the major number and major name conversion operations
150  * need to use the name_to_major file from the target system instead
151  * of the name_to_major file on the miniroot.  So, calls to
152  * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
153  * when running on an upgrade.  Same is true with calls to
154  * ddi_major_to_name.
155  */
156 
157 
158 #ifndef MDDB_FAKE
159 
160 static int
161 mddb_rwdata(
162 	mddb_set_t	*s,	/* incore db set structure */
163 	int		flag,	/* B_ASYNC, B_FAILFAST or 0 passed in here */
164 	buf_t		*bp
165 )
166 {
167 	int		err = 0;
168 
169 	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
170 
171 	mutex_exit(SETMUTEX(s->s_setno));
172 	if (mdv_strategy_tstpnt == NULL ||
173 	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
174 		(void) bdev_strategy(bp);
175 
176 	if (flag & B_ASYNC) {
177 		mutex_enter(SETMUTEX(s->s_setno));
178 		return (0);
179 	}
180 
181 	err = biowait(bp);
182 	mutex_enter(SETMUTEX(s->s_setno));
183 	return (err);
184 }
185 
186 static void
187 setidentifier(
188 	mddb_set_t	*s,
189 	identifier_t	*ident
190 )
191 {
192 	if (s->s_setno == MD_LOCAL_SET)
193 		(void) strcpy(&ident->serial[0], s->s_ident.serial);
194 	else
195 		ident->createtime = s->s_ident.createtime;
196 }
197 
198 static int
199 cmpidentifier(
200 	mddb_set_t	*s,
201 	identifier_t	*ident
202 )
203 {
204 	if (s->s_setno == MD_LOCAL_SET)
205 		return (strcmp(ident->serial, s->s_ident.serial));
206 	else
207 		return (timercmp(&ident->createtime,
208 		    /*CSTYLED*/
209 		    &s->s_ident.createtime, !=));
210 }
211 
212 static int
213 mddb_devopen(
214 	md_dev64_t	dev
215 )
216 {
217 	dev_t		ddi_dev = md_dev64_to_dev(dev);
218 
219 	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
220 		return (0);
221 	return (1);
222 }
223 
224 static void
225 mddb_devclose(
226 	md_dev64_t	dev
227 )
228 {
229 	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
230 }
231 
232 /*
233  * stripe_skip_ts
234  *
235  * Returns a list of fields to be skipped in the stripe record structure.
236  * These fields are ms_timestamp in the component structure.
237  * Used to skip these fields when calculating the checksum.
238  */
239 static crc_skip_t *
240 stripe_skip_ts(void *un, uint_t revision)
241 {
242 	struct ms_row32_od	*small_mdr;
243 	struct ms_row		*big_mdr;
244 	uint_t			row, comp, ncomps, compoff;
245 	crc_skip_t		*skip;
246 	crc_skip_t		*skip_prev;
247 	crc_skip_t		skip_start = {0, 0, 0};
248 	ms_unit_t		*big_un;
249 	ms_unit32_od_t		*small_un;
250 	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);
251 
252 	switch (revision) {
253 	case MDDB_REV_RB:
254 	case MDDB_REV_RBFN:
255 		small_un = (ms_unit32_od_t *)un;
256 		skip_prev = &skip_start;
257 
258 		if (small_un->un_nrows == 0)
259 			return (NULL);
260 		/*
261 		 * walk through all rows to find the total number
262 		 * of components
263 		 */
264 		small_mdr   = &small_un->un_row[0];
265 		ncomps = 0;
266 		for (row = 0; (row < small_un->un_nrows); row++) {
267 			ncomps += small_mdr[row].un_ncomp;
268 		}
269 
270 		/* Now walk through the components */
271 		compoff = small_un->un_ocomp + rb_off;
272 		for (comp = 0; (comp < ncomps); ++comp) {
273 			uint_t	mdcp = compoff +
274 			    (comp * sizeof (ms_comp32_od_t));
275 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
276 			    KM_SLEEP);
277 			skip->skip_offset = mdcp +
278 			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
279 			skip->skip_size = sizeof (md_timeval32_t);
280 			skip_prev->skip_next = skip;
281 			skip_prev = skip;
282 		}
283 		break;
284 	case MDDB_REV_RB64:
285 	case MDDB_REV_RB64FN:
286 		big_un = (ms_unit_t *)un;
287 		skip_prev = &skip_start;
288 
289 		if (big_un->un_nrows == 0)
290 			return (NULL);
291 		/*
292 		 * walk through all rows to find the total number
293 		 * of components
294 		 */
295 		big_mdr   = &big_un->un_row[0];
296 		ncomps = 0;
297 		for (row = 0; (row < big_un->un_nrows); row++) {
298 			ncomps += big_mdr[row].un_ncomp;
299 		}
300 
301 		/* Now walk through the components */
302 		compoff = big_un->un_ocomp + rb_off;
303 		for (comp = 0; (comp < ncomps); ++comp) {
304 			uint_t	mdcp = compoff +
305 			    (comp * sizeof (ms_comp_t));
306 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
307 			    KM_SLEEP);
308 			skip->skip_offset = mdcp +
309 			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
310 			skip->skip_size = sizeof (md_timeval32_t);
311 			skip_prev->skip_next = skip;
312 			skip_prev = skip;
313 		}
314 		break;
315 	}
316 	/* Return the start of the list of fields to skip */
317 	return (skip_start.skip_next);
318 }
319 
320 /*
321  * mirror_skip_ts
322  *
323  * Returns a list of fields to be skipped in the mirror record structure.
324  * This includes un_last_read and sm_timestamp for each submirror
325  * Used to skip these fields when calculating the checksum.
326  */
327 static crc_skip_t *
328 mirror_skip_ts(uint_t revision)
329 {
330 	int		i;
331 	crc_skip_t	*skip;
332 	crc_skip_t	*skip_prev;
333 	crc_skip_t	skip_start = {0, 0, 0};
334 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
335 
336 	skip_prev = &skip_start;
337 
338 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
339 	switch (revision) {
340 	case MDDB_REV_RB:
341 	case MDDB_REV_RBFN:
342 		skip->skip_offset = offsetof(mm_unit32_od_t,
343 		    un_last_read) + rb_off;
344 		break;
345 	case MDDB_REV_RB64:
346 	case MDDB_REV_RB64FN:
347 		skip->skip_offset = offsetof(mm_unit_t,
348 		    un_last_read) + rb_off;
349 		break;
350 	}
351 	skip->skip_size = sizeof (int);
352 	skip_prev->skip_next = skip;
353 	skip_prev = skip;
354 
355 	for (i = 0; i < NMIRROR; i++) {
356 		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
357 		switch (revision) {
358 		case MDDB_REV_RB:
359 		case MDDB_REV_RBFN:
360 			skip->skip_offset = offsetof(mm_unit32_od_t,
361 			    un_sm[i].sm_timestamp) + rb_off;
362 			break;
363 		case MDDB_REV_RB64:
364 		case MDDB_REV_RB64FN:
365 			skip->skip_offset = offsetof(mm_unit_t,
366 			    un_sm[i].sm_timestamp) + rb_off;
367 			break;
368 		}
369 		skip->skip_size = sizeof (md_timeval32_t);
370 		skip_prev->skip_next = skip;
371 		skip_prev = skip;
372 	}
373 	/* Return the start of the list of fields to skip */
374 	return (skip_start.skip_next);
375 }
376 
377 /*
378  * hotspare_skip_ts
379  *
380  * Returns a list of the timestamp fields in the hotspare record structure.
381  * Used to skip these fields when calculating the checksum.
382  */
383 static crc_skip_t *
384 hotspare_skip_ts(uint_t revision)
385 {
386 	crc_skip_t	*skip;
387 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
388 
389 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
390 	switch (revision) {
391 	case MDDB_REV_RB:
392 	case MDDB_REV_RBFN:
393 		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
394 		    rb_off;
395 		break;
396 	case MDDB_REV_RB64:
397 	case MDDB_REV_RB64FN:
398 		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
399 		    rb_off;
400 		break;
401 	}
402 	skip->skip_size = sizeof (md_timeval32_t);
403 	return (skip);
404 }
405 
406 /*
407  * rec_crcfunc
408  *
409  * Calculate or check the checksum for a record
410  * Calculate the crc if check == 0, Check the crc if check == 1
411  *
412  * Record block may be written by different nodes in a multi-owner diskset
413  * (in case of master change), the function rec_crcchk excludes timestamp
414  * fields in crc computation of record data.
415  * Otherwise, timestamp fields will cause each node to have a different
416  * checksum for same record block causing the exclusive-or of all record block
417  * checksums and data block record sums to be non-zero after new master writes
418  * at least one record block.
419  */
420 static uint_t
421 rec_crcfunc(
422 	mddb_set_t	*s,
423 	mddb_de_ic_t	*dep,
424 	mddb_rb32_t	*rbp,
425 	int		check
426 )
427 {
428 	crc_skip_t	*skip;
429 	crc_skip_t	*skip_tail;
430 	mddb_type_t	type = dep->de_type1;
431 	uint_t		ret;
432 
433 	/*
434 	 * Generate a list of the areas to be skipped when calculating
435 	 * the checksum.
436 	 * First skip rb_checksum, rb_private and rb_userdata.
437 	 */
438 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
439 	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
440 	skip->skip_size = 3 * sizeof (uint_t);
441 	skip_tail = skip;
442 	if (MD_MNSET_SETNO(s->s_setno)) {
443 		/* For a MN set, skip rb_timestamp */
444 		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
445 		    KM_SLEEP);
446 		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
447 		skip_tail->skip_size = sizeof (md_timeval32_t);
448 		skip->skip_next = skip_tail;
449 
450 		/* Now add a list of timestamps to be skipped */
451 		if (type >= MDDB_FIRST_MODID) {
452 			switch (dep->de_flags) {
453 				case MDDB_F_STRIPE:
454 					skip_tail->skip_next =
455 					    stripe_skip_ts((void *)rbp->rb_data,
456 					    rbp->rb_revision);
457 					break;
458 				case MDDB_F_MIRROR:
459 					skip_tail->skip_next =
460 					    mirror_skip_ts(rbp->rb_revision);
461 					break;
462 				case MDDB_F_HOTSPARE:
463 					skip_tail->skip_next =
464 					    hotspare_skip_ts(rbp->rb_revision);
465 					break;
466 				default:
467 					break;
468 			}
469 		}
470 	}
471 
472 	if (check) {
473 		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
474 	} else {
475 		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
476 		ret = rbp->rb_checksum;
477 	}
478 	while (skip) {
479 		crc_skip_t	*skip_save = skip;
480 
481 		skip = skip->skip_next;
482 		kmem_free(skip_save, sizeof (crc_skip_t));
483 	}
484 	return (ret);
485 }
486 
487 static mddb_bf_t *
488 allocbuffer(
489 	mddb_set_t	*s,
490 	int		sleepflag
491 )
492 {
493 	mddb_bf_t	*bfp;
494 
495 	while ((bfp = s->s_freebufhead) == NULL) {
496 		if (sleepflag == MDDB_NOSLEEP)
497 			return ((mddb_bf_t *)NULL);
498 		++s->s_bufmisses;
499 #ifdef	DEBUG
500 		if (s->s_bufmisses == 1)
501 			cmn_err(CE_NOTE,
502 			    "md: mddb: set %u sleeping for buffer", s->s_setno);
503 #endif
504 		s->s_bufwakeup = 1;
505 		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
506 	}
507 	s->s_freebufhead = bfp->bf_next;
508 	bzero((caddr_t)bfp, sizeof (*bfp));
509 	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
510 	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
511 	return (bfp);
512 }
513 
514 static void
515 freebuffer(
516 	mddb_set_t		*s,
517 	mddb_bf_t	*bfp
518 )
519 {
520 	bfp->bf_next = s->s_freebufhead;
521 	s->s_freebufhead = bfp;
522 	if (s->s_bufwakeup) {
523 		cv_broadcast(&s->s_buf_cv);
524 		s->s_bufwakeup = 0;
525 	}
526 }
527 
528 
529 static void
530 blkbusy(
531 	mddb_set_t	*s,
532 	mddb_block_t	blk
533 )
534 {
535 	int		bit, byte;
536 
537 	s->s_freeblkcnt--;
538 	byte = blk / 8;
539 	bit = 1 << (blk & 7);
540 	ASSERT(! (s->s_freebitmap[byte] & bit));
541 	s->s_freebitmap[byte] |= bit;
542 }
543 
544 static void
545 blkfree(
546 	mddb_set_t	*s,
547 	mddb_block_t	blk
548 )
549 {
550 	int		bit, byte;
551 
552 	s->s_freeblkcnt++;
553 	byte = blk / 8;
554 	bit = 1 << (blk & 7);
555 	ASSERT(s->s_freebitmap[byte] & bit);
556 	s->s_freebitmap[byte] &= ~bit;
557 }
558 
559 static int
560 blkcheck(
561 	mddb_set_t	*s,
562 	mddb_block_t	blk
563 )
564 {
565 	int		bit, byte;
566 
567 	byte = blk / 8;
568 	bit = 1 << (blk & 7);
569 	return (s->s_freebitmap[byte] & bit);
570 }
571 
572 /*
573  * not fast but simple
574  */
575 static mddb_block_t
576 getfreeblks(
577 	mddb_set_t	*s,
578 	size_t		count
579 )
580 {
581 	int		i;
582 	size_t		contig;
583 
584 	contig = 0;
585 	for (i = 0; i < s->s_totalblkcnt; i++) {
586 		if (blkcheck(s, i)) {
587 			contig = 0;
588 		} else {
589 			contig++;
590 			if (contig == count) {
591 				contig = i - count + 1;
592 				for (i = (int)contig; i < contig + count; i++)
593 					blkbusy(s, i);
594 				return ((mddb_block_t)contig);
595 			}
596 		}
597 	}
598 	return (0);
599 }
600 
601 static void
602 computefreeblks(
603 	mddb_set_t	*s
604 )
605 {
606 	mddb_db_t	*dbp;
607 	mddb_de_ic_t	*dep;
608 	int		i;
609 	int		minblks;
610 	int		freeblks;
611 	mddb_mb_ic_t	*mbip;
612 	mddb_lb_t	*lbp;
613 	mddb_block_t	maxblk;
614 	mddb_did_db_t	*did_dbp;
615 	int		nblks;
616 
617 	minblks = 0;
618 	lbp = s->s_lbp;
619 	maxblk = 0;
620 
621 	/*
622 	 * Determine the max number of blocks.
623 	 */
624 	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
625 	/*
626 	 * go through and find highest logical block
627 	 */
628 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
629 		if (dbp->db_blknum > maxblk)
630 			maxblk = dbp->db_blknum;
631 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
632 			for (i = 0; i < dep->de_blkcount; i++)
633 				if (dep->de_blks[i] > maxblk)
634 					maxblk = dep->de_blks[i];
635 	}
636 
637 	for (i = 0; i < lbp->lb_loccnt; i++) {
638 		mddb_locator_t	*lp = &lbp->lb_locators[i];
639 
640 		if ((lp->l_flags & MDDB_F_DELETED) ||
641 		    (lp->l_flags & MDDB_F_EMASTER))
642 			continue;
643 
644 		freeblks = 0;
645 		for (mbip = s->s_mbiarray[i]; mbip != NULL;
646 					mbip = mbip->mbi_next) {
647 			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
648 		}
649 		if (freeblks == 0)	/* this happen when there is no */
650 			continue;	/*	master blk		*/
651 
652 		if (freeblks <= maxblk) {
653 			lp->l_flags |= MDDB_F_TOOSMALL;
654 			lp->l_flags &= ~MDDB_F_ACTIVE;
655 		}
656 
657 		if (freeblks < minblks || minblks == 0)
658 			minblks = freeblks;
659 	}
660 	/*
661 	 * set up reasonable freespace if no
662 	 * data bases exist
663 	 */
664 	if (minblks == 0)
665 		minblks = 100;
666 	if (minblks > nblks)
667 		minblks = nblks;
668 	s->s_freeblkcnt = minblks;
669 	s->s_totalblkcnt = minblks;
670 	if (! s->s_freebitmapsize) {
671 		s->s_freebitmapsize = nblks / 8;
672 		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
673 		    KM_SLEEP);
674 	}
675 	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
676 
677 	/* locator block sectors */
678 	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
679 		blkbusy(s, i);
680 
681 	/* locator name sectors */
682 	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
683 		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
684 
685 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
686 		/* locator block device id information */
687 		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
688 			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
689 
690 		/* disk blocks containing actual device ids */
691 		did_dbp = s->s_did_icp->did_ic_dbp;
692 		while (did_dbp) {
693 			for (i = 0; i < did_dbp->db_blkcnt; i++) {
694 				blkbusy(s, did_dbp->db_firstblk + i);
695 			}
696 			did_dbp = did_dbp->db_next;
697 		}
698 	}
699 
700 	/* Only use data tags if not a MN set */
701 	if (!(lbp->lb_flags & MDDB_MNSET)) {
702 		/* Found a bad tag, do NOT mark the data tag blks busy here */
703 		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
704 			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
705 				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
706 		}
707 	}
708 
709 	/* directory block/entry sectors */
710 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
711 		blkbusy(s, dbp->db_blknum);
712 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
713 			for (i = 0; i < dep->de_blkcount; i++)
714 				blkbusy(s, dep->de_blks[i]);
715 	}
716 }
717 
718 /*
719  * Add free space to the device id incore free list.
720  * Called:
721  *    - During startup when all devid blocks are temporarily placed on the
722  *       free list
723  *    - After a devid has been deleted via the metadb command.
724  *    - When mddb_devid_free_get adds unused space from a disk block
725  *       to free list
726  */
727 static int
728 mddb_devid_free_add(
729 	mddb_set_t *s,
730 	uint_t firstblk,
731 	uint_t offset,
732 	uint_t length
733 )
734 {
735 	mddb_did_free_t	*did_freep;
736 
737 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
738 		return (0);
739 	}
740 
741 	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
742 	    KM_SLEEP);
743 	did_freep->free_blk = firstblk;
744 	did_freep->free_offset = offset;
745 	did_freep->free_length = length;
746 	did_freep->free_next = s->s_did_icp->did_ic_freep;
747 	s->s_did_icp->did_ic_freep = did_freep;
748 
749 	return (0);
750 }
751 
752 /*
753  * Remove specific free space from the device id incore free list.
754  * Called at startup (after all devid blocks have been placed on
755  * free list) in order to remove the free space from the list that
756  * contains actual devids.
757  * Returns 0 if area successfully removed.
758  * Returns 1 if no matching area is found - so nothing removed.
759  */
760 static int
761 mddb_devid_free_delete(
762 	mddb_set_t *s,
763 	uint_t firstblk,
764 	uint_t offset,
765 	uint_t length
766 )
767 {
768 	int		block_found = 0;
769 	mddb_did_free_t	*did_freep1;		/* next free block */
770 	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
771 	mddb_did_free_t *did_freep_before;	/* area before offset, len */
772 	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
773 	uint_t		old_length;
774 
775 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
776 		return (1);
777 	}
778 
779 	/* find free block for this devid */
780 	did_freep1 = s->s_did_icp->did_ic_freep;
781 	while (did_freep1) {
782 		/*
783 		 * Look through free list of <block, offset, length> to
784 		 * find our entry in the free list.  Our entry should
785 		 * exist since the entire devid block was placed into
786 		 * this free list at startup.  This code is just removing
787 		 * the non-free (in-use) portions of the devid block so
788 		 * that the remaining linked list does indeed just
789 		 * contain a free list.
790 		 *
791 		 * Our entry has been found if
792 		 *   - the blocks match,
793 		 *   - the offset (starting address) in the free list is
794 		 *	less than the offset of our entry and
795 		 *   - the length+offset (ending address) in the free list is
796 		 *	greater than the length+offset of our entry.
797 		 */
798 		if ((did_freep1->free_blk == firstblk) &&
799 		    (did_freep1->free_offset <= offset) &&
800 		    ((did_freep1->free_length + did_freep1->free_offset) >=
801 			(length + offset))) {
802 			/* Have found our entry - remove from list */
803 			block_found = 1;
804 			did_freep_before = did_freep1;
805 			old_length = did_freep1->free_length;
806 			/* did_freep1 - pts to next free block */
807 			did_freep1 = did_freep1->free_next;
808 			if (did_freep2) {
809 				did_freep2->free_next = did_freep1;
810 			} else {
811 				s->s_did_icp->did_ic_freep = did_freep1;
812 			}
813 
814 			/*
815 			 * did_freep_before points to area in block before
816 			 * offset, length.
817 			 */
818 			did_freep_before->free_length = offset -
819 				did_freep_before->free_offset;
820 			/*
821 			 * did_freep_after points to area in block after
822 			 * offset, length.
823 			 */
824 			did_freep_after = (mddb_did_free_t *)kmem_zalloc
825 					(sizeof (mddb_did_free_t), KM_SLEEP);
826 			did_freep_after->free_blk = did_freep_before->free_blk;
827 			did_freep_after->free_offset = offset + length;
828 			did_freep_after->free_length = old_length - length -
829 				did_freep_before->free_length;
830 			/*
831 			 * Add before and after areas to free list
832 			 * If area before or after offset, length has length
833 			 * of 0, that entry is not added.
834 			 */
835 			if (did_freep_after->free_length) {
836 				did_freep_after->free_next = did_freep1;
837 				if (did_freep2) {
838 				    did_freep2->free_next = did_freep_after;
839 				} else {
840 				    s->s_did_icp->did_ic_freep =
841 					did_freep_after;
842 				}
843 				did_freep1 = did_freep_after;
844 			} else {
845 				kmem_free(did_freep_after,
846 					sizeof (mddb_did_free_t));
847 			}
848 
849 			if (did_freep_before->free_length) {
850 				did_freep_before->free_next = did_freep1;
851 				if (did_freep2) {
852 				    did_freep2->free_next = did_freep_before;
853 				} else {
854 				    s->s_did_icp->did_ic_freep =
855 					did_freep_before;
856 				}
857 			} else {
858 				kmem_free(did_freep_before,
859 					sizeof (mddb_did_free_t));
860 			}
861 			break;
862 		} else {
863 			did_freep2 = did_freep1;
864 			did_freep1 = did_freep1->free_next;
865 		}
866 	}
867 	if (block_found == 0) {
868 		return (1);
869 	} else {
870 		return (0);
871 	}
872 }
873 
874 /*
875  * Find free space of devid length and remove free space from list.
876  * Return a pointer to the previously free area.
877  *
878  * If there's not enough free space on the free list, get an empty
879  * disk block, put the empty disk block on the did_ic_dbp linked list,
880  * and add the disk block space not used for devid to the free list.
881  *
882  * Return pointer to address (inside disk block) of free area for devid.
883  * Return 0 if error.
884  */
885 static caddr_t
886 mddb_devid_free_get(
887 	mddb_set_t *s,
888 	uint_t len,
889 	uint_t *blk,
890 	uint_t *cnt,
891 	uint_t *offset
892 )
893 {
894 	mddb_did_free_t	*freep, *freep2;
895 	mddb_did_db_t	*dbp;
896 	uint_t		blk_cnt, blk_num;
897 	ddi_devid_t	devid_ptr = NULL;
898 
899 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
900 		return (0);
901 	}
902 
903 	freep = s->s_did_icp->did_ic_freep;
904 	freep2 = (mddb_did_free_t *)NULL;
905 	while (freep) {
906 		/* found a free area - remove from free list */
907 		if (len <= freep->free_length) {
908 			*blk = freep->free_blk;
909 			*offset = freep->free_offset;
910 			/* find disk block pointer that contains free area */
911 			dbp = s->s_did_icp->did_ic_dbp;
912 			while (dbp) {
913 				if (dbp->db_firstblk == *blk)
914 					break;
915 				else
916 					dbp = dbp->db_next;
917 			}
918 			/*
919 			 * If a disk block pointer can't be found - something
920 			 * is wrong, so don't use this free space.
921 			 */
922 			if (dbp == NULL) {
923 				freep2 = freep;
924 				freep = freep->free_next;
925 				continue;
926 			}
927 
928 			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
929 			*cnt = dbp->db_blkcnt;
930 
931 			/* Update free list information */
932 			freep->free_offset += len;
933 			freep->free_length -= len;
934 			if (freep->free_length == 0) {
935 				if (freep2) {
936 					freep2->free_next =
937 					freep->free_next;
938 				} else {
939 					s->s_did_icp->did_ic_freep =
940 					freep->free_next;
941 				}
942 				kmem_free(freep, sizeof (mddb_did_free_t));
943 			}
944 			break;
945 		}
946 		freep2 = freep;
947 		freep = freep->free_next;
948 	}
949 
950 	/* Didn't find a free spot */
951 	if (freep == NULL) {
952 		/* get free logical disk blk in replica */
953 		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
954 		blk_num = getfreeblks(s, blk_cnt);
955 		if (blk_num == 0)
956 			return (0);
957 
958 		/* Add disk block to disk block linked list */
959 		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
960 		dbp->db_firstblk = blk_num;
961 		dbp->db_blkcnt = blk_cnt;
962 		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
963 		dbp->db_next = s->s_did_icp->did_ic_dbp;
964 		s->s_did_icp->did_ic_dbp = dbp;
965 		devid_ptr = (ddi_devid_t)dbp->db_ptr;
966 
967 		/* Update return values */
968 		*blk = blk_num;
969 		*offset = 0;
970 		*cnt = blk_cnt;
971 
972 		/* Add unused part of block to free list */
973 		(void) mddb_devid_free_add(s, blk_num,
974 			len, (dbtob(blk_cnt) - len));
975 	}
976 
977 	return ((caddr_t)devid_ptr);
978 }
979 
980 /*
981  * Add device id information for locator index to device id area in set.
982  * Get free area to store device id from free list.   Update checksum
983  * for mddb_did_blk.
984  *
985  * This routine does not write any data out to disk.
986  * After this routine has been called, the routine, writelocall, should
987  * be called to write both the locator block and device id area out
988  * to disk.
989  */
990 static int
991 mddb_devid_add(
992 	mddb_set_t	*s,
993 	uint_t		index,
994 	ddi_devid_t	devid,
995 	char		*minor_name
996 )
997 {
998 	uint_t		devid_len;
999 	uint_t		blk, offset;
1000 	ddi_devid_t	devid_ptr;
1001 	mddb_did_info_t	*did_info;
1002 	uint_t		blkcnt, i;
1003 	mddb_did_blk_t	*did_blk;
1004 
1005 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1006 		return (1);
1007 	}
1008 	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
1009 		return (1);
1010 
1011 	/* Check if device id has already been added */
1012 	did_blk = s->s_did_icp->did_ic_blkp;
1013 	did_info = &(did_blk->blk_info[index]);
1014 	if (did_info->info_flags & MDDB_DID_EXISTS)
1015 		return (0);
1016 
1017 	devid_len = ddi_devid_sizeof(devid);
1018 	devid_ptr = (ddi_devid_t)
1019 			mddb_devid_free_get(s, devid_len, &blk, &blkcnt,
1020 				&offset);
1021 	if (devid_ptr == NULL) {
1022 		return (1);
1023 	}
1024 
1025 	/* Copy devid into devid free area */
1026 	for (i = 0; i < devid_len; i++)
1027 		((char *)devid_ptr)[i] = ((char *)devid)[i];
1028 
1029 	/* Update mddb_did_info area for new device id */
1030 	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;
1031 
1032 	/*
1033 	 * Only set UPDATED flag for non-replicated import cases.
1034 	 * This allows the side locator driver name index to get
1035 	 * updated in load_old_replicas.
1036 	 */
1037 	if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
1038 		did_info->info_flags |= MDDB_DID_UPDATED;
1039 
1040 	did_info->info_firstblk = blk;
1041 	did_info->info_blkcnt = blkcnt;
1042 	did_info->info_offset = offset;
1043 	did_info->info_length = devid_len;
1044 	(void) strcpy(did_info->info_minor_name, minor_name);
1045 	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
1046 
1047 	/* Add device id pointer to did_ic_devid array */
1048 	s->s_did_icp->did_ic_devid[index] = devid_ptr;
1049 
1050 	return (0);
1051 }
1052 
1053 
1054 /*
1055  * Delete device id information for locator index from device id area in set.
1056  * Add device id space to free area.
1057  *
1058  * This routine does not write any data out to disk.
1059  * After this routine has been called, the routine, writelocall, should
1060  * be called to write both the locator block and device id area out
1061  * to disk.
1062  */
1063 static int
1064 mddb_devid_delete(mddb_set_t *s, uint_t index)
1065 {
1066 	mddb_did_info_t	*did_info;
1067 	mddb_did_blk_t	*did_blk;
1068 
1069 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1070 		return (1);
1071 	}
1072 
1073 	/* Get device id information from mddb_did_blk */
1074 	did_blk = s->s_did_icp->did_ic_blkp;
1075 	did_info = &(did_blk->blk_info[index]);
1076 
1077 	/*
1078 	 * Ensure that the underlying device supports device ids
1079 	 * before arbitrarily removing them.
1080 	 */
1081 	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
1082 		return (1);
1083 	}
1084 
1085 	/* Remove device id information from mddb_did_blk */
1086 	did_info->info_flags = 0;
1087 
1088 	/* Remove device id from incore area */
1089 	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
1090 
1091 	/* Add new free space in disk block to free list */
1092 	(void) mddb_devid_free_add(s, did_info->info_firstblk,
1093 		did_info->info_offset, did_info->info_length);
1094 
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Check if there is a device id for a locator index.
1100  *
1101  * Caller of this routine should not free devid or minor_name since
1102  * these will point to internal data structures that should not
1103  * be freed.
1104  */
1105 static int
1106 mddb_devid_get(
1107 	mddb_set_t *s,
1108 	uint_t index,
1109 	ddi_devid_t *devid,
1110 	char **minor_name
1111 )
1112 {
1113 	mddb_did_info_t	*did_info;
1114 
1115 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1116 		return (0);
1117 	}
1118 	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
1119 
1120 	if (did_info->info_flags & MDDB_DID_EXISTS) {
1121 		*devid = s->s_did_icp->did_ic_devid[index];
1122 		*minor_name =
1123 		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
1124 		return (1);
1125 	} else
1126 		return (0);
1127 
1128 
1129 }
1130 
1131 /*
1132  * Check if device id is valid on current system.
1133  * Needs devid, previously known dev_t and current minor_name.
1134  *
1135  * Success:
1136  * 	Returns 0 if valid device id is found and updates
1137  * 	dev_t if the dev_t associated with the device id is
1138  *	different than dev_t.
1139  * Failure:
1140  * 	Returns 1 if device id not valid on current system.
1141  */
1142 static int
1143 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
1144 {
1145 	int		retndevs;
1146 	dev_t		*ddi_devs;
1147 	int		devid_flag = 0;
1148 	int 		cnt;
1149 
1150 	if (dev == 0)
1151 		return (1);
1152 	/*
1153 	 * See if devid is valid in the current system.
1154 	 * If so, set dev to match the devid.
1155 	 */
1156 	if (ddi_lyr_devid_to_devlist(devid, minor_name,
1157 	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
1158 		if (retndevs > 0) {
1159 			/* devid is valid to use */
1160 			devid_flag = 1;
1161 			/* does dev_t in list match dev */
1162 			cnt = 0;
1163 			while (cnt < retndevs) {
1164 				if (*dev == md_expldev(ddi_devs[cnt]))
1165 					break;
1166 				cnt++;
1167 			}
1168 			/*
1169 			 * If a different dev_t, then setup
1170 			 * new dev and new major name
1171 			 */
1172 			if (cnt == retndevs) {
1173 				*dev = md_expldev(ddi_devs[0]);
1174 			}
1175 			ddi_lyr_free_devlist(ddi_devs, retndevs);
1176 		}
1177 	}
1178 	if (devid_flag)
1179 		return (0);
1180 	else
1181 		return (1);
1182 }
1183 
1184 
1185 /*
1186  * Free the devid incore data areas
1187  */
1188 static void
1189 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
1190 {
1191 	mddb_did_free_t	*did_freep1, *did_freep2;
1192 	mddb_did_db_t	*did_dbp1, *did_dbp2;
1193 	mddb_did_ic_t	*icp = *did_icp;
1194 
1195 	if (icp) {
1196 		if (icp->did_ic_blkp) {
1197 			kmem_free((caddr_t)icp->did_ic_blkp,
1198 			    dbtob(lbp->lb_didblkcnt));
1199 			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
1200 		}
1201 
1202 		if (icp->did_ic_dbp) {
1203 			did_dbp1 = icp->did_ic_dbp;
1204 			while (did_dbp1) {
1205 				did_dbp2 = did_dbp1->db_next;
1206 				kmem_free((caddr_t)did_dbp1->db_ptr,
1207 				    dbtob(did_dbp1->db_blkcnt));
1208 				kmem_free((caddr_t)did_dbp1,
1209 				    sizeof (mddb_did_db_t));
1210 				did_dbp1 = did_dbp2;
1211 			}
1212 		}
1213 
1214 		if (icp->did_ic_freep) {
1215 			did_freep1 = icp->did_ic_freep;
1216 			while (did_freep1) {
1217 				did_freep2 = did_freep1->free_next;
1218 				kmem_free((caddr_t)did_freep1,
1219 				    sizeof (mddb_did_free_t));
1220 				did_freep1 = did_freep2;
1221 			}
1222 		}
1223 
1224 		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
1225 		*did_icp = (mddb_did_ic_t *)NULL;
1226 	}
1227 
1228 }
1229 
1230 static daddr_t
1231 getphysblk(
1232 	mddb_block_t		blk,
1233 	mddb_mb_ic_t		*mbip
1234 )
1235 {
1236 	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);
1237 
1238 	while (blk >= mbp->mb_blkcnt) {
1239 		if (! mbip->mbi_next)
1240 			return ((daddr_t)-1);	/* no such block */
1241 		blk -= mbp->mb_blkcnt;
1242 		mbip = mbip->mbi_next;
1243 		mbp = &(mbip->mbi_mddb_mb);
1244 	}
1245 
1246 	if (blk >= mbp->mb_blkmap.m_consecutive)
1247 		return ((daddr_t)-1);	/* no such block */
1248 
1249 	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
1250 }
1251 
1252 /*
1253  * when a buf header is passed in the new buffer must be
1254  * put on the front of the chain. writerec counts on it
1255  */
1256 static int
1257 putblks(
1258 	mddb_set_t	*s,		/* incore db set structure */
1259 	caddr_t		buffer,		/* adr of buffer to be written */
1260 	daddr_t		blk,		/* block number for first block */
1261 	int		cnt,		/* number of blocks to be written */
1262 	md_dev64_t	device,		/* device to be written to */
1263 	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
1264 					/*    and put buf address here */
1265 )
1266 {
1267 	buf_t		*bp;
1268 	mddb_bf_t	*bfp;
1269 	int		err = 0;
1270 
1271 	bfp = allocbuffer(s, MDDB_SLEEPOK);
1272 	bp = &bfp->bf_buf;
1273 	bp->b_bcount = MDDB_BSIZE * cnt;
1274 	bp->b_un.b_addr = buffer;
1275 	bp->b_blkno = blk;
1276 	bp->b_edev = md_dev64_to_dev(device);
1277 	/*
1278 	 * if a header for a buf chain is passed in this is async io.
1279 	 * currently only done for optimize  records
1280 	 */
1281 	if (bufhead) {
1282 		bfp->bf_next = *bufhead;
1283 		*bufhead = bfp;
1284 		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
1285 		return (0);
1286 	}
1287 	err = mddb_rwdata(s, B_WRITE, bp);
1288 	freebuffer(s, bfp);
1289 	if (err) {
1290 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1291 		    s->s_setno, device);
1292 		return (MDDB_F_EWRITE);
1293 	}
1294 	return (0);
1295 }
1296 
1297 /*
1298  * wrtblklst - takes an array of logical block numbers
1299  *		and writes the buffer to those blocks (scatter).
1300  * If called during upgrade, this routine expects a
1301  * non-translated (aka target) dev.
1302  */
1303 static int
1304 wrtblklst(
1305 	mddb_set_t	*s,		/* incore set structure */
1306 	caddr_t		buffer,		/* buffer to be written (record blk) */
1307 	mddb_block_t	blka[],		/* list of logical blks for record */
1308 	daddr_t		cnt,		/* number of logical blks */
1309 	const int	li,		/* locator index */
1310 	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
1311 					/*    and put buf address here */
1312 	int		master_only	/* allow only master node to write */
1313 )
1314 {
1315 	daddr_t		blk;
1316 	daddr_t		blk1;
1317 	int		err = 0;
1318 	int		cons;
1319 	mddb_lb_t	*lbp = s->s_lbp;
1320 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1321 	md_dev64_t	dev;
1322 	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];
1323 
1324 	/*
1325 	 * If a MN diskset and only the master can write,
1326 	 * then a non-master node will just return success.
1327 	 */
1328 	if ((lbp->lb_flags & MDDB_MNSET) &&
1329 	    (master_only == MDDB_WR_ONLY_MASTER)) {
1330 
1331 		/* return successfully if we aren't the master */
1332 		if (!(md_set[s->s_setno].s_am_i_master)) {
1333 			return (0);
1334 		}
1335 	}
1336 
1337 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1338 	if (dev == NODEV64) {
1339 		return (1);
1340 	}
1341 
1342 	blk = getphysblk(blka[0], mbip);
1343 	ASSERT(blk >= 0);
1344 
1345 	cons = 1;
1346 	while (cnt) {
1347 		if (cons != cnt) {
1348 			blk1 = getphysblk(blka[cons], mbip);
1349 			ASSERT(blk1 >= 0);
1350 			if ((blk + cons) == blk1) {
1351 				cons++;
1352 				continue;
1353 			}
1354 		}
1355 		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
1356 			/*
1357 			 * If an MN diskset and any_node_can_write
1358 			 * then this request is coming from writeoptrecord
1359 			 * and l_flags field should not be updated.
1360 			 * l_flags will be updated as a result of sending
1361 			 * a class1 message to the master.  Setting l_flags
1362 			 * here will cause slave to be out of sync with
1363 			 * master.
1364 			 *
1365 			 * Otherwise, set the error in l_flags
1366 			 * (this occurs if this is not a MN diskset or
1367 			 * only_master_can_write is set).
1368 			 */
1369 			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
1370 			    (master_only == MDDB_WR_ONLY_MASTER)) {
1371 				lp->l_flags |= MDDB_F_EWRITE;
1372 			}
1373 			return (err);
1374 		}
1375 		if (bufhead)
1376 			(*bufhead)->bf_locator = lp;
1377 
1378 		buffer += MDDB_BSIZE * cons;
1379 		cnt -= cons;
1380 		blka += cons;
1381 		if (cnt) {
1382 			blk = getphysblk(blka[0], mbip);
1383 			ASSERT(blk >= 0);
1384 		}
1385 		cons = 1;
1386 	}
1387 
1388 	return (0);
1389 }
1390 
1391 /*
1392  * writeblks - takes a logical block number/block count pair
1393  * 		and writes the buffer to those contiguous logical blocks.
1394  * If called during upgrade, this routine expects a non-translated
1395  * (aka target) dev.
1396  */
1397 static int
1398 writeblks(
1399 	mddb_set_t	*s,		/* incore set structure */
1400 	caddr_t		buffer,		/* buffer to be written */
1401 	mddb_block_t	blk,		/* starting logical block number */
1402 	int		cnt,		/* number of log blocks to be written */
1403 	const int	li,		/* locator index */
1404 	int		master_only	/* allow only master node to write */
1405 )
1406 {
1407 	daddr_t		physblk;
1408 	int		err = 0;
1409 	int		i;
1410 	mddb_lb_t	*lbp = s->s_lbp;
1411 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1412 	md_dev64_t	dev;
1413 	mddb_block_t	*blkarray;
1414 	int		size;
1415 	int		ret;
1416 
1417 	/*
1418 	 * If a MN diskset and only the master can write,
1419 	 * then a non-master node will just return success.
1420 	 */
1421 	if ((lbp->lb_flags & MDDB_MNSET) &&
1422 	    (master_only == MDDB_WR_ONLY_MASTER)) {
1423 		/* return successfully if we aren't the master */
1424 		if (!(md_set[s->s_setno].s_am_i_master)) {
1425 			return (0);
1426 		}
1427 	}
1428 
1429 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1430 	if (dev == NODEV64) {
1431 		return (1);
1432 	}
1433 
1434 	if (cnt > 1) {
1435 		size = sizeof (mddb_block_t) * cnt;
1436 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1437 		for (i = 0; i < cnt; i++)
1438 			blkarray[i] = blk + i;
1439 		ret = wrtblklst(s, buffer, blkarray, cnt,
1440 			li, 0, MDDB_WR_ONLY_MASTER);
1441 		kmem_free(blkarray, size);
1442 		return (ret);
1443 	}
1444 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1445 	ASSERT(physblk > 0);
1446 	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
1447 		lp->l_flags |= MDDB_F_EWRITE;
1448 		return (err);
1449 	}
1450 	return (0);
1451 }
1452 
1453 /*
1454  * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
1455  */
1456 static int
1457 writeall(
1458 	mddb_set_t	*s,		/* incore set structure */
1459 	caddr_t		buffer,		/* buffer to be written */
1460 	mddb_block_t	block,		/* starting logical block number */
1461 	int		cnt,		/* number of log blocks to be written */
1462 	int		master_only	/* allow only master node to write */
1463 )
1464 {
1465 	int		li;
1466 	int		err = 0;
1467 	mddb_lb_t	*lbp = s->s_lbp;
1468 
1469 	for (li = 0; li < lbp->lb_loccnt; li++) {
1470 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1471 
1472 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1473 		    (lp->l_flags & MDDB_F_EWRITE))
1474 			continue;
1475 
1476 		err |= writeblks(s, buffer, block, cnt, li, master_only);
1477 	}
1478 
1479 	return (err);
1480 }
1481 
1482 /*
1483  * writelocall - write the locator block and device id information (if
1484  * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
1485  *
1486  * Increments the locator block's commitcnt.  Updates the device id area's
1487  * commitcnt if the replica is in device id format.  Regenerates the
1488  * checksums after updating the commitcnt(s).
1489  */
1490 static int
1491 writelocall(
1492 	mddb_set_t	*s	/* incore set structure */
1493 )
1494 {
1495 	int		li;
1496 	int		err = 0;
1497 	mddb_lb_t	*lbp = s->s_lbp;
1498 	mddb_did_blk_t	*did_blk;
1499 	mddb_did_db_t	*did_dbp;
1500 
1501 	s->s_lbp->lb_commitcnt++;
1502 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1503 		did_blk = s->s_did_icp->did_ic_blkp;
1504 		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
1505 		crcgen(did_blk, &did_blk->blk_checksum,
1506 			dbtob(lbp->lb_didblkcnt), NULL);
1507 	}
1508 	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
1509 
1510 	for (li = 0; li < lbp->lb_loccnt; li++) {
1511 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1512 
1513 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1514 		    (lp->l_flags & MDDB_F_EWRITE))
1515 			continue;
1516 
1517 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1518 			/* write out blocks containing actual device ids */
1519 			did_dbp = s->s_did_icp->did_ic_dbp;
1520 			while (did_dbp) {
1521 				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
1522 					did_dbp->db_firstblk,
1523 					did_dbp->db_blkcnt, li,
1524 					MDDB_WR_ONLY_MASTER);
1525 				did_dbp = did_dbp->db_next;
1526 			}
1527 
1528 			/* write out device id area block */
1529 			err |= writeblks(s, (caddr_t)did_blk,
1530 				lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
1531 				MDDB_WR_ONLY_MASTER);
1532 		}
1533 		/* write out locator block */
1534 		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
1535 			MDDB_WR_ONLY_MASTER);
1536 	}
1537 
1538 	/*
1539 	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
1540 	 * in the mddb_set structure to show that the locator block has
1541 	 * been changed.
1542 	 */
1543 
1544 	if ((lbp->lb_flags & MDDB_MNSET) &&
1545 	    (md_set[s->s_setno].s_am_i_master)) {
1546 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
1547 	}
1548 	return (err);
1549 }
1550 
1551 /*
1552  * If called during upgrade, this routine expects a translated
1553  * (aka miniroot) dev.
1554  */
1555 static int
1556 getblks(
1557 	mddb_set_t	*s,	/* incore db set structure */
1558 	caddr_t		buffer,	/* buffer to read data into */
1559 	md_dev64_t	device,	/* device to read from */
1560 	daddr_t		blk,	/* physical block number to read */
1561 	int		cnt,	/* number of blocks to read */
1562 	int		flag	/* flags for I/O */
1563 )
1564 {
1565 	buf_t		*bp;
1566 	mddb_bf_t	*bfp;
1567 	int		err = 0;
1568 
1569 	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
1570 	bp = &bfp->bf_buf;
1571 	bp->b_bcount = MDDB_BSIZE * cnt;
1572 	bp->b_un.b_addr = buffer;
1573 	bp->b_blkno = blk;
1574 	bp->b_edev = md_dev64_to_dev(device);
1575 	err = mddb_rwdata(s, (B_READ | flag), bp);
1576 	freebuffer(s, bfp);
1577 	if (err) {
1578 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1579 		    s->s_setno, device);
1580 		return (MDDB_F_EREAD);
1581 	}
1582 	return (0);
1583 }
1584 
1585 /*
1586  * readblklst - takes an array of logical block numbers
1587  * 		and reads those blocks (gather) into the buffer.
1588  * If called during upgrade, this routine expects a non-translated
1589  * (aka target) dev.
1590  */
1591 static int
1592 readblklst(
1593 	mddb_set_t	*s,	/* incore set structure */
1594 	caddr_t		buffer,	/* buffer to be read (record block) */
1595 	mddb_block_t	blka[],	/* list of logical blocks to be read */
1596 	daddr_t		cnt,	/* number of logical blocks */
1597 	int		li,	/* locator index */
1598 	int		flag	/* flags for I/O */
1599 )
1600 {
1601 	daddr_t		blk;
1602 	daddr_t		blk1;
1603 	int		err = 0;
1604 	int		cons;
1605 	md_dev64_t	dev;
1606 	mddb_mb_ic_t	*mbip;
1607 
1608 	mbip = s->s_mbiarray[li];
1609 	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1610 	dev = md_xlate_targ_2_mini(dev);
1611 	if (dev == NODEV64) {
1612 		return (1);
1613 	}
1614 
1615 	blk = getphysblk(blka[0], mbip);
1616 	ASSERT(blk >= 0);
1617 
1618 	cons = 1;
1619 	while (cnt) {
1620 		if (cons != cnt) {
1621 			blk1 = getphysblk(blka[cons], mbip);
1622 			ASSERT(blk1 >= 0);
1623 			if ((blk + cons) == blk1) {
1624 				cons++;
1625 				continue;
1626 			}
1627 		}
1628 		if (err = getblks(s, buffer, dev, blk, cons, flag))
1629 			return (err);
1630 		buffer += MDDB_BSIZE * cons;
1631 		cnt -= cons;
1632 		blka += cons;
1633 		if (cnt) {
1634 			blk = getphysblk(blka[0], mbip);
1635 			ASSERT(blk >= 0);
1636 		}
1637 		cons = 1;
1638 	}
1639 	return (0);
1640 }
1641 
1642 /*
1643  * readblks - takes a logical block number/block count pair
1644  * 		and reads those contiguous logical blocks into the buffer.
1645  * If called during upgrade, this routine expects a non-translated
1646  * (aka target) dev.
1647  */
1648 static int
1649 readblks(
1650 	mddb_set_t	*s,	/* incore set structure */
1651 	caddr_t		buffer,	/* buffer to be read into */
1652 	mddb_block_t	blk,	/* logical block number to be read */
1653 	int		cnt,	/* number of logical blocks to be read */
1654 	int		li	/* locator index */
1655 )
1656 {
1657 	daddr_t		physblk;
1658 	md_dev64_t	device;
1659 	int		i;
1660 	mddb_block_t	*blkarray;
1661 	int		size;
1662 	int		ret;
1663 
1664 	if (cnt > 1) {
1665 		size = sizeof (mddb_block_t) * cnt;
1666 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1667 		for (i = 0; i < cnt; i++)
1668 			blkarray[i] = blk + i;
1669 		ret = readblklst(s, buffer, blkarray, cnt, li, 0);
1670 		kmem_free(blkarray, size);
1671 		return (ret);
1672 	}
1673 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1674 	ASSERT(physblk > 0);
1675 	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1676 	device = md_xlate_targ_2_mini(device);
1677 	if (device == NODEV64) {
1678 		return (1);
1679 	}
1680 	return (getblks(s, buffer, device, physblk, 1, 0));
1681 }
1682 
1683 static void
1684 single_thread_start(
1685 	mddb_set_t	*s
1686 )
1687 {
1688 	while (s->s_singlelockgotten) {
1689 		s->s_singlelockwanted++;
1690 		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
1691 	}
1692 	s->s_singlelockgotten++;
1693 }
1694 
1695 static void
1696 single_thread_end(
1697 	mddb_set_t	*s
1698 )
1699 {
1700 	ASSERT(s->s_singlelockgotten);
1701 	s->s_singlelockgotten = 0;
1702 	if (s->s_singlelockwanted) {
1703 		s->s_singlelockwanted = 0;
1704 		cv_broadcast(&s->s_single_thread_cv);
1705 	}
1706 }
1707 
1708 static size_t
1709 sizeofde(
1710 	mddb_de_ic_t	*dep
1711 )
1712 {
1713 	size_t		size;
1714 
1715 	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
1716 		    sizeof (mddb_block_t) * dep->de_blkcount;
1717 	return (size);
1718 }
1719 
1720 static size_t
1721 sizeofde32(
1722 	mddb_de32_t	*dep
1723 )
1724 {
1725 	size_t		size;
1726 
1727 	size = sizeof (*dep) - sizeof (dep->de32_blks) +
1728 		    sizeof (mddb_block_t) * dep->de32_blkcount;
1729 	return (size);
1730 }
1731 
1732 static mddb_de32_t *
1733 nextentry(
1734 	mddb_de32_t	*dep
1735 )
1736 {
1737 	mddb_de32_t	*ret;
1738 
1739 	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
1740 	return (ret);
1741 }
1742 
1743 static void
1744 create_db32rec(
1745 	mddb_db32_t *db32p,
1746 	mddb_db_t *dbp
1747 )
1748 {
1749 	mddb_de_ic_t *dep;
1750 	mddb_de32_t *de32p;
1751 
1752 #if defined(_ILP32) && !defined(lint)
1753 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1754 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1755 #endif
1756 
1757 	dbtodb32(dbp, db32p);
1758 	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
1759 		db32p->db32_firstentry = 0x4;
1760 	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
1761 		+ sizeof (db32p->db32_firstentry)));
1762 	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
1763 		detode32(dep, de32p);
1764 		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
1765 			de32p->de32_next = 0x4;
1766 		de32p = nextentry(de32p);
1767 	}
1768 	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
1769 }
1770 
1771 /*
1772  * If called during upgrade, this routine expects a translated
1773  * (aka miniroot) dev.
1774  * If master blocks are found, set the mn_set parameter to 1 if the
1775  * the master block revision number is MDDB_REV_MNMB; otherwise,
1776  * set it to 0.
1777  * If master blocks are not found, do not change the mnset parameter.
1778  */
1779 static mddb_mb_ic_t *
1780 getmasters(
1781 	mddb_set_t	*s,
1782 	md_dev64_t	dev,
1783 	daddr_t		blkno,
1784 	uint_t		*flag,
1785 	int		*mn_set
1786 )
1787 {
1788 	mddb_mb_ic_t	*mbi = NULL;
1789 	mddb_mb_t	*mb;
1790 	int		error = 0;
1791 	ddi_devid_t	devid;
1792 
1793 
1794 	if (mddb_devopen(dev)) {
1795 		if (flag)
1796 			*flag |= MDDB_F_EMASTER;
1797 		return ((mddb_mb_ic_t *)NULL);
1798 	}
1799 
1800 
1801 	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
1802 	mb = &(mbi->mbi_mddb_mb);
1803 	if (error = getblks(s, (caddr_t)mb, dev, blkno,
1804 	    btodb(MDDB_BSIZE), 0)) {
1805 		error |= MDDB_F_EMASTER;
1806 	}
1807 	if (mb->mb_magic != MDDB_MAGIC_MB) {
1808 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1809 	}
1810 	/* Check for MDDB_REV_MNMB and lower */
1811 	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
1812 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1813 	}
1814 	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
1815 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1816 	}
1817 
1818 	if (!(md_get_setstatus(s->s_setno) &
1819 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
1820 	    (mb->mb_setno != s->s_setno)) {
1821 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1822 	}
1823 	if (mb->mb_blkno != blkno) {
1824 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1825 	}
1826 	mb->mb_next = NULL;
1827 	mbi->mbi_next = NULL;
1828 
1829 	if (error)
1830 		goto out;
1831 
1832 	/*
1833 	 * Check the md_devid_destroy and md_keep_repl_state flags
1834 	 * to see if we need to regen the devid or not.
1835 	 *
1836 	 * Don't care about devid in local set since it is not used
1837 	 * and this should not be part of set importing
1838 	 */
1839 	if ((s->s_setno != MD_LOCAL_SET) &&
1840 	    !(md_get_setstatus(s->s_setno) &
1841 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
1842 		/*
1843 		 * Now check the destroy flag. We also need to handle
1844 		 * the case where the destroy flag is reset after the
1845 		 * destroy
1846 		 */
1847 		if (md_devid_destroy || (mb->mb_devid_len == 0)) {
1848 
1849 			if (md_devid_destroy) {
1850 				bzero(mb->mb_devid, mb->mb_devid_len);
1851 				mb->mb_devid_len = 0;
1852 			}
1853 
1854 			/*
1855 			 * Try to regenerate it if the 'keep' flag is not set
1856 			 */
1857 			if (!md_keep_repl_state) {
1858 				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
1859 				    &devid) == DDI_SUCCESS) {
1860 					mb->mb_devid_len =
1861 					    ddi_devid_sizeof(devid);
1862 					bcopy(devid, mb->mb_devid,
1863 					    mb->mb_devid_len);
1864 					ddi_devid_free(devid);
1865 				} else {
1866 					error = MDDB_F_EFMT | MDDB_F_EMASTER;
1867 				}
1868 			}
1869 
1870 			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
1871 
1872 			/*
1873 			 * Push
1874 			 */
1875 			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
1876 				error = MDDB_F_EFMT | MDDB_F_EMASTER;
1877 			}
1878 		}
1879 	}
1880 
1881 	if (! error) {
1882 		/* Set mn_set parameter to 1 if a MN set */
1883 		if (mb->mb_revision == MDDB_REV_MNMB)
1884 			*mn_set = 1;
1885 		else
1886 			*mn_set = 0;
1887 		return (mbi);
1888 	}
1889 
1890 out:
1891 	/* Error Out */
1892 	if (flag)
1893 		*flag |= error;
1894 
1895 	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
1896 	mddb_devclose(dev);
1897 	return ((mddb_mb_ic_t *)NULL);
1898 }
1899 
1900 static int
1901 getrecord(
1902 	mddb_set_t	*s,
1903 	mddb_de_ic_t	*dep,
1904 	int		li
1905 )
1906 {
1907 	int		err = 0;
1908 	mddb_rb32_t	*rbp;
1909 
1910 #if defined(_ILP32) && !defined(lint)
1911 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
1912 #endif
1913 
1914 
1915 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
1916 	rbp = dep->de_rb;
1917 
1918 	err = readblklst(s, (caddr_t)rbp, dep->de_blks,
1919 	    dep->de_blkcount, li, 0);
1920 	if (err) {
1921 		return (MDDB_F_EDATA | err);
1922 	}
1923 	if (rbp->rb_magic != MDDB_MAGIC_RB) {
1924 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1925 	}
1926 	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
1927 	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
1928 	    (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
1929 	    (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
1930 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1931 	}
1932 	/* Check crc for this record */
1933 	if (rec_crcchk(s, dep, rbp)) {
1934 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1935 	}
1936 	return (0);
1937 }
1938 
1939 /*
1940  * Code to read in the locator name information
1941  */
1942 static int
1943 readlocnames(
1944 	mddb_set_t	*s,
1945 	int		li
1946 )
1947 {
1948 	mddb_ln_t	*lnp;
1949 	int		err = 0;
1950 	mddb_block_t	ln_blkcnt, ln_blkno;
1951 
1952 	/*
1953 	 * read in the locator name blocks
1954 	 */
1955 	s->s_lnp = NULL;
1956 
1957 	ln_blkno = s->s_lbp->lb_lnfirstblk;
1958 	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
1959 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
1960 
1961 	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
1962 	if (err) {
1963 		err |= MDDB_F_EDATA;
1964 		goto out;
1965 	}
1966 	if (lnp->ln_magic != MDDB_MAGIC_LN) {
1967 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1968 		goto out;
1969 	}
1970 	if (s->s_lbp->lb_flags & MDDB_MNSET) {
1971 		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
1972 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1973 			goto out;
1974 		}
1975 	} else {
1976 		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
1977 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1978 			goto out;
1979 		}
1980 	}
1981 	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
1982 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1983 		goto out;
1984 	}
1985 out:
1986 	/*
1987 	 *	if error occurred in locator name blocks free them
1988 	 *	and return
1989 	 */
1990 	if (err) {
1991 		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
1992 		return (err);
1993 	}
1994 	s->s_lnp = lnp;
1995 	return (0);
1996 }
1997 
1998 /*
1999  * code to read in a copy of the database.
2000  */
2001 
2002 static int
2003 readcopy(
2004 	mddb_set_t	*s,
2005 	int		li
2006 )
2007 {
2008 	uint_t		blk;
2009 	mddb_db_t	*dbp, *dbp1, *dbhp;
2010 	mddb_db32_t	*db32p;
2011 	mddb_de_ic_t	*dep, *dep2;
2012 	mddb_de32_t	*de32p, *de32p2;
2013 	int		err = 0;
2014 	uint_t		checksum;
2015 
2016 
2017 #if defined(_ILP32) && !defined(lint)
2018 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2019 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2020 #endif
2021 
2022 	dbp = NULL;
2023 	dbhp = NULL;
2024 	/*
2025 	 *	read in all the directory blocks
2026 	 */
2027 	blk = s->s_lbp->lb_dbfirstblk;
2028 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2029 
2030 	for (; blk != 0; blk = dbp->db_nextblk) {
2031 		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
2032 		if (! dbhp) {
2033 			dbhp = dbp1;
2034 		} else {
2035 			dbp->db_next = dbp1;
2036 		}
2037 		dbp = dbp1;
2038 
2039 		err = readblks(s, (caddr_t)db32p, blk, 1, li);
2040 		if (err) {
2041 			err |= MDDB_F_EDATA;
2042 			break;
2043 		}
2044 		db32todb(db32p, dbp);
2045 		if (db32p->db32_magic != MDDB_MAGIC_DB) {
2046 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2047 			break;
2048 		}
2049 		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
2050 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2051 			break;
2052 		}
2053 		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
2054 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2055 			break;
2056 		}
2057 		/*
2058 		 * first go through and fix up all de_next pointers
2059 		 */
2060 		if (dbp->db_firstentry) {
2061 
2062 			de32p = (mddb_de32_t *)
2063 			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
2064 			    + sizeof (db32p->db32_firstentry)));
2065 
2066 			dep = (mddb_de_ic_t *)
2067 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
2068 				sizeof (mddb_block_t) +
2069 				sizeof (mddb_block_t) * de32p->de32_blkcount,
2070 				KM_SLEEP);
2071 			de32tode(de32p, dep);
2072 
2073 			dbp->db_firstentry = dep;
2074 			while (de32p && de32p->de32_next) {
2075 
2076 				de32p2 = nextentry(de32p);
2077 
2078 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
2079 					sizeof (mddb_de_ic_t) -
2080 					sizeof (mddb_block_t) +
2081 					sizeof (mddb_block_t) *
2082 					de32p2->de32_blkcount, KM_SLEEP);
2083 
2084 				de32tode(de32p2, dep2);
2085 
2086 				dep->de_next = dep2;
2087 				dep = dep2;
2088 				de32p = de32p2;
2089 			}
2090 		}
2091 		/*
2092 		 * go through and make all of the pointer to record blocks
2093 		 * are null;
2094 		 */
2095 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
2096 			dep->de_rb = NULL;
2097 	}
2098 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2099 	dbp->db_next = NULL;
2100 	/*
2101 	 *	if error occurred in directory blocks free them
2102 	 *	and return
2103 	 */
2104 	if (err) {
2105 		dbp = dbhp;
2106 		while (dbp) {
2107 			dep = dbp->db_firstentry;
2108 			while (dep) {
2109 				/* No mddb_rb32_t structures yet */
2110 				dep2 = dep->de_next;
2111 				kmem_free((caddr_t)dep, sizeofde(dep));
2112 				dep = dep2;
2113 			}
2114 			dbp1 = dbp->db_next;
2115 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2116 			dbp = dbp1;
2117 		}
2118 		s->s_dbp = NULL;
2119 		return (err);
2120 
2121 	}
2122 	/*
2123 	 */
2124 	err = 0;
2125 	checksum = MDDB_GLOBAL_XOR;
2126 	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
2127 		checksum ^= dbp->db_recsum;
2128 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2129 			if (dep->de_flags & MDDB_F_OPT)
2130 				continue;
2131 			err = getrecord(s, dep, li);
2132 			if (err)
2133 				break;
2134 			/* Don't include CHANGELOG in big XOR */
2135 			if (dep->de_flags & MDDB_F_CHANGELOG)
2136 				continue;
2137 			checksum ^= dep->de_rb->rb_checksum;
2138 			checksum ^= dep->de_rb->rb_checksum_fiddle;
2139 		}
2140 		if (err)
2141 			break;
2142 	}
2143 	if (checksum) {
2144 		if (! err)
2145 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2146 	}
2147 	if (err) {
2148 		dbp = dbhp;
2149 		dbhp = NULL;
2150 		while (dbp) {
2151 			dep = dbp->db_firstentry;
2152 			while (dep) {
2153 				if (dep->de_rb)
2154 					kmem_free((caddr_t)dep->de_rb,
2155 					    dep->de_recsize);
2156 				dep2 = dep->de_next;
2157 				kmem_free((caddr_t)dep, sizeofde(dep));
2158 				dep = dep2;
2159 			}
2160 			dbp1 = dbp->db_next;
2161 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2162 			dbp = dbp1;
2163 		}
2164 	}
2165 	s->s_dbp = dbhp;
2166 	return (err);
2167 }
2168 
2169 static int
2170 getoptcnt(
2171 	mddb_set_t	*s,
2172 	int		li)
2173 {
2174 	int		result;
2175 	mddb_de_ic_t	*dep;
2176 	mddb_db_t	*dbp;
2177 
2178 #if defined(_ILP32) && !defined(lint)
2179 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2180 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2181 #endif
2182 
2183 	result = 0;
2184 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2185 		dep = dbp->db_firstentry;
2186 		for (; dep != NULL; dep = dep->de_next) {
2187 			if (! (dep->de_flags & MDDB_F_OPT))
2188 				continue;
2189 			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
2190 			    (li == dep->de_optinfo[0].o_li)) ||
2191 			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
2192 			    (li == dep->de_optinfo[1].o_li)))
2193 			result++;
2194 		}
2195 	}
2196 	return (result);
2197 }
2198 
2199 static void
2200 getoptdev(
2201 	mddb_set_t	*s,
2202 	mddb_de_ic_t	*rdep,
2203 	int		opti
2204 )
2205 {
2206 	mddb_lb_t	*lbp;
2207 	mddb_locator_t	*lp;
2208 	mddb_optinfo_t	*otherop;
2209 	mddb_optinfo_t	*resultop;
2210 	int		li;
2211 	dev_t		otherdev;
2212 	int		blkonly = 0;
2213 	int		mincnt;
2214 	int		thiscnt;
2215 
2216 	lbp = s->s_lbp;
2217 
2218 	resultop = &rdep->de_optinfo[opti];
2219 	otherop = &rdep->de_optinfo[1-opti];
2220 
2221 	resultop->o_flags = 0;
2222 
2223 	/*
2224 	 * scan through and see if data bases have to vary by only device
2225 	 */
2226 
2227 	if (otherop->o_flags & MDDB_F_ACTIVE) {
2228 		blkonly = 1;
2229 		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
2230 		for (li = 0; li < lbp->lb_loccnt; li++) {
2231 			lp = &lbp->lb_locators[li];
2232 			if (! (lp->l_flags & MDDB_F_ACTIVE))
2233 				continue;
2234 			if (expldev(lp->l_dev) != otherdev) {
2235 				blkonly = 0;
2236 				break;
2237 			}
2238 		}
2239 	}
2240 
2241 	mincnt = 999999;
2242 	for (li = 0; li < lbp->lb_loccnt; li++) {
2243 		dev_info_t	*devi;
2244 		int		removable = 0;
2245 
2246 		lp = &lbp->lb_locators[li];
2247 		if (! (lp->l_flags & MDDB_F_ACTIVE))
2248 			continue;
2249 		if (otherop->o_flags & MDDB_F_ACTIVE) {
2250 			if (blkonly) {
2251 				if (otherop->o_li == li)
2252 					continue;
2253 			} else {
2254 				if (otherdev == expldev(lp->l_dev))
2255 					continue;
2256 			}
2257 		}
2258 
2259 		/*
2260 		 * Check if this is a removable device.  If it is we
2261 		 * assume it is something like a USB flash disk, a zip disk
2262 		 * or even a floppy that is being used to help maintain
2263 		 * mddb quorum.  We don't want to put any optimized resync
2264 		 * records on these kinds of disks since they are usually
2265 		 * slower or don't have the same read/write lifetimes as
2266 		 * a regular fixed disk.
2267 		 */
2268 		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
2269 			int		error;
2270 			struct cb_ops	*cb;
2271 			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
2272 			int		propvalue = 0;
2273 			int		proplength = sizeof (int);
2274 
2275 			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
2276 			    != NULL) {
2277 				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
2278 					prop_op,
2279 					DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
2280 					"removable-media",
2281 					(caddr_t)&propvalue, &proplength);
2282 
2283 				if (error == DDI_PROP_SUCCESS)
2284 					removable = 1;
2285 			}
2286 
2287 			ddi_release_devi(devi);
2288 		}
2289 
2290 		if (removable)
2291 			continue;
2292 
2293 		thiscnt = getoptcnt(s, li);
2294 		if (thiscnt < mincnt) {
2295 			resultop->o_li  = li;
2296 			mincnt = thiscnt;
2297 			resultop->o_flags = MDDB_F_ACTIVE;
2298 		}
2299 	}
2300 }
2301 
2302 static void
2303 allocuserdata(
2304 	mddb_de_ic_t	*dep
2305 )
2306 {
2307 	mddb_rb32_t	*rbp;
2308 
2309 #if defined(_ILP32) && !defined(lint)
2310 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2311 #endif
2312 
2313 	rbp = dep->de_rb;
2314 	rbp->rb_private = 0;
2315 	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
2316 	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
2317 	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
2318 }
2319 
2320 
2321 static void
2322 getuserdata(
2323 	set_t		setno,
2324 	mddb_de_ic_t	*dep
2325 )
2326 {
2327 	mddb_rb32_t	 *rbp;
2328 
2329 
2330 	mddb_type_t	type = dep->de_type1;
2331 	caddr_t		data, udata;
2332 
2333 #if defined(_ILP32) && !defined(lint)
2334 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2335 #endif
2336 	rbp = dep->de_rb;
2337 	data = (caddr_t)rbp->rb_data;
2338 	udata = (caddr_t)dep->de_rb_userdata;
2339 
2340 	/*
2341 	 * If it's a driver record, and an old style record, and not a DRL
2342 	 * record, we must convert it because it was incore as a 64 bit
2343 	 * structure but its on disk layout has only 32 bit for block sizes
2344 	 */
2345 	if (!(md_get_setstatus(setno) &
2346 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
2347 	    (type >= MDDB_FIRST_MODID) &&
2348 	    ((rbp->rb_revision == MDDB_REV_RB) ||
2349 		(rbp->rb_revision == MDDB_REV_RBFN))) {
2350 
2351 		switch (dep->de_flags) {
2352 
2353 			case MDDB_F_STRIPE:
2354 				stripe_convert(data, udata, BIG_2_SMALL);
2355 				break;
2356 
2357 			case MDDB_F_MIRROR:
2358 				mirror_convert(data, udata, BIG_2_SMALL);
2359 				break;
2360 
2361 			case MDDB_F_RAID:
2362 				raid_convert(data, udata, BIG_2_SMALL);
2363 				break;
2364 
2365 			case MDDB_F_SOFTPART:
2366 				softpart_convert(data, udata, BIG_2_SMALL);
2367 				break;
2368 
2369 			case MDDB_F_TRANS_MASTER:
2370 				trans_master_convert(data, udata, BIG_2_SMALL);
2371 				break;
2372 
2373 			case MDDB_F_TRANS_LOG:
2374 				trans_log_convert(data, udata, BIG_2_SMALL);
2375 				break;
2376 
2377 			case MDDB_F_HOTSPARE:
2378 				hs_convert(data, udata, BIG_2_SMALL);
2379 				break;
2380 
2381 			case MDDB_F_OPT:
2382 			default:
2383 				bcopy(udata, data, dep->de_reqsize);
2384 		}
2385 	} else {
2386 		bcopy(udata, data, dep->de_reqsize);
2387 	}
2388 }
2389 
2390 static void
2391 getoptrecord(
2392 	mddb_set_t	*s,
2393 	mddb_de_ic_t	*dep
2394 )
2395 {
2396 	mddb_lb_t	*lbp;
2397 	mddb_locator_t	*lp;
2398 	mddb_rb32_t	*rbp, *crbp;
2399 	int		li;
2400 	int		i;
2401 	int		err = 0;
2402 	size_t		recsize;
2403 
2404 #if defined(_ILP32) && !defined(lint)
2405 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2406 #endif
2407 
2408 	lbp = s->s_lbp;
2409 
2410 	recsize = dep->de_recsize;
2411 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2412 	rbp = dep->de_rb;
2413 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2414 
2415 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
2416 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2417 
2418 	for (i = 0; i < 2; i++) {
2419 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2420 			continue;
2421 		li = dep->de_optinfo[i].o_li;
2422 		lp = &lbp->lb_locators[li];
2423 
2424 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
2425 		    (lp->l_flags & MDDB_F_EMASTER))
2426 			continue;
2427 
2428 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
2429 		    dep->de_blkcount, li, 0);
2430 
2431 		if (err)
2432 			continue;
2433 
2434 		if (rbp->rb_magic != MDDB_MAGIC_RB)
2435 			continue;
2436 
2437 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
2438 			continue;
2439 
2440 		/* Check the crc for this record */
2441 		if (rec_crcchk(s, dep, rbp)) {
2442 			continue;
2443 		}
2444 
2445 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
2446 
2447 		if (rbp == crbp) {
2448 			if (rbp->rb_checksum != crbp->rb_checksum)
2449 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2450 			break;
2451 		}
2452 		rbp = crbp;
2453 	}
2454 
2455 	if (rbp == crbp) {
2456 		rbp->rb_private = 0;
2457 		kmem_free((caddr_t)crbp, recsize);
2458 		return;
2459 	}
2460 	bzero((caddr_t)rbp, recsize);
2461 	rbp->rb_magic = MDDB_MAGIC_RB;
2462 	rbp->rb_revision = MDDB_REV_RB;
2463 	uniqtime32(&rbp->rb_timestamp);
2464 	/* Generate the crc for this record */
2465 	rec_crcgen(s, dep, rbp);
2466 	kmem_free((caddr_t)crbp, recsize);
2467 }
2468 
2469 /*
2470  * writeoptrecord writes out an optimized record.
2471  */
2472 static int
2473 writeoptrecord(
2474 	mddb_set_t	*s,
2475 	mddb_de_ic_t	*dep
2476 )
2477 {
2478 	mddb_rb32_t	*rbp;
2479 	int		li;
2480 	int		err = 0, wrt_err = 0;
2481 	mddb_bf_t	*bufhead, *bfp;
2482 	mddb_lb_t	*lbp = s->s_lbp;
2483 	mddb_locator_t	*lp;
2484 	int		i;
2485 
2486 #if defined(_ILP32) && !defined(lint)
2487 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2488 #endif
2489 
2490 	bufhead = NULL;
2491 	err = 0;
2492 
2493 	while (s->s_opthavequeuinglck) {
2494 		s->s_optwantqueuinglck++;
2495 		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
2496 	}
2497 	s->s_opthavequeuinglck++;
2498 	rbp = dep->de_rb;
2499 	for (i = 0; i < 2; i++) {
2500 		/*
2501 		 * only possible error is xlate. This can
2502 		 * occur if a replica was off line and came
2503 		 * back. During the mean time the database grew
2504 		 * large than the now on line replica can store
2505 		 */
2506 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2507 			continue;
2508 		li = dep->de_optinfo[i].o_li;
2509 		/*
2510 		 * In a MN diskset, any node can write optimized record(s).
2511 		 */
2512 		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
2513 			dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
2514 		/*
2515 		 * For MN diskset, set error in optinfo structure so
2516 		 * that mddb_commitrec knows which replica failed.
2517 		 */
2518 		if ((MD_MNSET_SETNO(s->s_setno)) &&
2519 		    (wrt_err & MDDB_F_EWRITE)) {
2520 			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
2521 		}
2522 		err |= wrt_err;
2523 	}
2524 	s->s_opthavequeuinglck = 0;
2525 	if (s->s_optwantqueuinglck) {
2526 		s->s_optwantqueuinglck = 0;
2527 		cv_broadcast(&s->s_optqueuing_cv);
2528 	}
2529 	for (bfp = bufhead; bfp; bfp = bufhead) {
2530 		mutex_exit(SETMUTEX(s->s_setno));
2531 		(void) biowait(&bfp->bf_buf);
2532 		mutex_enter(SETMUTEX(s->s_setno));
2533 		if (bfp->bf_buf.b_flags & B_ERROR) {
2534 			/*
2535 			 * If an MN diskset, don't set replica
2536 			 * in error since this hasn't been set in master.
2537 			 * Setting replica in error before master could
2538 			 * leave the nodes with different views of the
2539 			 * world since a class 1 configuration change
2540 			 * could occur in mddb_commitrec as soon as
2541 			 * all locks are dropped.  Must keep this
2542 			 * node the same as master and can't afford a
2543 			 * failure from the class 1 config change
2544 			 * if master succeeded.
2545 			 */
2546 			if (!(MD_MNSET_SETNO(s->s_setno))) {
2547 				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
2548 			} else {
2549 				/*
2550 				 * Find which de_optinfo (which replica)
2551 				 * had a failure and set the failure in
2552 				 * the o_flags field.
2553 				 */
2554 				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
2555 				if (lp == bfp->bf_locator) {
2556 					dep->de_optinfo[0].o_flags |=
2557 						MDDB_F_EWRITE;
2558 				} else {
2559 					dep->de_optinfo[1].o_flags |=
2560 						MDDB_F_EWRITE;
2561 				}
2562 			}
2563 			err |= MDDB_F_EWRITE;
2564 		}
2565 		bufhead = bfp->bf_next;
2566 		freebuffer(s, bfp);
2567 	}
2568 	return (err);
2569 }
2570 
2571 /*
2572  * Fix up the optimized resync record.  Used in the traditional and local
2573  * disksets to move an optimized record from a failed or deleted mddb
2574  * to an active one.
2575  *
2576  * In a MN diskset, the fixing of the optimized record is split between
2577  * the master and slave nodes.  If the master node moves the optimized
2578  * resync record, then the master node will send a MDDB_PARSE_OPTRECS
2579  * message to the slave nodes causing the slave nodes to reget the
2580  * directory entry containing the location of the optimized resync record.
2581  * After the record is reread from disk, then writeoptrecord is called
2582  * if the location of the optimized resync record or flags have changed.
2583  * When writeoptrecord is called, the node that is the owner of this record
2584  * will write the optimized record to the location specified in the directory
2585  * entry.  Since the master node uses the highest class message (PARSE)
2586  * the record owner node is guaranteed to already have an updated
2587  * directory entry incore.
2588  *
2589  * The other difference between the traditional/local set and MN diskset
2590  * is that the directory entry can be written to disk before the optimized
2591  * record in a MN diskset if the record is owned by a slave node.  So,
2592  * the users of an optimized record must handle the failure case when no
2593  * data is available from an optimized record since the master node could
2594  * have failed during the relocation of the optimized record to another mddb.
2595  */
2596 static int
2597 fixoptrecord(
2598 	mddb_set_t	*s,
2599 	mddb_de_ic_t	*dep,
2600 	mddb_db_t	*dbp
2601 )
2602 {
2603 	int		changed;
2604 	int		writedata;
2605 	int		err = 0;
2606 	int		i;
2607 	mddb_lb_t	*lbp;
2608 	mddb_optinfo_t	*op;
2609 	mddb_db32_t	*db32p;
2610 	int		rec_owner;	/* Is node owner of record? */
2611 
2612 #if defined(_ILP32) && !defined(lint)
2613 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2614 #endif
2615 
2616 	lbp = s->s_lbp;
2617 	changed = 0;
2618 	writedata = 0;
2619 	for (i = 0; i < 2; i++) {
2620 		op = &dep->de_optinfo[i];
2621 
2622 		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
2623 			op->o_flags = 0;
2624 
2625 		/*
2626 		 * If optimized record has seen a replica failure,
2627 		 * assign new replica to record and re-write data
2628 		 * to new record.
2629 		 */
2630 		if (! (op->o_flags & MDDB_F_ACTIVE)) {
2631 			getoptdev(s, dep, i);
2632 			writedata++;
2633 			changed++;
2634 			/* Set flag for slaves to reread dep and write rec */
2635 			if (lbp->lb_flags & MDDB_MNSET) {
2636 				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
2637 			}
2638 		}
2639 
2640 		/*
2641 		 * If just an error in the data was seen, set
2642 		 * the optimized record's replica flag to active (ok)
2643 		 * and try again.
2644 		 */
2645 		if (op->o_flags & MDDB_F_EDATA) {
2646 			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
2647 			writedata++;
2648 		}
2649 	}
2650 
2651 	rec_owner = 0;
2652 	if (lbp->lb_flags & MDDB_MNSET) {
2653 		/*
2654 		 * If a MN diskset then check the owner of optimized record.
2655 		 * If the master node owns the record or if there is
2656 		 * no owner of the record, then the master can write the
2657 		 * optimized record to disk.
2658 		 * Master node can write the optimized record now, but
2659 		 * slave nodes write their records during handling of
2660 		 * the MDDB_PARSE_OPTRECS message.
2661 		 */
2662 		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
2663 		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
2664 			rec_owner = 1;
2665 		}
2666 	} else {
2667 		/*
2668 		 * In traditional diskset and local set, this node
2669 		 * is always the record owner and always the master.
2670 		 */
2671 		rec_owner = 1;
2672 	}
2673 
2674 	/*
2675 	 * If this node is the record owner, write out record.
2676 	 */
2677 	if ((writedata) && (rec_owner)) {
2678 		if (err = writeoptrecord(s, dep)) {
2679 			return (err);
2680 		}
2681 	}
2682 	if (! changed)
2683 		return (0);
2684 	uniqtime32(&dbp->db_timestamp);
2685 	dbp->db_revision = MDDB_REV_DB;
2686 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2687 	create_db32rec(db32p, dbp);
2688 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
2689 	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
2690 		1, MDDB_WR_ONLY_MASTER);
2691 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2692 	return (err);
2693 }
2694 
2695 static int
2696 fixoptrecords(
2697 	mddb_set_t		*s
2698 )
2699 {
2700 	mddb_de_ic_t	*dep;
2701 	mddb_db_t	*dbp;
2702 	int		err = 0;
2703 	set_t		setno;
2704 
2705 	/*
2706 	 * In a MN diskset, the master node is the only node that runs
2707 	 * fixoptrecords.  If the master node changes anything, then the
2708 	 * master node sends PARSE message to the slave nodes.  The slave
2709 	 * nodes will then re-read in the locator block or re-read in the
2710 	 * directory blocks and re-write the optimized resync records.
2711 	 */
2712 	setno = s->s_setno;
2713 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
2714 	    (md_set[setno].s_am_i_master == 0)) {
2715 		return (0);
2716 	}
2717 
2718 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2719 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2720 			if (! (dep->de_flags & MDDB_F_OPT))
2721 				continue;
2722 			err = fixoptrecord(s, dep, dbp);
2723 			if (err != 0)
2724 				return (err);
2725 		}
2726 	}
2727 	return (0);
2728 }
2729 
2730 /*
2731  * Checks incore version of mddb data to mddb data ondisk.
2732  *
2733  * Returns:
2734  *	- 0 if the data was successfully read and is good.
2735  *	- MDDB_F_EREAD if a read error occurred.
2736  *	- 1 if the data read is bad (checksum failed, etc)
2737  */
2738 static int
2739 checkcopy
2740 (
2741 	mddb_set_t	*s,
2742 	int		li
2743 )
2744 {
2745 	mddb_db_t	*dbp;
2746 	mddb_db32_t	*cdb32p;
2747 	mddb_de_ic_t	*dep;
2748 	mddb_de32_t	*cde32p;
2749 	mddb_rb32_t	*rbp, *crbp;
2750 	size_t		size;
2751 	int		i;
2752 	int		retval = 1;
2753 
2754 #if defined(_ILP32) && !defined(lint)
2755 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2756 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2757 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2758 #endif
2759 
2760 	if (s->s_databuffer_size == 0) {
2761 		size_t maxrecsize = MDDB_BSIZE;
2762 
2763 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
2764 			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
2765 				if (! (dep->de_flags & MDDB_F_OPT) &&
2766 				    dep->de_recsize > maxrecsize)
2767 					maxrecsize = dep->de_recsize;
2768 
2769 		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
2770 		s->s_databuffer_size = maxrecsize;
2771 	}
2772 
2773 	cdb32p = (mddb_db32_t *)s->s_databuffer;
2774 
2775 	/*
2776 	 * first go through and make sure all directory stuff
2777 	 * is the same
2778 	 */
2779 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2780 		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
2781 			retval = MDDB_F_EREAD;
2782 			goto err;
2783 		}
2784 		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
2785 			goto err;
2786 		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
2787 			goto err;
2788 		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
2789 			goto err;
2790 		if (cdb32p->db32_nextblk != dbp->db_nextblk)
2791 			goto err;
2792 		if (cdb32p->db32_recsum != dbp->db_recsum)
2793 			goto err;
2794 		if (cdb32p->db32_firstentry) {
2795 			cde32p = (mddb_de32_t *)
2796 			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
2797 			    + sizeof (cdb32p->db32_firstentry)));
2798 		} else
2799 			cde32p = NULL;
2800 
2801 		dep = dbp->db_firstentry;
2802 		/*
2803 		 * check if all directory entries are identical
2804 		 */
2805 		while (dep && cde32p) {
2806 			if (dep->de_recid != cde32p->de32_recid)
2807 				goto err;
2808 			if (dep->de_type1 != cde32p->de32_type1)
2809 				goto err;
2810 			if (dep->de_type2 != cde32p->de32_type2)
2811 				goto err;
2812 			if (dep->de_reqsize != cde32p->de32_reqsize)
2813 				goto err;
2814 			if (dep->de_flags != cde32p->de32_flags)
2815 				goto err;
2816 
2817 			for (i = 0; i < 2; i++) {
2818 				if (dep->de_optinfo[i].o_li !=
2819 				    cde32p->de32_optinfo[i].o_li)
2820 					break;
2821 			}
2822 			if (i != 2)
2823 				goto err;
2824 			size = sizeof (mddb_block_t) * dep->de_blkcount;
2825 			if (bcmp((caddr_t)dep->de_blks,
2826 			    (caddr_t)cde32p->de32_blks, size))
2827 				goto err;
2828 			dep = dep->de_next;
2829 			if (cde32p->de32_next)
2830 				cde32p = nextentry(cde32p);
2831 			else
2832 				cde32p = NULL;
2833 		}
2834 		if (dep || cde32p)
2835 			goto err;
2836 	}
2837 	/*
2838 	 * If here, all directories are functionally identical
2839 	 * check to make sure all records are identical
2840 	 * the reason the records are not just bcmped is that the
2841 	 * lock flag does not want to be compared.
2842 	 */
2843 	crbp = (mddb_rb32_t *)cdb32p;
2844 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2845 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2846 			if ((dep->de_flags & MDDB_F_OPT) ||
2847 			    (dep->de_flags & MDDB_F_CHANGELOG))
2848 				continue;
2849 			rbp = (mddb_rb32_t *)dep->de_rb;
2850 			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
2851 			    dep->de_blkcount, li, 0)) {
2852 				retval = MDDB_F_EREAD;
2853 				goto err;
2854 			}
2855 			/* Check the crc for this record */
2856 			if (rec_crcchk(s, dep, crbp))
2857 				goto err;
2858 
2859 			if (rbp->rb_checksum != crbp->rb_checksum ||
2860 			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
2861 				goto err;
2862 		}
2863 	}
2864 	return (0);
2865 err:
2866 	return (retval);
2867 }
2868 
2869 /*
2870  * Determine if the location information for two mddbs is the same.
2871  * The device slice and block offset should match.  If both have devids then
2872  * use that for the comparison, otherwise we compare the dev_ts.
2873  * Comparing with the devid allows us to handle the case where a mddb was
2874  * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
2875  * the dead mddb but the devid comparison will catch this and not match.
2876  *
2877  * Return 1 if the location of the two mddbs match, 0 if not.
2878  */
2879 static int
2880 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
2881 	daddr32_t blkno)
2882 {
2883 	if (rip->ri_flags & MDDB_F_EMASTER) {
2884 		/*
2885 		 * If this element is errored then we don't try to match on it.
2886 		 * If we try to match we could erroneously match on the dev_t
2887 		 * of a relocated disk.
2888 		 */
2889 		return (0);
2890 	}
2891 
2892 	if (rip->ri_devid && devid && minor) {
2893 		/*
2894 		 * If old devid exists, then this is a replicated diskset
2895 		 * and both old and new devids must be checked.
2896 		 */
2897 		if (rip->ri_old_devid) {
2898 			if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
2899 			    (ddi_devid_compare(rip->ri_old_devid,
2900 			    devid) != 0)) ||
2901 			    (strcmp(rip->ri_minor_name, minor) != 0))
2902 				return (0);
2903 		} else {
2904 			if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
2905 			    strcmp(rip->ri_minor_name, minor) != 0)
2906 				return (0);
2907 		}
2908 	} else {
2909 		if (rip->ri_dev != dev)
2910 			return (0);
2911 	}
2912 
2913 	if (rip->ri_blkno != blkno)
2914 		return (0);
2915 
2916 	return (1);
2917 }
2918 
2919 static int
2920 ridev(
2921 	mddb_ri_t	**rip,
2922 	mddb_cfg_loc_t	*clp,
2923 	dev32_t		*dev_2b_fixed,
2924 	int		flag)
2925 {
2926 	mddb_ri_t	*r, *r1;
2927 	md_dev64_t	ldev, ndev;
2928 	major_t		majordev;
2929 	int		sz;
2930 
2931 	if (MD_UPGRADE) {
2932 		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
2933 			clp->l_mnum);
2934 	} else {
2935 		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
2936 			return (EINVAL);
2937 
2938 		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
2939 			clp->l_mnum);
2940 	}
2941 
2942 	if (clp->l_devid != 0) {
2943 		/*
2944 		 * Get dev associated with device id and minor name.
2945 		 * Setup correct driver name if dev is now different.
2946 		 * Don't change driver name if during upgrade.
2947 		 */
2948 		ndev = ldev;
2949 		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
2950 		    &ndev, clp->l_minor_name)) {
2951 			if ((ndev != ldev) && (!(MD_UPGRADE))) {
2952 				majordev = md_getmajor(ndev);
2953 				(void) strcpy(clp->l_driver,
2954 				    ddi_major_to_name(majordev));
2955 				clp->l_mnum = md_getminor(ndev);
2956 				clp->l_devid_flags |= MDDB_DEVID_VALID;
2957 				ldev = ndev;
2958 			}
2959 		} else {
2960 			/* Mark as invalid */
2961 			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
2962 		}
2963 	}
2964 
2965 	clp->l_dev = md_cmpldev(ldev);
2966 	if (dev_2b_fixed)
2967 		*dev_2b_fixed = clp->l_dev;
2968 	r = *rip;
2969 
2970 	while (r) {
2971 		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
2972 		    clp->l_minor_name, ldev, clp->l_blkno)) {
2973 			if ((clp->l_devid != 0) &&
2974 			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2975 				r->ri_flags |= MDDB_F_EMASTER;
2976 			} else {
2977 				r->ri_flags |= flag;
2978 			}
2979 			return (0);	/* already entered return success */
2980 		}
2981 		r = r->ri_next;
2982 	}
2983 
2984 	/*
2985 	 * This replica not represented in the current rip list,
2986 	 * so add it to the list.
2987 	 */
2988 	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
2989 	r->ri_dev = ldev;
2990 	r->ri_blkno = clp->l_blkno;
2991 	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
2992 	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
2993 		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
2994 	}
2995 	if (clp->l_devname != NULL) {
2996 		(void) strcpy(r->ri_devname, clp->l_devname);
2997 	}
2998 	r->ri_flags |= flag;
2999 	if (clp->l_devid != 0) {
3000 		sz = clp->l_devid_sz;
3001 		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
3002 		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
3003 
3004 		if (clp->l_old_devid != NULL) {
3005 			sz = clp->l_old_devid_sz;
3006 			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
3007 			    KM_SLEEP);
3008 			bcopy((char *)(uintptr_t)clp->l_old_devid,
3009 			    (char *)r->ri_old_devid, sz);
3010 		} else {
3011 			r->ri_old_devid = 0;
3012 		}
3013 		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
3014 			(void) strcpy(r->ri_minor_name, clp->l_minor_name);
3015 
3016 		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
3017 			/*
3018 			 * Devid is present, but not valid.  This could
3019 			 * happen if device has been powered off or if
3020 			 * the device has been removed.  Mark the device in
3021 			 * error.  Don't allow any writes to this device
3022 			 * based on the dev_t since another device could
3023 			 * have been placed in its spot and be responding to
3024 			 * the dev_t accesses.
3025 			 */
3026 			r->ri_flags |= MDDB_F_EMASTER;
3027 		}
3028 	} else {
3029 		r->ri_devid = 0;
3030 		r->ri_old_devid = 0;
3031 	}
3032 
3033 	/*
3034 	 * If the rip list is empty then this entry
3035 	 * is the list.
3036 	 */
3037 	if (*rip == NULL) {
3038 		*rip = r;
3039 		return (0);
3040 	}
3041 
3042 	/*
3043 	 * Add this entry to the end of the rip list
3044 	 */
3045 	r1 = *rip;
3046 	while (r1->ri_next)
3047 		r1 = r1->ri_next;
3048 	r1->ri_next = r;
3049 	return (0);
3050 }
3051 
3052 /*
3053  * writecopy writes the incore data blocks out to all of the replicas.
3054  * This is called from writestart
3055  *	- when a diskset is started or
3056  *	- when an error has been enountered during the write to a mddb.
3057  * and from newdev when a new mddb is being added.
3058  *
3059  * flag can be 2 values:
3060  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3061  *		always used for traditional and local disksets.
3062  *		For MN diskset:
3063  *			All nodes can call writecopy, but only the
3064  *			master node actually writes data to the disk
3065  *			except for optimized resync records.
3066  *			An optimized resync record can only be written to
3067  *			by the record owner.
3068  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3069  *		master has been chosen, the new master may need to
3070  * 		write its incore mddb to disk (this is the case where the
3071  *		old master had executed a message but hadn't relayed it
3072  *		to this slave yet).  New master should not write the
3073  *		change log records since new master would be overwriting
3074  *		valuable data.  Only used during a reconfig cycle.
3075  */
3076 static int
3077 writecopy(
3078 	mddb_set_t	*s,
3079 	int		li,
3080 	int		flag
3081 )
3082 {
3083 	mddb_db_t	*dbp;
3084 	mddb_db32_t	*db32p;
3085 	mddb_de_ic_t	*dep;
3086 	mddb_rb32_t	*rbp;
3087 	uint_t		checksum;
3088 	int		err = 0;
3089 
3090 #if defined(_ILP32) && !defined(lint)
3091 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
3092 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
3093 #endif
3094 
3095 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
3096 		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
3097 		create_db32rec(db32p, dbp);
3098 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
3099 		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
3100 			MDDB_WR_ONLY_MASTER);
3101 		kmem_free((caddr_t)db32p, MDDB_BSIZE);
3102 		if (err)
3103 			return (err);
3104 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
3105 			/*
3106 			 * In a multinode diskset, when a new master is
3107 			 * chosen the new master may need to write its
3108 			 * incore copy of the mddb to disk.  In this case,
3109 			 * don't want to overwrite the change log records
3110 			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
3111 			 */
3112 			if (flag == MDDB_WRITECOPY_SYNC) {
3113 				if (dep->de_flags & MDDB_F_CHANGELOG)
3114 					continue;
3115 			}
3116 			/*
3117 			 * In a multinode diskset, don't write out optimized
3118 			 * resync resyncs since only the mirror owner node
3119 			 * will have the correct data.  If writecopy is
3120 			 * being called from writestart as a result of
3121 			 * an mddb failure, then writestart will handle
3122 			 * the optimized records when it calls fixoptrecords.
3123 			 */
3124 			if ((MD_MNSET_SETNO(s->s_setno)) &&
3125 			    (dep->de_flags & MDDB_F_OPT)) {
3126 				continue;
3127 			}
3128 
3129 			rbp = dep->de_rb;
3130 			checksum = rbp->rb_checksum_fiddle;
3131 			checksum ^= rbp->rb_checksum;
3132 			/* Generate the crc for this record */
3133 			rec_crcgen(s, dep, rbp);
3134 			checksum ^= rbp->rb_checksum;
3135 			rbp->rb_checksum_fiddle = checksum;
3136 			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
3137 			    dep->de_blkcount, li, (mddb_bf_t **)0,
3138 			    MDDB_WR_ONLY_MASTER))
3139 				return (err);
3140 		}
3141 	}
3142 	return (0);
3143 }
3144 
3145 static int
3146 upd_med(
3147 	mddb_set_t	*s,
3148 	char		*tag
3149 )
3150 {
3151 	med_data_t	meddb;
3152 	int		medok;
3153 	mddb_lb_t	*lbp = s->s_lbp;
3154 	set_t		setno = s->s_setno;
3155 	int		li;
3156 	int		alc;
3157 	int		lc;
3158 
3159 
3160 	/* If no mediator hosts, nothing to do */
3161 	if (s->s_med.n_cnt == 0)
3162 		return (0);
3163 
3164 	/*
3165 	 * If this is a MN set and we are not the master, then don't
3166 	 * update mediator hosts or mark mediator as golden since
3167 	 * only master node should do that.
3168 	 */
3169 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
3170 	    (md_set[setno].s_am_i_master == 0)) {
3171 		return (0);
3172 	}
3173 
3174 	bzero((char *)&meddb, sizeof (med_data_t));
3175 	meddb.med_dat_mag = MED_DATA_MAGIC;
3176 	meddb.med_dat_rev = MED_DATA_REV;
3177 	meddb.med_dat_fl = 0;
3178 	meddb.med_dat_sn = setno;
3179 	meddb.med_dat_cc = lbp->lb_commitcnt;
3180 	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
3181 	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3182 
3183 	/* count accessible mediators */
3184 	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3185 
3186 	/* count accessible and existing replicas */
3187 	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
3188 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3189 
3190 		if (lp->l_flags & MDDB_F_DELETED)
3191 			continue;
3192 
3193 		lc++;
3194 
3195 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
3196 		    (lp->l_flags & MDDB_F_EMASTER) ||
3197 		    (lp->l_flags & MDDB_F_EWRITE))
3198 			continue;
3199 
3200 		alc++;
3201 	}
3202 
3203 	/*
3204 	 * Mediator update quorum is >= 50%: check for less than
3205 	 * "mediator update" quorum.
3206 	 */
3207 	if ((medok * 2) < s->s_med.n_cnt) {
3208 		/* panic if <= 50% of all replicas are accessible */
3209 		if ((lc > 0) && ((alc * 2) <= lc)) {
3210 			cmn_err(CE_PANIC,
3211 			    "md: Update of 50%% of the mediator hosts failed");
3212 			/* NOTREACHED */
3213 		}
3214 
3215 		cmn_err(CE_WARN,
3216 		    "md: Update of 50%% of the mediator hosts failed");
3217 	}
3218 
3219 	/*
3220 	 * If we have mediator update quorum and exactly 50% of the replicas
3221 	 * are accessible then mark the mediator as golden.
3222 	 */
3223 	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
3224 	    ((alc * 2) == lc)) {
3225 		meddb.med_dat_fl = MED_DFL_GOLDEN;
3226 		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3227 		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3228 	}
3229 
3230 	return (0);
3231 }
3232 
3233 static int
3234 push_lb(mddb_set_t *s)
3235 {
3236 	mddb_lb_t	*lbp = s->s_lbp;
3237 
3238 	/* push the change to all the replicas */
3239 	uniqtime32(&lbp->lb_timestamp);
3240 	if (MD_MNSET_SETNO(s->s_setno)) {
3241 		lbp->lb_revision = MDDB_REV_MNLB;
3242 	} else {
3243 		lbp->lb_revision = MDDB_REV_LB;
3244 	}
3245 	/*
3246 	 * The updates to the mediator hosts are done
3247 	 * by the callers of this function.
3248 	 */
3249 	return (writelocall(s));
3250 }
3251 
3252 /* Should not call for MN diskset since data tags are not supported */
3253 static int
3254 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
3255 {
3256 	int 		diff = 0;
3257 
3258 	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
3259 	if (diff)
3260 		return (diff);
3261 
3262 	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
3263 	if (diff)
3264 		return (diff);
3265 
3266 	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
3267 	if (diff)
3268 		return (diff);
3269 
3270 	/*CSTYLED*/
3271 	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
3272 }
3273 
3274 /* Should not call for MN diskset since data tags are not supported */
3275 static int
3276 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
3277 {
3278 	int		nextid = 0;
3279 	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
3280 
3281 	/* Run to the end of the list */
3282 	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
3283 		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
3284 			return (0);
3285 		nextid++;
3286 	}
3287 
3288 	/* Add the new member */
3289 	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
3290 
3291 	/* Update the dtag portion of the list */
3292 	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
3293 	    sizeof (mddb_dtag_t));
3294 
3295 	/* Fix up the id value */
3296 	(*dtlpp)->dtl_dt.dt_id = ++nextid;
3297 
3298 	return (0);
3299 }
3300 
3301 /*
3302  * Even though data tags are not supported in MN disksets, dt_cntl may
3303  * be called for a MN diskset since this routine is called even before
3304  * it is known the kind of diskset being read in from disk.
3305  * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
3306  */
3307 static int
3308 dtl_cntl(mddb_set_t *s)
3309 {
3310 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3311 	int		ndt = 0;
3312 
3313 	while (dtlp != NULL) {
3314 		ndt++;
3315 		dtlp = dtlp->dtl_nx;
3316 	}
3317 
3318 	return (ndt);
3319 }
3320 
3321 /*
3322  * Even though data tags are not supported in MN disksets, dt_cntl may
3323  * be called for a MN diskset since this routine is called even before
3324  * it is known the kind of diskset being read in from disk.
3325  * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
3326  */
3327 static mddb_dtag_t *
3328 dtl_findl(mddb_set_t *s, int id)
3329 {
3330 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3331 
3332 	while (dtlp != NULL) {
3333 		if (dtlp->dtl_dt.dt_id == id)
3334 			return (&dtlp->dtl_dt);
3335 		dtlp = dtlp->dtl_nx;
3336 	}
3337 	return ((mddb_dtag_t *)NULL);
3338 }
3339 
3340 /* Should not call for MN diskset since data tags are not supported */
3341 static void
3342 dtl_freel(mddb_dtag_lst_t **dtlpp)
3343 {
3344 	mddb_dtag_lst_t	*dtlp;
3345 	mddb_dtag_lst_t	*tdtlp;
3346 
3347 
3348 	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
3349 		dtlp = tdtlp->dtl_nx;
3350 		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
3351 	}
3352 	*dtlpp = (mddb_dtag_lst_t *)NULL;
3353 }
3354 
3355 /*
3356  * Even though data tags are not supported in MN disksets, dt_setup will
3357  * be called for a MN diskset since this routine is called even before
3358  * it is known the kind of diskset being read in from disk.
3359  * Once this set is known as a MN diskset, the dtp area will be freed.
3360  */
3361 static void
3362 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
3363 {
3364 	mddb_dt_t	*dtp;
3365 	set_t		setno = s->s_setno;
3366 
3367 
3368 	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
3369 		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3370 	else if (dtagp == (mddb_dtag_t *)NULL)
3371 		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
3372 
3373 	/* shorthand */
3374 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3375 
3376 	dtp->dt_mag = MDDB_MAGIC_DT;
3377 	dtp->dt_rev = MDDB_REV_DT;
3378 
3379 	if (dtagp != NULL)
3380 		dtp->dt_dtag = *dtagp;		/* structure assignment */
3381 
3382 	/* Initialize the setno */
3383 	dtp->dt_dtag.dt_setno = setno;
3384 
3385 	/* Clear the id and flags, this is only used in user land */
3386 	dtp->dt_dtag.dt_id = 0;
3387 
3388 	/* Checksum it */
3389 	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
3390 }
3391 
3392 /* Should not call for MN diskset since data tags are not supported */
3393 static int
3394 set_dtag(mddb_set_t *s, md_error_t *ep)
3395 {
3396 	mddb_lb_t	*lbp = s->s_lbp;
3397 	mddb_dtag_t	tag;
3398 
3399 	if (lbp->lb_dtblkcnt == 0) {
3400 		/* Data tags not used in a MN set - so no failure returned */
3401 		if (lbp->lb_flags & MDDB_MNSET)
3402 			return (0);
3403 
3404 		cmn_err(CE_WARN,
3405 		    "No tag record allocated, unable to tag data");
3406 		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
3407 		return (1);
3408 	}
3409 
3410 	/* Clear the stack variable */
3411 	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
3412 
3413 	/* Get the HW serial number for this host */
3414 	(void) strncpy(tag.dt_sn, hw_serial, MDDB_SN_LEN);
3415 	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
3416 
3417 	/* Get the nodename that this host goes by */
3418 	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
3419 	tag.dt_hn[MD_MAX_NODENAME] = '\0';
3420 
3421 	/* Get a time stamp for NOW */
3422 	uniqtime32(&tag.dt_tv);
3423 
3424 	/* Setup the data tag record */
3425 	dt_setup(s, &tag);
3426 
3427 	/* Free any list of tags if they exist */
3428 	dtl_freel(&s->s_dtlp);
3429 
3430 	/* Put the new tag onto the tag list */
3431 	(void) dtl_addl(s, &tag);
3432 
3433 	return (0);
3434 }
3435 
3436 /*
3437  * If called during upgrade, this routine expects a non-translated
3438  * (aka target) dev.
3439  * Should not call for MN diskset since data tags are not supported.
3440  */
3441 static int
3442 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
3443 {
3444 	int		err = 0;
3445 	md_dev64_t	dev;
3446 	caddr_t		tbuf;
3447 	daddr_t		physblk;
3448 	mddb_block_t	blk;
3449 	mddb_dt_t	*dtp;
3450 	mddb_dtag_t	*dtagp;
3451 	set_t		setno = s->s_setno;
3452 
3453 	/* If have not allocated a data tag record, there is nothing to do */
3454 	if (lbp->lb_dtblkcnt == 0)
3455 		return (1);
3456 
3457 	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3458 
3459 	if (dtp == (mddb_dt_t *)NULL)
3460 		return (1);
3461 
3462 	/* shorthand */
3463 	dev = md_xlate_targ_2_mini(rip->ri_dev);
3464 	if (dev == NODEV64) {
3465 		return (1);
3466 	}
3467 
3468 	tbuf = (caddr_t)rip->ri_dtp;
3469 
3470 	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
3471 		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
3472 		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
3473 		/* error reading the tag */
3474 		if (err) {
3475 			err = 1;
3476 			goto out;
3477 		}
3478 		tbuf += MDDB_BSIZE;
3479 	}
3480 
3481 	/* magic is valid? */
3482 	if (dtp->dt_mag != MDDB_MAGIC_DT) {
3483 		err = 1;
3484 		goto out;
3485 	}
3486 
3487 	/* revision is valid? */
3488 	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
3489 		err = 1;
3490 		goto out;
3491 	}
3492 
3493 	/* crc is valid? */
3494 	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
3495 		err = 1;
3496 		goto out;
3497 	}
3498 
3499 	/* shorthand */
3500 	dtagp = &dtp->dt_dtag;
3501 
3502 	/* set number match? */
3503 	if (dtagp->dt_setno != setno) {
3504 		err = 1;
3505 		goto out;
3506 	}
3507 
3508 	/* tag is not empty? */
3509 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3510 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3511 	    dtagp->dt_id == 0) {
3512 		err = 2;
3513 		goto out;
3514 	}
3515 
3516 	/* Mark the locator as having tagged data */
3517 	rip->ri_flags |= MDDB_F_TAGDATA;
3518 
3519 out:
3520 	if (err) {
3521 		if (err == 1) {
3522 			md_set_setstatus(setno, MD_SET_BADTAG);
3523 			rip->ri_flags |= MDDB_F_BADTAG;
3524 		}
3525 		if (dtp != NULL) {
3526 			kmem_free(dtp, MDDB_DT_BYTES);
3527 			rip->ri_dtp = (mddb_dt_t *)NULL;
3528 		}
3529 	}
3530 
3531 	return (err);
3532 }
3533 
3534 /* Should not call for MN diskset since data tags are not supported */
3535 static int
3536 dt_write(mddb_set_t *s)
3537 {
3538 	int		li;
3539 	int		err = 0;
3540 	int		werr;
3541 	int		empty_tag = 0;
3542 	mddb_dtag_t	*dtagp;
3543 	mddb_dt_t	*dtp;
3544 	mddb_lb_t	*lbp = s->s_lbp;
3545 	set_t		setno = s->s_setno;
3546 	uint_t		set_status = md_get_setstatus(setno);
3547 
3548 
3549 	ASSERT(md_set[setno].s_dtp != NULL);
3550 
3551 	/* Nowhere to write to */
3552 	if (lbp->lb_dtblkcnt == 0)
3553 		return (err);
3554 
3555 	if (set_status & MD_SET_BADTAG)
3556 		return (err);
3557 
3558 	/* shorthand */
3559 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3560 	dtagp = &dtp->dt_dtag;
3561 
3562 	/* See if the tag is empty. */
3563 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3564 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3565 	    dtagp->dt_id == 0)
3566 		empty_tag = 1;
3567 
3568 	/* Write the tag to the locators and reset appropriate flags. */
3569 	for (li = 0; li < lbp->lb_loccnt; li++) {
3570 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3571 
3572 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3573 		    (lp->l_flags & MDDB_F_DELETED) ||
3574 		    (lp->l_flags & MDDB_F_EWRITE))
3575 			continue;
3576 
3577 		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
3578 		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
3579 
3580 		if (werr) {
3581 			err |= werr;
3582 			continue;
3583 		}
3584 
3585 		if (empty_tag)
3586 			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
3587 		else {
3588 			lp->l_flags |= MDDB_F_TAGDATA;
3589 			lp->l_flags &= ~MDDB_F_BADTAG;
3590 		}
3591 	}
3592 
3593 	if (err)
3594 		return (err);
3595 
3596 
3597 	/* If the tags were written, check to see if any tags remain. */
3598 	for (li = 0; li < lbp->lb_loccnt; li++) {
3599 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3600 
3601 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3602 		    (lp->l_flags & MDDB_F_DELETED) ||
3603 		    (lp->l_flags & MDDB_F_EWRITE))
3604 			continue;
3605 
3606 		if (lp->l_flags & MDDB_F_TAGDATA)
3607 			break;
3608 	}
3609 
3610 	/* If there are no tags, then clear CLRTAG and TAGDATA */
3611 	if (li == lbp->lb_loccnt) {
3612 		md_clr_setstatus(setno, MD_SET_CLRTAG);
3613 		md_clr_setstatus(setno, MD_SET_TAGDATA);
3614 	}
3615 
3616 	return (err);
3617 }
3618 
3619 /* Should not call for MN diskset since data tags are not supported */
3620 static int
3621 dt_alloc_if_needed(mddb_set_t *s)
3622 {
3623 	int		i;
3624 	int		li;
3625 	int		moveit = 0;
3626 	mddb_lb_t	*lbp = s->s_lbp;
3627 	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
3628 	set_t		setno = s->s_setno;
3629 	uint_t		set_status = md_get_setstatus(setno);
3630 
3631 	/*
3632 	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
3633 	 * not detected, there is nothing to do.
3634 	 */
3635 	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
3636 		return (0);
3637 
3638 	/* Bitmap not setup, checks can't be done */
3639 	if (s->s_totalblkcnt == 0)
3640 		return (0);
3641 
3642 	/* While reading the tag(s) an invalid tag data record was seen */
3643 	if (set_status & MD_SET_BADTAG)
3644 		/* See if the invalid tag needs to be moved */
3645 		for (i = 0; i < MDDB_DT_BLOCKS; i++)
3646 			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
3647 				moveit = 1;
3648 				break;
3649 			}
3650 
3651 	/* Need to move or allocate the tag data record */
3652 	if (moveit || blkcnt == 0) {
3653 		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
3654 		if (lbp->lb_dtfirstblk == 0) {
3655 			cmn_err(CE_WARN,
3656 			    "Unable to allocate data tag record");
3657 			return (0);
3658 		}
3659 		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
3660 
3661 		/* Mark the locators so that they get written to disk. */
3662 		for (li = 0; li < lbp->lb_loccnt; li++) {
3663 			mddb_locator_t	*lp = &lbp->lb_locators[li];
3664 
3665 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3666 			    (lp->l_flags & MDDB_F_DELETED) ||
3667 			    (lp->l_flags & MDDB_F_EWRITE))
3668 				continue;
3669 
3670 			lp->l_flags |= MDDB_F_BADTAG;
3671 		}
3672 		return (1);
3673 	}
3674 
3675 	/*
3676 	 * Make sure the blocks are owned, since the calculation in
3677 	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
3678 	 */
3679 	for (i = 0; i < MDDB_DT_BLOCKS; i++)
3680 		blkbusy(s, (i + lbp->lb_dtfirstblk));
3681 
3682 	return (1);
3683 }
3684 
3685 /*
3686  * Writestart writes the incore mddb out to all of the replicas.
3687  * This is called when a diskset is started and when an error has
3688  * been enountered during the write to a mddb.
3689  *
3690  * flag can be 2 values:
3691  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3692  *		always used for traditional and local disksets.
3693  *		This is the normal path for MN disksets since the slave
3694  *		nodes aren't actually allowed to write to disk.
3695  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3696  *		master has been chosen, the new master may need to
3697  * 		write its incore mddb to disk (this is the case where the
3698  *		old master had executed a message but hadn't relayed it
3699  *		to this slave yet).  New master should not write the
3700  *		change log records since new master would be overwriting
3701  *		valuable data.  Only used during a reconfig cycle.
3702  */
3703 static int
3704 writestart(
3705 	mddb_set_t	*s,
3706 	int		flag
3707 )
3708 {
3709 	int		li;
3710 	mddb_locator_t	*lp;
3711 	mddb_lb_t	*lbp;
3712 	mddb_ln_t	*lnp;
3713 	int		err = 0;
3714 	uint_t		set_status;
3715 
3716 	lbp = s->s_lbp;
3717 
3718 	for (li = 0; li < lbp->lb_loccnt; li++) {
3719 		lp = &lbp->lb_locators[li];
3720 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3721 			continue;
3722 		if (! (lp->l_flags & MDDB_F_SUSPECT))
3723 			continue;
3724 		if (writecopy(s, li, flag))
3725 			return (1);
3726 		lp->l_flags |= MDDB_F_UP2DATE;
3727 	}
3728 
3729 	for (li = 0; li < lbp->lb_loccnt; li++) {
3730 		lp = &lbp->lb_locators[li];
3731 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3732 			continue;
3733 		if ((lp->l_flags & MDDB_F_UP2DATE))
3734 			continue;
3735 		if (checkcopy(s, li))
3736 			if (err = writecopy(s, li, flag))
3737 				return (1);
3738 		lp->l_flags |= MDDB_F_UP2DATE;
3739 	}
3740 
3741 	/*
3742 	 * Call fixoptrecord even during a reconfig cycle since a replica
3743 	 * failure may force the master to re-assign the optimized
3744 	 * resync record to another replica.
3745 	 */
3746 	if (fixoptrecords(s))
3747 		return (1);
3748 
3749 	set_status = md_get_setstatus(s->s_setno);
3750 
3751 	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
3752 	for (li = 0; li < lbp->lb_loccnt; li++) {
3753 		lp = &lbp->lb_locators[li];
3754 
3755 		if (lp->l_flags & MDDB_F_DELETED)
3756 			continue;
3757 
3758 		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
3759 		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
3760 		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
3761 		    (lp->l_flags & MDDB_F_OLDACT) != 0))
3762 			break;
3763 
3764 		if ((set_status & MD_SET_TAGDATA) ||
3765 		    (set_status & MD_SET_CLRTAG))
3766 			if ((lp->l_flags & MDDB_F_TAGDATA) ||
3767 			    (lp->l_flags & MDDB_F_BADTAG))
3768 				break;
3769 	}
3770 
3771 	/*
3772 	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
3773 	 * the lbp identifier and the set identifier doesn't match.
3774 	 */
3775 	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
3776 
3777 		/* Only call for traditional and local sets */
3778 		if (!(lbp->lb_flags & MDDB_MNSET))
3779 			(void) dt_write(s);
3780 
3781 		setidentifier(s, &lbp->lb_ident);
3782 
3783 		if (err = push_lb(s)) {
3784 			(void) upd_med(s, "writestart(0)");
3785 			return (err);
3786 		}
3787 
3788 		(void) upd_med(s, "writestart(0)");
3789 
3790 		if (err = push_lb(s)) {
3791 			(void) upd_med(s, "writestart(1)");
3792 			return (err);
3793 		}
3794 
3795 		(void) upd_med(s, "writestart(1)");
3796 
3797 		lnp = s->s_lnp;
3798 		uniqtime32(&lnp->ln_timestamp);
3799 		if (lbp->lb_flags & MDDB_MNSET)
3800 			lnp->ln_revision = MDDB_REV_MNLN;
3801 		else
3802 			lnp->ln_revision = MDDB_REV_LN;
3803 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
3804 		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
3805 			lbp->lb_lnblkcnt, 0);
3806 		/*
3807 		 * If a MN diskset and this is the master, set the PARSE_LOCNM
3808 		 * flag in the mddb_set structure to show that the locator
3809 		 * names have changed.
3810 		 * Don't set parseflags as a result of a new master sync
3811 		 * during reconfig cycle since slaves nodes are already
3812 		 * in-sync with the new master.
3813 		 */
3814 
3815 		if ((lbp->lb_flags & MDDB_MNSET) &&
3816 		    (md_set[s->s_setno].s_am_i_master) &&
3817 		    (flag != MDDB_WRITECOPY_SYNC)) {
3818 			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
3819 		}
3820 
3821 		if (err)
3822 			return (err);
3823 	}
3824 
3825 	for (li = 0; li < lbp->lb_loccnt; li++) {
3826 		lp = &lbp->lb_locators[li];
3827 		if (lp->l_flags & MDDB_F_DELETED)
3828 			continue;
3829 		if (lp->l_flags & MDDB_F_ACTIVE) {
3830 			lp->l_flags |= MDDB_F_OLDACT;
3831 		} else {
3832 			lp->l_flags &= ~MDDB_F_OLDACT;
3833 		}
3834 	}
3835 
3836 	md_clr_setstatus(s->s_setno, MD_SET_STALE);
3837 
3838 	return (0);
3839 }
3840 
3841 /*
3842  * selectreplicas selects the working replicas and may write the incore
3843  * version of the mddb out to the replicas ondisk.
3844  *
3845  * flag can be 3 values:
3846  *	MDDB_RETRYSCAN - quick scan to see if there is an error.
3847  *			If no new error, returns without writing mddb
3848  *			to disks.  If a new error is seen, writes out
3849  *			mddb to disks.
3850  *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
3851  *			out mddb to the replica ondisk.  Calls writecopy
3852  *			with MDDB_WRITECOPY_ALL flag which writes out
3853  *			all records to the replicas ondisk.
3854  *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
3855  *			and ondisk mddbs by writing incore values to disk.
3856  *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
3857  *			that change log records are not written out.
3858  *			Only used by MN disksets.
3859  *
3860  * Returns:
3861  *	0 - Successful
3862  *	1 - Unable to write incore mddb data to disk since < 50% replicas.
3863  */
3864 int
3865 selectreplicas(
3866 	mddb_set_t	*s,
3867 	int		flag
3868 )
3869 {
3870 	int		li;
3871 	int		alc;
3872 	int		lc;
3873 	mddb_locator_t	*lp;
3874 	mddb_lb_t	*lbp = s->s_lbp;
3875 	set_t		setno = s->s_setno;
3876 	int		wc_flag;
3877 
3878 	/*
3879 	 * can never transition from stale to not stale
3880 	 */
3881 	if (md_get_setstatus(setno) & MD_SET_STALE) {
3882 		for (li = 0; li < lbp->lb_loccnt; li++) {
3883 			lp = &lbp->lb_locators[li];
3884 			if (lp->l_flags & MDDB_F_DELETED)
3885 				continue;
3886 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3887 				lp->l_flags |= MDDB_F_ACTIVE;
3888 			} else {
3889 				lp->l_flags &= ~MDDB_F_ACTIVE;
3890 			}
3891 		}
3892 		return (1);
3893 	}
3894 
3895 	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
3896 		for (li = 0; li < lbp->lb_loccnt; li++) {
3897 			lp = &lbp->lb_locators[li];
3898 			if (lp->l_flags & MDDB_F_DELETED)
3899 				continue;
3900 			if (lp->l_flags & MDDB_F_ACTIVE) {
3901 				lp->l_flags |= MDDB_F_OLDACT;
3902 				lp->l_flags &= ~MDDB_F_SUSPECT;
3903 			} else {
3904 				lp->l_flags |= MDDB_F_SUSPECT;
3905 				lp->l_flags &= ~MDDB_F_OLDACT;
3906 			}
3907 
3908 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3909 				lp->l_flags |= MDDB_F_ACTIVE;
3910 				lp->l_flags &= ~MDDB_F_EWRITE;
3911 				lp->l_flags &= ~MDDB_F_TOOSMALL;
3912 			} else {
3913 				lp->l_flags &= ~MDDB_F_ACTIVE;
3914 			}
3915 		}
3916 		computefreeblks(s); /* set up free block bits */
3917 	} else {
3918 		for (li = 0; li < lbp->lb_loccnt; li++) {
3919 			lp = &lbp->lb_locators[li];
3920 			if (! (lp->l_flags & MDDB_F_ACTIVE))
3921 				continue;
3922 			if (lp->l_flags & MDDB_F_EWRITE)
3923 				break;
3924 		}
3925 
3926 		/*
3927 		 * if there are no errors this is error has already
3928 		 * been processed return current state
3929 		 */
3930 		if (li == lbp->lb_loccnt)
3931 			return (md_get_setstatus(setno) & MD_SET_TOOFEW);
3932 
3933 		lp->l_flags &= ~MDDB_F_ACTIVE;
3934 		do {
3935 			lp = &lbp->lb_locators[li];
3936 			lp->l_flags &= ~MDDB_F_UP2DATE;
3937 		} while (++li < lbp->lb_loccnt);
3938 	}
3939 
3940 	alc = 0;
3941 	lc = 0;
3942 	for (li = 0; li < lbp->lb_loccnt; li++) {
3943 		lp = &lbp->lb_locators[li];
3944 		if (lp->l_flags & MDDB_F_DELETED)
3945 			continue;
3946 		lc++;
3947 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3948 			continue;
3949 		alc++;
3950 	}
3951 
3952 	if (alc < ((lc + 1) / 2)) {
3953 		md_set_setstatus(setno, MD_SET_TOOFEW);
3954 		return (1);
3955 	}
3956 
3957 	/* Set wc_flag based on flag passed in. */
3958 	if (flag == MDDB_SCANALLSYNC)
3959 		wc_flag = MDDB_WRITECOPY_SYNC;
3960 	else
3961 		wc_flag = MDDB_WRITECOPY_ALL;
3962 
3963 	do {
3964 		if (! writestart(s, wc_flag)) {
3965 			md_clr_setstatus(setno, MD_SET_TOOFEW);
3966 			return (0);
3967 		}
3968 		alc  = 0;
3969 		for (li = 0; li < lbp->lb_loccnt; li++) {
3970 			lp = &lbp->lb_locators[li];
3971 			if ((lp->l_flags & MDDB_F_DELETED) ||
3972 			    (lp->l_flags & MDDB_F_EMASTER))
3973 				continue;
3974 
3975 			if (lp->l_flags & MDDB_F_EWRITE) {
3976 				lp->l_flags &= ~MDDB_F_ACTIVE;
3977 				lp->l_flags &= ~MDDB_F_UP2DATE;
3978 				continue;
3979 			}
3980 			alc++;
3981 		}
3982 	} while (alc >= ((lc + 1) / 2));
3983 	md_set_setstatus(setno, MD_SET_TOOFEW);
3984 	return (1);
3985 }
3986 
3987 static int
3988 checkstate(
3989 	mddb_set_t	*s,
3990 	int		probe
3991 )
3992 {
3993 	int		error;
3994 	uint_t		set_status = md_get_setstatus(s->s_setno);
3995 
3996 	ASSERT(s != NULL);
3997 
3998 	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
3999 		return (0);
4000 
4001 	if (probe == MDDB_NOPROBE)
4002 		return (1);
4003 
4004 	single_thread_start(s);
4005 	error = selectreplicas(s, MDDB_SCANALL);
4006 	single_thread_end(s);
4007 
4008 	if (error == 0 && s->s_zombie != 0) {
4009 		mutex_exit(SETMUTEX(s->s_setno));
4010 		error = mddb_deleterec(s->s_zombie);
4011 		mutex_enter(SETMUTEX(s->s_setno));
4012 		if (error == 0)
4013 			s->s_zombie = 0;
4014 	}
4015 	return (error);
4016 }
4017 
4018 static int
4019 writeretry(
4020 	mddb_set_t	*s
4021 )
4022 {
4023 	if (selectreplicas(s, MDDB_RETRYSCAN))
4024 		if (selectreplicas(s, MDDB_SCANALL))
4025 			return (1);
4026 	return (0);
4027 }
4028 
4029 static void
4030 free_mbipp(mddb_mb_ic_t **mbipp)
4031 {
4032 	mddb_mb_ic_t	*mbip1, *mbip2;
4033 
4034 	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
4035 		mbip2 = mbip1->mbi_next;
4036 		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
4037 	}
4038 	*mbipp = (mddb_mb_ic_t *)NULL;
4039 }
4040 
4041 static mddb_ri_t *
4042 save_rip(mddb_set_t *s)
4043 {
4044 	mddb_ri_t	*trip = s->s_rip;
4045 	mddb_ri_t	*nrip = NULL;
4046 	mddb_ri_t	**nripp = &nrip;
4047 	mddb_ri_t	*rip;
4048 
4049 	while (trip) {
4050 		/* Run to the end of the list */
4051 		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
4052 			/* void */;
4053 
4054 		/* Add the new member */
4055 		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
4056 
4057 		ASSERT(*nripp != NULL);
4058 
4059 		/* shorthand */
4060 		rip = *nripp;
4061 
4062 		*rip = *trip;			/* structure assignment */
4063 
4064 		/* Clear the stuff that is not needed for hints */
4065 		rip->ri_flags = 0;
4066 		rip->ri_commitcnt = 0;
4067 		rip->ri_transplant = 0;
4068 		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
4069 		rip->ri_dtp = (mddb_dt_t *)NULL;
4070 		rip->ri_lbp = (mddb_lb_t *)NULL;
4071 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4072 		rip->ri_devid = (ddi_devid_t)NULL;
4073 		rip->ri_old_devid = (ddi_devid_t)NULL;
4074 		rip->ri_next = (mddb_ri_t *)NULL;
4075 
4076 		trip = trip->ri_next;
4077 	}
4078 	return (nrip);
4079 }
4080 
4081 static void
4082 free_rip(mddb_ri_t **ripp)
4083 {
4084 	mddb_ri_t	*rip;
4085 	mddb_ri_t	*arip;
4086 
4087 	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
4088 		arip = rip->ri_next;
4089 		if (rip->ri_devid != (ddi_devid_t)NULL) {
4090 			ddi_devid_free(rip->ri_devid);
4091 			rip->ri_devid = (ddi_devid_t)NULL;
4092 		}
4093 		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
4094 			ddi_devid_free(rip->ri_old_devid);
4095 			rip->ri_old_devid = (ddi_devid_t)NULL;
4096 		}
4097 		kmem_free((caddr_t)rip, sizeof (*rip));
4098 	}
4099 	*ripp = (mddb_ri_t *)NULL;
4100 }
4101 
4102 /*
4103  * this routine selects the correct replica to use
4104  * the rules are as follows
4105  *	1.	if all replica has same init time select highest commit count
4106  *	2.	if some but not all replicas are from another hostid discard
4107  *		them.
4108  *	3.	find which init time is present is most replicas
4109  *	4.	discard all replicas which do not match most init times
4110  *	5.	select replica with highest commit count
4111  */
4112 
4113 static mddb_lb_t *
4114 selectlocator(
4115 	mddb_set_t	*s
4116 )
4117 {
4118 	mddb_ri_t	*rip = s->s_rip;
4119 	mddb_ri_t	*r, *r1;
4120 	mddb_lb_t	*lbp;
4121 	struct timeval32 *tp = (struct timeval32 *)NULL;
4122 	int		different;
4123 	int		same;
4124 	int		count;
4125 	int		maxcount;
4126 	set_t		setno = s->s_setno;
4127 	size_t		sz;
4128 	int		mn_set = 0;
4129 
4130 	/* Clear the ri_transplant flag on all the rip entries. */
4131 	/* Set ri_commitcnt to locator's commitcnt - if available */
4132 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4133 		r->ri_transplant = 0;
4134 		if (r->ri_lbp != (mddb_lb_t *)NULL) {
4135 			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
4136 			/* If any locators have MN bit set, set flag */
4137 			if (r->ri_lbp->lb_flags & MDDB_MNSET)
4138 				mn_set = 1;
4139 		}
4140 	}
4141 
4142 	/*
4143 	 * A data tag is being used, so use it to limit the selection first.
4144 	 * Data tags not used in MN diskset.
4145 	 */
4146 	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
4147 		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;
4148 
4149 		/*
4150 		 * now toss any locators that have a different data tag
4151 		 */
4152 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4153 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4154 				continue;
4155 
4156 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4157 				/* If same tag, keep it */
4158 				if (dtl_cmp(&dtp->dt_dtag,
4159 				    &r->ri_dtp->dt_dtag) == 0)
4160 					continue;
4161 			}
4162 
4163 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4164 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4165 				r->ri_dtp = (mddb_dt_t *)NULL;
4166 			}
4167 
4168 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4169 			if (!(md_get_setstatus(setno) &
4170 			    MD_SET_REPLICATED_IMPORT)) {
4171 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4172 					sz = ddi_devid_sizeof(r->ri_old_devid);
4173 					kmem_free((caddr_t)r->ri_old_devid, sz);
4174 					r->ri_old_devid = (ddi_devid_t)NULL;
4175 				}
4176 			}
4177 
4178 			kmem_free((caddr_t)r->ri_lbp,
4179 			    dbtob(r->ri_lbp->lb_blkcnt));
4180 			r->ri_lbp = (mddb_lb_t *)NULL;
4181 
4182 			r->ri_transplant = 1;
4183 		}
4184 
4185 		/* Tag used, clear the bit */
4186 		md_clr_setstatus(s->s_setno, MD_SET_USETAG);
4187 
4188 		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
4189 			/*
4190 			 * Get rid of the list of tags.
4191 			 */
4192 			dtl_freel(&s->s_dtlp);
4193 
4194 			/*
4195 			 * Re-create the list with the tag used.
4196 			 */
4197 			(void) dtl_addl(s, &dtp->dt_dtag);
4198 		}
4199 	}
4200 
4201 	/*
4202 	 * scan to see if all replicas have same time
4203 	 */
4204 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4205 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4206 			continue;
4207 		if (tp == NULL) {
4208 			tp = &r->ri_lbp->lb_inittime;
4209 			continue;
4210 		}
4211 		/* CSTYLED */
4212 		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
4213 			break;
4214 	}
4215 
4216 	/*
4217 	 * if r == NULL then they were all them same. Choose highest
4218 	 * commit count
4219 	 */
4220 	if (r == (mddb_ri_t *)NULL)
4221 		goto out;
4222 
4223 	/*
4224 	 * If here, a bogus replica is present and at least 1 lb_inittime
4225 	 * did not match.
4226 	 */
4227 
4228 	/*
4229 	 * look and see if any but not all are from different id
4230 	 */
4231 
4232 	different = 0;
4233 	same = 0;
4234 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4235 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4236 			continue;
4237 		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4238 			different = 1;
4239 		else
4240 			same = 1;
4241 	}
4242 
4243 	/*
4244 	 * now go through and throw out different if there are some
4245 	 * that are the same
4246 	 */
4247 	if (different != 0 && same != 0) {
4248 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4249 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4250 				continue;
4251 
4252 			if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
4253 				continue;
4254 
4255 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4256 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4257 				r->ri_dtp = (mddb_dt_t *)NULL;
4258 			}
4259 
4260 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4261 			if (!(md_get_setstatus(setno) &
4262 			    MD_SET_REPLICATED_IMPORT)) {
4263 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4264 					sz = ddi_devid_sizeof(r->ri_old_devid);
4265 					kmem_free((caddr_t)r->ri_old_devid, sz);
4266 					r->ri_old_devid = (ddi_devid_t)NULL;
4267 				}
4268 			}
4269 
4270 			kmem_free((caddr_t)r->ri_lbp,
4271 			    dbtob(r->ri_lbp->lb_blkcnt));
4272 			r->ri_lbp = (mddb_lb_t *)NULL;
4273 
4274 			r->ri_transplant = 1;
4275 		}
4276 	}
4277 
4278 	/*
4279 	 * go through and pick highest. Use n square because it is
4280 	 * simple and 40 some is max possible
4281 	 */
4282 	maxcount = 0;
4283 	lbp = (mddb_lb_t *)NULL;
4284 	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
4285 		if (r1->ri_lbp == (mddb_lb_t *)NULL)
4286 			continue;
4287 		count = 0;
4288 		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4289 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4290 				continue;
4291 			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
4292 			    &r->ri_lbp->lb_inittime, ==))
4293 				count++;
4294 		}
4295 		if (count > maxcount) {
4296 			maxcount = count;
4297 			lbp = r1->ri_lbp;
4298 		}
4299 	}
4300 
4301 	/*
4302 	 * now go though and toss any that are of a different time stamp
4303 	 */
4304 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4305 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4306 			continue;
4307 		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
4308 		    &r->ri_lbp->lb_inittime, ==))
4309 			continue;
4310 
4311 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4312 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4313 			r->ri_dtp = (mddb_dt_t *)NULL;
4314 		}
4315 
4316 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4317 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4318 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4319 				sz = ddi_devid_sizeof(r->ri_old_devid);
4320 				kmem_free((caddr_t)r->ri_old_devid, sz);
4321 				r->ri_old_devid = (ddi_devid_t)NULL;
4322 			}
4323 		}
4324 
4325 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4326 		r->ri_lbp = (mddb_lb_t *)NULL;
4327 
4328 		r->ri_transplant = 1;
4329 	}
4330 
4331 out:
4332 	/*
4333 	 * Find the locator with the highest commit count, and make it the
4334 	 * "chosen" one.
4335 	 */
4336 	lbp = (mddb_lb_t *)NULL;
4337 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4338 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4339 			continue;
4340 
4341 		if (lbp == NULL) {
4342 			lbp = r->ri_lbp;
4343 			continue;
4344 		}
4345 
4346 		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
4347 			lbp = r->ri_lbp;
4348 	}
4349 
4350 	/* Toss all locator blocks, except the "chosen" one. */
4351 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4352 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4353 			continue;
4354 
4355 		/* Get rid of all dtp's */
4356 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4357 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4358 			r->ri_dtp = (mddb_dt_t *)NULL;
4359 		}
4360 
4361 		if (r->ri_lbp == lbp)
4362 			continue;
4363 
4364 		/* Get rid of extra locator devid block info */
4365 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4366 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4367 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4368 				sz = ddi_devid_sizeof(r->ri_old_devid);
4369 				kmem_free((caddr_t)r->ri_old_devid, sz);
4370 				r->ri_old_devid = (ddi_devid_t)NULL;
4371 			}
4372 		}
4373 
4374 		/* Get rid of extra locators */
4375 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4376 		r->ri_lbp = (mddb_lb_t *)NULL;
4377 	}
4378 	return (lbp);
4379 }
4380 
4381 static void
4382 locator2cfgloc(
4383 	mddb_lb_t		*lbp,
4384 	mddb_cfg_loc_t		*clp,
4385 	int			li,
4386 	side_t			sideno,
4387 	mddb_did_ic_t		*did_icp
4388 )
4389 {
4390 	mddb_drvnm_t		*dn;
4391 	mddb_locator_t		*lp = &lbp->lb_locators[li];
4392 	mddb_sidelocator_t	*slp;
4393 	mddb_mnsidelocator_t	*mnslp;
4394 	mddb_did_info_t		*did_info;
4395 	int 			i, sz, szalloc;
4396 	int			mn_set = 0;
4397 	mddb_mnlb_t		*mnlbp;
4398 
4399 	if (lbp->lb_flags & MDDB_MNSET) {
4400 		mn_set = 1;
4401 		mnlbp = (mddb_mnlb_t *)lbp;
4402 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4403 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4404 			if (mnslp->mnl_sideno == sideno)
4405 				break;
4406 		}
4407 		if (i == MD_MNMAXSIDES)
4408 			return;
4409 	} else {
4410 		slp = &lbp->lb_sidelocators[sideno][li];
4411 	}
4412 
4413 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4414 	    did_info = &(did_icp->did_ic_blkp->blk_info[li]);
4415 	    if (did_info->info_flags & MDDB_DID_EXISTS) {
4416 		sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
4417 		if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
4418 			/* copy device id from mddb to cfg_loc structure */
4419 			szalloc = clp->l_devid_sz;
4420 			if (sz <= szalloc) {
4421 				for (i = 0; i < sz; i++) {
4422 					((char *)(uintptr_t)clp->l_devid)[i] =
4423 					((char *)did_icp->did_ic_devid[li])[i];
4424 				}
4425 				clp->l_devid_flags |= MDDB_DEVID_VALID;
4426 				(void) strcpy(clp->l_minor_name,
4427 					did_info->info_minor_name);
4428 			} else {
4429 				clp->l_devid_flags |= MDDB_DEVID_NOSPACE;
4430 			}
4431 		} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
4432 			clp->l_devid_flags = MDDB_DEVID_SZ;
4433 			clp->l_devid_sz = sz;
4434 		}
4435 	    }
4436 	}
4437 
4438 	/*
4439 	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
4440 	 * and sidelocators.  During startup, the dev, drvnm and mnum in
4441 	 * these structures may not match the devid (the locators and
4442 	 * sidelocators will be updated to match the devid by the routine
4443 	 * load_old_replicas).  Using out-of-sync values won't cause any
4444 	 * problems since ridev will re-derive these from the devid and mnum.
4445 	 * After startup, the dev, drvnm and mnum in these structures have
4446 	 * been updated and can be used.
4447 	 */
4448 
4449 	clp->l_blkno = lp->l_blkno;
4450 	clp->l_flags = lp->l_flags;
4451 	clp->l_dev = lp->l_dev;
4452 
4453 	if (mn_set) {
4454 		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
4455 		clp->l_mnum = mnslp->mnl_mnum;
4456 	} else {
4457 		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
4458 		clp->l_mnum = slp->l_mnum;
4459 	}
4460 	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
4461 }
4462 
4463 /*
4464  * Find the index into the mnsidelocator where entry will go.
4465  * Then index can be fed into both splitname2locatorblocks and
4466  * cfgloc2locator so that those entries can be kept in sync.
4467  *
4468  * Returns:
4469  *	-1 if failed to find unused slot or if a traditional diskset
4470  *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
4471  */
4472 static int
4473 checklocator(
4474 	mddb_lb_t		*lbp,
4475 	int			li,
4476 	side_t			sideno
4477 )
4478 {
4479 	uchar_t			i;
4480 	mddb_mnsidelocator_t	*mnslp;
4481 	mddb_mnlb_t		*mnlbp;
4482 	int			index = -1;
4483 
4484 	if (lbp->lb_flags & MDDB_MNSET) {
4485 		/*
4486 		 * Checking side locator structure.  First, check if
4487 		 * there is already an entry for this side.  If so,
4488 		 * then use that entry.  Otherwise, find an entry
4489 		 * that has a sideno of 0.
4490 		 */
4491 		mnlbp = (mddb_mnlb_t *)lbp;
4492 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4493 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4494 			if (mnslp->mnl_sideno == sideno) {
4495 				/* Found a match - stop looking */
4496 				index = i;
4497 				break;
4498 			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
4499 				/* Set first empty slot, but keep looking */
4500 				index = i;
4501 			}
4502 		}
4503 		/* Didn't find empty slot or previously used slot */
4504 		if ((i == MD_MNMAXSIDES) && (index == -1)) {
4505 			return (-1);
4506 		}
4507 		return (index);
4508 	} else
4509 		return (0);
4510 }
4511 
4512 /*
4513  * Takes locator information (driver name, minor number, sideno) and
4514  * stores it in the locator block.
4515  * For traditional diskset, the sideno is the index into the sidelocator
4516  * array in the locator block.
4517  * For the MN diskset, the sideno is the nodeid which can be any number,
4518  * so the index passed in is the index into the mnsidelocator array
4519  * in the locator block.
4520  */
4521 static int
4522 cfgloc2locator(
4523 	mddb_lb_t		*lbp,
4524 	mddb_cfg_loc_t		*clp,
4525 	int			li,
4526 	side_t			sideno,
4527 	int			index	/* Only useful in MNsets when > 1 */
4528 )
4529 {
4530 	uchar_t			i;
4531 	mddb_sidelocator_t	*slp;
4532 	mddb_mnsidelocator_t	*mnslp;
4533 	mddb_set_t		*s;
4534 	int			mn_set = 0;
4535 	mddb_mnlb_t		*mnlbp;
4536 
4537 	if (lbp->lb_flags & MDDB_MNSET) {
4538 		mnlbp = (mddb_mnlb_t *)lbp;
4539 		mn_set = 1;
4540 		/*
4541 		 * Index will be the slot that has the given sideno or
4542 		 * the first empty slot if no match is found.
4543 		 * This was pre-checked out in check locator.
4544 		 */
4545 		mnslp = &mnlbp->lb_mnsidelocators[index][li];
4546 	} else {
4547 		slp = &lbp->lb_sidelocators[sideno][li];
4548 	}
4549 
4550 	/*
4551 	 * Look for the driver name
4552 	 */
4553 	for (i = 0; i < MDDB_DRVNMCNT; i++) {
4554 		if (lbp->lb_drvnm[i].dn_len == 0)
4555 			continue;
4556 		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4557 		    MD_MAXDRVNM) == 0)
4558 			break;
4559 	}
4560 
4561 	/*
4562 	 * Didn't find one, add a new one
4563 	 */
4564 	if (i == MDDB_DRVNMCNT) {
4565 		for (i = 0; i < MDDB_DRVNMCNT; i++) {
4566 			if (lbp->lb_drvnm[i].dn_len == 0)
4567 				break;
4568 		}
4569 		if (i == MDDB_DRVNMCNT)
4570 			return (1);
4571 		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4572 		    MD_MAXDRVNM);
4573 		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
4574 	}
4575 
4576 	/* Fill in the drvnm index */
4577 	if (mn_set) {
4578 		mnslp->mnl_drvnm_index = i;
4579 		mnslp->mnl_mnum = clp->l_mnum;
4580 		mnslp->mnl_sideno = sideno;
4581 	} else {
4582 		slp->l_drvnm_index = i;
4583 		slp->l_mnum = clp->l_mnum;
4584 	}
4585 
4586 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4587 		/*
4588 		 * This device id could already be associated with this index
4589 		 * if this is not the first side added to the set.
4590 		 * If device id is 0, there is no device id for this device.
4591 		 */
4592 		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
4593 			return (0);
4594 		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
4595 		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
4596 		    clp->l_minor_name)) {
4597 			return (1);
4598 		}
4599 	}
4600 
4601 	return (0);
4602 }
4603 
4604 /*
4605  * See if there are mediator hosts and try to use the data.
4606  */
4607 static int
4608 mediate(
4609 	mddb_set_t	*s
4610 )
4611 {
4612 	mddb_lb_t	*lbp = s->s_lbp;
4613 	med_data_lst_t	*meddlp = NULL;
4614 	med_data_lst_t	*tmeddlp = NULL;
4615 	med_data_t	*meddp;
4616 	int		medok = 0;
4617 	int		medacc = 0;
4618 	uint_t		maxcc;
4619 	int		golden = 0;
4620 	int		err = 1;
4621 	set_t		setno = s->s_setno;
4622 
4623 	/* Do not have a mediator, then the state is stale */
4624 	if (s->s_med.n_cnt == 0)
4625 		return (err);
4626 
4627 	/* Contact the mediator hosts for the data */
4628 	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
4629 
4630 	/* No mediator data, stale */
4631 	if (meddlp == NULL)
4632 		return (err);
4633 
4634 	/* Mark all the mediator data that is not for this set as errored */
4635 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4636 		struct timeval32 tmptime;
4637 		meddp = tmeddlp->mdl_med;
4638 
4639 		/* Count the number of mediators contacted */
4640 		medacc++;
4641 
4642 		/* Paranoid check */
4643 		if (meddp->med_dat_sn != setno)
4644 			meddp->med_dat_fl |= MED_DFL_ERROR;
4645 
4646 		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
4647 
4648 		/*CSTYLED*/
4649 		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
4650 			meddp->med_dat_fl |= MED_DFL_ERROR;
4651 	}
4652 
4653 	/* Get the max commitcount */
4654 	maxcc = 0;
4655 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4656 		meddp = tmeddlp->mdl_med;
4657 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4658 			continue;
4659 		if (meddp->med_dat_cc > maxcc)
4660 			maxcc = meddp->med_dat_cc;
4661 	}
4662 
4663 	/* Now mark the records that don't have the highest cc as errored */
4664 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4665 		meddp = tmeddlp->mdl_med;
4666 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4667 			continue;
4668 		if (meddp->med_dat_cc != maxcc)
4669 			meddp->med_dat_fl |= MED_DFL_ERROR;
4670 	}
4671 
4672 	/* Now mark the records that don't match the lb commitcnt as errored */
4673 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4674 		meddp = tmeddlp->mdl_med;
4675 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4676 			continue;
4677 		if (meddp->med_dat_cc != lbp->lb_commitcnt)
4678 			meddp->med_dat_fl |= MED_DFL_ERROR;
4679 	}
4680 
4681 	/* Is there a "golden" copy and how many valid mediators */
4682 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4683 		meddp = tmeddlp->mdl_med;
4684 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4685 			continue;
4686 
4687 		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
4688 			golden++;
4689 
4690 		medok++;
4691 	}
4692 
4693 	/* No survivors, stale */
4694 	if (medok == 0)
4695 		goto out;
4696 
4697 	/* No mediator quorum and no golden copies, stale */
4698 	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
4699 		/* Skip odd numbers, no exact 50% */
4700 		if (s->s_med.n_cnt & 1)
4701 			goto out;
4702 		/* Have 50%, allow an accept */
4703 		if (medacc == (s->s_med.n_cnt / 2))
4704 			md_set_setstatus(setno, MD_SET_ACCOK);
4705 		goto out;
4706 	}
4707 
4708 	/* We either have a quorum or a golden copy, or both */
4709 	err = 0;
4710 
4711 out:
4712 	if (meddlp) {
4713 		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
4714 			tmeddlp = meddlp->mdl_nx;
4715 			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
4716 			kmem_free(meddlp, sizeof (med_data_lst_t));
4717 		}
4718 	}
4719 
4720 	return (err);
4721 }
4722 
4723 /*
4724  *	1. read masterblks and locator blocks for all know database locations
4725  *		a. keep track of which have good master blks
4726  *		b. keep track of which have good locators
4727  *
4728  */
4729 static int
4730 get_mbs_n_lbs(
4731 	mddb_set_t	*s,
4732 	int		*write_lb
4733 )
4734 {
4735 	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
4736 						/* May be cast to mddb_mnlb_t */
4737 						/* if accessing sidenames in */
4738 						/* MN set */
4739 	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
4740 	mddb_did_blk_t	*did_blkp = 0;
4741 	int		did_blkp_sz = 0;
4742 	mddb_did_db_t	*did_dbp;
4743 	mddb_did_info_t	*did_info;
4744 	caddr_t		did_block;
4745 	mddb_ri_t	*rip;
4746 	mddb_dtag_lst_t	*dtlp;
4747 	mddb_locator_t	*lp;
4748 	daddr_t		physblk;
4749 	int		li;
4750 	uint_t		blk;
4751 	md_dev64_t	dev;
4752 	caddr_t		buffer;
4753 	uint_t		lb_blkcnt;
4754 	int		retval = 0;
4755 	int		err = 0;
4756 	int		lb_ok = 0;
4757 	int		lb_total = 0;
4758 	int		lb_tagged = 0;
4759 	int		lb_tags;
4760 	set_t		setno = s->s_setno;
4761 	int		cont_flag, i;
4762 	mddb_did_db_t	*did_dbp1, *did_dbp2;
4763 	int		mn_set = 0;
4764 	mddb_cfg_loc_t	*cl;
4765 
4766 	/*
4767 	 * read in master blocks and locator block for all known locators.
4768 	 * lb_blkcnt will be set correctly for MN set later once getmasters
4769 	 * has determined that the set is a MN set.
4770 	 */
4771 	lb_blkcnt = ((setno == MD_LOCAL_SET) ?
4772 			MDDB_LOCAL_LBCNT : MDDB_LBCNT);
4773 
4774 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
4775 		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
4776 		    MDDB_F_EMASTER);
4777 		rip->ri_lbp = (mddb_lb_t *)NULL;
4778 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4779 
4780 		/*
4781 		 * Translated dev is only used in calls to getmasters and
4782 		 * getblks which expect a translated (aka miniroot) dev.
4783 		 */
4784 		dev = md_xlate_targ_2_mini(rip->ri_dev);
4785 		if (dev == NODEV64) {
4786 			/* Set error flag that getmasters would have set */
4787 			/* if getmasters had been allowed to fail */
4788 			rip->ri_flags |= MDDB_F_EMASTER;
4789 		}
4790 
4791 		/*
4792 		 * Invalid device id on system (due to failed or
4793 		 * removed device) or invalid devt during upgrade
4794 		 * (due to powered off device) will cause this
4795 		 * replica to be marked in error and not used.
4796 		 */
4797 		if (rip->ri_flags & MDDB_F_EMASTER)
4798 			continue;
4799 
4800 		/* get all master blocks, does mddb_devopen() */
4801 		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
4802 		    &rip->ri_flags, &mn_set);
4803 
4804 		/* if invalid master block - try next replica */
4805 		if (! rip->ri_mbip)
4806 			continue;
4807 
4808 		/*
4809 		 * If lbp alloc'd to wrong size - reset it.
4810 		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
4811 		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
4812 		 */
4813 		if (lbp) {
4814 			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
4815 			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
4816 				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
4817 				lbp = (mddb_lb_t *)NULL;
4818 			}
4819 		}
4820 
4821 		if (lbp == (mddb_lb_t *)NULL) {
4822 			/* If a MN set, set lb_blkcnt for MN loc blk size */
4823 			if (mn_set)
4824 				lb_blkcnt = MDDB_MNLBCNT;
4825 			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
4826 			    KM_SLEEP);
4827 		}
4828 
4829 		/*
4830 		 * Read in all the sectors for the locator block
4831 		 * NOTE: Need to use getblks, rather than readblklst.
4832 		 *	because it is too early and things are
4833 		 *	NOT set up yet for read*()'s
4834 		 */
4835 		buffer = (caddr_t)lbp;
4836 		for (blk = 0; blk < lb_blkcnt; blk++) {
4837 			physblk = getphysblk(blk, rip->ri_mbip);
4838 			err = getblks(s, buffer, dev, physblk,
4839 			    btodb(MDDB_BSIZE), 0);
4840 			if (err) {
4841 				rip->ri_flags |= err;
4842 				break;
4843 			}
4844 			buffer += MDDB_BSIZE;
4845 		}
4846 
4847 		if (err)
4848 			continue;
4849 
4850 		/* Verify the locator block */
4851 		if (blk != lb_blkcnt)
4852 			continue;
4853 		if (lbp->lb_magic != MDDB_MAGIC_LB)
4854 			continue;
4855 		if (lbp->lb_blkcnt != lb_blkcnt)
4856 			continue;
4857 		if (mn_set) {
4858 			/* If a MN set, check for MNLB revision in lb. */
4859 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
4860 				continue;
4861 		} else {
4862 			/* If not a MN set, check for LB revision in lb. */
4863 			if (revchk(MDDB_REV_LB, lbp->lb_revision))
4864 				continue;
4865 		}
4866 		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
4867 			continue;
4868 
4869 		/*
4870 		 * With the addition of MultiNode Disksets, we must make sure
4871 		 * to verify that this is the correct set.  A node could
4872 		 * have been out of the config for awhile and this disk could
4873 		 * have been moved to a different diskset and we don't want
4874 		 * to accidentally start the wrong set.
4875 		 *
4876 		 * We don't do this check if we're in the middle of
4877 		 * importing a set.
4878 		 */
4879 		if (!(md_get_setstatus(s->s_setno) &
4880 		    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
4881 		    (lbp->lb_setno != s->s_setno))
4882 			continue;
4883 
4884 		rip->ri_flags |= MDDB_F_LOCACC;
4885 
4886 		/*
4887 		 * a commit count of zero means this locator has been deleted
4888 		 */
4889 		if (lbp->lb_commitcnt == 0)
4890 			continue;
4891 
4892 		/*
4893 		 * If replica is in the device ID style and md_devid_destroy
4894 		 * flag is set, turn off device id style.  This is only to be
4895 		 * used in a catastrophic failure case.  Examples would be
4896 		 * where the device id of all drives in the system
4897 		 * (especially the mirror'd root drives) had been changed
4898 		 * by firmware upgrade or by a patch to an existing disk
4899 		 * driver.  Another example would be in the case of non-unique
4900 		 * device ids due to a bug.  The device id would be valid on
4901 		 * the system, but would return the wrong dev_t.
4902 		 */
4903 		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
4904 			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
4905 			lbp->lb_didfirstblk = 0;
4906 			lbp->lb_didblkcnt = 0;
4907 			*write_lb = 1;
4908 		}
4909 
4910 
4911 		/*
4912 		 * If replica is in device ID style, read in device ID
4913 		 * block and verify device ID block information.
4914 		 */
4915 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4916 
4917 			/* Read in device ID block */
4918 			if (did_icp == NULL) {
4919 				did_icp = (mddb_did_ic_t *)
4920 					kmem_zalloc(sizeof (mddb_did_ic_t),
4921 					    KM_SLEEP);
4922 			} else {
4923 				/* Reuse did_icp, but clear out data */
4924 				if (did_icp->did_ic_blkp !=
4925 				    (mddb_did_blk_t *)NULL) {
4926 					kmem_free((caddr_t)did_icp->did_ic_blkp,
4927 					    did_blkp_sz);
4928 					did_blkp = (mddb_did_blk_t *)NULL;
4929 					did_icp->did_ic_blkp =
4930 					    (mddb_did_blk_t *)NULL;
4931 				}
4932 				if (did_icp->did_ic_dbp !=
4933 					(mddb_did_db_t *)NULL) {
4934 					did_dbp1 = did_icp->did_ic_dbp;
4935 					while (did_dbp1) {
4936 					    did_dbp2 = did_dbp1->db_next;
4937 					    kmem_free((caddr_t)did_dbp1->db_ptr,
4938 						dbtob(did_dbp1->db_blkcnt));
4939 					    kmem_free((caddr_t)did_dbp1,
4940 						sizeof (mddb_did_db_t));
4941 					    did_dbp1 = did_dbp2;
4942 					}
4943 					did_icp->did_ic_dbp =
4944 						(mddb_did_db_t *)NULL;
4945 				}
4946 				for (i = 0; i < MDDB_NLB; i++) {
4947 					did_icp->did_ic_devid[i] =
4948 						(ddi_devid_t)NULL;
4949 				}
4950 			}
4951 
4952 			/* Can't reuse blkp since size could be different */
4953 			if (did_blkp != (mddb_did_blk_t *)NULL) {
4954 				kmem_free(did_blkp, did_blkp_sz);
4955 			}
4956 			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
4957 			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
4958 			    KM_SLEEP);
4959 			did_icp->did_ic_blkp = did_blkp;
4960 			buffer = (caddr_t)did_blkp;
4961 			for (blk = lbp->lb_didfirstblk;
4962 			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
4963 			    blk++) {
4964 				physblk = getphysblk(blk, rip->ri_mbip);
4965 				err = getblks(s, buffer, dev, physblk,
4966 				    btodb(MDDB_BSIZE), 0);
4967 				if (err) {
4968 					rip->ri_flags |= err;
4969 					break;
4970 				}
4971 				buffer += MDDB_BSIZE;
4972 			}
4973 			if (err)
4974 				continue;
4975 
4976 			/* Verify the Device ID block */
4977 			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
4978 				continue;
4979 			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
4980 				continue;
4981 			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
4982 				continue;
4983 			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
4984 				continue;
4985 			if (crcchk(did_blkp, &did_blkp->blk_checksum,
4986 				dbtob(lbp->lb_didblkcnt), NULL))
4987 				continue;
4988 
4989 			/*
4990 			 * Check if device ID block is out of sync with the
4991 			 * Locator Block by checking if the locator block
4992 			 * commitcnt does not match the device id block
4993 			 * commitcnt.  If an 'out of sync' condition
4994 			 * exists, discard this replica since it has
4995 			 * inconsistent data and can't be used in
4996 			 * determining the best replica.
4997 			 *
4998 			 * An 'out of sync' condition could happen if old
4999 			 * SDS code was running with new devid style replicas
5000 			 * or if a failure occurred between the writing of
5001 			 * the locator block's commitcnt and the device
5002 			 * id block's commitcnt.
5003 			 *
5004 			 * If old SDS code had been running, the upgrade
5005 			 * process should detect this situation and
5006 			 * have removed all of the device id information
5007 			 * via the md_devid_destroy flag in md.conf.
5008 			 */
5009 			if (did_blkp->blk_commitcnt !=
5010 			    lbp->lb_commitcnt) {
5011 				continue;
5012 			}
5013 		}
5014 
5015 
5016 		/*
5017 		 * If replica is still in device ID style, read in all
5018 		 * of the device IDs, verify the checksum of the device IDs.
5019 		 */
5020 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5021 			/*
5022 			 * Reset valid bit in device id info block flags. This
5023 			 * flag is stored on disk, but the valid bit is reset
5024 			 * when reading in the replica.  If the corresponding
5025 			 * device id is valid (aka meaning that the system
5026 			 * knows about this device id), the valid bit will
5027 			 * be set at a later time.  The valid bit for this
5028 			 * replica's device ID will be set in this routine.
5029 			 * The valid bits for the rest of the device id's
5030 			 * will be set after the 'best' replica has
5031 			 * been selected in routine load_old_replicas.
5032 			 * Reset updated bit in device id info block flags.
5033 			 * This flag is also stored on disk, reset when read
5034 			 * in and set when the locators and side locators
5035 			 * have been updated to match this valid device
5036 			 * id information.
5037 			 */
5038 		    for (li = 0; li < lbp->lb_loccnt; li++) {
5039 			did_info = &did_blkp->blk_info[li];
5040 			if (did_info->info_flags & MDDB_DID_EXISTS)
5041 				did_info->info_flags &=
5042 					~(MDDB_DID_VALID | MDDB_DID_UPDATED);
5043 		    }
5044 
5045 		    cont_flag = 0;
5046 		    for (li = 0; li < lbp->lb_loccnt; li++) {
5047 			did_info = &did_blkp->blk_info[li];
5048 			did_block = (caddr_t)NULL;
5049 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5050 			    /* Check if block has already been read in */
5051 			    did_dbp = did_icp->did_ic_dbp;
5052 			    while (did_dbp != 0) {
5053 				if (did_dbp->db_firstblk ==
5054 				    did_info->info_firstblk)
5055 					break;
5056 				else
5057 					did_dbp = did_dbp->db_next;
5058 			    }
5059 			    /* if block not found, read it in */
5060 			    if (did_dbp == NULL) {
5061 				did_block = (caddr_t)(kmem_zalloc(dbtob
5062 					    (did_info->info_blkcnt), KM_SLEEP));
5063 				buffer = (caddr_t)did_block;
5064 				for (blk = did_info->info_firstblk;
5065 				    blk < (did_info->info_firstblk +
5066 				    did_info->info_blkcnt); blk++) {
5067 					physblk = getphysblk(blk, rip->ri_mbip);
5068 					err = getblks(s, buffer, dev, physblk,
5069 					    btodb(MDDB_BSIZE), 0);
5070 					if (err) {
5071 						rip->ri_flags |= err;
5072 						break;
5073 					}
5074 					buffer += MDDB_BSIZE;
5075 				}
5076 				if (err) {
5077 				    kmem_free(did_block,
5078 					dbtob(did_info->info_blkcnt));
5079 					did_block = (caddr_t)NULL;
5080 				    cont_flag = 1;
5081 				    break;
5082 				}
5083 
5084 				/*
5085 				 * Block read in - alloc Disk Block area
5086 				 */
5087 				did_dbp = (mddb_did_db_t *)kmem_zalloc(
5088 				    sizeof (mddb_did_db_t), KM_SLEEP);
5089 				did_dbp->db_ptr = did_block;
5090 				did_dbp->db_firstblk = did_info->info_firstblk;
5091 				did_dbp->db_blkcnt = did_info->info_blkcnt;
5092 
5093 				/* Add to front of dbp list */
5094 				did_dbp->db_next = did_icp->did_ic_dbp;
5095 				did_icp->did_ic_dbp = did_dbp;
5096 			    }
5097 			    /* Check validity of devid in block */
5098 			    if (crcchk(((char *)did_dbp->db_ptr +
5099 				did_info->info_offset),
5100 				&did_info->info_checksum,
5101 				did_info->info_length, NULL)) {
5102 				    cont_flag = 1;
5103 				    break;
5104 			    }
5105 
5106 			    /* Block now pointed to by did_dbp */
5107 			    did_icp->did_ic_devid[li] = (ddi_devid_t)
5108 				((char *)did_dbp->db_ptr +
5109 				did_info->info_offset);
5110 			}
5111 		    }
5112 		    if (cont_flag)
5113 			continue;
5114 		}
5115 
5116 		/*
5117 		 * All blocks containing devids are now in core.
5118 		 */
5119 
5120 		/*
5121 		 * If we're doing a replicated import (also known as
5122 		 * remote copy import), the device id in the locator
5123 		 * block is incorrect and we need to fix it up here
5124 		 * alongwith the l_dev otherwise we run into lots of
5125 		 * trouble later on.
5126 		 */
5127 		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5128 			mddb_ri_t	*trip;
5129 			for (li = 0; li < lbp->lb_loccnt; li++) {
5130 				did_info = &did_blkp->blk_info[li];
5131 				lp = &lbp->lb_locators[li];
5132 
5133 				if (lp->l_flags & MDDB_F_DELETED)
5134 					continue;
5135 
5136 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5137 					continue;
5138 
5139 				if (did_icp->did_ic_devid[li] == NULL)
5140 					continue;
5141 
5142 				for (trip = s->s_rip; trip != NULL;
5143 				    trip = trip->ri_next) {
5144 					if (trip->ri_old_devid == NULL)
5145 						continue;
5146 					if (ddi_devid_compare(
5147 					    trip->ri_old_devid,
5148 					    did_icp->did_ic_devid[li]) != 0) {
5149 						continue;
5150 					}
5151 
5152 					/* update l_dev and side mnum */
5153 					lp->l_dev = md_cmpldev(trip->ri_dev);
5154 					lbp->lb_sidelocators[0][li].l_mnum =
5155 					    md_getminor(trip->ri_dev);
5156 				}
5157 			}
5158 		}
5159 
5160 		/*
5161 		 * If there is a valid devid, verify that this locator
5162 		 * block has information about itself by checking the
5163 		 * device ID, minor_name and block
5164 		 * number from this replica's incore data structure
5165 		 * against the locator block information that has just
5166 		 * been read in from disk.
5167 		 *
5168 		 * If not a valid devid, verify that this locator block
5169 		 * has information about itself by checking the minor
5170 		 * number, block number and driver name from this
5171 		 * replica's incore data structure against the locator
5172 		 * block information that has just been read in from disk.
5173 		 */
5174 		if ((rip->ri_devid != NULL) &&
5175 		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
5176 			/*
5177 			 * This locator block MUST have locator (replica)
5178 			 * information about itself.  Check against devid,
5179 			 * slice part of minor number, and block number.
5180 			 */
5181 			for (li = 0; li < lbp->lb_loccnt; li++) {
5182 				did_info = &did_blkp->blk_info[li];
5183 				lp = &lbp->lb_locators[li];
5184 				if (lp->l_flags & MDDB_F_DELETED)
5185 					continue;
5186 
5187 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5188 					continue;
5189 
5190 				if (((md_get_setstatus(setno) &
5191 				    MD_SET_REPLICATED_IMPORT)) &&
5192 				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
5193 					if (ddi_devid_compare(rip->ri_old_devid,
5194 					    did_icp->did_ic_devid[li]) != 0)
5195 					    continue;
5196 				} else {
5197 					if (ddi_devid_compare(rip->ri_devid,
5198 					    did_icp->did_ic_devid[li]) != 0)
5199 					    continue;
5200 				}
5201 
5202 				if (strcmp(rip->ri_minor_name,
5203 				    did_info->info_minor_name) != 0)
5204 					continue;
5205 
5206 				if (lp->l_blkno == rip->ri_blkno)
5207 					break;
5208 			}
5209 		} else {
5210 			/*
5211 			 * This locator block MUST have locator (replica)
5212 			 * information about itself.
5213 			 */
5214 			if (!mn_set) {
5215 			    for (li = 0; li < lbp->lb_loccnt; li++) {
5216 				mddb_drvnm_t		*dn;
5217 				mddb_sidelocator_t	*slp;
5218 
5219 				lp = &lbp->lb_locators[li];
5220 				slp = &lbp->lb_sidelocators[s->s_sideno][li];
5221 				if (lp->l_flags & MDDB_F_DELETED)
5222 					continue;
5223 				if (slp->l_mnum != md_getminor(rip->ri_dev))
5224 					continue;
5225 				if (lp->l_blkno != rip->ri_blkno)
5226 					continue;
5227 				dn = &lbp->lb_drvnm[slp->l_drvnm_index];
5228 				if (strncmp(dn->dn_data, rip->ri_driver,
5229 				    MD_MAXDRVNM) == 0)
5230 				break;
5231 			    }
5232 			} else {
5233 			    for (li = 0; li < lbp->lb_loccnt; li++) {
5234 				mddb_drvnm_t		*dn;
5235 				mddb_mnsidelocator_t	*mnslp;
5236 				mddb_mnlb_t		*mnlbp;
5237 				int			i;
5238 
5239 				/*
5240 				 * Check all possible locators locking for
5241 				 * match to the currently read-in locator,
5242 				 * must match on:
5243 				 *	- blkno
5244 				 *	- side locator for this node's side
5245 				 *	- side locator minor number
5246 				 *	- side locator driver name
5247 				 */
5248 
5249 				/* Looking at sidelocs - cast lbp -> mnlbp */
5250 				mnlbp = (mddb_mnlb_t *)lbp;
5251 				lp = &mnlbp->lb_locators[li];
5252 				if (lp->l_flags & MDDB_F_DELETED)
5253 					continue;
5254 				if (lp->l_blkno != rip->ri_blkno)
5255 					continue;
5256 
5257 				for (i = 0; i < MD_MNMAXSIDES; i++) {
5258 				    mnslp = &mnlbp->lb_mnsidelocators[i][li];
5259 				    if (mnslp->mnl_sideno == s->s_sideno) {
5260 					break;
5261 				    }
5262 				}
5263 				/* No matching side found */
5264 				if (i == MD_MNMAXSIDES)
5265 					continue;
5266 				if (mnslp->mnl_mnum != md_getminor(rip->ri_dev))
5267 					continue;
5268 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
5269 				if (strncmp(dn->dn_data, rip->ri_driver,
5270 				    MD_MAXDRVNM) == 0)
5271 					break;
5272 			    }
5273 			}
5274 		}
5275 
5276 		/*
5277 		 * Didn't find ourself in this locator block it means
5278 		 * the locator block is a stale transplant. Probably from
5279 		 * a user doing a dd.
5280 		 */
5281 		if (li == lbp->lb_loccnt)
5282 			continue;
5283 
5284 		/*
5285 		 * Keep track of the number of accessed and valid
5286 		 * locator blocks.
5287 		 */
5288 		lb_ok++;
5289 
5290 		/*
5291 		 * Read the tag in, skips invalid or blank tags.
5292 		 * Only valid tags allocate storage
5293 		 * Data tags are not used in MN disksets.
5294 		 */
5295 		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
5296 			/*
5297 			 * Keep track of the number of tagged
5298 			 * locator blocks.
5299 			 */
5300 			lb_tagged++;
5301 
5302 			/* Keep a list of unique tags. */
5303 			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
5304 		}
5305 
5306 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5307 			/*
5308 			 * go through locator block and add any other
5309 			 * locations of the data base.
5310 			 * For the replicated import case, this was done earlier
5311 			 * and we really don't need or want to do so again
5312 			 */
5313 			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
5314 			for (li = 0; li < lbp->lb_loccnt; li++) {
5315 				lp = &lbp->lb_locators[li];
5316 				if (lp->l_flags & MDDB_F_DELETED)
5317 					continue;
5318 
5319 				cl->l_devid_flags = MDDB_DEVID_GETSZ;
5320 				cl->l_devid = (uint64_t)0;
5321 				cl->l_devid_sz = 0;
5322 				cl->l_old_devid = (uint64_t)0;
5323 				cl->l_old_devid_sz = 0;
5324 				cl->l_minor_name[0] = '\0';
5325 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5326 				    did_icp);
5327 
5328 				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
5329 					if ((cl->l_devid = (uintptr_t)kmem_alloc
5330 					    (cl->l_devid_sz, KM_SLEEP))
5331 					    == NULL) {
5332 						continue;
5333 					} else {
5334 						cl->l_devid_flags =
5335 						    MDDB_DEVID_SPACE;
5336 					}
5337 				}
5338 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5339 				    did_icp);
5340 
5341 				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
5342 
5343 				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
5344 					kmem_free((caddr_t)(uintptr_t)
5345 					    cl->l_devid, cl->l_devid_sz);
5346 			}
5347 			kmem_free(cl, sizeof (mddb_cfg_loc_t));
5348 		}
5349 
5350 		/* Save LB for later */
5351 		rip->ri_lbp = lbp;
5352 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5353 			rip->ri_did_icp = did_icp;
5354 			did_icp = (mddb_did_ic_t *)NULL;
5355 			did_blkp = (mddb_did_blk_t *)NULL;
5356 		} else
5357 			rip->ri_did_icp = NULL;
5358 		lbp = (mddb_lb_t *)NULL;
5359 	}
5360 
5361 	if (lbp != (mddb_lb_t *)NULL)
5362 		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
5363 
5364 	if (did_icp != (mddb_did_ic_t *)NULL) {
5365 		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
5366 			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
5367 			did_blkp = (mddb_did_blk_t *)NULL;
5368 		}
5369 		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
5370 			mddb_did_db_t	*did_dbp1, *did_dbp2;
5371 
5372 			did_dbp1 = did_icp->did_ic_dbp;
5373 			while (did_dbp1) {
5374 				did_dbp2 = did_dbp1->db_next;
5375 				kmem_free((caddr_t)did_dbp1->db_ptr,
5376 				    dbtob(did_dbp1->db_blkcnt));
5377 				kmem_free((caddr_t)did_dbp1,
5378 				    sizeof (mddb_did_db_t));
5379 				did_dbp1 = did_dbp2;
5380 			}
5381 		}
5382 		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
5383 	}
5384 
5385 	if (did_blkp != (mddb_did_blk_t *)NULL) {
5386 		kmem_free((caddr_t)did_blkp, did_blkp_sz);
5387 	}
5388 
5389 	/* No locator blocks were ok */
5390 	if (lb_ok == 0)
5391 		goto out;
5392 
5393 	/* No tagged data was found - will be 0 for MN diskset */
5394 	if (lb_tagged == 0)
5395 		goto out;
5396 
5397 	/* Find the highest non-deleted replica count */
5398 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5399 		int		lb_tot = 0;
5400 
5401 		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
5402 			continue;
5403 
5404 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
5405 			continue;
5406 
5407 		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
5408 			lp = &rip->ri_lbp->lb_locators[li];
5409 			if (lp->l_flags & MDDB_F_DELETED)
5410 				continue;
5411 			lb_tot++;
5412 		}
5413 
5414 		if (lb_tot > lb_total)
5415 			lb_total = lb_tot;
5416 	}
5417 
5418 	/* Count the number of unique tags */
5419 	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
5420 		lb_tags++;
5421 
5422 	/* Should have at least one tag at this point */
5423 	ASSERT(lb_tags > 0);
5424 
5425 
5426 	/*
5427 	 * If the number of tagged locators is not the same as the number of
5428 	 * OK locators OR more than one tag exists, then make sure the
5429 	 * selected tag will be written out later.
5430 	 */
5431 	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
5432 		md_set_setstatus(setno, MD_SET_TAGDATA);
5433 
5434 	/* Only a single tag, take the tagged data */
5435 	if (lb_tags == 1) {
5436 		dt_setup(s, &s->s_dtlp->dtl_dt);
5437 		md_set_setstatus(setno, MD_SET_USETAG);
5438 		goto out;
5439 	}
5440 
5441 	/* Multiple tags, not selecting a tag, tag mode is on */
5442 	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
5443 		retval = MDDB_E_TAGDATA;
5444 
5445 out:
5446 
5447 	return (retval);
5448 }
5449 
5450 /*
5451  *	1. Select a locator.
5452  *	2. check if enough locators now have current copies
5453  *	3. read in database from one of latest
5454  *	4. if known to have latest make all database the same
5455  *	5. if configuration has changed rewrite locators
5456  *
5457  * Parameters:
5458  * 	s - pointer to mddb_set structure
5459  *	flag - used in MN disksets to tell if this node is being joined to
5460  *		a diskset that is in the STALE state.  If the flag is
5461  *		MDDB_MN_STALE, then this node should be marked in the STALE
5462  *		state even if > 50% mddbs are available.  (The diskset can
5463  *		only change from STALE->OK if all nodes withdraw from the
5464  *		MN diskset and then rejoin).
5465  */
5466 static int
5467 load_old_replicas(
5468 	mddb_set_t	*s,
5469 	int		flag
5470 )
5471 {
5472 	mddb_lb_t	*lbp = NULL;
5473 	mddb_mnlb_t	*mnlbp = NULL;
5474 	mddb_ri_t	*rip;
5475 	mddb_locator_t	*lp;
5476 	mddb_db_t	*dbp;
5477 	mddb_de_ic_t	*dep;
5478 	int		li;
5479 	int		alc;
5480 	int		lc;
5481 	int		tlc;
5482 	int		retval = 0;
5483 	caddr_t		p;
5484 	size_t		maxrecsize;
5485 	set_t		setno = s->s_setno;
5486 	mddb_did_db_t	*did_dbp1;
5487 	mddb_did_info_t	*did_info;
5488 	mddb_did_ic_t	*did_icp = NULL;
5489 	md_dev64_t	*newdev;
5490 	mddb_sidelocator_t	*slp = 0;
5491 	mddb_mnsidelocator_t	*mnslp = 0;
5492 	uchar_t		i;
5493 	char		*name;
5494 	ddi_devid_t	ret_devid;
5495 	md_dev64_t	dev;
5496 	uint_t		len, sz;
5497 	char		*minor_name;
5498 	int		write_lb = 0;
5499 	int		rval;
5500 	int		stale_rtn = 0;
5501 
5502 	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
5503 	if (retval = get_mbs_n_lbs(s, &write_lb))
5504 		goto errout;
5505 
5506 	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
5507 		retval = MDDB_E_NOLOCBLK;
5508 		goto errout;
5509 	}
5510 
5511 	/* If a multi-node set, then set md_set.s_status flag */
5512 	if (lbp->lb_flags & MDDB_MNSET) {
5513 		md_set_setstatus(setno, MD_SET_MNSET);
5514 		/*
5515 		 * If data tag area had been allocated before set type was
5516 		 * known - free it now.
5517 		 */
5518 		if (md_set[setno].s_dtp) {
5519 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
5520 			md_set[setno].s_dtp = NULL;
5521 		}
5522 	}
5523 
5524 	/*
5525 	 * If the replica is in devid format, setup the devid incore ptr.
5526 	 */
5527 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5528 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5529 			if (rip->ri_lbp == s->s_lbp) {
5530 				did_icp = s->s_did_icp = rip->ri_did_icp;
5531 				break;
5532 			}
5533 		}
5534 		/*
5535 		 * If no devid incore info found - something has gone
5536 		 * wrong so errout.
5537 		 */
5538 		if (rip == NULL) {
5539 			retval = MDDB_E_NODEVID;
5540 			goto errout;
5541 		}
5542 
5543 		/*
5544 		 * Add all blocks containing devids to free list.
5545 		 * Then remove addresses that actually contain devids.
5546 		 */
5547 		did_dbp1 = did_icp->did_ic_dbp;
5548 		while (did_dbp1) {
5549 			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
5550 				0, dbtob(did_dbp1->db_blkcnt))) {
5551 				retval = MDDB_E_NOSPACE;
5552 				goto errout;
5553 			}
5554 
5555 			did_dbp1 = did_dbp1->db_next;
5556 		}
5557 		for (li = 0; li < lbp->lb_loccnt; li++) {
5558 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5559 			if (!(did_info->info_flags & MDDB_DID_EXISTS))
5560 				continue;
5561 
5562 			if (mddb_devid_free_delete(s, did_info->info_firstblk,
5563 			    did_info->info_offset, did_info->info_length)) {
5564 				/* unable to find disk block */
5565 				retval = MDDB_E_NODEVID;
5566 				goto errout;
5567 			}
5568 		}
5569 	}
5570 
5571 	/*
5572 	 * create mddb_mbaray, count all locators and active locators.
5573 	 */
5574 	alc = 0;
5575 	lc = 0;
5576 	for (li = 0; li < lbp->lb_loccnt; li++) {
5577 		ddi_devid_t	li_devid;
5578 
5579 		lp = &lbp->lb_locators[li];
5580 
5581 		if (lp->l_flags & MDDB_F_DELETED)
5582 			continue;
5583 
5584 		/* Count non-deleted replicas */
5585 		lc++;
5586 
5587 		/*
5588 		 * Use the devid of this locator to compare with the rip
5589 		 * list.  The scenario to watch out for here is that this
5590 		 * locator could be on a disk that is dead and there could
5591 		 * be a valid entry in the rip list for a different disk
5592 		 * that has been moved to the dead disks dev_t.  We don't
5593 		 * want to match with the moved disk.
5594 		 */
5595 		li_devid = NULL;
5596 		(void) mddb_devid_get(s, li, &li_devid, &minor_name);
5597 
5598 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5599 			if (match_mddb(rip, li_devid, minor_name,
5600 			    md_expldev(lp->l_dev), lp->l_blkno)) {
5601 				break;
5602 			}
5603 		}
5604 		if (rip == NULL) {
5605 			/*
5606 			 * If rip not found, then mark error in master block
5607 			 * so that no writes are later attempted to this
5608 			 * replica.  rip may not be setup if ridev
5609 			 * failed due to un-found driver name.
5610 			 */
5611 			lp->l_flags |= MDDB_F_EMASTER;
5612 			continue;
5613 		}
5614 
5615 		s->s_mbiarray[li] = rip->ri_mbip;
5616 
5617 		lp->l_flags &= MDDB_F_ACTIVE;
5618 		lp->l_flags |= (int)rip->ri_flags;
5619 
5620 		if (rip->ri_transplant)
5621 			lp->l_flags &= ~MDDB_F_ACTIVE;
5622 
5623 		if (lp->l_flags & MDDB_F_LOCACC)
5624 			alc++;
5625 	}
5626 
5627 	/* Save on a divide - calculate 50% + 1 up front */
5628 	tlc = ((lc + 1) / 2);
5629 
5630 	if (alc > tlc) {		/* alc > tlc		- OK */
5631 		md_clr_setstatus(setno, MD_SET_STALE);
5632 	} else if (alc < tlc) {		/* alc < tlc		- stale */
5633 		md_set_setstatus(setno, MD_SET_STALE);
5634 	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
5635 		md_clr_setstatus(setno, MD_SET_STALE);
5636 	} else {			/* alc == tlc && even	- ? */
5637 		/* Can do an accept, and are */
5638 		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
5639 			md_clr_setstatus(setno, MD_SET_STALE);
5640 		} else {		/* possibly has a mediator */
5641 			if (mediate(s)) {
5642 				md_set_setstatus(setno, MD_SET_STALE);
5643 			} else {
5644 				md_clr_setstatus(setno, MD_SET_STALE);
5645 			}
5646 		}
5647 
5648 		/*
5649 		 * The mirrored_root_flag allows the sysadmin to decide to
5650 		 * start the local set in a read/write (non-stale) mode
5651 		 * when there are only 50% available mddbs on the system and
5652 		 * when the root file system is on a mirror.  This is useful
5653 		 * in a 2 disk system where 1 disk failure would cause an mddb
5654 		 * quorum failure and subsequent boot failures since the root
5655 		 * filesystem would be in a read-only state.
5656 		 */
5657 		if (mirrored_root_flag == 1 && setno == 0 &&
5658 		    svm_bootpath[0] != 0) {
5659 			md_clr_setstatus(setno, MD_SET_STALE);
5660 		} else {
5661 			if (md_get_setstatus(setno) & MD_SET_STALE) {
5662 				/* Allow half mode - CAREFUL! */
5663 				if (mddb_allow_half)
5664 					md_clr_setstatus(setno, MD_SET_STALE);
5665 			}
5666 		}
5667 
5668 		/*
5669 		 * In a MN diskset,
5670 		 *	- if 50% mddbs are unavailable and this
5671 		 *		has been marked STALE above
5672 		 * 	- master node isn't in the STALE state
5673 		 *	- this node isn't the master node (this node
5674 		 *		isn't the first node to join the set)
5675 		 * then clear the STALE state and set TOOFEW.
5676 		 *
5677 		 * If this node is the master node and set was marked STALE,
5678 		 * then the set stays STALE.
5679 		 *
5680 		 * If this node is not the master and this node's state is
5681 		 * STALE and the master node is not marked STALE,
5682 		 * then master node must be in the TOOFEW state or the
5683 		 * master is panic'ing.  A MN diskset can only be placed into
5684 		 * the STALE state by having the first node join the set
5685 		 * with <= 50% mddbs.  There's no way for a MN diskset to
5686 		 * transition between STALE and not-STALE states unless all
5687 		 * nodes are withdrawn from the diskset or all nodes in the
5688 		 * diskset are rebooted at the same time.
5689 		 *
5690 		 * So, mark this node's state as TOOFEW instead of STALE.
5691 		 */
5692 		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
5693 		    == (MD_SET_MNSET | MD_SET_STALE)) &&
5694 		    ((flag & MDDB_MN_STALE) == 0) &&
5695 		    (!(md_set[setno].s_am_i_master))) {
5696 			md_clr_setstatus(setno, MD_SET_STALE);
5697 			md_set_setstatus(setno, MD_SET_TOOFEW);
5698 		}
5699 	}
5700 
5701 	/*
5702 	 * If a MN set is marked STALE on the other nodes,
5703 	 * mark it stale here.  Override all other considerations
5704 	 * such as a mediator or > 50% mddbs available.
5705 	 */
5706 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
5707 		if (flag & MDDB_MN_STALE)
5708 			md_set_setstatus(setno, MD_SET_STALE);
5709 	}
5710 
5711 	/*
5712 	 * read a good copy of the locator names
5713 	 * if an error occurs reading what is suppose
5714 	 * to be a good copy continue looking for another
5715 	 * good copy
5716 	 */
5717 	s->s_lnp = NULL;
5718 	for (li = 0; li < lbp->lb_loccnt; li++) {
5719 		lp = &lbp->lb_locators[li];
5720 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5721 		    (lp->l_flags & MDDB_F_EMASTER))
5722 			continue;
5723 
5724 		/* Find rip entry for this locator if one exists */
5725 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5726 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5727 			    lp->l_blkno))
5728 				break;
5729 		}
5730 
5731 		if (rip == NULL) {
5732 			continue;
5733 		}
5734 
5735 		/*
5736 		 * Use the rip commitcnt since the commitcnt in lbp could
5737 		 * been cleared by selectlocator.  Looking for a replica with
5738 		 * the same commitcnt as the 'golden' copy in order to
5739 		 * get the same data.
5740 		 */
5741 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5742 			continue;
5743 		}
5744 
5745 		/*
5746 		 * Now have a copy of the database that is equivalent
5747 		 * to the chosen locator block with respect to
5748 		 * inittime, identifier and commitcnt.   Trying the
5749 		 * equivalent databases in the order that they were
5750 		 * written will provide the most up to date data.
5751 		 */
5752 		lp->l_flags |= readlocnames(s, li);
5753 		if (s->s_lnp)
5754 			break;
5755 	}
5756 
5757 	if (s->s_lnp == NULL) {
5758 		retval = MDDB_E_NOLOCNMS;
5759 		goto errout;
5760 	}
5761 
5762 	/*
5763 	 * read a good copy of the data base
5764 	 * if an error occurs reading what is suppose
5765 	 * to be a good copy continue looking for another
5766 	 * good copy
5767 	 */
5768 
5769 	s->s_dbp = NULL;
5770 	for (li = 0; li < lbp->lb_loccnt; li++) {
5771 		lp = &lbp->lb_locators[li];
5772 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5773 		    (lp->l_flags & MDDB_F_EMASTER))
5774 			continue;
5775 
5776 		/* Find rip entry for this locator if one exists */
5777 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5778 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5779 			    lp->l_blkno))
5780 				break;
5781 		}
5782 
5783 		if (rip == NULL) {
5784 			continue;
5785 		}
5786 
5787 		/*
5788 		 * Use the rip commitcnt since the commitcnt in lbp could
5789 		 * been cleared by selectlocator.  Looking for a replica with
5790 		 * the same commitcnt as the 'golden' copy in order to
5791 		 * get the same data.
5792 		 */
5793 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5794 			continue;
5795 		}
5796 
5797 		/*
5798 		 * Now have a copy of the database that is equivalent
5799 		 * to the chosen locator block with respect to
5800 		 * inittime, identifier and commitcnt.   Trying the
5801 		 * equivalent databases in the order that they were
5802 		 * written will provide the most up to date data.
5803 		 */
5804 		lp->l_flags |= readcopy(s, li);
5805 
5806 		if (s->s_dbp)
5807 			break;
5808 	}
5809 
5810 	if (s->s_dbp == NULL) {
5811 		retval = MDDB_E_NODIRBLK;
5812 		goto errout;
5813 	}
5814 
5815 	lp->l_flags |= MDDB_F_MASTER;
5816 	lp->l_flags |= MDDB_F_UP2DATE;
5817 
5818 	/*
5819 	 * go through and find largest record;
5820 	 * Also fixup the user data area's
5821 	 */
5822 	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
5823 
5824 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
5825 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
5826 			if (dep->de_flags & MDDB_F_OPT)
5827 				getoptrecord(s, dep);
5828 			else {
5829 				allocuserdata(dep);
5830 				maxrecsize = MAX(dep->de_recsize, maxrecsize);
5831 			}
5832 
5833 	if (maxrecsize > s->s_databuffer_size) {
5834 		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
5835 		if (s->s_databuffer_size)
5836 			kmem_free(s->s_databuffer, s->s_databuffer_size);
5837 		s->s_databuffer = p;
5838 		s->s_databuffer_size = maxrecsize;
5839 	}
5840 
5841 	/* If we can clear the tag data record, do it now. */
5842 	/* Data tags not supported on MN sets */
5843 	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
5844 	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
5845 		dt_setup(s, NULL);
5846 
5847 	/* This will return non-zero if STALE or TOOFEW */
5848 	/* This will write out chosen replica image to all replicas */
5849 	stale_rtn = selectreplicas(s, MDDB_SCANALL);
5850 
5851 	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5852 		ddi_devid_t	devidptr;
5853 
5854 		/*
5855 		 * ignore the return value from selectreplicas because we
5856 		 * may have a STALE or TOOFEW set in the case of a partial
5857 		 * replicated diskset. We will fix that up later.
5858 		 */
5859 
5860 		lbp = s->s_lbp;
5861 		for (li = 0; li < lbp->lb_loccnt; li++) {
5862 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5863 
5864 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5865 				devidptr = s->s_did_icp->did_ic_devid[li];
5866 				lp = &lbp->lb_locators[li];
5867 				for (rip = s->s_rip; rip != NULL;
5868 				    rip = rip->ri_next) {
5869 					if (rip->ri_old_devid == 0)
5870 						continue;
5871 					if (ddi_devid_compare(rip->ri_old_devid,
5872 					    devidptr) != 0) {
5873 						continue;
5874 					}
5875 					if (update_locatorblock(s,
5876 					    md_expldev(lp->l_dev),
5877 					    rip->ri_devid, rip->ri_old_devid)) {
5878 						goto errout;
5879 					}
5880 				}
5881 			}
5882 		}
5883 	} else {
5884 		if (stale_rtn)
5885 			goto errout;
5886 	}
5887 
5888 	/*
5889 	 * If the replica is in device id style - validate the device id's,
5890 	 * if present, in the locator block devid area.
5891 	 */
5892 	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
5893 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5894 		for (li = 0; li < lbp->lb_loccnt; li++) {
5895 			newdev[li] = 0;
5896 			lp = &lbp->lb_locators[li];
5897 			if (lp->l_flags & MDDB_F_DELETED)
5898 				continue;
5899 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5900 			dev = md_expldev(lp->l_dev);
5901 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5902 				/* Validate device id on current system */
5903 				newdev[li] = dev;
5904 				if (mddb_devid_validate(
5905 					did_icp->did_ic_devid[li],
5906 					&(newdev[li]),
5907 					did_info->info_minor_name) == 0) {
5908 					/* Set valid flag */
5909 					did_info->info_flags |= MDDB_DID_VALID;
5910 				} else {
5911 					lp->l_flags |= MDDB_F_EMASTER;
5912 				}
5913 			} else if (!(MD_UPGRADE)) {
5914 				/*
5915 				 * If a device doesn't have a device id,
5916 				 * check if there is now a device ID
5917 				 * associated with device.  If one exists,
5918 				 * add it to the locator block devid area.
5919 				 * If there's not enough space to add it,
5920 				 * print a warning.
5921 				 * Don't do this during upgrade.
5922 				 */
5923 				dev_t ddi_dev = md_dev64_to_dev(dev);
5924 				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
5925 				    DDI_SUCCESS) {
5926 					if (ddi_lyr_get_minor_name(ddi_dev,
5927 					    S_IFBLK, &minor_name)
5928 					    == DDI_SUCCESS) {
5929 						if (mddb_devid_add(s, li,
5930 						    ret_devid, minor_name)) {
5931 							cmn_err(CE_WARN,
5932 							"Not enough space in"
5933 							" metadevice state"
5934 							" database\n");
5935 							cmn_err(CE_WARN,
5936 							"to add relocation"
5937 							" information for"
5938 							" device:\n");
5939 							cmn_err(CE_WARN,
5940 							" major = %d, "
5941 							" minor = %d\n",
5942 							getmajor(ddi_dev),
5943 							getminor(ddi_dev));
5944 						} else {
5945 						    write_lb = 1;
5946 						}
5947 						kmem_free(minor_name,
5948 						    strlen(minor_name) + 1);
5949 					}
5950 					ddi_devid_free(ret_devid);
5951 				}
5952 			}
5953 		}
5954 
5955 		/*
5956 		 * If a device has a valid device id and if the dev_t
5957 		 * associated with the device id has changed, update the
5958 		 * driver name, minor num and dev_t in the local and side
5959 		 * locators to match the dev_t that the system currently
5960 		 * associates with the device id.
5961 		 *
5962 		 * Don't do this during upgrade.
5963 		 */
5964 		if (!(MD_UPGRADE)) {
5965 		    for (li = 0; li < lbp->lb_loccnt; li++) {
5966 			lp = &lbp->lb_locators[li];
5967 			if (lp->l_flags & MDDB_F_DELETED)
5968 				continue;
5969 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5970 			if ((did_info->info_flags & MDDB_DID_VALID) &&
5971 			    !(did_info->info_flags & MDDB_DID_UPDATED)) {
5972 				if (lbp->lb_flags & MDDB_MNSET) {
5973 					int 	j;
5974 					int	index = -1;
5975 					mnlbp = (mddb_mnlb_t *)lbp;
5976 					for (j = 0; j < MD_MNMAXSIDES; j++) {
5977 					    mnslp = &mnlbp->
5978 						lb_mnsidelocators[j][li];
5979 					    if (mnslp->mnl_sideno ==
5980 						s->s_sideno)
5981 						break;
5982 					    if (mnslp->mnl_sideno == 0)
5983 						index = j;
5984 					}
5985 					if (j == MD_MNMAXSIDES) {
5986 					    /* No match found; take empty */
5987 					    mnslp = &mnlbp->
5988 						lb_mnsidelocators[index][li];
5989 					    write_lb = 1;
5990 					    mnslp->mnl_mnum =
5991 						md_getminor(newdev[li]);
5992 					} else if (mnslp->mnl_mnum !=
5993 					    md_getminor(newdev[li])) {
5994 						write_lb = 1;
5995 						mnslp->mnl_mnum =
5996 						    md_getminor(newdev[li]);
5997 					}
5998 				} else {
5999 					slp = &lbp->
6000 					    lb_sidelocators[s->s_sideno][li];
6001 					if (slp->l_mnum !=
6002 					    md_getminor(newdev[li])) {
6003 						write_lb = 1;
6004 						slp->l_mnum =
6005 						    md_getminor(newdev[li]);
6006 					}
6007 				}
6008 				name = ddi_major_to_name(
6009 						md_getmajor(newdev[li]));
6010 				if (lbp->lb_flags & MDDB_MNSET) {
6011 					i = mnslp->mnl_drvnm_index;
6012 				} else {
6013 					i = slp->l_drvnm_index;
6014 				}
6015 				if (strncmp(lbp->lb_drvnm[i].dn_data, name,
6016 					lbp->lb_drvnm[i].dn_len) != 0) {
6017 					/* Driver name has changed */
6018 					len = strlen(name);
6019 					/* Look for the driver name */
6020 					for (i = 0; i < MDDB_DRVNMCNT; i++) {
6021 						if (lbp->lb_drvnm[i].dn_len
6022 						    != len)
6023 							continue;
6024 						if (strncmp(
6025 						    lbp->lb_drvnm[i].dn_data,
6026 						    name, len) == 0)
6027 							break;
6028 					}
6029 					/* Didn't find one, add it */
6030 					if (i == MDDB_DRVNMCNT) {
6031 					    for (i = 0; i < MDDB_DRVNMCNT;
6032 						i++) {
6033 						if (lbp->lb_drvnm[i].dn_len
6034 						    == 0)
6035 							break;
6036 					    }
6037 					    if (i == MDDB_DRVNMCNT) {
6038 						cmn_err(CE_WARN,
6039 						    "Unable to update driver"
6040 						    " name for dev:  "
6041 						    "major = %d, "
6042 						    "minor = %d\n",
6043 						    md_getmajor(newdev[li]),
6044 						    md_getminor(newdev[li]));
6045 						continue;
6046 					    }
6047 					    (void) strncpy(
6048 						lbp->lb_drvnm[i].dn_data,
6049 						name, MD_MAXDRVNM);
6050 					    lbp->lb_drvnm[i].dn_len =
6051 						(uchar_t)strlen(name);
6052 					}
6053 					/* Fill in the drvnm index */
6054 					if (lbp->lb_flags & MDDB_MNSET) {
6055 						mnslp->mnl_drvnm_index = i;
6056 					} else {
6057 						slp->l_drvnm_index = i;
6058 					}
6059 					write_lb = 1;
6060 				}
6061 				did_info->info_flags |= MDDB_DID_UPDATED;
6062 			}
6063 		}
6064 	    }
6065 	}
6066 	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
6067 
6068 	/*
6069 	 * If locator block has been changed by get_mbs_n_lbs,
6070 	 * by addition of new device id, by updated minor name or
6071 	 * by updated driver name - write out locator block.
6072 	 */
6073 	if (write_lb) {
6074 		rval = push_lb(s);
6075 		(void) upd_med(s, "load_old_replicas(0)");
6076 		if (rval)
6077 			goto errout;
6078 	}
6079 
6080 	/*
6081 	 * If the tag was moved, allocated, or a BADTAG was seen for some other
6082 	 * reason, then make sure tags are written to all the replicas.
6083 	 * Data tags not supported on MN sets.
6084 	 */
6085 	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
6086 		if (! (lc = dt_alloc_if_needed(s))) {
6087 			for (li = 0; li < lbp->lb_loccnt; li++) {
6088 				lp = &lbp->lb_locators[li];
6089 
6090 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
6091 				    (lp->l_flags & MDDB_F_EMASTER))
6092 					continue;
6093 
6094 				if (lp->l_flags & MDDB_F_BADTAG) {
6095 					lc = 1;
6096 					break;
6097 				}
6098 			}
6099 		}
6100 
6101 		if (lc) {
6102 			md_set_setstatus(setno, MD_SET_TAGDATA);
6103 			md_clr_setstatus(setno, MD_SET_BADTAG);
6104 			(void) selectreplicas(s, MDDB_SCANALL);
6105 		}
6106 	}
6107 
6108 errout:
6109 
6110 	/* Free extraneous rip components. */
6111 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
6112 		/* Get rid of lbp's and dtp's */
6113 
6114 		if (rip->ri_lbp != lbp) {
6115 			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
6116 				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
6117 				rip->ri_dtp = (mddb_dt_t *)NULL;
6118 			}
6119 
6120 			if (rip->ri_devid != (ddi_devid_t)NULL) {
6121 				sz = (int)ddi_devid_sizeof(rip->ri_devid);
6122 				kmem_free((caddr_t)rip->ri_devid, sz);
6123 				rip->ri_devid = (ddi_devid_t)NULL;
6124 			}
6125 			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
6126 				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
6127 				kmem_free((caddr_t)rip->ri_old_devid, sz);
6128 				rip->ri_old_devid = (ddi_devid_t)NULL;
6129 			}
6130 
6131 			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
6132 				mddb_devid_icp_free(&rip->ri_did_icp,
6133 				    rip->ri_lbp);
6134 
6135 				kmem_free((caddr_t)rip->ri_lbp,
6136 				    dbtob(rip->ri_lbp->lb_blkcnt));
6137 				rip->ri_lbp = (mddb_lb_t *)NULL;
6138 			}
6139 		}
6140 
6141 		if (lbp != NULL) {
6142 			for (li = 0; li < lbp->lb_loccnt; li++) {
6143 				lp = &lbp->lb_locators[li];
6144 				if (lp->l_flags & MDDB_F_DELETED)
6145 					continue;
6146 				if (rip->ri_dev == md_expldev(lp->l_dev) &&
6147 				    rip->ri_blkno == lp->l_blkno)
6148 					break;
6149 			}
6150 			if (li < lbp->lb_loccnt)
6151 				continue;
6152 		}
6153 
6154 		/*
6155 		 * Get rid of mbp's:
6156 		 *	if lbp, those out of lb_loccnt bounds
6157 		 *	if !lbp,  all of them.
6158 		 */
6159 		if (rip->ri_mbip) {
6160 			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
6161 			if (dev64 != NODEV64) {
6162 				mddb_devclose(dev64);
6163 				free_mbipp(&rip->ri_mbip);
6164 			}
6165 		}
6166 		/*
6167 		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
6168 		 * code always ends up calling ridev for all replicas
6169 		 * before calling load_old_replicas.  ridev will reset
6170 		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
6171 		 */
6172 		if (setno != MD_LOCAL_SET)
6173 			rip->ri_flags &= ~MDDB_F_EMASTER;
6174 	}
6175 	return (retval);
6176 }
6177 
6178 /*
6179  * Given the devt from the md.conf info, get the devid for the device.
6180  */
6181 static void
6182 lookup_db_devid(mddb_cfg_loc_t *cl)
6183 {
6184 	dev_t		ldev;
6185 	ddi_devid_t	devid;
6186 	char		*minor;
6187 
6188 	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
6189 		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
6190 		return;
6191 	}
6192 
6193 	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
6194 	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
6195 		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
6196 		    cl->l_driver, cl->l_mnum);
6197 		return;
6198 	}
6199 
6200 	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
6201 		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
6202 		    cl->l_mnum);
6203 		return;
6204 	}
6205 
6206 	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
6207 	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
6208 	cl->l_devid = (uint64_t)(uintptr_t)devid;
6209 	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
6210 
6211 	kmem_free(minor, strlen(minor) + 1);
6212 }
6213 
6214 /*
6215  * grab driver name, minor, block and devid out of
6216  * strings like "driver:minor:block:devid"
6217  */
6218 static int
6219 parse_db_loc(
6220 	char		*str,
6221 	mddb_cfg_loc_t	*clp
6222 )
6223 {
6224 	char		*p, *e;
6225 	char		*minor_name;
6226 	ddi_devid_t	ret_devid;
6227 
6228 	clp->l_dev = 0;
6229 	p = clp->l_driver;
6230 	e = p + sizeof (clp->l_driver) - 1;
6231 	while ((*str != ':') && (*str != '\0') && (p < e))
6232 		*p++ = *str++;
6233 	*p = '\0';
6234 	if (*str++ != ':')
6235 		return (-1);
6236 	clp->l_mnum = 0;
6237 	while (ISNUM(*str)) {
6238 		clp->l_mnum *= 10;
6239 		clp->l_mnum += *str++ - '0';
6240 	}
6241 	if (*str++ != ':')
6242 		return (-1);
6243 	clp->l_blkno = 0;
6244 	while (ISNUM(*str)) {
6245 		clp->l_blkno *= 10;
6246 		clp->l_blkno += *str++ - '0';
6247 	}
6248 	if (*str++ != ':')
6249 		return (-1);
6250 
6251 	/*
6252 	 * If the md_devid_destroy flag is set, ignore the device ids.
6253 	 * This is only to used in a catastrophic failure case.  Examples
6254 	 * would be where the device id of all drives in the system
6255 	 * (especially the mirror'd root drives) had been changed
6256 	 * by firmware upgrade or by a patch to an existing disk
6257 	 * driver.  Another example would be in the case of non-unique
6258 	 * device ids due to a bug.  The device id would be valid on
6259 	 * the system, but would return the wrong dev_t.
6260 	 */
6261 	if (md_devid_destroy) {
6262 		clp->l_devid_flags = 0;
6263 		clp->l_devid = (uint64_t)NULL;
6264 		clp->l_devid_sz = 0;
6265 		clp->l_old_devid = (uint64_t)NULL;
6266 		clp->l_old_devid_sz = 0;
6267 		clp->l_minor_name[0] = '\0';
6268 		return (0);
6269 	}
6270 
6271 	if (ddi_devid_str_decode(str,
6272 	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
6273 		return (-1);
6274 
6275 	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
6276 	clp->l_devid_flags = 0;
6277 	clp->l_old_devid = (uint64_t)NULL;
6278 	clp->l_old_devid_sz = 0;
6279 
6280 	/* If no device id associated with device, just return */
6281 	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
6282 		clp->l_devid_sz = 0;
6283 		clp->l_minor_name[0] = '\0';
6284 		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
6285 		    md_keep_repl_state == 0) {
6286 			/*
6287 			 * No devid in md.conf; we're in recovery mode so
6288 			 * lookup the devid for the device as specified by
6289 			 * the devt in md.conf.
6290 			 */
6291 			lookup_db_devid(clp);
6292 		}
6293 		return (0);
6294 	}
6295 
6296 	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
6297 	    MDDB_DEVID_SZ;
6298 	clp->l_devid_sz = (int)ddi_devid_sizeof(
6299 	    (ddi_devid_t)(uintptr_t)clp->l_devid);
6300 	(void) strcpy(clp->l_minor_name, minor_name);
6301 	kmem_free(minor_name, strlen(minor_name) + 1);
6302 
6303 	return (0);
6304 }
6305 
6306 /*
6307  * grab driver name, minor, and block out of
6308  * strings like "driver:minor:block:devid driver:minor:block:devid ..."
6309  */
6310 static void
6311 parse_db_string(
6312 	char		*str
6313 )
6314 {
6315 	char		*p, *e;
6316 	mddb_cfg_loc_t	*cl;
6317 	char		restore_space;
6318 
6319 	/* CSTYLED */
6320 	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
6321 	for (p = str; (*p != '\0'); ) {
6322 		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
6323 			;
6324 		if (*p == '\0')
6325 			break;
6326 		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
6327 			;
6328 		/*
6329 		 * Only give parse_db_loc 1 entry, so stuff a null into
6330 		 * the string if we're not at the end.  We need to save this
6331 		 * char and restore it after call.
6332 		 */
6333 		restore_space = '\0';
6334 		if (*e != '\0') {
6335 			restore_space = *e;
6336 			*e = '\0';
6337 		}
6338 		if (parse_db_loc(p, cl) != 0) {
6339 			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
6340 		} else {
6341 			(void) ridev(
6342 			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
6343 			    cl, NULL, MDDB_F_PTCHED);
6344 			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
6345 				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
6346 				    cl->l_devid_sz);
6347 			}
6348 		}
6349 		if (restore_space != '\0') {
6350 			*e = restore_space;
6351 		}
6352 		p = e;
6353 	}
6354 	kmem_free(cl, sizeof (mddb_cfg_loc_t));
6355 }
6356 
6357 /*
6358  * grab database locations supplied by md.conf as properties
6359  */
6360 static void
6361 parse_db_strings(void)
6362 {
6363 	int		bootlist_id;
6364 	int		proplen;
6365 	/*
6366 	 * size of _bootlist_name should match uses of line and entry in
6367 	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
6368 	 */
6369 	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
6370 	char		*bootlist_name;
6371 	caddr_t		prop;
6372 
6373 /*
6374  * Step through the bootlist properties one at a time by forming the
6375  * correct name, fetching the property, parsing the property and
6376  * then freeing the memory.  If a property does not exist or returns
6377  * some form of error just ignore it.  There is no guarantee that
6378  * the properties will always exist in sequence, for example
6379  * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
6380  * mddb_bootlist3 existing.
6381  */
6382 	bootlist_name = &_bootlist_name[0];
6383 	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
6384 
6385 		proplen = 0;
6386 		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
6387 
6388 		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
6389 		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
6390 		    &proplen) != DDI_PROP_SUCCESS)
6391 			continue;
6392 
6393 		if (proplen <= 0)
6394 			continue;
6395 
6396 		if (md_init_debug)
6397 			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
6398 
6399 		parse_db_string(prop);
6400 		kmem_free(prop, proplen);
6401 	}
6402 }
6403 
6404 static int
6405 initit(
6406 	set_t		setno,
6407 	int		flag
6408 )
6409 {
6410 	int		i;
6411 	mddb_set_t	*s;
6412 	mddb_lb_t	*lbp;		/* pointer to locator block */
6413 	mddb_ln_t	*lnp;		/* pointer to locator names */
6414 	mddb_db_t	*dbp;		/* pointer to directory block */
6415 	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
6416 	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
6417 	mddb_bf_t	*bfp;
6418 	side_t		sideno;
6419 	side_t		maxsides;
6420 	mddb_block_t	lb_blkcnt;
6421 	int		retval = 0;
6422 	md_dev64_t	dev;
6423 	mddb_mnlb_t	*mnlbp;
6424 	int		devid_flag;
6425 
6426 	/* single thread's all loads/unloads of set's */
6427 	mutex_enter(&mddb_lock);
6428 	mutex_enter(SETMUTEX(setno));
6429 
6430 	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
6431 		mutex_exit(SETMUTEX(setno));
6432 		mutex_exit(&mddb_lock);
6433 		return (MDDB_E_NOTNOW);
6434 	}
6435 
6436 	s = (mddb_set_t *)md_set[setno].s_db;
6437 
6438 	single_thread_start(s);
6439 
6440 	/*
6441 	 * init is already underway, block. Return success.
6442 	 */
6443 	if (s->s_lbp) {
6444 		single_thread_end(s);
6445 		mutex_exit(SETMUTEX(setno));
6446 		mutex_exit(&mddb_lock);
6447 		return (0);
6448 	}
6449 
6450 	uniqtime32(&s->s_inittime);
6451 
6452 	/* grab database locations patched by /etc/system */
6453 	if (setno == MD_LOCAL_SET)
6454 		parse_db_strings();
6455 
6456 	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
6457 	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
6458 
6459 	s->s_zombie = 0;
6460 	s->s_staledeletes = 0;
6461 	s->s_optcmtcnt = 0;
6462 	s->s_opthavelck = 0;
6463 	s->s_optwantlck = 0;
6464 	s->s_optwaiterr = 0;
6465 	s->s_opthungerr = 0;
6466 
6467 	/*
6468 	 * KEEPTAG can never be set for a MN diskset since no tags are
6469 	 * allowed to be stored in a MN diskset.  No way to check
6470 	 * if this is a MN diskset or not at this point since the mddb
6471 	 * hasn't been read in from disk yet.  (flag will only have
6472 	 * MUTLINODE bit set if a new set is being created.)
6473 	 */
6474 	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
6475 		dt_setup(s, NULL);
6476 
6477 	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
6478 
6479 	for (i = 0; i <	mddb_maxbufheaders; i++) {
6480 		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
6481 		sema_init(&bfp->bf_buf.b_io, 0, NULL,
6482 		    SEMA_DEFAULT, NULL);
6483 		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
6484 		    SEMA_DEFAULT, NULL);
6485 		bfp->bf_buf.b_offset = -1;
6486 		freebuffer(s, bfp);
6487 	}
6488 
6489 	retval = load_old_replicas(s, flag);
6490 	/* If 0 return value - success */
6491 	if (! retval) {
6492 		single_thread_end(s);
6493 		mutex_exit(SETMUTEX(setno));
6494 		mutex_exit(&mddb_lock);
6495 		return (0);
6496 	}
6497 
6498 	/*
6499 	 * If here, then the load_old_replicas() failed
6500 	 */
6501 
6502 
6503 	/* If the database was supposed to exist. */
6504 	if (flag & MDDB_MUSTEXIST) {
6505 		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
6506 			for (i = 0; i < mddb_maxcopies;	 i++) {
6507 				if (! s->s_mbiarray[i])
6508 					continue;
6509 				dev = md_expldev(
6510 					s->s_lbp->lb_locators[i].l_dev);
6511 				dev = md_xlate_targ_2_mini(dev);
6512 				if (dev != NODEV64) {
6513 					mddb_devclose(dev);
6514 					free_mbipp(&s->s_mbiarray[i]);
6515 				}
6516 			}
6517 
6518 			kmem_free((caddr_t)s->s_mbiarray,
6519 				sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
6520 			s->s_mbiarray = NULL;
6521 		}
6522 
6523 		if (s->s_lnp != (mddb_ln_t *)NULL) {
6524 			kmem_free((caddr_t)s->s_lnp,
6525 			    dbtob(s->s_lbp->lb_lnblkcnt));
6526 			s->s_lnp = (mddb_ln_t *)NULL;
6527 		}
6528 
6529 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
6530 
6531 		if (s->s_lbp != (mddb_lb_t *)NULL) {
6532 			kmem_free((caddr_t)s->s_lbp,
6533 			    dbtob(s->s_lbp->lb_blkcnt));
6534 			s->s_lbp = (mddb_lb_t *)NULL;
6535 		}
6536 
6537 		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
6538 			kmem_free((caddr_t)bfp, sizeof (*bfp));
6539 
6540 		single_thread_end(s);
6541 		mutex_exit(SETMUTEX(setno));
6542 		mutex_exit(&mddb_lock);
6543 
6544 		if (retval == MDDB_E_TAGDATA)
6545 			return (retval);
6546 
6547 		/* Want a bit more detailed error messages */
6548 		if (mddb_db_err_detail)
6549 			return (retval);
6550 
6551 		return (MDDB_E_NODB);
6552 	}
6553 
6554 
6555 	/*
6556 	 * MDDB_NOOLDOK set - Creating a new database, so do
6557 	 * more initialization.
6558 	 */
6559 
6560 	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6561 				MDDB_LOCAL_LBCNT : MDDB_LBCNT);
6562 	if (flag & MDDB_MULTINODE) {
6563 		lb_blkcnt = MDDB_MNLBCNT;
6564 	}
6565 
6566 	if (s->s_lbp == NULL)
6567 		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
6568 	lbp = s->s_lbp;
6569 
6570 	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
6571 	lbp->lb_setno = setno;
6572 	lbp->lb_magic = MDDB_MAGIC_LB;
6573 	if (flag & MDDB_MULTINODE) {
6574 		lbp->lb_revision = MDDB_REV_MNLB;
6575 	} else {
6576 		lbp->lb_revision = MDDB_REV_LB;
6577 	}
6578 	lbp->lb_inittime = s->s_inittime;
6579 	if (flag & MDDB_MULTINODE) {
6580 		mnlbp = (mddb_mnlb_t *)lbp;
6581 		for (i = 0; i < MDDB_NLB; i++) {
6582 			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
6583 				mddb_mnsidelocator_t	*mnslp;
6584 				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
6585 				mnslp->mnl_mnum = NODEV32;
6586 				mnslp->mnl_sideno = 0;
6587 				mnslp->mnl_drvnm_index = 0;
6588 			}
6589 		}
6590 	} else {
6591 		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
6592 		for (i = 0; i < MDDB_NLB; i++) {
6593 			for (sideno = 0; sideno < maxsides; sideno++) {
6594 				mddb_sidelocator_t	*slp;
6595 				slp = &lbp->lb_sidelocators[sideno][i];
6596 				slp->l_mnum = NODEV32;
6597 			}
6598 		}
6599 	}
6600 	lbp->lb_blkcnt = lb_blkcnt;
6601 
6602 	/* lb starts on block 0 */
6603 	/* locator names starts after locator block */
6604 	lbp->lb_lnfirstblk = lb_blkcnt;
6605 	if (flag & MDDB_MULTINODE) {
6606 		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
6607 	} else {
6608 		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6609 		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
6610 	}
6611 
6612 	if (flag & MDDB_MULTINODE) {
6613 		/* Creating a multinode diskset */
6614 		md_set_setstatus(setno, MD_SET_MNSET);
6615 		lbp->lb_flags |= MDDB_MNSET;
6616 	}
6617 
6618 	/* Data portion of mddb located after locator names */
6619 	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
6620 
6621 	/* the btodb that follows is converting the directory block size */
6622 	/* Data tag part of mddb located after first block of mddb data */
6623 	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
6624 						btodb(MDDB_BSIZE));
6625 	/* Data tags are not used in MN diskset - so set count to 0 */
6626 	if (flag & MDDB_MULTINODE)
6627 		lbp->lb_dtblkcnt = (mddb_block_t)0;
6628 	else
6629 		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
6630 
6631 
6632 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
6633 	lnp->ln_magic = MDDB_MAGIC_LN;
6634 	if (flag & MDDB_MULTINODE) {
6635 		lnp->ln_revision = MDDB_REV_MNLN;
6636 	} else {
6637 		lnp->ln_revision = MDDB_REV_LN;
6638 	}
6639 	s->s_lnp = lnp;
6640 
6641 	/*
6642 	 * Set up Device ID portion of Locator Block.
6643 	 * Do not set locator to device id style if
6644 	 * md_devid_destroy is 1 and md_keep_repl_state is 1
6645 	 * (destroy all device id data and keep replica in
6646 	 * non device id mode).
6647 	 *
6648 	 * This is logically equivalent to set locator to
6649 	 * device id style if md_devid_destroy is 0 or
6650 	 * md_keep_repl_state is 0.
6651 	 *
6652 	 * In SunCluster environment, device id mode is disabled
6653 	 * which means diskset will be run in non-devid mode.  For
6654 	 * localset, the behavior will remain intact and run in
6655 	 * device id mode.
6656 	 *
6657 	 * In multinode diskset devids are turned off.
6658 	 */
6659 	devid_flag = 1;
6660 	if (cluster_bootflags & CLUSTER_CONFIGURED)
6661 		if (setno != MD_LOCAL_SET)
6662 			devid_flag = 0;
6663 	if (flag & MDDB_MULTINODE)
6664 		devid_flag = 0;
6665 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
6666 		devid_flag = 0;
6667 	/*
6668 	 * if we weren't devid style before and md_keep_repl_state=1
6669 	 * we need to stay non-devid
6670 	 */
6671 	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
6672 	    (md_keep_repl_state == 1))
6673 		devid_flag = 0;
6674 	if (devid_flag) {
6675 		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
6676 			lbp->lb_dtblkcnt;
6677 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6678 		lbp->lb_flags |= MDDB_DEVID_STYLE;
6679 
6680 		did_icp = (mddb_did_ic_t *)kmem_zalloc
6681 			(sizeof (mddb_did_ic_t), KM_SLEEP);
6682 		did_blkp = (mddb_did_blk_t *)
6683 			kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
6684 		did_blkp->blk_magic = MDDB_MAGIC_DI;
6685 		did_blkp->blk_revision = MDDB_REV_DI;
6686 		did_icp->did_ic_blkp = did_blkp;
6687 		s->s_did_icp = did_icp;
6688 	}
6689 
6690 	setidentifier(s, &lbp->lb_ident);
6691 	uniqtime32(&lbp->lb_timestamp);
6692 	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
6693 	dbp->db_magic = MDDB_MAGIC_DB;
6694 	dbp->db_revision = MDDB_REV_DB;
6695 	uniqtime32(&dbp->db_timestamp);
6696 	dbp->db_nextblk = 0;
6697 	dbp->db_firstentry = NULL;
6698 	dbp->db_blknum = lbp->lb_dbfirstblk;
6699 	dbp->db_recsum = MDDB_GLOBAL_XOR;
6700 	s->s_dbp = dbp;
6701 	single_thread_end(s);
6702 	mutex_exit(SETMUTEX(setno));
6703 	mutex_exit(&mddb_lock);
6704 	return (0);
6705 }
6706 
6707 mddb_set_t *
6708 mddb_setenter(
6709 	set_t		setno,
6710 	int		flag,
6711 	int		*errorcodep
6712 )
6713 {
6714 	mddb_set_t	*s;
6715 	int		err = 0;
6716 	size_t		sz = sizeof (void *) * MD_MAXUNITS;
6717 
6718 	mutex_enter(SETMUTEX(setno));
6719 	if (! md_set[setno].s_db) {
6720 		mutex_exit(SETMUTEX(setno));
6721 		if (errorcodep != NULL)
6722 			*errorcodep = MDDB_E_NOTOWNER;
6723 		return (NULL);
6724 	}
6725 
6726 	/* Allocate s_un and s_ui arrays if not already present. */
6727 	if (md_set[setno].s_un == NULL) {
6728 		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
6729 		if (md_set[setno].s_un == NULL) {
6730 			mutex_exit(SETMUTEX(setno));
6731 			if (errorcodep != NULL)
6732 				*errorcodep = MDDB_E_NOTOWNER;
6733 			return (NULL);
6734 		}
6735 	}
6736 	if (md_set[setno].s_ui == NULL) {
6737 		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
6738 		if (md_set[setno].s_ui == NULL) {
6739 			mutex_exit(&md_set[setno].s_dbmx);
6740 			kmem_free(md_set[setno].s_un, sz);
6741 			md_set[setno].s_un = NULL;
6742 			if (errorcodep != NULL)
6743 				*errorcodep = MDDB_E_NOTOWNER;
6744 			return (NULL);
6745 		}
6746 	}
6747 	s = (mddb_set_t *)md_set[setno].s_db;
6748 	if (s->s_lbp)
6749 		return (s);
6750 
6751 	if (flag & MDDB_NOINIT)
6752 		return (s);
6753 
6754 	/*
6755 	 * Release the set mutex - it will be acquired and released in
6756 	 * initit after acquiring the mddb_lock.  This is done to assure
6757 	 * that mutexes are always acquired in the same order to prevent
6758 	 * possible deadlock
6759 	 */
6760 	mutex_exit(SETMUTEX(setno));
6761 
6762 	if ((err = initit(setno, flag)) != 0) {
6763 		if (errorcodep != NULL)
6764 			*errorcodep = err;
6765 		return (NULL);
6766 	}
6767 
6768 	mutex_enter(SETMUTEX(setno));
6769 	return ((mddb_set_t *)md_set[setno].s_db);
6770 }
6771 
6772 /*
6773  * Release the set lock for a given set.
6774  *
6775  * In a MN diskset, this routine may send messages to the rpc.mdcommd
6776  * in order to have the slave nodes re-parse parts of the mddb.
6777  * Messages are only sent if the global ioctl lock is not held.
6778  *
6779  * With the introduction of multi-threaded ioctls, there is no way
6780  * to determine which thread(s) are holding the ioctl lock.  So, if
6781  * the ioctl lock is held (by process X) process X will send the
6782  * messages to the slave nodes when process X releases the ioctl lock.
6783  */
6784 void
6785 mddb_setexit(
6786 	mddb_set_t	*s
6787 )
6788 {
6789 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
6790 	md_mn_kresult_t			*kresult;
6791 	mddb_lb_t			*lbp = s->s_lbp;
6792 	int				i;
6793 	int				rval = 1;
6794 
6795 	/*
6796 	 * If not a MN diskset OR
6797 	 * a MN diskset but this node isn't master,
6798 	 * then release the mutex.
6799 	 */
6800 	if (!(MD_MNSET_SETNO(s->s_setno)) ||
6801 	    ((MD_MNSET_SETNO(s->s_setno)) &&
6802 	    (!md_set[s->s_setno].s_am_i_master))) {
6803 		mutex_exit(SETMUTEX(s->s_setno));
6804 		return;
6805 	}
6806 
6807 	/*
6808 	 * If global ioctl lock is held, then send no messages,
6809 	 * just release mutex and return.
6810 	 *
6811 	 */
6812 	if (md_status & MD_GBL_IOCTL_LOCK) {
6813 		mutex_exit(SETMUTEX(s->s_setno));
6814 		return;
6815 	}
6816 
6817 	/*
6818 	 * This thread is not holding the ioctl lock, so drop the set
6819 	 * lock, send messages to slave nodes to reparse portions
6820 	 * of the mddb and return.
6821 	 *
6822 	 * If the block parse flag is set, do not send parse messages.
6823 	 * This flag is set when master is adding a new mddb that would
6824 	 * cause parse messages to be sent to the slaves, but the slaves
6825 	 * don't have knowledge of the new mddb yet since the mddb add
6826 	 * operation hasn't been run on the slave nodes yet.  When the
6827 	 * master unblocks the parse flag, the parse messages will be
6828 	 * generated.
6829 	 *
6830 	 * If s_mn_parseflags_sending is non-zero, then another thread
6831 	 * is already currently sending a parse message, so just release
6832 	 * the mutex and return.  If an mddb change occurred that results
6833 	 * in a parse message to be generated, the thread that is currently
6834 	 * sending a parse message would generate the additional parse message.
6835 	 *
6836 	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
6837 	 * then loop until s_mn_parseflags is 0 (until there are no more
6838 	 * messages to send).
6839 	 * While s_mn_parseflags is non-zero,
6840 	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
6841 	 * 	set s_mn_parseflags to zero
6842 	 *	release mutex
6843 	 *	send message
6844 	 *	re-grab mutex
6845 	 *	set s_mn_parseflags_sending to zero
6846 	 */
6847 	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
6848 		KM_SLEEP);
6849 	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
6850 	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
6851 	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
6852 		/* Grab snapshot of parse flags */
6853 		s->s_mn_parseflags_sending = s->s_mn_parseflags;
6854 		s->s_mn_parseflags = 0;
6855 
6856 		mutex_exit(SETMUTEX(s->s_setno));
6857 
6858 		/*
6859 		 * Send the message to the slaves to re-parse
6860 		 * the indicated portions of the mddb. Send the status
6861 		 * of the 50 mddbs in this set so that slaves know which
6862 		 * mddbs that the master node thinks are 'good'.
6863 		 * Otherwise, slave may reparse, but from wrong replica.
6864 		 */
6865 		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
6866 		for (i = 0; i < MDDB_NLB; i++) {
6867 			mddb_parse_msg->msg_lb_flags[i] =
6868 				lbp->lb_locators[i].l_flags;
6869 		}
6870 		kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
6871 		while (rval != 0) {
6872 			rval = mdmn_ksend_message(s->s_setno,
6873 				MD_MN_MSG_MDDB_PARSE, 0,
6874 				(char *)mddb_parse_msg,
6875 				sizeof (mddb_parse_msg), kresult);
6876 			if (rval != 0)
6877 				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
6878 					"mddb update message to other nodes in "
6879 					"diskset %s\n", s->s_setname);
6880 		}
6881 		kmem_free(kresult, sizeof (md_mn_kresult_t));
6882 
6883 		/*
6884 		 * Re-grab mutex to clear sending field and to
6885 		 * see if another parse message needs to be generated.
6886 		 */
6887 		mutex_enter(SETMUTEX(s->s_setno));
6888 		s->s_mn_parseflags_sending = 0;
6889 	}
6890 	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
6891 	mutex_exit(SETMUTEX(s->s_setno));
6892 }
6893 
6894 static void
6895 mddb_setexit_no_parse(
6896 	mddb_set_t	*s
6897 )
6898 {
6899 	mutex_exit(SETMUTEX(s->s_setno));
6900 }
6901 
6902 uint_t
6903 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
6904 {
6905 	uint_t			li;
6906 	mddb_lb_t		*lbp = s->s_lbp;
6907 	mddb_locator_t		*lp;
6908 	ddi_devid_t		ret_devid;
6909 	uint_t			devid_len;
6910 	dev_t			ddi_dev;
6911 	mddb_did_ic_t		*did_icp;
6912 	mddb_did_blk_t		*did_blkp;
6913 	char			*minor_name;
6914 	size_t			sz;
6915 	int			retval;
6916 	int			err;
6917 	md_dev64_t		dev64; /* tmp var to make code look better */
6918 
6919 
6920 	/* Need disk block(s) to hold mddb_did_blk_t */
6921 	*blk_cnt = MDDB_DID_BLOCKS;
6922 
6923 	if (doit) {
6924 		/*
6925 		 * Alloc mddb_did_blk_t disk block and fill in header area.
6926 		 * Don't fill in did magic number until end of routine so
6927 		 * if machine panics in the middle of conversion, the
6928 		 * device id information will be thrown away at the
6929 		 * next snarfing of this set.
6930 		 * Need to set DEVID_STYLE so that mddb_devid_add will
6931 		 * function properly.
6932 		 */
6933 		/* grab the mutex */
6934 		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
6935 			return (1);
6936 		}
6937 		single_thread_start(s);
6938 		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
6939 		if (lbp->lb_didfirstblk == 0) {
6940 			single_thread_end(s);
6941 			mddb_setexit(s);
6942 			return (1);
6943 		}
6944 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6945 		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
6946 		    KM_SLEEP);
6947 		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
6948 		    KM_SLEEP);
6949 
6950 		did_blkp->blk_revision = MDDB_REV_DI;
6951 		did_icp->did_ic_blkp = did_blkp;
6952 		s->s_did_icp = did_icp;
6953 		lbp->lb_flags |= MDDB_DEVID_STYLE;
6954 	}
6955 
6956 	/* Fill in information in mddb_did_info_t array */
6957 	for (li = 0; li < lbp->lb_loccnt; li++) {
6958 		lp = &lbp->lb_locators[li];
6959 		if (lp->l_flags & MDDB_F_DELETED)
6960 			continue;
6961 
6962 		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
6963 		ddi_dev = md_dev64_to_dev(dev64);
6964 		if (ddi_dev == NODEV) {
6965 			/*
6966 			 * No translation available for replica.
6967 			 * Could fail conversion to device id replica,
6968 			 * but instead will just continue with next
6969 			 * replica in list.
6970 			 */
6971 			continue;
6972 		}
6973 		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
6974 			/*
6975 			 * Just count each devid as at least 1 block.  This
6976 			 * is conservative since several device id's may fit
6977 			 * into 1 disk block, but it's better to overestimate
6978 			 * the number of blocks needed than to underestimate.
6979 			 */
6980 			devid_len = (int)ddi_devid_sizeof(ret_devid);
6981 			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
6982 			if (doit) {
6983 				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
6984 				    &minor_name) == DDI_SUCCESS) {
6985 					if (mddb_devid_add(s, li, ret_devid,
6986 					    minor_name)) {
6987 						cmn_err(CE_WARN,
6988 						"Not enough space in metadb"
6989 						" to add device id for"
6990 						"  dev: major = %d, "
6991 						"minor = %d\n",
6992 						getmajor(ddi_dev),
6993 						getminor(ddi_dev));
6994 					}
6995 					sz = strlen(minor_name) + 1;
6996 					kmem_free(minor_name, sz);
6997 				}
6998 			}
6999 			ddi_devid_free(ret_devid);
7000 		}
7001 	}
7002 
7003 	if (doit) {
7004 		did_blkp->blk_magic = MDDB_MAGIC_DI;
7005 		retval = push_lb(s);
7006 		(void) upd_med(s, "mddb_lb_did_convert(0)");
7007 		single_thread_end(s);
7008 		mddb_setexit(s);
7009 		if (retval != 0)
7010 			return (1);
7011 	}
7012 
7013 	return (0);
7014 }
7015 
7016 static mddb_set_t *
7017 init_set(
7018 	mddb_config_t	*cp,
7019 	int		flag,
7020 	int		*errp
7021 )
7022 {
7023 	mddb_set_t	*s;
7024 	char		*setname = NULL;
7025 	set_t		setno = MD_LOCAL_SET;
7026 	side_t		sideno = 0;
7027 	struct timeval32 *created = NULL;
7028 
7029 	if (cp != NULL) {
7030 		setname = cp->c_setname;
7031 		setno = cp->c_setno;
7032 		sideno = cp->c_sideno;
7033 		created = &cp->c_timestamp;
7034 	}
7035 
7036 	if (setno >= MD_MAXSETS)
7037 		return ((mddb_set_t *)NULL);
7038 
7039 	if (md_set[setno].s_db)
7040 		return (mddb_setenter(setno, flag, errp));
7041 
7042 	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
7043 
7044 	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
7045 	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
7046 	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
7047 	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
7048 	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
7049 
7050 	s->s_setno = setno;
7051 	s->s_sideno = sideno;
7052 	if (setno == MD_LOCAL_SET) {
7053 		(void) strcpy(s->s_ident.serial, hw_serial);
7054 	} else {
7055 		s->s_ident.createtime = *created;
7056 		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
7057 		    KM_SLEEP);
7058 		(void) strcpy(s->s_setname, setname);
7059 	}
7060 
7061 	/* have a config struct,  copy mediator information */
7062 	if (cp != NULL)
7063 		s->s_med = cp->c_med;		/* structure assignment */
7064 
7065 	md_set[setno].s_db = (void *) s;
7066 
7067 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
7068 
7069 	return (mddb_setenter(setno, flag, errp));
7070 }
7071 
7072 void
7073 mddb_unload_set(
7074 	set_t		setno
7075 )
7076 {
7077 
7078 	mddb_set_t	*s;
7079 	mddb_db_t	*dbp, *adbp = NULL;
7080 	mddb_de_ic_t	*dep, *dep2;
7081 	mddb_bf_t	*bfp;
7082 	int		i;
7083 	md_dev64_t	dev;
7084 
7085 	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
7086 		return;
7087 
7088 	single_thread_start(s);
7089 
7090 	s->s_opthavequeuinglck = 0;
7091 	s->s_optwantqueuinglck = 0;
7092 
7093 	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
7094 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
7095 			if (dep->de_rb_userdata != NULL) {
7096 				if (dep->de_icreqsize)
7097 					kmem_free(dep->de_rb_userdata_ic,
7098 					    dep->de_icreqsize);
7099 				else
7100 					kmem_free(dep->de_rb_userdata,
7101 					    dep->de_reqsize);
7102 			}
7103 			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
7104 			dep2 = dep->de_next;
7105 			kmem_free((caddr_t)dep, sizeofde(dep));
7106 		}
7107 		adbp = dbp->db_next;
7108 		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
7109 	}
7110 	s->s_dbp = (mddb_db_t *)NULL;
7111 
7112 	free_rip(&s->s_rip);
7113 
7114 	for (i = 0; i < mddb_maxcopies;	 i++) {
7115 		if (! s->s_mbiarray)
7116 			break;
7117 
7118 		if (! s->s_mbiarray[i])
7119 			continue;
7120 
7121 		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
7122 		dev = md_xlate_targ_2_mini(dev);
7123 		if (dev != NODEV64) {
7124 			mddb_devclose(dev);
7125 			free_mbipp(&s->s_mbiarray[i]);
7126 		}
7127 	}
7128 
7129 	if (s->s_mbiarray) {
7130 		kmem_free((caddr_t)s->s_mbiarray,
7131 		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
7132 		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
7133 	}
7134 
7135 	if (s->s_lnp) {
7136 		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
7137 		s->s_lnp = (mddb_ln_t *)NULL;
7138 	}
7139 
7140 	if (s->s_lbp) {
7141 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
7142 		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
7143 		s->s_lbp = (mddb_lb_t *)NULL;
7144 	}
7145 
7146 	if (s->s_freebitmap) {
7147 		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
7148 		s->s_freebitmap = NULL;
7149 		s->s_freebitmapsize = 0;
7150 	}
7151 
7152 	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
7153 		kmem_free((caddr_t)bfp, sizeof (*bfp));
7154 
7155 	if (s->s_databuffer_size) {
7156 		kmem_free(s->s_databuffer, s->s_databuffer_size);
7157 		s->s_databuffer_size = 0;
7158 	}
7159 
7160 	if (s->s_setname != NULL)
7161 		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
7162 
7163 	/* Data tags not supported on MN sets. */
7164 	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
7165 		dtl_freel(&s->s_dtlp);
7166 
7167 	md_set[setno].s_db = NULL;
7168 	ASSERT(s->s_singlelockwanted == 0);
7169 	kmem_free(s, sizeof (mddb_set_t));
7170 
7171 	/* Take care of things setup in the md_set array */
7172 	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
7173 		if (md_set[setno].s_dtp) {
7174 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
7175 			md_set[setno].s_dtp = NULL;
7176 		}
7177 	}
7178 
7179 	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
7180 				MD_SET_TAGDATA | MD_SET_USETAG |
7181 				MD_SET_TOOFEW | MD_SET_STALE |
7182 				MD_SET_OWNERSHIP | MD_SET_BADTAG |
7183 				MD_SET_CLRTAG | MD_SET_MNSET |
7184 				MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK |
7185 				MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT |
7186 				MD_SET_REPLICATED_IMPORT);
7187 
7188 	mutex_exit(SETMUTEX(setno));
7189 }
7190 
7191 /*
7192  * returns 0 if name can be put into locator block
7193  * returns 1 if locator block prefixes are all used
7194  *
7195  * Takes splitname (suffix, prefix, sideno) and
7196  * stores it in the locator name structure.
7197  * For traditional diskset, the sideno is the index into the suffixes
7198  * array in the locator name structure.
7199  * For the MN diskset, the sideno is the nodeid which can be any number,
7200  * so the index passed in is the index into the mnsuffixes array
7201  * in the locator structure.  This index was computed by the
7202  * routine checklocator which basically checked the locator block
7203  * mnside locator structure.
7204  */
7205 static int
7206 splitname2locatorblock(
7207 	md_splitname	*spn,
7208 	mddb_ln_t	*lnp,
7209 	int		li,
7210 	side_t		sideno,
7211 	int		index
7212 )
7213 {
7214 	uchar_t			i;
7215 	md_name_suffix		*sn;
7216 	md_mnname_suffix_t	*mnsn;
7217 	mddb_mnln_t		*mnlnp;
7218 
7219 	for (i = 0; i < MDDB_PREFIXCNT; i++) {
7220 		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
7221 			continue;
7222 		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
7223 		    SPN_PREFIX(spn).pre_len) == 0)
7224 			break;
7225 	}
7226 	if (i == MDDB_PREFIXCNT) {
7227 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7228 			if (lnp->ln_prefixes[i].pre_len == 0)
7229 				break;
7230 		}
7231 		if (i == MDDB_PREFIXCNT)
7232 			return (1);
7233 		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
7234 		    SPN_PREFIX(spn).pre_len);
7235 		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
7236 	}
7237 
7238 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7239 		/* If a MN diskset, use index */
7240 		mnlnp = (mddb_mnln_t *)lnp;
7241 		mnsn = &mnlnp->ln_mnsuffixes[index][li];
7242 		mnsn->mn_ln_sideno = sideno;
7243 		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
7244 		mnsn->mn_ln_suffix.suf_prefix = i;
7245 		bcopy(SPN_SUFFIX(spn).suf_data,
7246 		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
7247 	} else {
7248 		sn = &lnp->ln_suffixes[sideno][li];
7249 		sn->suf_len = SPN_SUFFIX(spn).suf_len;
7250 		sn->suf_prefix = i;
7251 		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
7252 		    SPN_SUFFIX(spn).suf_len);
7253 	}
7254 	return (0);
7255 }
7256 
7257 /*
7258  * Find the locator name for the given sideno and convert the locator name
7259  * information into a splitname structure.
7260  */
7261 void
7262 mddb_locatorblock2splitname(
7263 	mddb_ln_t	*lnp,
7264 	int		li,
7265 	side_t		sideno,
7266 	md_splitname	*spn
7267 )
7268 {
7269 	int			iprefix;
7270 	md_name_suffix		*sn;
7271 	md_mnname_suffix_t	*mnsn;
7272 	int			i;
7273 	mddb_mnln_t		*mnlnp;
7274 
7275 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7276 		mnlnp = (mddb_mnln_t *)lnp;
7277 		for (i = 0; i < MD_MNMAXSIDES; i++) {
7278 			mnsn = &mnlnp->ln_mnsuffixes[i][li];
7279 			if (mnsn->mn_ln_sideno == sideno)
7280 				break;
7281 		}
7282 		if (i == MD_MNMAXSIDES)
7283 			return;
7284 
7285 		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
7286 		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
7287 			SPN_SUFFIX(spn).suf_len);
7288 		iprefix = mnsn->mn_ln_suffix.suf_prefix;
7289 	} else {
7290 		sn = &lnp->ln_suffixes[sideno][li];
7291 		SPN_SUFFIX(spn).suf_len = sn->suf_len;
7292 		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
7293 			SPN_SUFFIX(spn).suf_len);
7294 		iprefix = sn->suf_prefix;
7295 	}
7296 	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
7297 	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
7298 	    SPN_PREFIX(spn).pre_len);
7299 }
7300 
7301 static int
7302 getdeldev(
7303 	mddb_config_t	*cp,
7304 	int		command,
7305 	md_error_t	*ep
7306 )
7307 {
7308 	mddb_set_t	*s;
7309 	mddb_lb_t	*lbp;
7310 	mddb_locator_t	*locators;
7311 	uint_t		loccnt;
7312 	mddb_mb_ic_t	*mbip;
7313 	mddb_block_t	blk;
7314 	int		err = 0;
7315 	int		i, j;
7316 	int		li;
7317 	uint_t		commitcnt;
7318 	set_t		setno = cp->c_setno;
7319 	uint_t		set_status;
7320 	md_dev64_t	dev;
7321 	int		flags = MDDB_MUSTEXIST;
7322 
7323 	cp->c_dbmax = MDDB_NLB;
7324 
7325 	/*
7326 	 * Data checking
7327 	 */
7328 	if (setno >= md_nsets || cp->c_id < 0 ||
7329 		cp->c_id > cp->c_dbmax) {
7330 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7331 	}
7332 
7333 	if (cp->c_flags & MDDB_C_STALE)
7334 		flags |= MDDB_MN_STALE;
7335 
7336 	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
7337 		return (mddbstatus2error(ep, err, NODEV32, setno));
7338 
7339 	cp->c_flags = 0;
7340 
7341 	lbp = s->s_lbp;
7342 	loccnt = lbp->lb_loccnt;
7343 	locators = lbp->lb_locators;
7344 
7345 	/* shorthand */
7346 	set_status = md_get_setstatus(setno);
7347 
7348 	if (set_status & MD_SET_STALE)
7349 		cp->c_flags |= MDDB_C_STALE;
7350 
7351 	if (set_status & MD_SET_TOOFEW)
7352 		cp->c_flags |= MDDB_C_TOOFEW;
7353 
7354 	cp->c_sideno = s->s_sideno;
7355 
7356 	cp->c_dbcnt = 0;
7357 	/*
7358 	 * go through and count active entries
7359 	 */
7360 	for (i = 0; i < loccnt;	 i++) {
7361 		if (locators[i].l_flags & MDDB_F_DELETED)
7362 			continue;
7363 		cp->c_dbcnt++;
7364 	}
7365 
7366 	/*
7367 	 * add the ability to accept a locator block index
7368 	 * which is not relative to previously deleted replicas.  This
7369 	 * is for support of MD_DEBUG=STAT in metastat since it asks for
7370 	 * replica information specifically for each of the mirror resync
7371 	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
7372 	 * the mddb_config_t type.
7373 	 */
7374 	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
7375 		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
7376 			mddb_setexit(s);
7377 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7378 						setno));
7379 		}
7380 		li = cp->c_id;
7381 	} else {
7382 		if (cp->c_id >= cp->c_dbcnt) {
7383 			mddb_setexit(s);
7384 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7385 						setno));
7386 		}
7387 
7388 		/* CSTYLED */
7389 		for (li = 0, j = 0; /* void */; li++) {
7390 			if (locators[li].l_flags & MDDB_F_DELETED)
7391 				continue;
7392 			j++;
7393 			if (j > cp->c_id)
7394 				break;
7395 		}
7396 	}
7397 
7398 	if (command == MDDB_ENDDEV) {
7399 		daddr_t ib = 0, jb;
7400 
7401 		blk = 0;
7402 		if ((s != NULL) && s->s_mbiarray[li]) {
7403 			mbip = s->s_mbiarray[li];
7404 			while ((jb = getphysblk(blk++, mbip)) > 0) {
7405 				if (jb > ib)
7406 					ib = jb;
7407 			}
7408 			cp->c_dbend = (int)ib;
7409 		} else {
7410 			cp->c_dbend = 0;
7411 		}
7412 	}
7413 
7414 	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
7415 	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
7416 
7417 	if (command != MDDB_DELDEV) {
7418 		mddb_setexit(s);
7419 		return (0);
7420 	}
7421 
7422 	/* Currently don't allow addition/deletion of sides during upgrade */
7423 	if (MD_UPGRADE) {
7424 		cmn_err(CE_WARN,
7425 		    "Deletion of replica not allowed during upgrade.\n");
7426 		mddb_setexit(s);
7427 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7428 	}
7429 
7430 	/*
7431 	 * If here, replica delete in progress.
7432 	 */
7433 	single_thread_start(s);
7434 
7435 	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
7436 	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
7437 		commitcnt = lbp->lb_commitcnt;
7438 		lbp->lb_commitcnt = 0;
7439 		setidentifier(s, &lbp->lb_ident);
7440 		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
7441 		/*
7442 		 * Don't need to write out device id area, since locator
7443 		 * block on this replica is being deleted by setting the
7444 		 * commitcnt to 0.
7445 		 */
7446 		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
7447 			MDDB_WR_ONLY_MASTER);
7448 		lbp->lb_commitcnt = commitcnt;
7449 	}
7450 
7451 	if (s->s_mbiarray[li])
7452 		free_mbipp(&s->s_mbiarray[li]);
7453 
7454 	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
7455 		dev = md_expldev(locators[li].l_dev);
7456 		dev = md_xlate_targ_2_mini(dev);
7457 		if (dev != NODEV64)
7458 			mddb_devclose(dev);
7459 	}
7460 
7461 	s->s_mbiarray[li] = 0;
7462 	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
7463 
7464 	/* Only support data tags for traditional and local sets */
7465 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
7466 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
7467 	    setno != MD_LOCAL_SET)
7468 		if (set_dtag(s, ep))
7469 			mdclrerror(ep);
7470 
7471 	/* Write data tags to all accessible devices */
7472 	/* Only support data tags for traditional and local sets */
7473 	if (!(lbp->lb_flags & MDDB_MNSET)) {
7474 		(void) dt_write(s);
7475 	}
7476 
7477 	/* Delete device id of deleted replica */
7478 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7479 		(void) mddb_devid_delete(s, li);
7480 	}
7481 	/* write new locator to all devices */
7482 	err = writelocall(s);
7483 
7484 	(void) upd_med(s, "getdeldev(0)");
7485 
7486 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
7487 	    md_expldev(locators[li].l_dev));
7488 
7489 	computefreeblks(s); /* recompute always it may be larger */
7490 	cp->c_dbcnt--;
7491 	err |= fixoptrecords(s);
7492 	if (err) {
7493 		if (writeretry(s)) {
7494 			single_thread_end(s);
7495 			mddb_setexit(s);
7496 			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
7497 		}
7498 	}
7499 
7500 	single_thread_end(s);
7501 	mddb_setexit(s);
7502 	return (0);
7503 }
7504 
7505 static int
7506 getdriver(
7507 	mddb_cfg_loc_t	*clp
7508 )
7509 {
7510 	major_t		majordev;
7511 
7512 	/*
7513 	 * Data checking
7514 	 */
7515 	if (clp->l_dev <= 0)
7516 		return (EINVAL);
7517 
7518 	majordev = getmajor(expldev(clp->l_dev));
7519 
7520 	if (ddi_major_to_name(majordev) == (char *)NULL)
7521 		return (EINVAL);
7522 
7523 	if (MD_UPGRADE)
7524 		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
7525 	else
7526 		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
7527 	return (0);
7528 }
7529 
7530 /*
7531  * update_valid_replica - updates the locator block namespace (prefix
7532  * 	and/or suffix) with new pathname and devname.
7533  *	RETURN
7534  *		1	Error
7535  *		0	Success
7536  */
7537 static int
7538 update_valid_replica(
7539 	side_t		side,
7540 	mddb_locator_t	*lp,
7541 	mddb_set_t	*s,
7542 	int		li,
7543 	char		*devname,
7544 	char		*pathname,
7545 	md_dev64_t	devt
7546 )
7547 {
7548 	uchar_t		pre_len, suf_len;
7549 	md_name_suffix	*sn;
7550 	mddb_ln_t	*lnp;
7551 	uchar_t		pre_index;
7552 	uchar_t		i;
7553 
7554 	if (md_expldev(lp->l_dev) != devt) {
7555 		return (0);
7556 	}
7557 
7558 	if (pathname[strlen(pathname) - 1] == '/')
7559 		pathname[strlen(pathname) - 1] = '\0';
7560 
7561 	pre_len = (uchar_t)strlen(pathname);
7562 	suf_len = (uchar_t)strlen(devname);
7563 
7564 	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
7565 		return (1);
7566 
7567 	lnp = s->s_lnp;
7568 
7569 	/*
7570 	 * Future note:  Need to do something here for the MN diskset case
7571 	 * when device ids are supported in disksets.
7572 	 * Can't add until merging devids_in_diskset code into code base
7573 	 * Currently only called with side of 0.
7574 	 */
7575 
7576 	sn = &lnp->ln_suffixes[side][li];
7577 
7578 	/*
7579 	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
7580 	 * If new prefix is the same as the previous prefix - no change.
7581 	 *
7582 	 * If new prefix is not the same, check if new prefix
7583 	 * matches an existing one.  If so, use that one.
7584 	 *
7585 	 * If new prefix doesn't exist, add a new prefix.  If not enough
7586 	 * space, return failure.
7587 	 */
7588 	pre_index = sn->suf_prefix;
7589 	/* Check if new prefix is the same as the old prefix. */
7590 	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
7591 	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
7592 	    pre_len) != 0)) {
7593 		/* Check if new prefix is an already known prefix. */
7594 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7595 			if (lnp->ln_prefixes[i].pre_len != pre_len) {
7596 				continue;
7597 			}
7598 			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
7599 			    pre_len) == 0) {
7600 				break;
7601 			}
7602 		}
7603 		/* If no match found for new prefix - add the new prefix */
7604 		if (i == MDDB_PREFIXCNT) {
7605 			for (i = 0; i < MDDB_PREFIXCNT; i++) {
7606 				if (lnp->ln_prefixes[i].pre_len == 0)
7607 					break;
7608 			}
7609 			/* No space to add new prefix - return failure */
7610 			if (i == MDDB_PREFIXCNT) {
7611 				return (1);
7612 			}
7613 			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
7614 			lnp->ln_prefixes[i].pre_len = pre_len;
7615 		}
7616 		sn->suf_prefix = i;
7617 	}
7618 
7619 	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
7620 	if ((sn->suf_len != suf_len) ||
7621 	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
7622 		bcopy(devname, sn->suf_data, suf_len);
7623 		sn->suf_len = suf_len;
7624 	}
7625 	return (0);
7626 }
7627 
7628 
7629 /*
7630  * md_update_locator_namespace - If in devid style and active and the devid's
7631  *		exist and are valid update the locator namespace pathname
7632  *		and devname.
7633  *	RETURN
7634  *		1	Error
7635  *		0	Success
7636  */
7637 int
7638 md_update_locator_namespace(
7639 	set_t		setno,		/* which set to get name from */
7640 	side_t		side,
7641 	char		*dname,
7642 	char		*pname,
7643 	md_dev64_t	devt
7644 )
7645 {
7646 	mddb_set_t	*s;
7647 	mddb_lb_t	*lbp;
7648 	int		li;
7649 	uint_t		flg;
7650 	int		err = 0;
7651 	mddb_ln_t	*lnp;
7652 
7653 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7654 		return (1);
7655 	single_thread_start(s);
7656 	lbp = s->s_lbp;
7657 	/* must be DEVID_STYLE */
7658 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7659 		for (li = 0; li < lbp->lb_loccnt; li++) {
7660 			mddb_locator_t *lp = &lbp->lb_locators[li];
7661 
7662 			if (lp->l_flags & MDDB_F_DELETED) {
7663 				continue;
7664 			}
7665 
7666 			/* replica also must be active */
7667 			if (lp->l_flags & MDDB_F_ACTIVE) {
7668 				flg = s->s_did_icp->did_ic_blkp->
7669 				    blk_info[li].info_flags;
7670 				/* only update if did exists and is valid */
7671 				if ((flg & MDDB_DID_EXISTS) &&
7672 				    (flg & MDDB_DID_VALID)) {
7673 					if (update_valid_replica(side, lp, s,
7674 					    li, dname, pname, devt)) {
7675 						err = 1;
7676 						goto out;
7677 					}
7678 				}
7679 			}
7680 		}
7681 	}
7682 	lnp = s->s_lnp;
7683 	uniqtime32(&lnp->ln_timestamp);
7684 	if (lbp->lb_flags & MDDB_MNSET)
7685 		lnp->ln_revision = MDDB_REV_MNLN;
7686 	else
7687 		lnp->ln_revision = MDDB_REV_LN;
7688 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
7689 	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
7690 		lbp->lb_lnblkcnt, 0);
7691 	/*
7692 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
7693 	 * flag in the mddb_set structure to show that the locator
7694 	 * names have changed.
7695 	 */
7696 
7697 	if ((lbp->lb_flags & MDDB_MNSET) &&
7698 	    (md_set[s->s_setno].s_am_i_master)) {
7699 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
7700 	}
7701 out:
7702 	single_thread_end(s);
7703 	mddb_setexit(s);
7704 	if (err)
7705 		return (1);
7706 	return (0);
7707 }
7708 
7709 /*
7710  * update_locatorblock - for active entries in the locator block, check
7711  *		the devt to see if it matches the given devt. If so, and
7712  *		there is an associated device id which is not the same
7713  *		as the passed in devid, delete old devid and add a new one.
7714  *
7715  *		During import of replicated disksets, old_didptr contains
7716  *		the original disk's device id.  Use this device id in
7717  *		addition to the devt to determine if an entry is a match
7718  *		and should be updated with the new device id of the
7719  *		replicated disk.  Specifically, this is the case being handled:
7720  *
7721  *		Original_disk	Replicated_disk	Disk_Available_During_Import
7722  *		c1t1d0		c1t3d0		no - so old name c1t1d0 shown
7723  *		c1t2d0		c1t1d0		yes - name is c1t1d0
7724  *		c1t3d0		c1t2d0		yes - name is c1t2d0
7725  *
7726  *		Can't just match on devt since devt for the first and third
7727  *		disks will be the same, but the original disk's device id
7728  *		is known and can be used to distinguish which disk's
7729  *		replicated device id should be updated.
7730  *	RETURN
7731  *		MDDB_E_NODEVID
7732  *		MDDB_E_NOLOCBLK
7733  *		1	Error
7734  *		0	Success
7735  */
7736 static int
7737 update_locatorblock(
7738 	mddb_set_t	*s,
7739 	md_dev64_t	dev,
7740 	ddi_devid_t	didptr,
7741 	ddi_devid_t	old_didptr
7742 )
7743 {
7744 	mddb_lb_t	*lbp = NULL;
7745 	mddb_locator_t	*lp;
7746 	int		li;
7747 	uint_t		flg;
7748 	ddi_devid_t	devid_ptr;
7749 	int		retval = 0;
7750 	char		*minor_name;
7751 	int		repl_import_flag;
7752 
7753 	/* Set replicated flag if this is a replicated import */
7754 	repl_import_flag = md_get_setstatus(s->s_setno) &
7755 	    MD_SET_REPLICATED_IMPORT;
7756 
7757 	lbp = s->s_lbp;
7758 	/* find replicas that haven't been deleted */
7759 	for (li = 0; li < lbp->lb_loccnt; li++) {
7760 		lp = &lbp->lb_locators[li];
7761 
7762 		if ((lp->l_flags & MDDB_F_DELETED)) {
7763 			continue;
7764 		}
7765 		/*
7766 		 * check to see if locator devt matches given dev
7767 		 * and if there is a device ID associated with it
7768 		 */
7769 		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
7770 		if ((md_expldev(lp->l_dev) == dev) &&
7771 		    (flg & MDDB_DID_EXISTS)) {
7772 			if (flg & MDDB_DID_VALID) {
7773 				continue; /* cont to nxt active entry */
7774 			}
7775 			devid_ptr = s->s_did_icp->did_ic_devid[li];
7776 			if (devid_ptr == NULL) {
7777 				return (MDDB_E_NODEVID);
7778 			}
7779 
7780 			/*
7781 			 * During a replicated import the old_didptr
7782 			 * must match the current devid before the
7783 			 * devid can be updated.
7784 			 */
7785 			if (repl_import_flag) {
7786 				if (ddi_devid_compare(devid_ptr,
7787 				    old_didptr) != 0)
7788 					continue;
7789 			}
7790 
7791 			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
7792 				/*
7793 				 * devid's not equal so
7794 				 * delete and add
7795 				 */
7796 				if (ddi_lyr_get_minor_name(
7797 				    md_dev64_to_dev(dev),
7798 				    S_IFBLK, &minor_name) == DDI_SUCCESS) {
7799 					(void) mddb_devid_delete(s, li);
7800 					(void) mddb_devid_add(s, li, didptr,
7801 					    minor_name);
7802 					kmem_free(minor_name,
7803 					    strlen(minor_name)+1);
7804 					break;
7805 				} else {
7806 					retval = 1;
7807 					goto err_out;
7808 				}
7809 			}
7810 		}
7811 	} /* end for */
7812 	retval = push_lb(s);
7813 	(void) upd_med(s, "update_locatorblock(0)");
7814 err_out:
7815 	return (retval);
7816 }
7817 
7818 static int
7819 update_mb_devid(
7820 	mddb_set_t	*s,
7821 	mddb_ri_t	*rip,
7822 	ddi_devid_t	devidptr
7823 )
7824 {
7825 	mddb_mb_ic_t	*mbip;
7826 	mddb_mb_t	*mb = NULL;
7827 	daddr_t		blkno;
7828 	md_dev64_t	device;
7829 	uint_t		sz;
7830 	int		mb2free = 0;
7831 	int		err = 0;
7832 
7833 
7834 	/*
7835 	 * There is case where a disk may not have mddb,
7836 	 * and only has dummy mddb which contains
7837 	 * a valid devid we like to update and in this
7838 	 * case, the rip_lbp will be NULL but we still
7839 	 * like to update the devid embedded in the
7840 	 * dummy mb block.
7841 	 *
7842 	 */
7843 	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
7844 		mbip = rip->ri_mbip;
7845 		mb = &mbip->mbi_mddb_mb;
7846 	} else {
7847 		/*
7848 		 * Done if it is non-replicated set
7849 		 */
7850 		if (devidptr != (ddi_devid_t)NULL) {
7851 			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
7852 				KM_SLEEP);
7853 			mb->mb_magic = MDDB_MAGIC_DU;
7854 			mb->mb_revision = MDDB_REV_MB;
7855 			mb2free = 1;
7856 		} else {
7857 			goto out;
7858 		}
7859 	}
7860 
7861 	blkno = rip->ri_blkno;
7862 	device = rip->ri_dev;
7863 	/*
7864 	 * Replace the mb_devid with the new/valid one
7865 	 */
7866 	if (devidptr != (ddi_devid_t)NULL) {
7867 		/*
7868 		 * Zero out what we have previously
7869 		 */
7870 		if (mb->mb_devid_len)
7871 			bzero(mb->mb_devid, mb->mb_devid_len);
7872 		sz = ddi_devid_sizeof(devidptr);
7873 		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
7874 		mb->mb_devid_len = sz;
7875 	}
7876 
7877 	mb->mb_setno = s->s_setno;
7878 	uniqtime32(&mb->mb_timestamp);
7879 	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
7880 	/*
7881 	 * putblks will
7882 	 *
7883 	 *	- drop the s_dbmx lock
7884 	 *	- biowait
7885 	 *	- regain the s_dbmx lock
7886 	 *
7887 	 * Need to update this if we wants to handle
7888 	 * mb_next != NULL which it is unlikely will happen
7889 	 */
7890 	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
7891 
7892 	if (mb2free) {
7893 		kmem_free(mb, MDDB_BSIZE);
7894 	}
7895 out:
7896 	return (err);
7897 }
7898 
7899 static int
7900 setdid(
7901 	mddb_config_t		*cp
7902 )
7903 {
7904 	ddi_devid_t		devidp;
7905 	dev_t			ddi_dev;
7906 	mddb_set_t		*s;
7907 	int			err = 0;
7908 	mddb_ri_t		*rip;
7909 
7910 	/*
7911 	 * Data integrity check
7912 	 */
7913 	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
7914 		return (EINVAL);
7915 
7916 	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
7917 		return (0);
7918 
7919 	ddi_dev = md_dev64_to_dev(cp->c_devt);
7920 	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
7921 		return (-1);
7922 	}
7923 	if (devidp == NULL) {
7924 		return (-1);
7925 	}
7926 
7927 	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
7928 		return (-1);
7929 	single_thread_start(s);
7930 
7931 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
7932 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
7933 			continue;
7934 		/*
7935 		 * We only update what is asked
7936 		 */
7937 		if (rip->ri_dev == cp->c_devt) {
7938 			if (update_mb_devid(s, rip, devidp) != 0) {
7939 				err = -1;
7940 				goto out;
7941 			}
7942 		}
7943 	}
7944 
7945 	if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
7946 		err = -1;
7947 		goto out;
7948 	}
7949 
7950 out:
7951 	single_thread_end(s);
7952 	mddb_setexit(s);
7953 	ddi_devid_free(devidp);
7954 	return (err);
7955 }
7956 
7957 static int
7958 delnewside(
7959 	mddb_config_t		*cp,
7960 	int			command,
7961 	md_error_t		*ep
7962 )
7963 {
7964 	mddb_set_t		*s;
7965 	int			li;
7966 	mddb_lb_t		*lbp;		/* pointer to locator block */
7967 	mddb_ln_t		*lnp;		/* pointer to locator names */
7968 	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
7969 	mddb_locator_t		*lp;
7970 	mddb_sidelocator_t	*slp;
7971 	mddb_cfg_loc_t		*clp;
7972 	int			err = 0;
7973 	set_t			setno = cp->c_setno;
7974 	ddi_devid_t		devid;
7975 	ddi_devid_t		ret_devid = NULL;
7976 	char			*minor_name;
7977 	uint_t			use_devid = 0;
7978 	dev_t			ddi_dev;
7979 	md_mnname_suffix_t	*mnsn;
7980 	mddb_mnlb_t		*mnlbp;
7981 	mddb_mnsidelocator_t	*mnslp;
7982 
7983 	/* Currently don't allow addition/deletion of sides during upgrade */
7984 	if (MD_UPGRADE) {
7985 		cmn_err(CE_WARN,
7986 		    "Addition and deletion of sides not allowed"
7987 		    " during upgrade. \n");
7988 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7989 	}
7990 
7991 	/*
7992 	 * Data integrity check
7993 	 */
7994 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
7995 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7996 
7997 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7998 		return (mddbstatus2error(ep, err, NODEV32, setno));
7999 
8000 	single_thread_start(s);
8001 	clp = &cp->c_locator;
8002 
8003 	lbp = s->s_lbp;
8004 
8005 	if (lbp->lb_setno != setno) {
8006 		single_thread_end(s);
8007 		mddb_setexit(s);
8008 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8009 	}
8010 
8011 	/*
8012 	 * Find this device/blkno pair
8013 	 */
8014 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8015 		ddi_dev = md_dev64_to_dev(clp->l_dev);
8016 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8017 		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
8018 		    == DDI_SUCCESS)) {
8019 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8020 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8021 				use_devid = 1;
8022 				(void) strcpy(clp->l_minor_name, minor_name);
8023 			}
8024 			kmem_free(minor_name, strlen(minor_name)+1);
8025 		}
8026 		if (use_devid != 1 && ret_devid != NULL)
8027 			ddi_devid_free(ret_devid);
8028 	}
8029 	for (li = 0; li < lbp->lb_loccnt; li++) {
8030 		lp = &lbp->lb_locators[li];
8031 		if (lp->l_flags & MDDB_F_DELETED)
8032 			continue;
8033 		if (use_devid) {
8034 			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
8035 				continue;
8036 			if ((ddi_devid_compare(devid,
8037 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8038 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8039 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8040 				break;
8041 			}
8042 		} else {
8043 			if (lp->l_dev == clp->l_dev &&
8044 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8045 				break;
8046 			}
8047 		}
8048 	}
8049 
8050 	if (li == lbp->lb_loccnt) {
8051 		if (use_devid)
8052 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8053 		single_thread_end(s);
8054 		mddb_setexit(s);
8055 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8056 	}
8057 
8058 	lnp = s->s_lnp;
8059 	if (command == MDDB_NEWSIDE) {
8060 		int 	index = 0;
8061 		/*
8062 		 * If a MN diskset, need to find the index where the new
8063 		 * locator information is to be stored in the mnsidelocator
8064 		 * field of the locator block so that the locator name can
8065 		 * be stored at the same array index in the mnsuffixes
8066 		 * field of the locator names structure.
8067 		 */
8068 		if (lbp->lb_flags & MDDB_MNSET) {
8069 			if ((index = checklocator(lbp, li,
8070 			    cp->c_sideno)) == -1) {
8071 				if (use_devid) {
8072 					ddi_devid_free((ddi_devid_t)
8073 					    (uintptr_t)clp->l_devid);
8074 				}
8075 				single_thread_end(s);
8076 				mddb_setexit(s);
8077 				return (mdmddberror(ep, MDE_DB_TOOSMALL,
8078 					NODEV32, setno));
8079 			}
8080 		}
8081 
8082 		/*
8083 		 * Store the locator name before the sidelocator information
8084 		 * in case a panic occurs between these 2 steps.  Must have
8085 		 * the locator name information in order to print reasonable
8086 		 * error information.
8087 		 */
8088 		if (splitname2locatorblock(&cp->c_devname, lnp, li,
8089 		    cp->c_sideno, index)) {
8090 			if (use_devid)
8091 				ddi_devid_free(
8092 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8093 			single_thread_end(s);
8094 			mddb_setexit(s);
8095 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8096 						setno));
8097 		}
8098 
8099 		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
8100 			if (use_devid)
8101 				ddi_devid_free(
8102 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8103 			single_thread_end(s);
8104 			mddb_setexit(s);
8105 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8106 						setno));
8107 		}
8108 	}
8109 
8110 	if (use_devid)
8111 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8112 
8113 	if (command == MDDB_DELSIDE) {
8114 		int i;
8115 		for (i = 0; i < lbp->lb_loccnt; i++) {
8116 			if (lbp->lb_flags & MDDB_MNSET) {
8117 				int	j;
8118 				mnlbp = (mddb_mnlb_t *)lbp;
8119 				for (j = 0; j < MD_MNMAXSIDES; j++) {
8120 				    mnslp = &mnlbp->lb_mnsidelocators[j][i];
8121 				    if (mnslp->mnl_sideno == cp->c_sideno)
8122 					break;
8123 				}
8124 				if (j < MD_MNMAXSIDES) {
8125 					mnslp->mnl_mnum = NODEV32;
8126 					mnslp->mnl_sideno = 0;
8127 					mnlnp = (mddb_mnln_t *)lnp;
8128 					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
8129 					bzero((caddr_t)mnsn,
8130 						sizeof (md_mnname_suffix_t));
8131 				}
8132 			} else {
8133 				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
8134 				bzero((caddr_t)&lnp->ln_suffixes
8135 				    [cp->c_sideno][i], sizeof (md_name_suffix));
8136 				slp->l_mnum = NODEV32;
8137 			}
8138 		}
8139 	}
8140 
8141 	/* write new locator names to all devices */
8142 	uniqtime32(&lnp->ln_timestamp);
8143 	if (lbp->lb_flags & MDDB_MNSET)
8144 		lnp->ln_revision = MDDB_REV_MNLN;
8145 	else
8146 		lnp->ln_revision = MDDB_REV_LN;
8147 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8148 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8149 		lbp->lb_lnblkcnt, 0);
8150 	/*
8151 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8152 	 * flag in the mddb_set structure to show that the locator
8153 	 * names have changed.
8154 	 */
8155 
8156 	if ((lbp->lb_flags & MDDB_MNSET) &&
8157 	    (md_set[s->s_setno].s_am_i_master)) {
8158 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8159 	}
8160 	if (err) {
8161 		if (writeretry(s)) {
8162 			single_thread_end(s);
8163 			mddb_setexit(s);
8164 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8165 		}
8166 	}
8167 
8168 	uniqtime32(&lbp->lb_timestamp);
8169 	/* write new locator to all devices */
8170 	err = writelocall(s);
8171 
8172 	(void) upd_med(s, "delnewside(0)");
8173 
8174 	computefreeblks(s); /* recompute always it may be larger */
8175 	if (err) {
8176 		if (writeretry(s)) {
8177 			single_thread_end(s);
8178 			mddb_setexit(s);
8179 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8180 		}
8181 	}
8182 
8183 	single_thread_end(s);
8184 	mddb_setexit(s);
8185 
8186 	return (0);
8187 }
8188 
8189 static int
8190 newdev(
8191 	mddb_config_t	*cp,
8192 	int		command,
8193 	md_error_t	*ep
8194 )
8195 {
8196 	mddb_set_t	*s;
8197 	mddb_mb_ic_t	*mbip, *mbip1;
8198 	int		i, j;
8199 	int		li;
8200 	mddb_lb_t	*lbp;		/* pointer to locator block */
8201 	mddb_ln_t	*lnp;		/* pointer to locator names */
8202 	mddb_locator_t	*lp;
8203 	mddb_cfg_loc_t	*clp;
8204 	int		err = 0;
8205 	set_t		setno = cp->c_setno;
8206 	ddi_devid_t	devid2;
8207 	ddi_devid_t	ret_devid = NULL;
8208 	char		*minor_name;
8209 	uint_t		use_devid = 0;
8210 	dev_t		ddi_dev;
8211 	int		old_flags;
8212 	int		flags;
8213 	int		mn_set = 0;
8214 	int		index;
8215 
8216 
8217 	/* Currently don't allow addition of new replica during upgrade */
8218 	if (MD_UPGRADE) {
8219 		cmn_err(CE_WARN,
8220 		    "Addition of new replica not allowed during upgrade.\n");
8221 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8222 	}
8223 
8224 	/*
8225 	 * Data integrity check
8226 	 */
8227 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8228 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8229 
8230 	/* Determine the flag settings for multinode sets */
8231 	flags = MDDB_NOOLDOK;
8232 	if (cp->c_multi_node)
8233 		flags |= MDDB_MULTINODE;
8234 
8235 	if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
8236 		if (err != MDDB_E_NOTOWNER)
8237 			return (mddbstatus2error(ep, err, NODEV32, setno));
8238 		s = init_set(cp, flags, &err);
8239 		if (s == NULL)
8240 			return (mddbstatus2error(ep, err, NODEV32, setno));
8241 	}
8242 
8243 	single_thread_start(s);
8244 
8245 	/* shorthand */
8246 	clp = &cp->c_locator;
8247 
8248 	/* shorthand */
8249 	lbp = s->s_lbp;
8250 
8251 	if (lbp->lb_setno != setno) {
8252 		single_thread_end(s);
8253 		mddb_setexit(s);
8254 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8255 	}
8256 
8257 	/*
8258 	 * See if this device/blkno pair is already a replica
8259 	 */
8260 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8261 		ddi_dev = expldev(clp->l_dev);
8262 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8263 		    (ddi_lyr_get_minor_name(ddi_dev,
8264 		    S_IFBLK, &minor_name) == DDI_SUCCESS)) {
8265 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8266 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8267 				use_devid = 1;
8268 				(void) strcpy(clp->l_minor_name, minor_name);
8269 			}
8270 			kmem_free(minor_name, strlen(minor_name)+1);
8271 		}
8272 		if (use_devid != 1 && ret_devid != NULL)
8273 			ddi_devid_free(ret_devid);
8274 	}
8275 
8276 	for (i = 0; i < lbp->lb_loccnt;	 i++) {
8277 		lp = &lbp->lb_locators[i];
8278 		if (lp->l_flags & MDDB_F_DELETED)
8279 			continue;
8280 		if (use_devid) {
8281 			if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
8282 				continue;
8283 			if ((ddi_devid_compare(devid2,
8284 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8285 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8286 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8287 				if (command == MDDB_NEWDEV) {
8288 					ddi_devid_free((ddi_devid_t)(uintptr_t)
8289 						clp->l_devid);
8290 					single_thread_end(s);
8291 					mddb_setexit(s);
8292 					return (mdmddberror(ep,
8293 						MDE_DB_EXISTS, NODEV32, setno));
8294 				}
8295 			}
8296 		} else {
8297 			if (lp->l_dev == clp->l_dev &&
8298 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8299 				if (command == MDDB_NEWDEV) {
8300 					single_thread_end(s);
8301 					mddb_setexit(s);
8302 					return (mdmddberror(ep,
8303 						MDE_DB_EXISTS, NODEV32, setno));
8304 				}
8305 			}
8306 		}
8307 	}
8308 
8309 	/*
8310 	 * Really is a new replica, go get the master blocks
8311 	 */
8312 	mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
8313 	    (uint_t *)0, &mn_set);
8314 	if (! mbip) {
8315 		if (use_devid)
8316 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8317 		single_thread_end(s);
8318 		mddb_setexit(s);
8319 		return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
8320 	}
8321 
8322 	/*
8323 	 * Compute free blocks in replica.
8324 	 */
8325 	computefreeblks(s);
8326 
8327 	/*
8328 	 * Check if this is large enough
8329 	 */
8330 	for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
8331 		i += mbip1->mbi_mddb_mb.mb_blkcnt;
8332 	for (j = i; j < s->s_totalblkcnt; j++) {
8333 		if (blkcheck(s, j)) {
8334 			while (mbip) {
8335 				mbip1 = mbip->mbi_next;
8336 				kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
8337 				mbip = mbip1;
8338 			}
8339 			if (use_devid)
8340 				ddi_devid_free(
8341 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8342 			mddb_devclose(md_expldev(clp->l_dev));
8343 			single_thread_end(s);
8344 			mddb_setexit(s);
8345 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8346 						setno));
8347 		}
8348 	}
8349 
8350 	/* Look for a deleted slot */
8351 	for (li = 0; li < lbp->lb_loccnt; li++) {
8352 		lp = &lbp->lb_locators[li];
8353 		if (lp->l_flags & MDDB_F_DELETED)
8354 			break;
8355 	}
8356 
8357 	/* If no deleted slots, add a new one */
8358 	if (li == lbp->lb_loccnt) {
8359 		/* Already have the max replicas, bail */
8360 		if (lbp->lb_loccnt == MDDB_NLB) {
8361 			if (use_devid)
8362 				ddi_devid_free((ddi_devid_t)(uintptr_t)
8363 				    clp->l_devid);
8364 			mddb_devclose(md_expldev(clp->l_dev));
8365 			single_thread_end(s);
8366 			mddb_setexit(s);
8367 			return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
8368 			    setno));
8369 		}
8370 		lbp->lb_loccnt++;
8371 		lp = &lbp->lb_locators[li];
8372 	}
8373 
8374 	/* Initialize the new or deleted slot */
8375 	old_flags = lp->l_flags;
8376 	lp->l_dev = clp->l_dev;
8377 	lp->l_blkno = (daddr32_t)clp->l_blkno;
8378 	lp->l_flags = clp->l_flags;
8379 
8380 	/* shorthand */
8381 	lnp = s->s_lnp;
8382 
8383 	index = 0;
8384 	if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
8385 		/*
8386 		 * If a MN diskset, need to find the index where the new
8387 		 * locator information is to be stored in the mnsidelocator
8388 		 * field of the locator block so that the locator name can
8389 		 * be stored at the same array index in the mnsuffixes
8390 		 * field of the locator names structure.
8391 		 */
8392 		lbp->lb_flags |= MDDB_MNSET;
8393 		if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
8394 			if (use_devid)
8395 				ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
8396 				    l_devid);
8397 			lp->l_flags = old_flags;
8398 			lbp->lb_loccnt--;
8399 			mddb_devclose(md_expldev(clp->l_dev));
8400 			single_thread_end(s);
8401 			mddb_setexit(s);
8402 			return (mdmddberror(ep, MDE_DB_TOOSMALL,
8403 				NODEV32, setno));
8404 		}
8405 	}
8406 	/*
8407 	 * Store the locator name before the sidelocator information
8408 	 * in case a panic occurs between these 2 steps.  Must have
8409 	 * the locator name information in order to print reasonable
8410 	 * error information.
8411 	 */
8412 	if (splitname2locatorblock(&cp->c_devname, lnp, li,
8413 	    s->s_sideno, index)) {
8414 		if (use_devid)
8415 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8416 		lp->l_flags = old_flags;
8417 		lbp->lb_loccnt--;
8418 		mddb_devclose(md_expldev(clp->l_dev));
8419 		single_thread_end(s);
8420 		mddb_setexit(s);
8421 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8422 	}
8423 
8424 	/*
8425 	 * Compute free blocks in replica before calling cfgloc2locator
8426 	 * since cfgloc2locator may attempt to alloc an unused block
8427 	 * to store the device id.
8428 	 * mbiarray needs to be setup before calling computefreeblks.
8429 	 */
8430 	s->s_mbiarray[li] = mbip;
8431 	computefreeblks(s);
8432 
8433 	if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
8434 		if (use_devid)
8435 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8436 		lp->l_flags = old_flags;
8437 		lbp->lb_loccnt--;
8438 		s->s_mbiarray[li] = 0;
8439 		mddb_devclose(md_expldev(clp->l_dev));
8440 		single_thread_end(s);
8441 		mddb_setexit(s);
8442 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8443 	}
8444 
8445 	if (use_devid)
8446 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8447 
8448 	uniqtime32(&lbp->lb_timestamp);
8449 	lp->l_flags = MDDB_F_ACTIVE;
8450 
8451 	/* write db copy to new device */
8452 	err = writecopy(s, li, MDDB_WRITECOPY_ALL);
8453 	lp->l_flags |= MDDB_F_UP2DATE;
8454 
8455 	/* write new locator names to all devices */
8456 	uniqtime32(&lnp->ln_timestamp);
8457 	if (lbp->lb_flags & MDDB_MNSET)
8458 		lnp->ln_revision = MDDB_REV_MNLN;
8459 	else
8460 		lnp->ln_revision = MDDB_REV_LN;
8461 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8462 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8463 		lbp->lb_lnblkcnt, 0);
8464 	/*
8465 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8466 	 * flag in the mddb_set structure to show that the locator
8467 	 * names have changed.
8468 	 */
8469 
8470 	if ((lbp->lb_flags & MDDB_MNSET) &&
8471 	    (md_set[s->s_setno].s_am_i_master)) {
8472 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8473 	}
8474 	if (err) {
8475 		if (writeretry(s)) {
8476 			single_thread_end(s);
8477 			mddb_setexit(s);
8478 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8479 		}
8480 	}
8481 
8482 	/* Data tags not supported on MN sets */
8483 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
8484 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
8485 	    setno != MD_LOCAL_SET)
8486 		if (set_dtag(s, ep))
8487 			mdclrerror(ep);
8488 
8489 	/* Write data tags to all accessible devices */
8490 	/* Data tags not supported on MN sets */
8491 	if (!(lbp->lb_flags & MDDB_MNSET)) {
8492 		(void) dt_write(s);
8493 	}
8494 
8495 	/* write new locator to all devices */
8496 	err = writelocall(s);
8497 
8498 	(void) upd_med(s, "newdev(0)");
8499 
8500 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
8501 	    md_expldev(clp->l_dev));
8502 
8503 	computefreeblks(s); /* recompute always it may be smaller */
8504 	if (err) {
8505 		if (writeretry(s)) {
8506 			single_thread_end(s);
8507 			mddb_setexit(s);
8508 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8509 		}
8510 	}
8511 
8512 	single_thread_end(s);
8513 	mddb_setexit(s);
8514 
8515 	return (0);
8516 }
8517 
8518 #ifdef DEBUG
8519 static void
8520 mddb_check_set(
8521 	set_t	setno
8522 )
8523 {
8524 	mddb_set_t	*s;
8525 	mddb_db_t	*dbp;
8526 	mddb_de_ic_t	*dep;
8527 	mddb_rb32_t	*rbp;
8528 
8529 	if (! md_set[setno].s_db)
8530 		return;
8531 
8532 	s = (mddb_set_t *)md_set[setno].s_db;
8533 
8534 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8535 		for (dep = dbp->db_firstentry;
8536 		    dep != NULL; dep = dep->de_next) {
8537 			rbp = dep->de_rb;
8538 			ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
8539 			if (dep->de_rb_userdata)
8540 				ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
8541 		}
8542 	}
8543 }
8544 #endif /* DEBUG */
8545 
8546 /*
8547  * Exported Entry Points
8548  */
8549 #ifdef DEBUG
8550 void
8551 mddb_check(void)
8552 {
8553 	int	i;
8554 
8555 	for (i = 0; i < md_nsets; i++) {
8556 		if (! md_set[i].s_db)
8557 			return;
8558 
8559 		mddb_check_set(i);
8560 	}
8561 
8562 }
8563 #endif /* DEBUG */
8564 
8565 int
8566 mddb_configure(
8567 	mddb_cfgcmd_t	command,
8568 	mddb_config_t	*cp
8569 )
8570 {
8571 	mddb_set_t	*s;
8572 	md_error_t	*ep = &cp->c_mde;
8573 	int		flag = 0;
8574 	int		err = 0;
8575 	set_t		setno = cp->c_setno;
8576 
8577 	mdclrerror(ep);
8578 
8579 	switch (command) {
8580 	    case MDDB_NEWDEV:
8581 		err = newdev(cp, command, ep);
8582 		break;
8583 
8584 	    case MDDB_NEWSIDE:
8585 	    case MDDB_DELSIDE:
8586 		err = delnewside(cp, command, ep);
8587 		break;
8588 
8589 	    case MDDB_GETDEV:
8590 	    case MDDB_DELDEV:
8591 	    case MDDB_ENDDEV:
8592 		err = getdeldev(cp, command, ep);
8593 		break;
8594 
8595 	    case MDDB_GETDRVRNAME:
8596 		err = getdriver(&cp->c_locator);
8597 		break;
8598 
8599 	    case MDDB_USEDEV:
8600 		/*
8601 		 * Note: must allow USEDEV ioctl during upgrade to support
8602 		 * auto-take disksets.
8603 		 *
8604 		 * Also during the set import if the md_devid_destroy
8605 		 * flag is set then error out
8606 		 */
8607 
8608 		if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
8609 			return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8610 
8611 		if (setno >= md_nsets)
8612 			return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8613 
8614 		if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
8615 			if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
8616 				err = mddbstatus2error(ep, err, NODEV32, setno);
8617 				break;
8618 			}
8619 		}
8620 		if (setno == MD_LOCAL_SET)
8621 			flag = MDDB_F_IOCTL;
8622 		if (cp->c_locator.l_old_devid) {
8623 			md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT);
8624 		}
8625 		err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
8626 		mddb_setexit(s);
8627 		break;
8628 
8629 	    case MDDB_RELEASESET:
8630 		mutex_enter(&mddb_lock);
8631 		mddb_unload_set(cp->c_setno);
8632 		mutex_exit(&mddb_lock);
8633 		break;
8634 
8635 	    case MDDB_SETDID:
8636 		err = setdid(cp);
8637 		break;
8638 
8639 	    default:
8640 		err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno);
8641 	}
8642 
8643 	return (err);
8644 }
8645 
8646 int
8647 mddb_getoptloc(
8648 	mddb_optloc_t		*ol
8649 )
8650 {
8651 	mddb_set_t		*s;
8652 	mddb_db_t		*dbp;
8653 	mddb_de_ic_t		*dep;
8654 	mddb_recid_t		id;
8655 	set_t			setno;
8656 
8657 	ol->li[0] = -1;
8658 	ol->li[1] = -1;
8659 
8660 	id = ol->recid;
8661 	setno = DBSET(id);
8662 	if (setno >= md_nsets)
8663 		return (EINVAL);
8664 
8665 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
8666 		return (0);
8667 
8668 	id = DBID(id);
8669 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8670 		for (dep = dbp->db_firstentry;
8671 		    dep != NULL; dep = dep->de_next) {
8672 			if (dep->de_recid != id)
8673 				continue;
8674 			ol->li[0] = dep->de_optinfo[0].o_li;
8675 			ol->li[1] = dep->de_optinfo[1].o_li;
8676 			mddb_setexit(s);
8677 			return (0);
8678 		}
8679 	}
8680 	mddb_setexit(s);
8681 	return (0);
8682 }
8683 
8684 void
8685 mddb_init(void)
8686 {
8687 	mddb_set_t	*s;
8688 
8689 	mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
8690 	if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
8691 		mddb_setexit(s);
8692 }
8693 
8694 
8695 void
8696 mddb_unload(void)
8697 {
8698 	int	i;
8699 
8700 	mutex_enter(&mddb_lock);
8701 
8702 	for (i = 0; i < md_nsets; i++) {
8703 		md_clr_setstatus(i, MD_SET_KEEPTAG);
8704 		mddb_unload_set(i);
8705 	}
8706 
8707 	crcfreetab();
8708 
8709 	mutex_exit(&mddb_lock);
8710 }
8711 
8712 mddb_recid_t
8713 mddb_createrec(
8714 	size_t		usersize,	 /* size of db record */
8715 	mddb_type_t	type,		 /* type1 of db record */
8716 	uint_t		type2,		 /* type2 of db record */
8717 	md_create_rec_option_t	options, /* options for this creation  */
8718 	set_t		setno		 /* set number to create record in */
8719 )
8720 {
8721 	mddb_set_t	*s;
8722 	mddb_db_t	*dbp, *prevdbp, *newdbp;
8723 	mddb_db32_t	*db32p;
8724 	mddb_de_ic_t	*dep;
8725 	/* LINTED variable unused - used for sizeof calculations */
8726 	mddb_de32_t	*de32p;
8727 	mddb_rb32_t	*rbp;
8728 	size_t		recsize;
8729 	ulong_t		blkcnt;
8730 	ulong_t		maxblocks;
8731 	size_t		desize, desize_ic;
8732 	size_t		used;
8733 	mddb_recid_t	newid;
8734 	caddr_t		tmppnt;
8735 	int		i, err = 0;
8736 	void		*userdata;
8737 	uint_t		flag_type;
8738 
8739 #if defined(_ILP32) && !defined(lint)
8740 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
8741 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8742 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8743 #endif
8744 
8745 	/*
8746 	 * everyone is supposed to sepcify if it's a
8747 	 * 32 bit or a 64 bit record
8748 	 */
8749 	if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
8750 		return (MDDB_E_INVALID);
8751 	}
8752 
8753 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8754 		return (err);
8755 
8756 	if (checkstate(s, MDDB_PROBE)) {
8757 		mddb_setexit(s);
8758 		return (MDDB_E_NOTNOW);
8759 	}
8760 
8761 	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
8762 				usersize, MDDB_BSIZE);
8763 	blkcnt = btodb(recsize);
8764 
8765 	if (mddb_maxblocks)
8766 		maxblocks = mddb_maxblocks;
8767 	else
8768 		maxblocks = (MDDB_BSIZE -
8769 			(sizeof (*db32p) + sizeof (*de32p) -
8770 			sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
8771 
8772 	if (blkcnt > maxblocks) {
8773 		mddb_setexit(s);
8774 		return (MDDB_E_INVALID);
8775 	}
8776 	/*
8777 	 * allocate record block
8778 	 * and new directory block so to avoid sleeping
8779 	 * after starting single_thread
8780 	 */
8781 	rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
8782 	if ((options & MD_CRO_OPTIMIZE) == 0)
8783 		userdata = kmem_zalloc(usersize, KM_SLEEP);
8784 	newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);
8785 
8786 	/*
8787 	 * if this is the largest record allocate new buffer for
8788 	 * checkcopy();
8789 	 */
8790 	if (recsize > s->s_databuffer_size) {
8791 		tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
8792 		/*
8793 		 * this test is incase when to sleep during kmem_alloc
8794 		 * and some other task bumped max record size
8795 		 */
8796 		if (recsize > s->s_databuffer_size) {
8797 			if (s->s_databuffer_size)
8798 				kmem_free(s->s_databuffer,
8799 				    s->s_databuffer_size);
8800 			s->s_databuffer = tmppnt;
8801 			s->s_databuffer_size = recsize;
8802 		} else {
8803 			kmem_free(tmppnt, recsize);
8804 		}
8805 	}
8806 
8807 	single_thread_start(s);
8808 
8809 	newid = 0;
8810 	do {
8811 		newid++;
8812 		if (DBID(newid) == 0) {
8813 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8814 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8815 			if ((options & MD_CRO_OPTIMIZE) == 0)
8816 				kmem_free(userdata, usersize);
8817 			single_thread_end(s);
8818 			mddb_setexit(s);
8819 			return (MDDB_E_NOTNOW);
8820 		}
8821 
8822 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8823 			for (dep = dbp->db_firstentry; dep;
8824 			    dep = dep->de_next) {
8825 				if (dep->de_recid == newid)
8826 					break;
8827 			}
8828 			if (dep != NULL)
8829 				break;
8830 		}
8831 	} while (dbp);
8832 
8833 	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
8834 			(sizeof (mddb_block_t) * blkcnt);
8835 
8836 	/*
8837 	 * see if a directory block exists which will hold this entry
8838 	 */
8839 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8840 		used = sizeof (*db32p);
8841 		for (dep = dbp->db_firstentry;
8842 		    dep != NULL; dep = dep->de_next) {
8843 			used += sizeof (*de32p) - sizeof (de32p->de32_blks);
8844 			used += sizeof (mddb_block_t) * dep->de_blkcount;
8845 		}
8846 		if ((used + desize) < MDDB_BSIZE)
8847 			break;
8848 	}
8849 	if (dbp) {
8850 		kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8851 		if (blkcnt > s->s_freeblkcnt) {
8852 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8853 			if ((options & MD_CRO_OPTIMIZE) == 0)
8854 				kmem_free(userdata, usersize);
8855 			single_thread_end(s);
8856 			mddb_setexit(s);
8857 			return (MDDB_E_NOSPACE);
8858 		}
8859 		prevdbp = NULL;
8860 	} else {
8861 		/*
8862 		 * need to add directory block
8863 		 */
8864 		if ((blkcnt + 1) > s->s_freeblkcnt) {
8865 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8866 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8867 			if ((options & MD_CRO_OPTIMIZE) == 0)
8868 				kmem_free(userdata, usersize);
8869 			single_thread_end(s);
8870 			mddb_setexit(s);
8871 			return (MDDB_E_NOSPACE);
8872 		}
8873 		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next);
8874 		dbp->db_next = newdbp;
8875 		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
8876 		dbp->db_nextblk = getfreeblks(s, 1);
8877 		dbp->db_next->db_blknum = dbp->db_nextblk;
8878 		prevdbp = dbp;
8879 		dbp = dbp->db_next;
8880 		dbp->db_nextblk = 0;
8881 		dbp->db_firstentry = NULL;
8882 		dbp->db_recsum = 0;
8883 		dbp->db_magic = MDDB_MAGIC_DB;
8884 	}
8885 	/*
8886 	 * ready to add record
8887 	 */
8888 	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
8889 			(sizeof (mddb_block_t) * blkcnt);
8890 	if (dbp->db_firstentry) {
8891 		for (dep = dbp->db_firstentry; dep->de_next;
8892 		    dep = dep->de_next);
8893 		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8894 		dep = dep->de_next;
8895 	} else {
8896 		dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8897 		dbp->db_firstentry = dep;
8898 	}
8899 	bzero((caddr_t)dep, desize_ic);
8900 	dep->de_recid = newid;
8901 	/*
8902 	 * Optimized records have an owner node associated with them in
8903 	 * a MN diskset.  The owner is only set on a node that is actively
8904 	 * writing to that record.  The other nodes will show that record
8905 	 * as having an invalid owner.  The owner for an optimized record
8906 	 * is used during fixoptrecord to determine which node should
8907 	 * write out the record when the replicas associated with that
8908 	 * optimized record have been changed.
8909 	 */
8910 	if (MD_MNSET_SETNO(s->s_setno)) {
8911 		dep->de_owner_nodeid = MD_MN_INVALID_NID;
8912 	}
8913 	dep->de_type1 =	type;
8914 	dep->de_type2 = type2;
8915 	dep->de_reqsize = usersize;
8916 	dep->de_recsize = recsize;
8917 	dep->de_blkcount = blkcnt;
8918 	flag_type = options &
8919 	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
8920 		MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
8921 		MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
8922 	switch (flag_type) {
8923 	case MD_CRO_OPTIMIZE:
8924 		dep->de_flags = MDDB_F_OPT;
8925 		getoptdev(s, dep, 0);
8926 		getoptdev(s, dep, 1);
8927 		break;
8928 	case MD_CRO_STRIPE:
8929 		dep->de_flags = MDDB_F_STRIPE;
8930 		break;
8931 	case MD_CRO_MIRROR:
8932 		dep->de_flags = MDDB_F_MIRROR;
8933 		break;
8934 	case MD_CRO_RAID:
8935 		dep->de_flags = MDDB_F_RAID;
8936 		break;
8937 	case MD_CRO_SOFTPART:
8938 		dep->de_flags = MDDB_F_SOFTPART;
8939 		break;
8940 	case MD_CRO_TRANS_MASTER:
8941 		dep->de_flags = MDDB_F_TRANS_MASTER;
8942 		break;
8943 	case MD_CRO_TRANS_LOG:
8944 		dep->de_flags = MDDB_F_TRANS_LOG;
8945 		break;
8946 	case MD_CRO_HOTSPARE:
8947 		dep->de_flags = MDDB_F_HOTSPARE;
8948 		break;
8949 	case MD_CRO_HOTSPARE_POOL:
8950 		dep->de_flags = MDDB_F_HOTSPARE_POOL;
8951 		break;
8952 	case MD_CRO_CHANGELOG:
8953 		dep->de_flags = MDDB_F_CHANGELOG;
8954 		break;
8955 	}
8956 	/*
8957 	 * try to get all blocks consecutive. If not possible
8958 	 * just get them one at a time
8959 	 */
8960 	dep->de_blks[0] = getfreeblks(s, blkcnt);
8961 	if (dep->de_blks[0]) {
8962 		for (i = 1; i < blkcnt; i++)
8963 			dep->de_blks[i] = dep->de_blks[0] + i;
8964 	} else {
8965 		for (i = 0; i < blkcnt;	 i++)
8966 			dep->de_blks[i] = getfreeblks(s, 1);
8967 	}
8968 	dep->de_rb = rbp;
8969 	bzero((caddr_t)rbp, recsize);
8970 	rbp->rb_magic = MDDB_MAGIC_RB;
8971 
8972 	/* Do we have to create an old style (32 bit) record?  */
8973 	if (options & MD_CRO_32BIT) {
8974 		if (options & MD_CRO_FN)
8975 			rbp->rb_revision = MDDB_REV_RBFN;
8976 		else
8977 			rbp->rb_revision = MDDB_REV_RB;
8978 	} else {
8979 		if (options & MD_CRO_FN)
8980 			rbp->rb_revision = MDDB_REV_RB64FN;
8981 		else
8982 			rbp->rb_revision = MDDB_REV_RB64;
8983 	}
8984 
8985 	/* set de_rb_userdata for non optimization records */
8986 	if ((options & MD_CRO_OPTIMIZE) == 0) {
8987 		dep->de_rb_userdata = userdata;
8988 	}
8989 
8990 	uniqtime32(&rbp->rb_timestamp);
8991 	/* Generate the crc for this record */
8992 	rec_crcgen(s, dep, rbp);
8993 	tmppnt = (caddr_t)rbp;
8994 	/*
8995 	 * the following code writes new records to all instances of
8996 	 * the data base. Writing one block at a time to each instance
8997 	 * is safe because they are not yet in a directory entry which
8998 	 * has been written to the data base
8999 	 */
9000 	err = 0;
9001 	if ((options & MD_CRO_OPTIMIZE) == 0) {
9002 		for (i = 0; i < blkcnt;	 i++) {
9003 			err |= writeall(s, (caddr_t)tmppnt,
9004 				dep->de_blks[i], 1, 0);
9005 			tmppnt += MDDB_BSIZE;
9006 		}
9007 	} else {
9008 		if ((MD_MNSET_SETNO(s->s_setno)) &&
9009 		    md_set[s->s_setno].s_am_i_master) {
9010 		/*
9011 		 * If a MN diskset then only master writes out newly
9012 		 * created optimized record.
9013 		 */
9014 			err |= writeoptrecord(s, dep);
9015 		}
9016 	}
9017 	uniqtime32(&dbp->db_timestamp);
9018 	dbp->db_revision = MDDB_REV_DB;
9019 	/* Don't include opt resync and change log records in global XOR */
9020 	if (!(dep->de_flags & MDDB_F_OPT) &&
9021 	    !(dep->de_flags & MDDB_F_CHANGELOG))
9022 		dbp->db_recsum ^= rbp->rb_checksum;
9023 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9024 	create_db32rec(db32p, dbp);
9025 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9026 	err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9027 	if (prevdbp) {
9028 		dbp = prevdbp;
9029 		uniqtime32(&dbp->db_timestamp);
9030 		dbp->db_revision = MDDB_REV_DB;
9031 		create_db32rec(db32p, dbp);
9032 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9033 		err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9034 	}
9035 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9036 	if (err) {
9037 		if (writeretry(s)) {
9038 			s->s_zombie = newid;
9039 			single_thread_end(s);
9040 			mddb_setexit(s);
9041 			return (MDDB_E_NOTNOW);
9042 		}
9043 	}
9044 	single_thread_end(s);
9045 	mddb_setexit(s);
9046 
9047 	ASSERT((newid & MDDB_SETMASK) == 0);
9048 	return (MAKERECID(setno, newid));
9049 }
9050 
9051 int
9052 mddb_deleterec(
9053 	mddb_recid_t	id
9054 )
9055 {
9056 	mddb_set_t	*s;
9057 	mddb_db_t	*dbp;
9058 	mddb_db32_t	*db32p;
9059 	mddb_de_ic_t	*dep, *dep1;
9060 	int		i;
9061 
9062 #if defined(_ILP32) && !defined(lint)
9063 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
9064 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
9065 #endif
9066 
9067 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9068 	ASSERT(s != NULL);
9069 
9070 	id = DBID(id);
9071 	if (checkstate(s, MDDB_PROBE)) {
9072 		mddb_setexit(s);
9073 		return (MDDB_E_NOTNOW);
9074 	}
9075 
9076 	ASSERT(s->s_lbp != NULL);
9077 	single_thread_start(s);
9078 
9079 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9080 		dep1 = NULL;
9081 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9082 			if (dep->de_recid == id)
9083 				break;
9084 			dep1 = dep;
9085 		}
9086 		if (dep != NULL)
9087 			break;
9088 	}
9089 	/*
9090 	 * no such record
9091 	 */
9092 	if (dep == NULL) {
9093 		single_thread_end(s);
9094 		ASSERT(s->s_staledeletes != 0);
9095 		s->s_staledeletes--;
9096 		mddb_setexit(s);
9097 		return (0);
9098 	}
9099 
9100 	if (!(dep->de_flags & MDDB_F_OPT) &&
9101 	    !(dep->de_flags & MDDB_F_CHANGELOG)) {
9102 		dbp->db_recsum ^= dep->de_rb->rb_checksum;
9103 		dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
9104 	}
9105 
9106 	if (dep->de_rb_userdata != NULL) {
9107 		if (dep->de_icreqsize)
9108 			kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
9109 		else
9110 			kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9111 	}
9112 
9113 	kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
9114 
9115 	for (i = 0; i < dep->de_blkcount; i++)
9116 		blkfree(s, dep->de_blks[i]);
9117 	if (dep1)
9118 		dep1->de_next = dep->de_next;
9119 	else
9120 		dbp->db_firstentry = dep->de_next;
9121 
9122 	kmem_free(dep, sizeofde(dep));
9123 
9124 	uniqtime32(&dbp->db_timestamp);
9125 	dbp->db_revision = MDDB_REV_DB;
9126 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9127 	create_db32rec(db32p, dbp);
9128 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9129 	if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
9130 		if (writeretry(s)) {
9131 			/*
9132 			 * staledelete is used to mark deletes which failed.
9133 			 * its only use is to not panic when the user retries
9134 			 * the delete once the database is active again
9135 			 */
9136 			single_thread_end(s);
9137 			s->s_staledeletes++;
9138 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
9139 			mddb_setexit(s);
9140 			return (MDDB_E_NOTNOW);
9141 		}
9142 	}
9143 	single_thread_end(s);
9144 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9145 	mddb_setexit(s);
9146 	return (0);
9147 }
9148 
9149 mddb_recid_t
9150 mddb_getnextrec(
9151 	mddb_recid_t		id,
9152 	mddb_type_t		typ,
9153 	uint_t			type2
9154 )
9155 {
9156 	mddb_set_t		*s;
9157 	mddb_db_t		*dbp;
9158 	mddb_de_ic_t		*dep;
9159 	int			searching, err;
9160 	set_t			setno;
9161 
9162 	setno = DBSET(id);
9163 	id = DBID(id);
9164 	searching = id;
9165 
9166 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9167 		return (err);
9168 
9169 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9170 		for (dep = dbp->db_firstentry;
9171 		    dep != NULL; dep = dep->de_next) {
9172 			if (searching) {
9173 				if (dep->de_recid == id)
9174 					searching = 0;
9175 			} else {
9176 				if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
9177 				    (type2 == 0 || dep->de_type2 == type2)) {
9178 					id = dep->de_recid;
9179 					mddb_setexit(s);
9180 					ASSERT((id & MDDB_SETMASK) == 0);
9181 					return (MAKERECID(setno, id));
9182 				}
9183 			}
9184 		}
9185 	}
9186 
9187 	mddb_setexit(s);
9188 
9189 	if (searching)
9190 		return (MDDB_E_NORECORD);
9191 	return (0);
9192 }
9193 
9194 void *
9195 mddb_getrecaddr(
9196 	mddb_recid_t		id
9197 )
9198 {
9199 	mddb_set_t		*s;
9200 	mddb_db_t		*dbp;
9201 	mddb_de_ic_t		*dep;
9202 	void			*rval;
9203 
9204 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9205 		return (NULL);
9206 
9207 	id = DBID(id);
9208 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9209 		for (dep = dbp->db_firstentry;
9210 		    dep != NULL; dep = dep->de_next) {
9211 			if (dep->de_recid != id)
9212 				continue;
9213 			if (dep->de_rb_userdata)
9214 				rval = (void *)dep->de_rb_userdata;
9215 			else
9216 				rval = (void *)dep->de_rb->rb_data;
9217 			mddb_setexit(s);
9218 			return (rval);
9219 		}
9220 	}
9221 
9222 	mddb_setexit(s);
9223 	return (NULL);
9224 }
9225 
9226 
9227 mddb_de_ic_t *
9228 mddb_getrecdep(
9229 	mddb_recid_t		id
9230 )
9231 {
9232 	mddb_set_t		*s;
9233 	mddb_db_t		*dbp;
9234 	mddb_de_ic_t		*dep;
9235 
9236 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9237 		return (NULL);
9238 
9239 	id = DBID(id);
9240 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9241 		for (dep = dbp->db_firstentry;
9242 		    dep != NULL; dep = dep->de_next) {
9243 			if (dep->de_recid != id)
9244 				continue;
9245 			mddb_setexit(s);
9246 			return (dep);
9247 		}
9248 	}
9249 
9250 	mddb_setexit(s);
9251 	return (NULL);
9252 }
9253 
9254 void *
9255 mddb_getrecaddr_resize(
9256 	mddb_recid_t		id,
9257 	size_t			icsize,
9258 	off_t			off
9259 )
9260 {
9261 	mddb_set_t		*s;
9262 	mddb_db_t		*dbp;
9263 	mddb_de_ic_t		*dep;
9264 	void			*rval = NULL;
9265 
9266 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9267 		return (NULL);
9268 
9269 	id = DBID(id);
9270 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9271 		for (dep = dbp->db_firstentry;
9272 		    dep != NULL; dep = dep->de_next) {
9273 			if (dep->de_recid != id)
9274 				continue;
9275 			if (dep->de_rb_userdata)
9276 				rval = (void *)dep->de_rb_userdata;
9277 			else
9278 				rval = (void *)dep->de_rb->rb_data;
9279 			break;
9280 		}
9281 		if (rval != NULL)
9282 			break;
9283 	}
9284 
9285 	if (rval == NULL) {
9286 		mddb_setexit(s);
9287 		return (NULL);
9288 	}
9289 
9290 	if (dep->de_rb_userdata) {
9291 		caddr_t nud;
9292 
9293 		if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
9294 			mddb_setexit(s);
9295 			return (rval);
9296 		}
9297 		ASSERT((dep->de_reqsize + off) <= icsize);
9298 		nud = kmem_zalloc(icsize, KM_SLEEP);
9299 		bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
9300 		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9301 		dep->de_rb_userdata = nud + off;
9302 		dep->de_rb_userdata_ic = nud;
9303 		dep->de_icreqsize = icsize;
9304 		rval = nud;
9305 	} else {
9306 		size_t recsize;
9307 		/* LINTED variable unused - used for sizeof calculations */
9308 		mddb_rb32_t *nrbp;
9309 
9310 		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
9311 				icsize, MDDB_BSIZE);
9312 		if (dep->de_recsize < recsize)
9313 			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
9314 				"nonoptimized records can be resized\n");
9315 	}
9316 
9317 	mddb_setexit(s);
9318 	return (rval);
9319 }
9320 
9321 int
9322 mddb_getrecprivate(
9323 	mddb_recid_t		id
9324 )
9325 {
9326 	mddb_set_t		*s;
9327 	mddb_db_t		*dbp;
9328 	mddb_de_ic_t		*dep;
9329 	int			err = 0;
9330 	int			private;
9331 
9332 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9333 		return (err);
9334 
9335 	id = DBID(id);
9336 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9337 		for (dep = dbp->db_firstentry;
9338 		    dep != NULL; dep = dep->de_next) {
9339 			if (dep->de_recid != id)
9340 				continue;
9341 			private = (int)dep->de_rb->rb_private;
9342 			mddb_setexit(s);
9343 			return (private);
9344 		}
9345 	}
9346 
9347 	mddb_setexit(s);
9348 	return (MDDB_E_NORECORD);
9349 }
9350 
9351 void
9352 mddb_setrecprivate(
9353 	mddb_recid_t		id,
9354 	uint_t			private
9355 )
9356 {
9357 	mddb_set_t		*s;
9358 	mddb_db_t		*dbp;
9359 	mddb_de_ic_t		*dep;
9360 
9361 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
9362 		ASSERT(0);
9363 		return;
9364 	}
9365 
9366 	id = DBID(id);
9367 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9368 		for (dep = dbp->db_firstentry;
9369 		    dep != NULL; dep = dep->de_next) {
9370 			if (dep->de_recid != id)
9371 				continue;
9372 			dep->de_rb->rb_private = private;
9373 			mddb_setexit(s);
9374 			return;
9375 		}
9376 	}
9377 
9378 	mddb_setexit(s);
9379 	ASSERT(0);
9380 }
9381 
9382 mddb_type_t
9383 mddb_getrectype1(
9384 	mddb_recid_t		id
9385 )
9386 {
9387 	mddb_set_t		*s;
9388 	mddb_db_t		*dbp;
9389 	mddb_de_ic_t		*dep;
9390 	int			err = 0;
9391 	mddb_type_t		rval;
9392 
9393 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9394 		return (err);
9395 
9396 	id = DBID(id);
9397 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9398 		for (dep = dbp->db_firstentry;
9399 		    dep != NULL; dep = dep->de_next) {
9400 			if (dep->de_recid != id)
9401 				continue;
9402 			rval = dep->de_type1;
9403 			mddb_setexit(s);
9404 			return (rval);
9405 		}
9406 	}
9407 
9408 	mddb_setexit(s);
9409 	return (MDDB_E_NORECORD);
9410 }
9411 
9412 int
9413 mddb_getrectype2(
9414 	mddb_recid_t		id
9415 )
9416 {
9417 	mddb_set_t		*s;
9418 	mddb_db_t		*dbp;
9419 	mddb_de_ic_t		*dep;
9420 	int			err = 0;
9421 	int			rval;
9422 
9423 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9424 		return (err);
9425 
9426 	id = DBID(id);
9427 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9428 		for (dep = dbp->db_firstentry;
9429 		    dep != NULL; dep = dep->de_next) {
9430 			if (dep->de_recid != id)
9431 				continue;
9432 			rval = (int)dep->de_type2;
9433 			mddb_setexit(s);
9434 			return (rval);
9435 		}
9436 	}
9437 
9438 	mddb_setexit(s);
9439 	return (MDDB_E_NORECORD);
9440 }
9441 
9442 int
9443 mddb_getrecsize(
9444 	mddb_recid_t		id
9445 )
9446 {
9447 	mddb_set_t		*s;
9448 	mddb_db_t		*dbp;
9449 	mddb_de_ic_t		*dep;
9450 	int			err = 0;
9451 	int			rval;
9452 
9453 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9454 		return (err);
9455 
9456 	id = DBID(id);
9457 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9458 		for (dep = dbp->db_firstentry;
9459 		    dep != NULL; dep = dep->de_next) {
9460 			if (dep->de_recid != id)
9461 				continue;
9462 			rval = (int)dep->de_reqsize;
9463 			mddb_setexit(s);
9464 			return (rval);
9465 		}
9466 	}
9467 
9468 	mddb_setexit(s);
9469 	return (MDDB_E_NORECORD);
9470 }
9471 
9472 
9473 mddb_recstatus_t
9474 mddb_getrecstatus(
9475 	mddb_recid_t		id
9476 )
9477 {
9478 	mddb_set_t		*s;
9479 	mddb_db_t		*dbp;
9480 	mddb_de_ic_t		*dep;
9481 	int			err = 0;
9482 	mddb_recstatus_t	e_err;
9483 
9484 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9485 		return ((mddb_recstatus_t)err);
9486 
9487 	id = DBID(id);
9488 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9489 		for (dep = dbp->db_firstentry;
9490 		    dep != NULL; dep = dep->de_next) {
9491 			if (dep->de_recid == id)
9492 				break;
9493 		}
9494 		if (dep)
9495 			break;
9496 	}
9497 
9498 	e_err = MDDB_OK;
9499 
9500 	if (! dep)
9501 		e_err = MDDB_NORECORD;
9502 	else if (! dep->de_rb->rb_commitcnt)
9503 		e_err = MDDB_NODATA;
9504 	else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
9505 		e_err = MDDB_STALE;
9506 
9507 	mddb_setexit(s);
9508 	return (e_err);
9509 }
9510 
9511 /*
9512  * Commit given record to disk.
9513  * If committing an optimized record, do not call
9514  * with md ioctl lock held.
9515  */
9516 int
9517 mddb_commitrec(
9518 	mddb_recid_t	id
9519 )
9520 {
9521 	mddb_set_t			*s;
9522 	mddb_db_t			*dbp;
9523 	mddb_de_ic_t			*dep;
9524 	mddb_recid_t			ids[2];
9525 	mddb_rb32_t			*rbp;
9526 	static int			err = 0;
9527 	md_mn_msg_mddb_optrecerr_t	*msg_recerr;
9528 	md_mn_kresult_t			*kres;
9529 	mddb_lb_t			*lbp;
9530 	mddb_mnlb_t			*mnlbp;
9531 	mddb_locator_t			*lp;
9532 	mddb_mnsidelocator_t		*mnslp;
9533 	mddb_drvnm_t			*dn;
9534 	int				li;
9535 	md_replica_recerr_t		*recerr;
9536 	int				i, j;
9537 	int				rval;
9538 	int				hit_err = 0;
9539 
9540 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9541 	ASSERT(s != NULL);
9542 
9543 	if (checkstate(s, MDDB_PROBE)) {
9544 		mddb_setexit(s);
9545 		return (MDDB_E_NOTNOW);
9546 	}
9547 
9548 	if (DBID(id) == 0) {
9549 		mddb_setexit(s);
9550 		return (0);
9551 	}
9552 
9553 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9554 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9555 			if (dep->de_recid == DBID(id))
9556 				break;
9557 		}
9558 		if (dep)
9559 			break;
9560 	}
9561 
9562 	if (dep == NULL) {
9563 		mddb_setexit(s);
9564 		return (MDDB_E_NORECORD);
9565 	}
9566 
9567 	if (! (dep->de_flags & MDDB_F_OPT)) {
9568 		ids[0] = id;
9569 		ids[1] = 0;
9570 		mddb_setexit(s);
9571 		return (mddb_commitrecs(ids));
9572 	}
9573 
9574 	/*
9575 	 * following code allows multiple processes to be doing
9576 	 * optimization commits in parallel.
9577 	 * NOTE: if lots of optimization commits then the lock
9578 	 * will not get released until it winds down
9579 	 */
9580 	if (s->s_optwaiterr) {
9581 		while (s->s_optwaiterr) {
9582 			s->s_opthungerr = 1;
9583 			cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
9584 		}
9585 		if (checkstate(s, MDDB_PROBE)) {
9586 			mddb_setexit(s);
9587 			return (MDDB_E_NOTNOW);
9588 		}
9589 	}
9590 	if (s->s_optcmtcnt++ == 0) {
9591 		single_thread_start(s);
9592 		s->s_opthavelck = 1;
9593 		if (s->s_optwantlck) {
9594 			cv_broadcast(&s->s_optwantlck_cv);
9595 			s->s_optwantlck = 0;
9596 		}
9597 	} else {
9598 		while (! s->s_opthavelck) {
9599 			s->s_optwantlck = 1;
9600 			cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
9601 		}
9602 	}
9603 
9604 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9605 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9606 			if (dep->de_recid == DBID(id))
9607 				break;
9608 		}
9609 		if (dep)
9610 			break;
9611 	}
9612 
9613 	if (dep == NULL) {
9614 		if (! (--s->s_optcmtcnt)) {
9615 			single_thread_end(s);
9616 			s->s_opthavelck = 0;
9617 		}
9618 		mddb_setexit(s);
9619 		return (MDDB_E_NORECORD);
9620 	}
9621 
9622 	rbp = dep->de_rb;
9623 	rbp->rb_commitcnt++;
9624 	uniqtime32(&rbp->rb_timestamp);
9625 	/* Generate the crc for this record */
9626 	rec_crcgen(s, dep, rbp);
9627 
9628 	if (writeoptrecord(s, dep)) {
9629 		if (MD_MNSET_SETNO(s->s_setno)) {
9630 			hit_err = 1;
9631 		}
9632 		s->s_optwaiterr++;
9633 	}
9634 	if (MD_MNSET_SETNO(s->s_setno)) {
9635 		/* If last thread out, release single_thread_start */
9636 		if (! (--s->s_optcmtcnt)) {
9637 			single_thread_end(s);
9638 			s->s_opthavelck = 0;
9639 		}
9640 		/*
9641 		 * If this thread had a writeoptrecords failure, then
9642 		 * need to send message to master.
9643 		 * But, multiple threads could all be running on the
9644 		 * same single_thread_start, so serialize the threads
9645 		 * by making each thread grab single_thread_start.
9646 		 *
9647 		 * After return from sending message to master message,
9648 		 * replicas associated with optimized record will havei
9649 		 * been changed (via a callback from the master to all
9650 		 * nodes), so retry call to writeoptrecord.
9651 		 * This code is replacing the call to writeretry that
9652 		 * occurs for the local and traditional disksets.
9653 		 */
9654 		if (hit_err) {
9655 			single_thread_start(s);
9656 			/*
9657 			 * If > 50% of replicas are alive then continue
9658 			 * to send message to master until writeoptrecord
9659 			 * succeeds.  For now, assume that minor name,
9660 			 * major number on this node is the same as on
9661 			 * the master node.  Once devids are turned on
9662 			 * for MN disksets, can send devid.
9663 			 */
9664 			kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
9665 			msg_recerr = kmem_zalloc(
9666 			    sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
9667 			while (!(md_get_setstatus(s->s_setno) &
9668 			    MD_SET_TOOFEW)) {
9669 				bzero((caddr_t)msg_recerr,
9670 				    sizeof (md_mn_msg_mddb_optrecerr_t));
9671 				lbp = s->s_lbp;
9672 				mnlbp = (mddb_mnlb_t *)lbp;
9673 				for (i = 0; i < 2; i++) {
9674 				    li = dep->de_optinfo[i].o_li;
9675 				    lp = &lbp->lb_locators[li];
9676 				    for (j = 0; j < MD_MNMAXSIDES; j++) {
9677 					mnslp =
9678 					    &mnlbp->lb_mnsidelocators[j][li];
9679 					if (mnslp->mnl_sideno == s->s_sideno)
9680 					    break;
9681 				    }
9682 				    if (j == MD_MNMAXSIDES)
9683 					continue;
9684 
9685 				    dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
9686 				    recerr = &msg_recerr->msg_recerr[i];
9687 				    recerr->r_li = li;
9688 				    recerr->r_flags =
9689 					dep->de_optinfo[i].o_flags;
9690 				    recerr->r_blkno = lp->l_blkno;
9691 				    recerr->r_mnum = md_getminor(lp->l_dev);
9692 				    (void) strncpy(recerr->r_driver_name,
9693 					dn->dn_data, MD_MAXDRVNM);
9694 				}
9695 
9696 				/* Release locks */
9697 				single_thread_end(s);
9698 				mutex_exit(SETMUTEX(s->s_setno));
9699 
9700 				/*
9701 				 * Send message to master about optimized
9702 				 * record failure.  After return, master
9703 				 * should have marked failed replicas
9704 				 * and sent parse message to slaves causing
9705 				 * slaves to have fixed up the optimized
9706 				 * record.
9707 				 * On return from ksend_message, retry
9708 				 * the write since this node should have fixed
9709 				 * the optimized resync records it owns.
9710 				 */
9711 				rval = mdmn_ksend_message(s->s_setno,
9712 					MD_MN_MSG_MDDB_OPTRECERR,
9713 					MD_MSGF_NO_BCAST,
9714 					(char *)msg_recerr,
9715 					sizeof (md_mn_msg_mddb_optrecerr_t),
9716 					kres);
9717 				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
9718 					cmn_err(CE_WARN, "mddb_commitrec: "
9719 						"Unable to send optimized "
9720 						"resync record failure "
9721 						"message to other nodes in "
9722 						"diskset %s\n", s->s_setname);
9723 					mdmn_ksend_show_error(rval, kres,
9724 					    "MD_MN_MSG_MDDB_OPTRECERR");
9725 				}
9726 
9727 				/* Regrab locks */
9728 				mutex_enter(SETMUTEX(s->s_setno));
9729 				single_thread_start(s);
9730 
9731 				/* Start over in case mddb changed */
9732 				for (dbp = s->s_dbp; dbp != NULL;
9733 				    dbp = dbp->db_next) {
9734 					for (dep = dbp->db_firstentry; dep;
9735 					    dep = dep->de_next) {
9736 						if (dep->de_recid == DBID(id))
9737 							break;
9738 					}
9739 					if (dep)
9740 						break;
9741 				}
9742 				if (dep) {
9743 					rbp = dep->de_rb;
9744 					rbp->rb_commitcnt++;
9745 					uniqtime32(&rbp->rb_timestamp);
9746 					/* Generate the crc for this record */
9747 					rec_crcgen(s, dep, rbp);
9748 
9749 					/*
9750 					 * If writeoptrecord succeeds, then
9751 					 * break out.
9752 					 */
9753 					if (!(writeoptrecord(s, dep)))
9754 						break;
9755 				}
9756 			}
9757 			kmem_free(kres, sizeof (md_mn_kresult_t));
9758 			kmem_free(msg_recerr,
9759 				sizeof (md_mn_msg_mddb_optrecerr_t));
9760 
9761 			/* Resync record should be fixed - if possible */
9762 			s->s_optwaiterr--;
9763 			if (s->s_optwaiterr == 0) {
9764 				/* All errors have been handled */
9765 				if (s->s_opthungerr) {
9766 					s->s_opthungerr = 0;
9767 					cv_broadcast(&s->s_opthungerr_cv);
9768 				}
9769 			}
9770 			single_thread_end(s);
9771 			mddb_setexit(s);
9772 			if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
9773 				return (MDDB_E_NOTNOW);
9774 			} else {
9775 				return (0);
9776 			}
9777 		}
9778 	} else {
9779 		/* If set is a traditional or local set */
9780 		if (! (--s->s_optcmtcnt)) {
9781 			err = 0;
9782 			if (s->s_optwaiterr) {
9783 				err = writeretry(s);
9784 				s->s_optwaiterr = 0;
9785 				if (s->s_opthungerr) {
9786 					s->s_opthungerr = 0;
9787 					cv_broadcast(&s->s_opthungerr_cv);
9788 				}
9789 			}
9790 			single_thread_end(s);
9791 			s->s_opthavelck = 0;
9792 			mddb_setexit(s);
9793 			if (err)
9794 				return (MDDB_E_NOTNOW);
9795 			return (0);
9796 		}
9797 		if (s->s_optwaiterr) {
9798 			while (s->s_optwaiterr) {
9799 				s->s_opthungerr = 1;
9800 				cv_wait(&s->s_opthungerr_cv,
9801 				    SETMUTEX(s->s_setno));
9802 			}
9803 			if (checkstate(s, MDDB_NOPROBE)) {
9804 				mddb_setexit(s);
9805 				return (MDDB_E_NOTNOW);
9806 			}
9807 		}
9808 	}
9809 
9810 	mddb_setexit(s);
9811 	return (0);
9812 }
9813 
9814 int
9815 mddb_commitrecs(
9816 	mddb_recid_t	ids[]
9817 )
9818 {
9819 	mddb_set_t	*s;
9820 	mddb_db_t	*dbp;
9821 	mddb_de_ic_t	*dep;
9822 	mddb_rb32_t	*rbp;
9823 	mddb_rb32_t	*saverbp;
9824 	mddb_lb_t	*lbp;
9825 	int		li;
9826 	uint_t		checksum;
9827 	mddb_recid_t	*idp;
9828 	int		err = 0;
9829 	set_t		setno;
9830 
9831 	if (panicstr)
9832 		cmn_err(CE_PANIC, "md: mddb: commit not allowed");
9833 
9834 	/*
9835 	 * scan through and make sure ids are from the same set
9836 	 */
9837 	setno = DBSET(ids[0]);
9838 	for (idp = ids; *idp != NULL; idp++)
9839 		ASSERT(DBSET(*idp) == setno);
9840 
9841 	s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);
9842 
9843 	if (checkstate(s, MDDB_PROBE)) {
9844 		mddb_setexit(s);
9845 		return (MDDB_E_NOTNOW);
9846 	}
9847 
9848 	ASSERT(s->s_lbp != NULL);
9849 	err = 0;
9850 
9851 	if (! ids[0]) {
9852 		mddb_setexit(s);
9853 		return (0);
9854 	}
9855 
9856 	single_thread_start(s);
9857 	/*
9858 	 * scan through and make sure ids all exist
9859 	 */
9860 	for (idp = ids; *idp != NULL; idp++) {
9861 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9862 			for (dep = dbp->db_firstentry; dep;
9863 			    dep = dep->de_next) {
9864 				if (dep->de_recid == DBID(*idp))
9865 					break;
9866 			}
9867 			if (dep != NULL)
9868 				break;
9869 		}
9870 		if (dep == NULL) {
9871 			single_thread_end(s);
9872 			mddb_setexit(s);
9873 			return (MDDB_E_NORECORD);
9874 		}
9875 	}
9876 
9877 	/*
9878 	 * scan through records fix commit counts and
9879 	 * zero fiddles and update time stamp and rechecksum record
9880 	 */
9881 	checksum = 0;
9882 	idp = ids;
9883 	saverbp = NULL;
9884 	while (*idp) {
9885 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9886 			for (dep = dbp->db_firstentry; dep;
9887 			    dep = dep->de_next) {
9888 				if (dep->de_recid == DBID(*idp))
9889 					break;
9890 			}
9891 			if (dep != NULL)
9892 				break;
9893 		}
9894 		rbp = dep->de_rb;
9895 		ASSERT(! (dep->de_flags & MDDB_F_OPT));
9896 
9897 		getuserdata(setno, dep);
9898 		/* Don't do fiddles for CHANGE LOG records */
9899 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9900 			checksum ^= rbp->rb_checksum_fiddle;
9901 			rbp->rb_checksum_fiddle = 0;
9902 			checksum ^= rbp->rb_checksum;
9903 			saverbp = rbp;
9904 		}
9905 		rbp->rb_commitcnt++;
9906 		uniqtime32(&rbp->rb_timestamp);
9907 		/* Generate the crc for this record */
9908 		rec_crcgen(s, dep, rbp);
9909 
9910 		/* Don't do fiddles for CHANGE LOG records */
9911 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9912 			checksum ^= rbp->rb_checksum;
9913 		}
9914 		idp++;
9915 	}
9916 
9917 	if (saverbp)
9918 		saverbp->rb_checksum_fiddle = checksum;
9919 
9920 	/*
9921 	 * If this is a MN set but we are not the master, then we are not
9922 	 * supposed to update the mddb on disk. So we finish at this point.
9923 	 */
9924 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
9925 	    (md_set[setno].s_am_i_master == 0)) {
9926 		single_thread_end(s);
9927 		mddb_setexit(s);
9928 		return (0);
9929 	}
9930 
9931 	lbp = s->s_lbp;
9932 	for (li = 0; li < lbp->lb_loccnt; li++) {
9933 		if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
9934 			continue;
9935 
9936 		idp = ids;
9937 		while (*idp) {
9938 			for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9939 				dep = dbp->db_firstentry;
9940 				while (dep && (dep->de_recid != DBID(*idp)))
9941 					dep = dep->de_next;
9942 				if (dep != NULL)
9943 					break;
9944 			}
9945 			rbp = dep->de_rb;
9946 			err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
9947 			    dep->de_blkcount, li, (mddb_bf_t **)0,
9948 			    MDDB_WR_ONLY_MASTER);
9949 			if (err)
9950 				break;
9951 			idp++;
9952 		}
9953 		if (err)
9954 			break;
9955 	}
9956 	if (err) {
9957 		if (writeretry(s)) {
9958 			single_thread_end(s);
9959 			mddb_setexit(s);
9960 			return (MDDB_E_NOTNOW);
9961 		}
9962 	}
9963 	single_thread_end(s);
9964 	mddb_setexit(s);
9965 	return (0);
9966 }
9967 
9968 mddb_recid_t
9969 mddb_makerecid(
9970 	set_t		setno,
9971 	mddb_recid_t	id
9972 )
9973 {
9974 	return (MAKERECID(setno, id));
9975 }
9976 
9977 set_t
9978 mddb_getsetnum(
9979 	mddb_recid_t	id
9980 )
9981 {
9982 	return (DBSET(id));
9983 }
9984 
9985 char *
9986 mddb_getsetname(
9987 	set_t	setno
9988 )
9989 {
9990 	return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
9991 }
9992 
9993 side_t
9994 mddb_getsidenum(
9995 	set_t	setno
9996 )
9997 {
9998 	if (md_set[setno].s_db)
9999 		return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
10000 	return (0);
10001 }
10002 
10003 int
10004 mddb_ownset(
10005 	set_t	setno
10006 )
10007 {
10008 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
10009 		return (1);
10010 
10011 	if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
10012 		return (1);
10013 
10014 	return (0);
10015 }
10016 
10017 /*ARGSUSED*/
10018 int
10019 getmed_ioctl(mddb_med_parm_t *medpp, int mode)
10020 {
10021 	mddb_set_t	*s;
10022 	int		err = 0;
10023 	set_t		setno = medpp->med_setno;
10024 	md_error_t	*ep = &medpp->med_mde;
10025 
10026 	mdclrerror(ep);
10027 
10028 	if (setno >= md_nsets)
10029 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10030 
10031 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10032 		return (0);
10033 
10034 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10035 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10036 
10037 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10038 		return (mddbstatus2error(ep, err, NODEV32, setno));
10039 
10040 	medpp->med = s->s_med;			/* structure assignment */
10041 
10042 	mddb_setexit(s);
10043 
10044 	return (0);
10045 }
10046 
10047 int
10048 setmed_ioctl(mddb_med_parm_t *medpp, int mode)
10049 {
10050 
10051 	mddb_set_t	*s;
10052 	int		err = 0;
10053 	set_t		setno = medpp->med_setno;
10054 	md_error_t	*ep = &medpp->med_mde;
10055 
10056 	mdclrerror(ep);
10057 
10058 	if ((mode & FWRITE) == 0)
10059 		return (mdsyserror(ep, EACCES));
10060 
10061 	/*
10062 	 * This should be the only thing that prevents LOCAL sets from having
10063 	 * mediators, at least in the kernel, userland needs to have some code
10064 	 * written.
10065 	 */
10066 	if (setno == MD_LOCAL_SET)
10067 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10068 
10069 	if (setno >= md_nsets)
10070 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10071 
10072 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10073 		return (0);
10074 
10075 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10076 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10077 
10078 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10079 		return (mddbstatus2error(ep, err, NODEV32, setno));
10080 
10081 	s->s_med = medpp->med;			/* structure assignment */
10082 
10083 	mddb_setexit(s);
10084 
10085 	return (0);
10086 }
10087 
10088 int
10089 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
10090 {
10091 
10092 	mddb_set_t	*s;
10093 	int		err = 0;
10094 	set_t		setno = medpp->med_setno;
10095 	md_error_t	*ep = &medpp->med_mde;
10096 
10097 	mdclrerror(ep);
10098 
10099 	if ((mode & FWRITE) == 0)
10100 		return (mdsyserror(ep, EACCES));
10101 
10102 	if (setno >= md_nsets)
10103 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10104 
10105 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10106 		return (0);
10107 
10108 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10109 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10110 
10111 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10112 		return (mddbstatus2error(ep, err, NODEV32, setno));
10113 
10114 	single_thread_start(s);
10115 	(void) upd_med(s, "updmed_ioctl()");
10116 	single_thread_end(s);
10117 
10118 	mddb_setexit(s);
10119 
10120 	return (0);
10121 }
10122 
10123 int
10124 take_set(mddb_config_t *cp, int mode)
10125 {
10126 	int			err = 0;
10127 	mddb_med_upd_parm_t	medup;
10128 	set_t			setno = cp->c_setno;
10129 	md_error_t		*ep = &cp->c_mde;
10130 	int			snarf_ok = 0;
10131 
10132 	if (md_get_setstatus(setno) & MD_SET_SNARFED)
10133 		return (0);
10134 
10135 	err = mddb_configure(MDDB_GETDEV, cp);
10136 	if (! err && mdisok(ep)) {
10137 		if (md_snarf_db_set(setno, ep) != 0)
10138 			goto out;
10139 		snarf_ok = 1;
10140 	}
10141 
10142 	/*
10143 	 * Clear replicated import flag since this is
10144 	 * used during the take of a diskset with
10145 	 * previously unresolved replicated disks.
10146 	 */
10147 	if (md_get_setstatus(setno) &
10148 	    MD_SET_REPLICATED_IMPORT) {
10149 		md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
10150 	}
10151 
10152 	if (! err && mdisok(ep)) {
10153 		if (! cp->c_flags) {
10154 			medup.med_setno = setno;
10155 			mdclrerror(&medup.med_mde);
10156 
10157 			err = updmed_ioctl(&medup, mode);
10158 			if (! mdisok(&medup.med_mde))
10159 				(void) mdstealerror(ep, &medup.med_mde);
10160 		}
10161 	}
10162 
10163 out:
10164 	/*
10165 	 * In the case that the snarf failed, the diskset is
10166 	 * left with s_db set, but s_lbp not set.  The node is not
10167 	 * an owner of the set and won't be allowed to release the
10168 	 * diskset in order to cleanup.  With s_db set, any call to the
10169 	 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
10170 	 * will cause the diskset to be loaded.  So, cleanup the diskset so
10171 	 * that an inadvertent start of the diskset doesn't happen later.
10172 	 */
10173 	if ((snarf_ok == 0) && md_set[setno].s_db &&
10174 	    (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
10175 		mutex_enter(&mddb_lock);
10176 		mddb_unload_set(setno);
10177 		mutex_exit(&mddb_lock);
10178 	}
10179 	return (err);
10180 }
10181 
10182 /*ARGSUSED*/
10183 int
10184 release_set(mddb_config_t *cp, int mode)
10185 {
10186 	int			err = 0;
10187 	set_t			setno = cp->c_setno;
10188 	md_error_t		*ep = &cp->c_mde;
10189 
10190 	/*
10191 	 * Data integrity check
10192 	 */
10193 	if (setno >= md_nsets)
10194 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10195 
10196 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
10197 	md_haltsnarf_enter(setno);
10198 	/*
10199 	 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
10200 	 * that the mirror code is currently searching all mirrors for a
10201 	 * errored component that needs a hotspare. While this search is in
10202 	 * progress, we cannot release the set and thgerefore we return EBUSY.
10203 	 * Once we have set HOLD, the mirror function (check_4_hotspares) will
10204 	 * block before the search until the set is released.
10205 	 */
10206 	if (md_holdset_testandenter(setno) != 0) {
10207 		md_haltsnarf_exit(setno);
10208 		rw_exit(&md_unit_array_rw.lock);
10209 		return (EBUSY);
10210 	}
10211 
10212 	if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
10213 		err = mddb_configure(MDDB_RELEASESET, cp);
10214 
10215 	md_holdset_exit(setno);
10216 	md_haltsnarf_exit(setno);
10217 	rw_exit(&md_unit_array_rw.lock);
10218 
10219 	if (! err && mdisok(ep)) {
10220 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
10221 		    NODEV64);
10222 	}
10223 
10224 	return (err);
10225 }
10226 
10227 int
10228 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
10229 {
10230 	mddb_set_t	*s;
10231 	int		err = 0;
10232 	mddb_dtag_lst_t	*dtlp;
10233 	set_t		setno = dtgpp->dtgp_setno;
10234 	md_error_t	*ep = &dtgpp->dtgp_mde;
10235 
10236 	mdclrerror(ep);
10237 
10238 	if ((mode & FREAD) == 0)
10239 		return (mdsyserror(ep, EACCES));
10240 
10241 	if (setno >= md_nsets)
10242 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10243 
10244 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10245 		return (0);
10246 
10247 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10248 		return (mddbstatus2error(ep, err, NODEV32, setno));
10249 
10250 	/*
10251 	 * Data tags not supported on MN sets so return invalid operation.
10252 	 * This ioctl could be called before the mddb has been read in so
10253 	 * the set status may not yet be set to MNSET, so code following
10254 	 * this check must handle a MN diskset properly.
10255 	 */
10256 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10257 		mddb_setexit(s);
10258 		return (mderror(ep, MDE_INVAL_MNOP));
10259 	}
10260 
10261 	/* s_dtlp is NULL for MN diskset */
10262 	dtlp = s->s_dtlp;
10263 	while (dtlp != NULL) {
10264 		if (dtgpp->dtgp_dt.dt_id == 0 ||
10265 		    dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
10266 			bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
10267 			    sizeof (mddb_dtag_t));
10268 			break;
10269 		}
10270 		dtlp = dtlp->dtl_nx;
10271 	}
10272 
10273 	/* Walked the whole list and id not found, return error */
10274 	if (dtlp == (mddb_dtag_lst_t *)NULL) {
10275 		mddb_setexit(s);
10276 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10277 	}
10278 
10279 	mddb_setexit(s);
10280 
10281 	return (0);
10282 }
10283 
10284 int
10285 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
10286 {
10287 	mddb_set_t	*s;
10288 	int		err = 0;
10289 	mddb_config_t	*cp;
10290 	mddb_ri_t	*trip = NULL;
10291 	mddb_dtag_t	*dtagp = NULL;
10292 	set_t		setno = dtupp->dtup_setno;
10293 	md_error_t	*ep = &dtupp->dtup_mde;
10294 
10295 	mdclrerror(ep);
10296 
10297 	if ((mode & FWRITE) == 0)
10298 		return (mdsyserror(ep, EACCES));
10299 
10300 	if (setno >= md_nsets)
10301 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10302 
10303 	if (dtupp->dtup_id < 0)
10304 		return (mdsyserror(ep, EINVAL));
10305 	else if (dtupp->dtup_id == 0)
10306 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10307 
10308 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10309 		return (0);
10310 
10311 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
10312 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10313 
10314 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10315 		return (mddbstatus2error(ep, err, NODEV32, setno));
10316 
10317 	/*
10318 	 * Data tags not supported on MN sets so return invalid operation.
10319 	 * This ioctl could be called before the mddb has been read in so
10320 	 * the set status may not yet be set to MNSET, so code following
10321 	 * this check must handle a MN diskset properly.
10322 	 */
10323 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10324 		mddb_setexit(s);
10325 		return (mderror(ep, MDE_INVAL_MNOP));
10326 	}
10327 
10328 	/* Validate and find the id requested - nothing found if MN diskset */
10329 	if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
10330 		mddb_setexit(s);
10331 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10332 	}
10333 
10334 	/* Usetag is only valid when more than one tag exists */
10335 	if (dtl_cntl(s) < 2) {
10336 		mddb_setexit(s);
10337 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10338 	}
10339 
10340 	/* Put the selected tag in place */
10341 	dt_setup(s, dtagp);
10342 
10343 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10344 
10345 	/* Save the hint information */
10346 	trip = save_rip(s);
10347 
10348 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10349 	cp->c_setno = setno;
10350 	cp->c_sideno = s->s_sideno;
10351 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10352 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10353 	cp->c_med = s->s_med;				/* struct assignment */
10354 
10355 	mddb_setexit(s);
10356 
10357 	s = NULL;
10358 
10359 	/* shorthand */
10360 	setno = cp->c_setno;
10361 
10362 	/* Let unload know not to free the tag */
10363 	md_set_setstatus(setno, MD_SET_KEEPTAG);
10364 
10365 	/* Release the set */
10366 	if (err = release_set(cp, mode))
10367 		goto out;
10368 
10369 	if (! mdisok(&cp->c_mde)) {
10370 		(void) mdstealerror(ep, &cp->c_mde);
10371 		err = 1;
10372 		goto out;
10373 	}
10374 
10375 	/* Re-init set using the saved mddb_config_t structure */
10376 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10377 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10378 			err = mddbstatus2error(ep, err, NODEV32, setno);
10379 			goto out;
10380 		}
10381 	}
10382 
10383 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10384 
10385 	/* use the saved rip structure */
10386 	s->s_rip = trip;
10387 	trip = (mddb_ri_t *)NULL;
10388 
10389 	/* Let the take code know a tag is being used */
10390 	md_set_setstatus(setno, MD_SET_USETAG);
10391 
10392 	mddb_setexit(s);
10393 
10394 	s = NULL;
10395 
10396 	/* Take the set */
10397 	if (err = take_set(cp, mode))
10398 		goto out;
10399 
10400 	if (! mdisok(&cp->c_mde))
10401 		(void) mdstealerror(ep, &cp->c_mde);
10402 
10403 out:
10404 	md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));
10405 
10406 	kmem_free(cp, sizeof (mddb_config_t));
10407 
10408 	if (trip)
10409 		free_rip(&trip);
10410 
10411 	if (s)
10412 		mddb_setexit(s);
10413 
10414 	return (err);
10415 }
10416 
10417 int
10418 accept_ioctl(mddb_accept_parm_t *accpp, int mode)
10419 {
10420 	mddb_set_t	*s;
10421 	int		err = 0;
10422 	mddb_config_t	*cp;
10423 	mddb_ri_t	*trip = NULL;
10424 	set_t		setno = accpp->accp_setno;
10425 	md_error_t	*ep = &accpp->accp_mde;
10426 
10427 	mdclrerror(ep);
10428 
10429 	if ((mode & FWRITE) == 0)
10430 		return (mdsyserror(ep, EACCES));
10431 
10432 	if (setno >= md_nsets)
10433 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10434 
10435 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10436 		return (0);
10437 
10438 	if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
10439 		return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));
10440 
10441 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10442 		return (mddbstatus2error(ep, err, NODEV32, setno));
10443 
10444 	/*
10445 	 * Data tags not supported on MN sets so return invalid operation.
10446 	 * mddb is guaranteed to be incore at this point, so this
10447 	 * check will catch all MN disksets.
10448 	 */
10449 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10450 		mddb_setexit(s);
10451 		return (mderror(ep, MDE_INVAL_MNOP));
10452 	}
10453 
10454 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10455 
10456 	trip = save_rip(s);
10457 
10458 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10459 	cp->c_setno = setno;
10460 	cp->c_sideno = s->s_sideno;
10461 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10462 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10463 	cp->c_med = s->s_med;				/* struct assignment */
10464 
10465 	/* Tag the data */
10466 	if (err = set_dtag(s, ep)) {
10467 		err = mdsyserror(ep, err);
10468 		goto out;
10469 	}
10470 
10471 	/* If we had a BADTAG, it will be re-written, so clear the bit. */
10472 	if (md_get_setstatus(setno) & MD_SET_BADTAG)
10473 		md_clr_setstatus(setno, MD_SET_BADTAG);
10474 
10475 	if (err = dt_write(s)) {
10476 		err = mdsyserror(ep, err);
10477 		goto out;
10478 	}
10479 
10480 	mddb_setexit(s);
10481 
10482 	s = NULL;
10483 
10484 	/* shorthand */
10485 	setno = cp->c_setno;
10486 
10487 	/* Clear the keeptag */
10488 	md_clr_setstatus(setno, MD_SET_KEEPTAG);
10489 
10490 	/* Release the set */
10491 	if (err = release_set(cp, mode))
10492 		goto out;
10493 
10494 	if (! mdisok(&cp->c_mde)) {
10495 		(void) mdstealerror(ep, &cp->c_mde);
10496 		goto out;
10497 	}
10498 
10499 	/* Re-init set using the saved mddb_config_t structure */
10500 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10501 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10502 			err = mddbstatus2error(ep, err, NODEV32, setno);
10503 			goto out;
10504 		}
10505 	}
10506 
10507 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10508 
10509 	/* Free the allocated rip structure */
10510 	if (s->s_rip != (mddb_ri_t *)NULL)
10511 		free_rip(&s->s_rip);
10512 
10513 	/* use the saved rip structure */
10514 	s->s_rip = trip;
10515 	trip = (mddb_ri_t *)NULL;
10516 
10517 	/* Let the set init code know an accept is in progress */
10518 	md_set_setstatus(setno, MD_SET_ACCEPT);
10519 
10520 	mddb_setexit(s);
10521 
10522 	s = NULL;
10523 
10524 	/* Take the set */
10525 	if (err = take_set(cp, mode))
10526 		goto out;
10527 
10528 	if (! mdisok(&cp->c_mde))
10529 		(void) mdstealerror(ep, &cp->c_mde);
10530 
10531 out:
10532 	md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));
10533 
10534 	kmem_free(cp, sizeof (mddb_config_t));
10535 
10536 	if (trip)
10537 		free_rip(&trip);
10538 
10539 	if (s)
10540 		mddb_setexit(s);
10541 
10542 	return (err);
10543 }
10544 
10545 /*
10546  * mddb_getinvlb_devid - cycles through the locator block and determines
10547  *		if the device id's for any of the replica disks are invalid.
10548  *		If so, it returns the diskname in the ctdptr.
10549  *	RETURN
10550  *		-1	Error
10551  *		cnt	number of invalid device id's
10552  */
10553 int
10554 mddb_getinvlb_devid(
10555 	set_t	setno,
10556 	int	count,
10557 	int	size,
10558 	char	**ctdptr
10559 )
10560 {
10561 	mddb_set_t	*s;
10562 	int		err = 0;
10563 	mddb_lb_t	*lbp;
10564 	int		li;
10565 	mddb_did_blk_t	*did_blk;
10566 	mddb_did_info_t	*did_info;
10567 	int		len;
10568 	int		cnt = 0;
10569 	char		*cptr;
10570 	md_name_suffix	*sn;
10571 	int		i, dont_add_it;
10572 	char		*tmpctd, *diskname;
10573 	char		*tmpname;
10574 
10575 	cptr = *ctdptr;
10576 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
10577 		return (-1);
10578 	}
10579 
10580 	single_thread_start(s);
10581 	lbp = s->s_lbp;
10582 
10583 	if (lbp->lb_setno != setno) {
10584 		single_thread_end(s);
10585 		mddb_setexit(s);
10586 		return (-1);
10587 	}
10588 
10589 	/* check for lb being devid style */
10590 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
10591 		did_blk = s->s_did_icp->did_ic_blkp;
10592 		for (li = 0; li < lbp->lb_loccnt; li++) {
10593 			did_info = &(did_blk->blk_info[li]);
10594 			/* Only if devid exists and isn't valid */
10595 			if ((did_info->info_flags & MDDB_DID_EXISTS) &&
10596 			    !(did_info->info_flags & MDDB_DID_VALID)) {
10597 				/*
10598 				 * if we count more invalid did's than
10599 				 * was passed in there's an error somewhere
10600 				 */
10601 				if (cnt++ > count) {
10602 					single_thread_end(s);
10603 					mddb_setexit(s);
10604 					return (-1);
10605 				}
10606 
10607 				/*
10608 				 * Future note: Need to do something here
10609 				 * for the MN diskset case when device ids
10610 				 * are supported in disksets.
10611 				 * Can't add until merging devids_in_diskset
10612 				 * code into code base.
10613 				 */
10614 
10615 				sn = &s->s_lnp->ln_suffixes[0][li];
10616 				/*
10617 				 * check to make sure length of device name is
10618 				 * not greater than computed first time through
10619 				 */
10620 				len = sn->suf_len;
10621 				if (len > size) {
10622 					single_thread_end(s);
10623 					mddb_setexit(s);
10624 					return (-1);
10625 				}
10626 				tmpctd = *ctdptr;
10627 				/* strip off slice part */
10628 				diskname = md_strdup(sn->suf_data);
10629 				tmpname = strrchr(diskname, 's');
10630 				*tmpname = '\0';
10631 				dont_add_it = 0;
10632 				/* look to see if diskname is already in list */
10633 				for (i = 0; i < (cnt-1); i++) {
10634 					if (strcmp(diskname, tmpctd) == 0) {
10635 						/* already there, don't add */
10636 						dont_add_it = 1;
10637 						break;
10638 					}
10639 					/* point to next diskname in list */
10640 					tmpctd += size;
10641 				}
10642 				if (dont_add_it == 0) {
10643 					/* add diskname to list */
10644 					(void) strcpy(cptr, diskname);
10645 					cptr += size;
10646 				}
10647 				kmem_free(diskname, strlen(sn->suf_data) + 1);
10648 			}
10649 		}
10650 	}
10651 	/* null terminate the list */
10652 	*cptr = '\0';
10653 	/*
10654 	 * need to save the new pointer so that calling routine can continue
10655 	 * to add information onto the end.
10656 	 */
10657 	*ctdptr = cptr;
10658 	single_thread_end(s);
10659 	mddb_setexit(s);
10660 	return (cnt);
10661 }
10662 
10663 /*
10664  * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
10665  *		track of length of longest devicename.
10666  *	RETURN
10667  *		-1	error
10668  *		 cnt	number of lb's with invalid devid's
10669  */
10670 int
10671 mddb_validate_lb(
10672 	set_t	setno,
10673 	int	*rmaxsz
10674 )
10675 {
10676 	mddb_set_t	*s;
10677 	int		err = 0;
10678 	mddb_lb_t	*lbp;
10679 	int		li;
10680 	mddb_did_blk_t	*did_blk;
10681 	mddb_did_info_t	*did_info;
10682 	int		len;
10683 	int		cnt = 0;
10684 
10685 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10686 		return (-1);
10687 
10688 	single_thread_start(s);
10689 	lbp = s->s_lbp;
10690 
10691 	if (lbp->lb_setno != setno) {
10692 		single_thread_end(s);
10693 		mddb_setexit(s);
10694 		return (-1);
10695 	}
10696 
10697 	/* lb must be in devid style */
10698 	if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
10699 		goto mvl_out;
10700 
10701 	did_blk = s->s_did_icp->did_ic_blkp;
10702 	for (li = 0; li < lbp->lb_loccnt; li++) {
10703 		char		*minor_name;
10704 		mddb_locator_t	*lp;
10705 		dev_t		ddi_dev;
10706 		ddi_devid_t	devid;
10707 		ddi_devid_t	rtn_devid = NULL;
10708 		int		get_rval;
10709 
10710 		did_info = &(did_blk->blk_info[li]);
10711 		if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
10712 		    (did_info->info_flags & MDDB_DID_VALID))
10713 			continue;
10714 
10715 		/* Here we know, did exists but isn't valid */
10716 
10717 		lp = &lbp->lb_locators[li];
10718 		ddi_dev = expldev(lp->l_dev);
10719 		get_rval = mddb_devid_get(s, li, &devid, &minor_name);
10720 		ASSERT(get_rval == 1);
10721 		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
10722 		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
10723 			did_info->info_flags = MDDB_DID_VALID |
10724 						MDDB_DID_EXISTS |
10725 						MDDB_DID_UPDATED;
10726 		} else {
10727 			cnt++;
10728 			/*
10729 			 * Future note: Need to do something here
10730 			 * for the MN diskset case when device ids
10731 			 * are supported in disksets.
10732 			 * Can't add until merging devids_in_diskset
10733 			 * code into code base.
10734 			 */
10735 			len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
10736 			if (*rmaxsz < len)
10737 				*rmaxsz = len;
10738 		}
10739 		if (rtn_devid != NULL)
10740 			ddi_devid_free(rtn_devid);
10741 	}
10742 
10743 mvl_out:
10744 
10745 	if (push_lb(s) != 0)
10746 		cnt = -1;
10747 	(void) upd_med(s, "mddb_validate_lb(0)");
10748 	single_thread_end(s);
10749 	mddb_setexit(s);
10750 	return (cnt);
10751 }
10752 
10753 int
10754 check_active_locators()
10755 {
10756 	mddb_set_t	*s;
10757 	mddb_lb_t	*lbp;
10758 	int		li;
10759 	int		active = 0;
10760 
10761 	mutex_enter(&mddb_lock);
10762 	/* there is nothing here..so we can unload */
10763 	if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
10764 		mutex_exit(&mddb_lock);
10765 		return (0);
10766 	}
10767 	s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
10768 	lbp = s->s_lbp;
10769 	if (lbp == NULL) {
10770 		mutex_exit(&mddb_lock);
10771 		return (0);
10772 	}
10773 
10774 	for (li = 0; li < lbp->lb_loccnt; li++) {
10775 		mddb_locator_t *lp = &lbp->lb_locators[li];
10776 		if (lp->l_flags & MDDB_F_ACTIVE) {
10777 			active = 1;
10778 			break;
10779 		}
10780 	}
10781 	mutex_exit(&mddb_lock);
10782 	return (active);
10783 }
10784 
10785 /*
10786  * regetoptrecord:
10787  * --------------
10788  *	Update the in-core optimized resync record contents by re-reading the
10789  *	record from the on-disk metadb.
10790  *	The contents of the resync record will be overwritten by calling this
10791  *	routine. This means that callers that require the previous contents to
10792  *	be preserved must save the data before calling this routine.
10793  *	Return values:
10794  *	0 - successfully read in resync record from a mddb
10795  *	1 - failure.  Unable to read resync record from either mddb.
10796  */
10797 static int
10798 regetoptrecord(
10799 	mddb_set_t	*s,
10800 	mddb_de_ic_t	*dep
10801 )
10802 {
10803 	mddb_lb_t	*lbp;
10804 	mddb_locator_t	*lp;
10805 	mddb_rb32_t	*rbp, *crbp;
10806 	int		li;
10807 	int		i;
10808 	int		err = 0;
10809 	size_t		recsize;
10810 
10811 #if defined(_ILP32) && !defined(lint)
10812 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
10813 #endif
10814 
10815 	recsize = dep->de_recsize;
10816 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
10817 
10818 	single_thread_start(s);
10819 	rbp = dep->de_rb;
10820 
10821 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
10822 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10823 
10824 	lbp = s->s_lbp;
10825 
10826 	for (i = 0; i < 2; i++) {
10827 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
10828 			continue;
10829 		li = dep->de_optinfo[i].o_li;
10830 		lp = &lbp->lb_locators[li];
10831 
10832 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
10833 		    (lp->l_flags & MDDB_F_EMASTER))
10834 			continue;
10835 
10836 		/*
10837 		 * re-read the optimized resync record with failfast set
10838 		 * since a failed disk could lead to a very long wait.
10839 		 */
10840 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
10841 		    dep->de_blkcount, li, B_FAILFAST);
10842 
10843 		if (err)
10844 			continue;
10845 
10846 		if (rbp->rb_magic != MDDB_MAGIC_RB)
10847 			continue;
10848 
10849 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
10850 			continue;
10851 
10852 		/* Check the crc for this record */
10853 		if (rec_crcchk(s, dep, rbp)) {
10854 			continue;
10855 		}
10856 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
10857 
10858 		if (rbp == crbp) {
10859 			if (rbp->rb_checksum != crbp->rb_checksum)
10860 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10861 			break;
10862 		}
10863 		rbp = crbp;
10864 	}
10865 
10866 	single_thread_end(s);
10867 
10868 	if (rbp == crbp) {
10869 		rbp->rb_private = 0;
10870 		kmem_free((caddr_t)crbp, recsize);
10871 		return (0);
10872 	}
10873 	uniqtime32(&rbp->rb_timestamp);
10874 	/* Generate the crc for this record */
10875 	rec_crcgen(s, dep, rbp);
10876 	kmem_free((caddr_t)crbp, recsize);
10877 	return (1);
10878 }
10879 
10880 /*
10881  * mddb_reread_rr:
10882  *	Re-read the resync record from the on-disk copy. This is required for
10883  *	multi-node support so that a new mirror-owner can determine if a resync
10884  *	operation is required to guarantee data integrity.
10885  *
10886  * Arguments:
10887  *	setno	Associated set
10888  *	id	Resync record ID
10889  *
10890  * Return Value:
10891  *	0	successful reread
10892  *	-1	invalid set (not multi-node or non-existant)
10893  *	>0	metadb state invalid, failed to reread
10894  */
10895 int
10896 mddb_reread_rr(
10897 	set_t		setno,
10898 	mddb_recid_t	id
10899 )
10900 {
10901 	mddb_set_t	*s;
10902 	int		err = 0;
10903 	mddb_db_t	*dbp;
10904 	mddb_de_ic_t	*dep;
10905 
10906 	if (setno >= md_nsets)
10907 		return (-1);
10908 
10909 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10910 		return (-1);
10911 
10912 	if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
10913 		mddb_setexit(s);
10914 		return (-1);
10915 	}
10916 
10917 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10918 		dep = dbp->db_firstentry;
10919 		while (dep && (dep->de_recid != DBID(id)))
10920 			dep = dep->de_next;
10921 		if (dep != NULL)
10922 			break;
10923 	}
10924 
10925 	if (dep != NULL) {
10926 		err = regetoptrecord(s, dep);
10927 	} else {
10928 		err = -1;
10929 	}
10930 	mddb_setexit(s);
10931 	return (err);
10932 }
10933 
10934 /*
10935  * Set owner associated with MN optimized resync record.
10936  *
10937  * Optimized records have an owner node associated with them in
10938  * a MN diskset.  The owner is only set on a node that is actively
10939  * writing to that record.  The other nodes will show that record
10940  * as having an invalid owner.  The owner for an optimized record
10941  * is used during fixoptrecord to determine which node should
10942  * write out the record when the replicas associated with that
10943  * optimized record have been changed.
10944  *
10945  * Called directly from mirror driver and not from an ioctl.
10946  *
10947  * Returns
10948  *	NULL if successful.
10949  *	MDDB_E_NORECORD if record not found.
10950  */
10951 int
10952 mddb_setowner(
10953 	mddb_recid_t		id,
10954 	md_mn_nodeid_t		owner
10955 )
10956 {
10957 	mddb_set_t		*s;
10958 	mddb_db_t		*dbp;
10959 	mddb_de_ic_t		*dep;
10960 	int			found = 0;
10961 
10962 
10963 	if (DBSET(id) >= md_nsets)
10964 		return (MDDB_E_NORECORD);
10965 
10966 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
10967 		return (MDDB_E_NORECORD);
10968 
10969 	id = DBID(id);
10970 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10971 		for (dep = dbp->db_firstentry;
10972 		    dep != NULL; dep = dep->de_next) {
10973 			if (dep->de_recid != id)
10974 				continue;
10975 			dep->de_owner_nodeid = owner;
10976 			found = 1;
10977 			break;
10978 		}
10979 		if (found)
10980 			break;
10981 	}
10982 
10983 	mddb_setexit(s);
10984 
10985 	if (!found) {
10986 		return (MDDB_E_NORECORD);
10987 	}
10988 
10989 	return (NULL);
10990 }
10991 
10992 /*
10993  * mddb_parse re-reads portions of the mddb from disk given a list
10994  * of good replicas to read from and flags describing
10995  * which portion of the mddb to read in.
10996  *
10997  * Used in a MN diskset when the master has made a change to some part
10998  * of the mddb and wants to relay this information to the slaves.
10999  */
11000 int
11001 mddb_parse(mddb_parse_parm_t *mpp)
11002 {
11003 	mddb_set_t	*s;
11004 	int		err = 0;
11005 	mddb_locator_t	*lp, *old_lp;
11006 	mddb_lb_t	*lbp, *old_lbp;
11007 	int		rval = 0;
11008 	int		i, li;
11009 	int		found_good_one = 0;
11010 	mddb_ln_t	*lnp;
11011 	mddb_block_t	ln_blkcnt;
11012 	md_error_t	*ep = &mpp->c_mde;
11013 
11014 	if (mpp->c_setno >= md_nsets)
11015 		return (EINVAL);
11016 
11017 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11018 		return (0);
11019 
11020 	if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11021 		return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
11022 	}
11023 
11024 	if (!(MD_MNSET_SETNO(mpp->c_setno))) {
11025 		mddb_setexit_no_parse(s);
11026 		return (EINVAL);
11027 	}
11028 
11029 	/*
11030 	 * Master node initiated this request, so there's no work for
11031 	 * the master node to do.
11032 	 */
11033 	if (md_set[mpp->c_setno].s_am_i_master) {
11034 		mddb_setexit_no_parse(s);
11035 		return (rval);
11036 	}
11037 
11038 	single_thread_start(s);
11039 
11040 	if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
11041 		lbp = 0;
11042 		for (i = 0; i < MDDB_NLB; i++) {
11043 			/* Walk through master's active list */
11044 			if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
11045 				continue;
11046 			if (s->s_mbiarray[i] == NULL)
11047 				continue;
11048 
11049 			/* Assumes master blocks are already setup */
11050 			if (lbp == (mddb_lb_t *)NULL) {
11051 				lbp = (mddb_lb_t *)kmem_zalloc(
11052 					dbtob(MDDB_MNLBCNT), KM_SLEEP);
11053 			}
11054 			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
11055 
11056 			if (err)
11057 				continue;
11058 
11059 			if (lbp->lb_magic != MDDB_MAGIC_LB)
11060 				continue;
11061 			if (lbp->lb_blkcnt != MDDB_MNLBCNT)
11062 				continue;
11063 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
11064 				continue;
11065 			if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
11066 			    NULL))
11067 				continue;
11068 			if (lbp->lb_setno != s->s_setno)
11069 				continue;
11070 			/*
11071 			 * a commit count of zero means this locator has
11072 			 * been deleted
11073 			 */
11074 			if (lbp->lb_commitcnt == 0) {
11075 				continue;
11076 			}
11077 			/* Found a good locator - keep it */
11078 			found_good_one = 1;
11079 			break;
11080 		}
11081 
11082 		/*
11083 		 * If found a good copy of the mddb, then read it into
11084 		 * this node's locator block.  Fix up the set's s_mbiarray
11085 		 * pointer (master block incore array pointer) to be
11086 		 * in sync with the newly read in locator block.  If a
11087 		 * new mddb was added, read in the master blocks associated
11088 		 * with the new mddb.  If an mddb was deleted, free the
11089 		 * master blocks associated with deleted mddb.
11090 		 */
11091 		if (found_good_one)  {
11092 			/* Compare old and new view of mddb locator blocks */
11093 			old_lbp = s->s_lbp;
11094 			for (li = 0; li < lbp->lb_loccnt; li++) {
11095 				int	mn_set;
11096 
11097 				lp = &lbp->lb_locators[li];
11098 				old_lp = &old_lbp->lb_locators[li];
11099 
11100 				/* If old and new views match, continue */
11101 				if ((lp->l_flags & MDDB_F_ACTIVE) ==
11102 				    (old_lp->l_flags & MDDB_F_ACTIVE))
11103 					continue;
11104 
11105 				if (lp->l_flags & MDDB_F_ACTIVE) {
11106 					/*
11107 					 * If new mddb has been added - delete
11108 					 * old mbiarray and get new one.
11109 					 *
11110 					 * When devids are supported, will
11111 					 * need to get dev from devid.
11112 					 */
11113 					if (s->s_mbiarray[li]) {
11114 						free_mbipp(&s->s_mbiarray[li]);
11115 					}
11116 					/*
11117 					 * If getmasters fails, getmasters
11118 					 * will set appropriate error flags.
11119 					 */
11120 					s->s_mbiarray[li] = getmasters(s,
11121 					    md_expldev(lp->l_dev), lp->l_blkno,
11122 					    (uint_t *)&(lp->l_flags), &mn_set);
11123 				} else if (lp->l_flags & MDDB_F_DELETED) {
11124 					/*
11125 					 * If old one has been deleted -
11126 					 * delete old mbiarray.
11127 					 */
11128 					if (s->s_mbiarray[li]) {
11129 						free_mbipp(&s->s_mbiarray[li]);
11130 					}
11131 				}
11132 			}
11133 
11134 			/* Free this node's old view of mddb locator blocks */
11135 			kmem_free((caddr_t)s->s_lbp,
11136 				dbtob(s->s_lbp->lb_blkcnt));
11137 			s->s_lbp = lbp;
11138 		} else {
11139 			if (lbp)
11140 				kmem_free(lbp, dbtob(MDDB_MNLBCNT));
11141 		}
11142 	}
11143 
11144 	if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
11145 		lnp = s->s_lnp;
11146 		lbp = s->s_lbp;
11147 		ln_blkcnt = lbp->lb_lnblkcnt;
11148 		s->s_lnp = NULL; /* readlocnames does this anyway */
11149 		for (li = 0; li < lbp->lb_loccnt; li++) {
11150 			lp = &lbp->lb_locators[li];
11151 
11152 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11153 			    (lp->l_flags & MDDB_F_EMASTER))
11154 				continue;
11155 
11156 			/* Successfully read the locator names */
11157 			if (readlocnames(s, li) == 0)
11158 				break;
11159 		}
11160 
11161 		if (li == lbp->lb_loccnt) {
11162 			/* Did not successfully read locnames; restore lnp */
11163 			s->s_lnp = lnp;
11164 		} else {
11165 			/* readlocnames successful, free old struct */
11166 			kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
11167 		}
11168 	}
11169 
11170 	if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
11171 		mddb_de_ic_t	*dep, *tdep, *first_dep, *dep2;
11172 		mddb_db_t	*dbp;
11173 		mddb_db32_t	*db32p;
11174 		mddb_de32_t	*de32p, *de32p2;
11175 		int		writeout;
11176 
11177 		lbp = s->s_lbp;
11178 		/*
11179 		 * Walk through directory block and directory entry incore
11180 		 * linked list looking for optimized resync records.
11181 		 * For each opt record found, re-read in directory block.
11182 		 * The directoy block consists of a number of directory
11183 		 * entries.  The directory entry for this opt record will
11184 		 * describe which 2 mddbs actually contain the resync record
11185 		 * since it could have been relocated by the master node
11186 		 * due to mddb failure or mddb deletion.  If this node
11187 		 * is the record owner for this opt record, then write out
11188 		 * the record to the 2 mddbs listed in the directory entry
11189 		 * if the mddbs locations are different than previously known.
11190 		 */
11191 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11192 			for (dep = dbp->db_firstentry; dep;
11193 			    dep = dep->de_next) {
11194 				/* Found an opt record */
11195 				if (dep->de_flags & MDDB_F_OPT)
11196 					break;
11197 			}
11198 			/* If no opt records found, go to next dbp */
11199 			if (dep == NULL)
11200 				continue;
11201 
11202 			/*
11203 			 * Reread directory block from disk since
11204 			 * master could have rewritten in during fixoptrecord.
11205 			 */
11206 			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
11207 				KM_SLEEP);
11208 			create_db32rec(db32p, dbp);
11209 			for (li = 0; li < lbp->lb_loccnt; li++) {
11210 				lp = &lbp->lb_locators[li];
11211 
11212 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11213 				    (lp->l_flags & MDDB_F_EMASTER))
11214 					continue;
11215 
11216 				err = readblks(s, (caddr_t)db32p,
11217 					db32p->db32_blknum, 1, li);
11218 				if (err)
11219 					continue;
11220 
11221 				/* Reverify db; go to next mddb if bad */
11222 				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
11223 				    (revchk(MDDB_REV_DB,
11224 					db32p->db32_revision)) ||
11225 				    (crcchk(db32p, &db32p->db32_checksum,
11226 					MDDB_BSIZE, NULL))) {
11227 					continue;
11228 				} else {
11229 					break;
11230 				}
11231 			}
11232 			/*
11233 			 * If all mddbs are unavailable then panic since
11234 			 * this slave cannot be allowed to continue out-of-sync
11235 			 * with the master node.  Since the optimized resync
11236 			 * records are written by all nodes, all nodes must
11237 			 * stay in sync with the master.
11238 			 *
11239 			 * This also handles the case when all storage
11240 			 * connectivity to a slave node has failed.  The
11241 			 * slave node will send an MDDB_OPTRECERR message to
11242 			 * the master node when the slave node has been unable
11243 			 * to write an optimized resync record to both
11244 			 * designated mddbs.  After the master has fixed the
11245 			 * optimized records to be on available mddbs, the
11246 			 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
11247 			 * is sent to all slave nodes.  If a slave node is
11248 			 * unable to access any mddb in order to read in the
11249 			 * relocated optimized resync record, then the slave
11250 			 * node must panic.
11251 			 */
11252 			if (li == lbp->lb_loccnt) {
11253 				kmem_free((caddr_t)db32p, MDDB_BSIZE);
11254 				cmn_err(CE_PANIC, "md: mddb: Node unable to "
11255 					"access any SVM state database "
11256 					"replicas for diskset %s\n",
11257 					s->s_setname);
11258 			}
11259 			/*
11260 			 * Setup temp copy of linked list of de's.
11261 			 * Already have an incore copy, but need to walk
11262 			 * the directory entry list contained in the
11263 			 * new directory block that was just read in above.
11264 			 * After finding the directory entry of an opt record
11265 			 * by walking the incore list, find the corresponding
11266 			 * entry in the temporary list and then update
11267 			 * the incore directory entry record with
11268 			 * the (possibly changed) mddb location stored
11269 			 * for the optimized resync records.
11270 			 */
11271 			de32p = (mddb_de32_t *)
11272 			    ((void *) ((caddr_t)
11273 			    (&db32p->db32_firstentry)
11274 			    + sizeof (db32p->db32_firstentry)));
11275 			tdep = (mddb_de_ic_t *)
11276 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
11277 			    sizeof (mddb_block_t) +
11278 			    sizeof (mddb_block_t) *
11279 			    de32p->de32_blkcount, KM_SLEEP);
11280 			de32tode(de32p, tdep);
11281 			first_dep = tdep;
11282 			while (de32p && de32p->de32_next) {
11283 				de32p2 = nextentry(de32p);
11284 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
11285 				    sizeof (mddb_de_ic_t) -
11286 				    sizeof (mddb_block_t) +
11287 				    sizeof (mddb_block_t) *
11288 				    de32p2->de32_blkcount, KM_SLEEP);
11289 				de32tode(de32p2, dep2);
11290 				tdep->de_next = dep2;
11291 				tdep = dep2;
11292 				de32p = de32p2;
11293 			}
11294 
11295 			/* Now, walk the incore directory entry list */
11296 			for (dep = dbp->db_firstentry; dep;
11297 			    dep = dep->de_next) {
11298 				if (! (dep->de_flags & MDDB_F_OPT))
11299 					continue;
11300 				/*
11301 				 * Found an opt record in the incore copy.
11302 				 * Find the corresponding entry in the temp
11303 				 * list.  If anything has changed in the
11304 				 * opt record info between the incore copy
11305 				 * and the temp copy, update the incore copy
11306 				 * and set a flag to writeout the opt record
11307 				 * to the new mddb locations.
11308 				 */
11309 				for (tdep = first_dep; tdep;
11310 				    tdep = tdep->de_next) {
11311 					if (dep->de_recid == tdep->de_recid) {
11312 					    writeout = 0;
11313 					    /* Check first mddb location */
11314 					    if ((dep->de_optinfo[0].o_li !=
11315 						tdep->de_optinfo[0].o_li) ||
11316 						(dep->de_optinfo[0].o_flags !=
11317 						tdep->de_optinfo[0].o_flags)) {
11318 						    dep->de_optinfo[0] =
11319 						    tdep->de_optinfo[0];
11320 						    writeout = 1;
11321 					    }
11322 					    /* Check second mddb location */
11323 					    if ((dep->de_optinfo[1].o_li !=
11324 						tdep->de_optinfo[1].o_li) ||
11325 						(dep->de_optinfo[1].o_flags !=
11326 						tdep->de_optinfo[1].o_flags)) {
11327 						    dep->de_optinfo[1] =
11328 						    tdep->de_optinfo[1];
11329 						    writeout = 1;
11330 					    }
11331 					    /* Record owner should rewrite it */
11332 					    if ((writeout) &&
11333 						(dep->de_owner_nodeid ==
11334 						md_set[mpp->c_setno].
11335 						s_nodeid)) {
11336 						    (void) writeoptrecord(s,
11337 							dep);
11338 					    }
11339 					    break;
11340 					}
11341 				}
11342 			}
11343 			/*
11344 			 * Update the incore checksum information for this
11345 			 * directory block to match the newly read in checksum.
11346 			 * This should have only changed if the incore and
11347 			 * temp directory entries differed, but it takes
11348 			 * more code to do the check than to just update
11349 			 * the information everytime.
11350 			 */
11351 			dbp->db_checksum = db32p->db32_checksum;
11352 
11353 			/* Now free everything */
11354 			tdep = first_dep;
11355 			while (tdep) {
11356 				dep2 = tdep->de_next;
11357 				kmem_free((caddr_t)tdep,
11358 				    sizeofde(tdep));
11359 				tdep = dep2;
11360 			}
11361 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
11362 		}
11363 		rval = 0;
11364 	}
11365 out:
11366 	single_thread_end(s);
11367 	mddb_setexit_no_parse(s);
11368 	return (rval);
11369 }
11370 
11371 int
11372 mddb_block(mddb_block_parm_t *mbp)
11373 {
11374 	mddb_set_t	*s;
11375 	int		err = 0;
11376 	md_error_t	*ep = &mbp->c_mde;
11377 
11378 	if (mbp->c_setno >= md_nsets)
11379 		return (EINVAL);
11380 
11381 	/*
11382 	 * If the new_master flag is set for this setno we are in the middle
11383 	 * of a reconfig cycle, and blocking or unblocking is not needed.
11384 	 * Hence we can return success immediately
11385 	 */
11386 	if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
11387 		return (0);
11388 	}
11389 
11390 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11391 		return (0);
11392 
11393 	if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11394 		return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
11395 	}
11396 
11397 	if (!(MD_MNSET_SETNO(mbp->c_setno))) {
11398 		mddb_setexit_no_parse(s);
11399 		return (EINVAL);
11400 	}
11401 
11402 	single_thread_start(s);
11403 
11404 	if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
11405 		md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11406 
11407 	if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
11408 		md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11409 
11410 	single_thread_end(s);
11411 	mddb_setexit_no_parse(s);
11412 	return (err);
11413 }
11414 
11415 /*
11416  * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
11417  * to relocate any optimized resync records to available mddbs.
11418  * This routine is only called on the master node.
11419  *
11420  * Used in a MN diskset when a slave node has failed to write an optimized
11421  * resync record.  The failed mddb information is sent to the master node
11422  * so the master can relocate the optimized records, if possible.  If the
11423  * failed mddb information has a mddb marked as failed that was previously
11424  * marked active on the master, the master sets its incore mddb state to
11425  * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
11426  * to relocate any optimized records on the newly failed mddbs by calling
11427  * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
11428  * optimized records are relocated.)
11429  *
11430  * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
11431  * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
11432  * flag causes the slave node to re-read in the locator block from disk.
11433  * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
11434  * blocks and write out any optimized resync records that have been
11435  * relocated to a different mddb.
11436  */
11437 int
11438 mddb_optrecfix(mddb_optrec_parm_t *mop)
11439 {
11440 	mddb_set_t		*s;
11441 	int			err = 0;
11442 	mddb_lb_t		*lbp;
11443 	mddb_mnlb_t		*mnlbp;
11444 	mddb_locator_t		*lp;
11445 	int			li;
11446 	mddb_mnsidelocator_t	*mnslp;
11447 	mddb_drvnm_t		*dn;
11448 	int			i, j;
11449 	md_replica_recerr_t	*recerr;
11450 	md_error_t		*ep = &mop->c_mde;
11451 	int			something_changed = 0;
11452 	int			alc, lc;
11453 	int			setno;
11454 
11455 	setno = mop->c_setno;
11456 	if (mop->c_setno >= md_nsets)
11457 		return (EINVAL);
11458 
11459 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11460 		return (0);
11461 
11462 	if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11463 		return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
11464 	}
11465 
11466 	if (!(MD_MNSET_SETNO(mop->c_setno))) {
11467 		mddb_setexit(s);
11468 		return (EINVAL);
11469 	}
11470 
11471 	single_thread_start(s);
11472 	lbp = s->s_lbp;
11473 	mnlbp = (mddb_mnlb_t *)lbp;
11474 
11475 	/*
11476 	 * If slave node has seen an mddb failure, but the master node
11477 	 * hasn't encountered this failure, mark the mddb as failed on
11478 	 * the master node and set the something_changed flag to 1.
11479 	 */
11480 	for (i = 0; i < 2; i++) {
11481 		recerr = &mop->c_recerr[i];
11482 		if (recerr->r_flags & MDDB_F_EWRITE) {
11483 			li = recerr->r_li;
11484 			lp = &lbp->lb_locators[li];
11485 			for (j = 0; j < MD_MNMAXSIDES; j++) {
11486 				mnslp = &mnlbp->lb_mnsidelocators[j][li];
11487 				if (mnslp->mnl_sideno == s->s_sideno)
11488 					break;
11489 			}
11490 			/* Do quick check using li */
11491 			if (j != MD_MNMAXSIDES)
11492 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11493 
11494 			if ((j != MD_MNMAXSIDES) &&
11495 			    (strncmp(dn->dn_data, recerr->r_driver_name,
11496 			    MD_MAXDRVNM) == 0) &&
11497 			    (recerr->r_blkno == lp->l_blkno) &&
11498 			    (recerr->r_mnum == mnslp->mnl_mnum)) {
11499 				if ((lp->l_flags & MDDB_F_ACTIVE) ||
11500 				    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
11501 					something_changed = 1;
11502 					lp->l_flags |= MDDB_F_EWRITE;
11503 					lp->l_flags &= ~MDDB_F_ACTIVE;
11504 				}
11505 			} else {
11506 			/*
11507 			 * Passed in li from slave does not match
11508 			 * the replica in the master's structures.
11509 			 * This could have occurred if a delete
11510 			 * mddb command was running when the
11511 			 * optimized resync record had a failure.
11512 			 * Search all replicas for this entry.
11513 			 * If no match, just ignore.
11514 			 * If a match, set replica in error.
11515 			 */
11516 			    for (li = 0; li < lbp->lb_loccnt; li++) {
11517 				lp = &lbp->lb_locators[li];
11518 				if (lp->l_flags & MDDB_F_DELETED)
11519 					continue;
11520 
11521 				for (j = 0; j < MD_MNMAXSIDES; j++) {
11522 					mnslp =
11523 					    &mnlbp->lb_mnsidelocators[j][li];
11524 					if (mnslp->mnl_sideno == s->s_sideno)
11525 						break;
11526 				}
11527 				if (j == MD_MNMAXSIDES)
11528 					continue;
11529 
11530 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11531 				if ((strncmp(dn->dn_data, recerr->r_driver_name,
11532 				    MD_MAXDRVNM) == 0) &&
11533 				    (recerr->r_blkno == lp->l_blkno) &&
11534 				    (recerr->r_mnum == mnslp->mnl_mnum)) {
11535 					if ((lp->l_flags & MDDB_F_ACTIVE) ||
11536 					    ((lp->l_flags & MDDB_F_EWRITE)
11537 					    == 0)) {
11538 						something_changed = 1;
11539 						lp->l_flags |= MDDB_F_EWRITE;
11540 						lp->l_flags &= ~MDDB_F_ACTIVE;
11541 					}
11542 					break;
11543 				}
11544 			    }
11545 			}
11546 		}
11547 	}
11548 
11549 	/*
11550 	 * If this message changed nothing, then we're done since this
11551 	 * failure has already been handled.
11552 	 * If some mddb state has been changed, send a parse message to
11553 	 * the slave nodes so that the slaves will re-read the locator
11554 	 * block from disk.
11555 	 */
11556 	if (something_changed == 0) {
11557 		single_thread_end(s);
11558 		mddb_setexit(s);
11559 		return (0);
11560 	} else {
11561 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11562 	}
11563 
11564 	/*
11565 	 * Scan replicas setting MD_SET_TOOFEW if
11566 	 * 50% or more of the mddbs have seen errors.
11567 	 * Note: Don't call selectreplicas or writeretry
11568 	 * since these routines may end up setting the ACTIVE flag
11569 	 * on a failed mddb if the master is able to access the mddb
11570 	 * but the slave node couldn't.  Need to have the ACTIVE flag
11571 	 * turned off in order to relocate the optimized records to
11572 	 * mddbs that are (hopefully) available on all nodes.
11573 	 */
11574 	alc = 0;
11575 	lc = 0;
11576 	for (li = 0; li < lbp->lb_loccnt; li++) {
11577 		lp = &lbp->lb_locators[li];
11578 		if (lp->l_flags & MDDB_F_DELETED)
11579 			continue;
11580 		lc++;
11581 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11582 			continue;
11583 		alc++;
11584 	}
11585 
11586 	/*
11587 	 * If more than 50% mddbs have failed, then don't relocate opt recs.
11588 	 * The node sending the mddb failure information will detect TOOFEW
11589 	 * and will panic when it attempts to re-write the optimized record.
11590 	 */
11591 	if (alc < ((lc + 1) / 2)) {
11592 		md_set_setstatus(setno, MD_SET_TOOFEW);
11593 		(void) push_lb(s);
11594 		(void) upd_med(s, "mddb_optrecfix(0)");
11595 		single_thread_end(s);
11596 		mddb_setexit(s);
11597 		return (0);
11598 	}
11599 
11600 	/* Attempt to relocate optimized records that are on failed mddbs */
11601 	(void) fixoptrecords(s);
11602 
11603 	/* Push changed locator block out to disk */
11604 	(void) push_lb(s);
11605 	(void) upd_med(s, "mddb_optrecfix(1)");
11606 
11607 	/* Recheck for TOOFEW after writing out locator blocks */
11608 	alc = 0;
11609 	lc = 0;
11610 	for (li = 0; li < lbp->lb_loccnt; li++) {
11611 		lp = &lbp->lb_locators[li];
11612 		if (lp->l_flags & MDDB_F_DELETED)
11613 			continue;
11614 		lc++;
11615 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11616 			continue;
11617 		alc++;
11618 	}
11619 
11620 	/* If more than 50% mddbs have failed, then don't relocate opt recs */
11621 	if (alc < ((lc + 1) / 2)) {
11622 		md_set_setstatus(setno, MD_SET_TOOFEW);
11623 		single_thread_end(s);
11624 		mddb_setexit(s);
11625 		return (0);
11626 	}
11627 
11628 	single_thread_end(s);
11629 	mddb_setexit(s);
11630 	return (0);
11631 }
11632 
11633 /*
11634  * Check if incore mddb on master node matches ondisk mddb.
11635  * If not, master writes out incore view to all mddbs.
11636  * Have previously verified that master is an owner of the
11637  * diskset (master has snarfed diskset) and that diskset is
11638  * not stale.
11639  *
11640  * Meant to be called during reconfig cycle during change of master.
11641  * Previous master in diskset may have changed the mddb and
11642  * panic'd before relaying information to slave nodes.  New
11643  * master node just writes out its incore view of the mddb and
11644  * the replay of the change log will resync all the nodes.
11645  *
11646  * Only supported for MN disksets.
11647  *
11648  * Return values:
11649  *	0 - success
11650  *	non-zero - failure
11651  */
11652 int
11653 mddb_check_write_ioctl(mddb_config_t *info)
11654 {
11655 	int			err = 0;
11656 	set_t			setno = info->c_setno;
11657 	mddb_set_t		*s;
11658 	int			li;
11659 	mddb_locator_t		*lp;
11660 	mddb_lb_t		*lbp;
11661 	mddb_mnlb_t		*mnlbp_od;
11662 	mddb_ln_t		*lnp;
11663 	mddb_mnln_t		*mnlnp_od;
11664 	mddb_db_t		*dbp;
11665 	mddb_de_ic_t		*dep;
11666 	int			write_out_mddb;
11667 	md_error_t		*ep = &info->c_mde;
11668 	int			mddb_err = 0;
11669 	int			prev_li = 0;
11670 	int			rval = 0;
11671 	int			alc, lc;
11672 	int			mddbs_present = 0;
11673 
11674 	/* Verify that setno is in valid range */
11675 	if (setno >= md_nsets)
11676 		return (EINVAL);
11677 
11678 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11679 		return (0);
11680 
11681 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
11682 		return (mddbstatus2error(ep, err, NODEV32, setno));
11683 	}
11684 
11685 	/* Calling diskset must be a MN diskset */
11686 	if (!(MD_MNSET_SETNO(setno))) {
11687 		mddb_setexit(s);
11688 		return (EINVAL);
11689 	}
11690 
11691 	/* Re-verify that set is not stale */
11692 	if (md_get_setstatus(setno) & MD_SET_STALE) {
11693 		mddb_setexit(s);
11694 		return (mdmddberror(ep, MDE_DB_STALE,
11695 			NODEV32, setno));
11696 	}
11697 
11698 	lbp = s->s_lbp;
11699 	lnp = s->s_lnp;
11700 
11701 	/*
11702 	 * Previous master could have died during the write of data to
11703 	 * the mddbs so that the ondisk mddbs may not be consistent.
11704 	 * So, need to check the contents of the first and last active mddb
11705 	 * to see if the mddbs need to be rewritten.
11706 	 */
11707 	for (li = 0; li < lbp->lb_loccnt; li++) {
11708 		int	checkcopy_err;
11709 
11710 		lp = &lbp->lb_locators[li];
11711 		/* Find replica that is active */
11712 		if (lp->l_flags & MDDB_F_DELETED)
11713 			continue;
11714 		mddbs_present = 1;
11715 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11716 			continue;
11717 		if (s->s_mbiarray[li] == NULL)
11718 			continue;
11719 		/* Check locator block */
11720 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11721 		    KM_SLEEP);
11722 		/* read in on-disk locator block */
11723 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11724 
11725 		/* If err, try next mddb */
11726 		if (err) {
11727 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11728 			continue;
11729 		}
11730 
11731 		/*
11732 		 * We resnarf all changelog entries for this set.
11733 		 * They may have been altered by the previous master
11734 		 */
11735 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11736 		    for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
11737 			if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
11738 				continue;
11739 			}
11740 			/* This has been alloc'ed while joining the set */
11741 			if (dep->de_rb) {
11742 				kmem_free(dep->de_rb, dep->de_recsize);
11743 				dep->de_rb = (mddb_rb32_t *)NULL;
11744 			}
11745 			if (dep->de_rb_userdata) {
11746 				kmem_free(dep->de_rb_userdata, dep->de_reqsize);
11747 				dep->de_rb_userdata = (caddr_t)NULL;
11748 			}
11749 
11750 			err = getrecord(s, dep, li);
11751 			if (err) {
11752 				/*
11753 				 * When we see on error while reading the
11754 				 * changelog entries, we move on to the next
11755 				 * mddb
11756 				 */
11757 				err = 1;
11758 				break; /* out of inner for-loop */
11759 			}
11760 			allocuserdata(dep);
11761 		    }
11762 		    if (err)
11763 			    break; /* out of outer for-loop */
11764 		}
11765 
11766 		/* If err, try next mddb */
11767 		if (err) {
11768 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11769 			continue;
11770 		}
11771 
11772 		/* Is incore locator block same as ondisk? */
11773 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11774 									== 1) {
11775 			write_out_mddb = 1;
11776 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11777 			break;
11778 		}
11779 
11780 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11781 
11782 		/* If lb ok, check locator names */
11783 		mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
11784 		    KM_SLEEP);
11785 		/* read in on-disk locator names */
11786 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11787 			lbp->lb_lnblkcnt, li);
11788 
11789 		/* If err, try next mddb */
11790 		if (err) {
11791 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11792 			continue;
11793 		}
11794 
11795 		/* Are incore locator names same as ondisk? */
11796 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11797 									== 1) {
11798 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11799 			write_out_mddb = 1;
11800 			break;
11801 		}
11802 
11803 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11804 
11805 		/*
11806 		 * Check records in mddb.
11807 		 * If a read error is encountered, set the error flag and
11808 		 * continue to the next mddb.  Otherwise, if incore data is
11809 		 * different from ondisk, then set the flag to write out
11810 		 * the mddb and break out.
11811 		 */
11812 		checkcopy_err = checkcopy(s, li);
11813 		if (checkcopy_err == MDDB_F_EREAD) {
11814 			lp->l_flags |= MDDB_F_EREAD;
11815 			mddb_err = 1;
11816 			continue;
11817 		} else if (checkcopy_err == 1) {
11818 			write_out_mddb = 1;
11819 			break;
11820 		}
11821 		/*
11822 		 * Have found first active mddb and the data is the same as
11823 		 * incore - break out of loop
11824 		 */
11825 		write_out_mddb = 0;
11826 		break;
11827 	}
11828 
11829 	/*
11830 	 * Skip checking for last active mddb if:
11831 	 *	- already found a mismatch in the first active mddb
11832 	 *		(write_out_mddb is 1)  OR
11833 	 * 	- didn't find a readable mddb when looking for first
11834 	 *	  active mddb (there are mddbs present but all failed
11835 	 *	  when read was attempted).
11836 	 *
11837 	 * In either case, go to write_out_mddb label in order to attempt
11838 	 * to write out the data. If < 50% mddbs are available, panic.
11839 	 */
11840 	if ((write_out_mddb == 1) ||
11841 	    ((li == lbp->lb_loccnt) && mddbs_present)) {
11842 		write_out_mddb = 1;
11843 		goto write_out_mddb;
11844 	}
11845 
11846 	/*
11847 	 * Save which index was checked for the first active mddb.  If only 1
11848 	 * active mddb, don't want to recheck the same mddb when looking for
11849 	 * last active mddb.
11850 	 */
11851 	prev_li = li;
11852 
11853 	/*
11854 	 * Now, checking for last active mddb.  If found same index as before
11855 	 * (only 1 active mddb), then skip.
11856 	 */
11857 	for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
11858 		int	checkcopy_err;
11859 
11860 		lp = &lbp->lb_locators[li];
11861 		/* Find replica that is active */
11862 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11863 			continue;
11864 		if (lp->l_flags & MDDB_F_DELETED)
11865 			continue;
11866 		if (s->s_mbiarray[li] == NULL)
11867 			continue;
11868 		/* If already checked mddb, bail out */
11869 		if (li == prev_li)
11870 			break;
11871 		/* Check locator block */
11872 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11873 		    KM_SLEEP);
11874 		/* read in on-disk locator block */
11875 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11876 
11877 		/* If err, try next mddb */
11878 		if (err) {
11879 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11880 			continue;
11881 		}
11882 
11883 
11884 		/* Is incore locator block same as ondisk? */
11885 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11886 									== 1) {
11887 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11888 			write_out_mddb = 1;
11889 			break;
11890 		}
11891 
11892 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11893 
11894 		/* If lb ok, check locator names */
11895 		mnlnp_od = (mddb_mnln_t *)
11896 		    kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);
11897 
11898 		/* read in on-disk locator names */
11899 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11900 		    lbp->lb_lnblkcnt, li);
11901 
11902 		/* If err, try next mddb */
11903 		if (err) {
11904 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11905 			continue;
11906 		}
11907 
11908 		/* Are incore locator names same as ondisk? */
11909 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11910 									== 1) {
11911 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11912 			write_out_mddb = 1;
11913 			break;
11914 		}
11915 
11916 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11917 
11918 		/*
11919 		 * Check records in mddb.
11920 		 * If a read error is encountered, set the error flag and
11921 		 * continue to the next mddb.  Otherwise, if incore data is
11922 		 * different from ondisk, then set the flag to write out
11923 		 * the mddb and break out.
11924 		 */
11925 		checkcopy_err = checkcopy(s, li);
11926 		if (checkcopy_err == MDDB_F_EREAD) {
11927 			lp->l_flags |= MDDB_F_EREAD;
11928 			mddb_err = 1;
11929 			continue;
11930 		} else if (checkcopy_err == 1) {
11931 			write_out_mddb = 1;
11932 			break;
11933 		}
11934 		/*
11935 		 * Have found last active mddb and the data is the same as
11936 		 * incore - break out of loop
11937 		 */
11938 		write_out_mddb = 0;
11939 		break;
11940 	}
11941 
11942 	/*
11943 	 * If ondisk and incore versions of the mddb don't match, then
11944 	 * write out this node's incore version to disk.
11945 	 * Or, if unable to read a copy of the mddb, attempt to write
11946 	 * out a new one.
11947 	 */
11948 write_out_mddb:
11949 	if (write_out_mddb) {
11950 		/* Recompute free blocks based on incore information */
11951 		computefreeblks(s); /* set up free block bits */
11952 
11953 		/*
11954 		 * Write directory entries and record blocks.
11955 		 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
11956 		 * routine won't write out change log records.
11957 		 */
11958 		for (li = 0; li < lbp->lb_loccnt; li++) {
11959 			lp = &lbp->lb_locators[li];
11960 			/* Don't write to inactive or deleted mddbs */
11961 			if (! (lp->l_flags & MDDB_F_ACTIVE))
11962 				continue;
11963 			if (lp->l_flags & MDDB_F_DELETED)
11964 				continue;
11965 			if (s->s_mbiarray[li] == NULL)
11966 				continue;
11967 			/* If encounter a write error, save it for later */
11968 			if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
11969 				lp->l_flags |= MDDB_F_EWRITE;
11970 				mddb_err = 1;
11971 			}
11972 		}
11973 
11974 		/*
11975 		 * Write out locator blocks to all replicas.
11976 		 * push_lb will set MDDB_F_EWRITE on replicas that fail.
11977 		 */
11978 		if (push_lb(s))
11979 			mddb_err = 1;
11980 		(void) upd_med(s, "mddb_check_write_ioctl(0)");
11981 
11982 		/* Write out locator names to all replicas */
11983 		lnp = s->s_lnp;
11984 		uniqtime32(&lnp->ln_timestamp);
11985 		lnp->ln_revision = MDDB_REV_MNLN;
11986 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
11987 
11988 		/* writeall sets MDDB_F_EWRITE if writes fails to replica */
11989 		if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
11990 		    lbp->lb_lnblkcnt, 0))
11991 			mddb_err = 1;
11992 
11993 		/*
11994 		 * The writes to the replicas above would have set
11995 		 * the MDDB_F_EWRITE flags if any write error was
11996 		 * encountered.
11997 		 * If < 50% of the mddbs are available, panic.
11998 		 */
11999 		lc = alc = 0;
12000 		for (li = 0; li < lbp->lb_loccnt; li++) {
12001 			lp = &lbp->lb_locators[li];
12002 			if (lp->l_flags & MDDB_F_DELETED)
12003 				continue;
12004 			lc++;
12005 			/*
12006 			 * If mddb:
12007 			 *	- is not active (previously had an error)
12008 			 *	- had an error reading the master blocks  or
12009 			 *	- had an error in writing to the mddb
12010 			 * then don't count this mddb in the active count.
12011 			 */
12012 			if (! (lp->l_flags & MDDB_F_ACTIVE) ||
12013 			    (lp->l_flags & MDDB_F_EMASTER) ||
12014 			    (lp->l_flags & MDDB_F_EWRITE))
12015 				continue;
12016 			alc++;
12017 		}
12018 		if (alc < ((lc + 1) / 2)) {
12019 			cmn_err(CE_PANIC,
12020 			    "md: Panic due to lack of DiskSuite state\n"
12021 			    " database replicas. Fewer than 50%% of "
12022 			    "the total were available,\n so panic to "
12023 			    "ensure data integrity.");
12024 		}
12025 	}
12026 
12027 	/*
12028 	 * If encountered an error during checking or writing of
12029 	 * mddbs, call selectreplicas so that replica error can
12030 	 * be properly handled. This will involve another attempt
12031 	 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
12032 	 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
12033 	 * turned off. Set the MDDB_SCANALLSYNC flag so that
12034 	 * selectreplicas doesn't overwrite the change log entries.
12035 	 *
12036 	 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
12037 	 * that the locator block has been changed.
12038 	 */
12039 	if (mddb_err) {
12040 		(void) selectreplicas(s, MDDB_SCANALLSYNC);
12041 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
12042 	}
12043 
12044 write_out_end:
12045 	mddb_setexit(s);
12046 	return (rval);
12047 }
12048 
12049 /*
12050  * Set/reset/get set flags in set structure.
12051  * Used during reconfig cycle
12052  * Only supported for MN disksets.
12053  *
12054  * Return values:
12055  *	0 - success
12056  *	non-zero - failure
12057  */
12058 int
12059 mddb_setflags_ioctl(mddb_setflags_config_t *info)
12060 {
12061 	set_t			setno = info->sf_setno;
12062 
12063 	/* Verify that setno is in valid range */
12064 	if (setno >= md_nsets)
12065 		return (EINVAL);
12066 
12067 	/*
12068 	 * When setting the flags, the set may not
12069 	 * be snarfed yet. So, don't check for SNARFED or MNset
12070 	 * and don't call mddb_setenter.
12071 	 * In order to discourage bad ioctl calls,
12072 	 * verify that magic field in structure is set correctly.
12073 	 */
12074 	if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
12075 		return (EINVAL);
12076 
12077 	switch (info->sf_flags) {
12078 	case MDDB_NM_SET:
12079 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12080 			md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12081 		if (info->sf_setflags & MD_SET_MN_START_RC)
12082 			md_set_setstatus(setno, MD_SET_MN_START_RC);
12083 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12084 			md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12085 		break;
12086 
12087 	case MDDB_NM_RESET:
12088 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12089 			md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12090 		if (info->sf_setflags & MD_SET_MN_START_RC)
12091 			md_clr_setstatus(setno, MD_SET_MN_START_RC);
12092 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12093 			md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12094 		break;
12095 
12096 	case MDDB_NM_GET:
12097 		info->sf_setflags = md_get_setstatus(setno) &
12098 		    (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
12099 		    MD_SET_MN_MIR_STATE_RC);
12100 		break;
12101 	}
12102 
12103 	return (0);
12104 }
12105 
12106 /*
12107  * md_update_minor
12108  *
12109  * This function updates the minor in the namespace entry for an
12110  * underlying metadevice.  The function is called in mod_imp_set
12111  * where mod is sp, stripe, mirror and raid.
12112  *
12113  */
12114 int
12115 md_update_minor(
12116 	set_t	setno,
12117 	side_t	side,
12118 	mdkey_t	key
12119 )
12120 {
12121 	struct nm_next_hdr	*nh;
12122 	struct nm_name		*n;
12123 	char			*shn;
12124 	int			retval = 1;
12125 
12126 	/*
12127 	 * Load the devid name space if it exists
12128 	 */
12129 	(void) md_load_namespace(setno, NULL, NM_DEVID);
12130 	if (! md_load_namespace(setno, NULL, 0L)) {
12131 		/*
12132 		 * Unload the devid namespace
12133 		 */
12134 		(void) md_unload_namespace(setno, NM_DEVID);
12135 		return (0);
12136 	}
12137 
12138 	rw_enter(&nm_lock.lock, RW_READER);
12139 
12140 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12141 		retval = 0;
12142 		goto out;
12143 	}
12144 
12145 	/*
12146 	 * Look up the key
12147 	 */
12148 	if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) {
12149 		/*
12150 		 * Find the entry, update its n_minor if metadevice
12151 		 */
12152 		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12153 		    == NULL) {
12154 			retval = 0;
12155 			goto out;
12156 		}
12157 
12158 		if (strcmp(shn, "md") == 0) {
12159 			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12160 		}
12161 	}
12162 
12163 out:
12164 	rw_exit(&nm_lock.lock);
12165 	return (retval);
12166 }
12167 
12168 /*
12169  * md_update_top_device_minor
12170  *
12171  * This function updates the minor in the namespace entry for a top
12172  * level metadevice.  The function is called in mod_imp_set where
12173  * mod is sp, stripe, mirror and raid.
12174  *
12175  */
12176 int
12177 md_update_top_device_minor(
12178 	set_t	setno,
12179 	side_t	side,
12180 	md_dev64_t dev
12181 )
12182 {
12183 	struct nm_next_hdr	*nh;
12184 	struct nm_name		*n;
12185 	char			*shn;
12186 	int			retval = 1;
12187 
12188 	/*
12189 	 * Load the devid name space if it exists
12190 	 */
12191 	(void) md_load_namespace(setno, NULL, NM_DEVID);
12192 	if (! md_load_namespace(setno, NULL, 0L)) {
12193 		/*
12194 		 * Unload the devid namespace
12195 		 */
12196 		(void) md_unload_namespace(setno, NM_DEVID);
12197 		return (0);
12198 	}
12199 
12200 	rw_enter(&nm_lock.lock, RW_READER);
12201 
12202 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12203 		retval = 0;
12204 		goto out;
12205 	}
12206 
12207 	/*
12208 	 * Look up the key
12209 	 */
12210 	if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
12211 		/*
12212 		 * Find the entry, update its n_minor if metadevice
12213 		 */
12214 		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12215 		    == NULL) {
12216 			retval = 0;
12217 			goto out;
12218 		}
12219 
12220 		if (strcmp(shn, "md") == 0) {
12221 			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12222 		}
12223 	}
12224 
12225 out:
12226 	rw_exit(&nm_lock.lock);
12227 	return (retval);
12228 }
12229 
12230 static void
12231 md_imp_nm(
12232 	mddb_set_t	*s
12233 )
12234 {
12235 	mddb_db_t		*dbp;
12236 	mddb_de_ic_t		*dep;
12237 	struct nm_rec_hdr	*hdr;
12238 	struct nm_header	*hhdr;
12239 	set_t			setno = s->s_setno;
12240 
12241 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12242 		for (dep = dbp->db_firstentry; dep != NULL;
12243 		    dep = dep->de_next) {
12244 			switch (dep->de_type1) {
12245 
12246 			case MDDB_NM_HDR:
12247 			case MDDB_DID_NM_HDR:
12248 
12249 				hhdr = (struct nm_header *)
12250 				    dep->de_rb_userdata;
12251 
12252 				hdr = &hhdr->h_names;
12253 				if (hdr->r_next_recid > 0) {
12254 					hdr->r_next_recid = MAKERECID(setno,
12255 					    DBID(hdr->r_next_recid));
12256 				}
12257 
12258 				hdr = &hhdr->h_shared;
12259 				if (hdr->r_next_recid > 0) {
12260 					hdr->r_next_recid = MAKERECID(setno,
12261 					    DBID(hdr->r_next_recid));
12262 				}
12263 				break;
12264 
12265 			case MDDB_NM:
12266 			case MDDB_DID_NM:
12267 			case MDDB_SHR_NM:
12268 			case MDDB_DID_SHR_NM:
12269 
12270 				hdr = (struct nm_rec_hdr *)
12271 				    dep->de_rb_userdata;
12272 
12273 				if (hdr->r_next_recid > 0) {
12274 					hdr->r_next_recid = MAKERECID
12275 					    (setno, DBID(hdr->r_next_recid));
12276 				}
12277 				break;
12278 
12279 			default:
12280 				break;
12281 			}
12282 		}
12283 	}
12284 }
12285 
12286 static int
12287 update_db_rec(
12288 	mddb_set_t	*s
12289 )
12290 {
12291 	mddb_db_t	*dbp;
12292 	mddb_de_ic_t	*dep;
12293 	mddb_recid_t	ids[2];
12294 
12295 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12296 		for (dep = dbp->db_firstentry; dep != NULL;
12297 		    dep = dep->de_next) {
12298 			if (! (dep->de_flags & MDDB_F_OPT)) {
12299 				ids[0] = MAKERECID(s->s_setno, dep->de_recid);
12300 				ids[1] = 0;
12301 				if (mddb_commitrecs(ids)) {
12302 					return (MDDB_E_NORECORD);
12303 				}
12304 			}
12305 		}
12306 	}
12307 	return (0);
12308 }
12309 
12310 static int
12311 update_mb(
12312 	mddb_set_t	*s
12313 )
12314 {
12315 	mddb_ri_t	*rip;
12316 	int	err = 0;
12317 
12318 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12319 		if (rip->ri_flags & MDDB_F_EMASTER)
12320 			/* disk is powered off or not there */
12321 			continue;
12322 
12323 		if (md_get_setstatus(s->s_setno) &
12324 			MD_SET_REPLICATED_IMPORT) {
12325 			/*
12326 			 * It is a replicated set
12327 			 */
12328 			if (rip->ri_devid == (ddi_devid_t)NULL) {
12329 				return (-1);
12330 			}
12331 			err = update_mb_devid(s, rip, rip->ri_devid);
12332 		} else {
12333 			/*
12334 			 * It is a non-replicated set
12335 			 * and there is no need to update
12336 			 * devid
12337 			 */
12338 			err = update_mb_devid(s, rip, NULL);
12339 		}
12340 
12341 		if (err)
12342 			return (err);
12343 	}
12344 
12345 	return (0);
12346 }
12347 
12348 static int
12349 update_setname(
12350 	set_t	setno
12351 )
12352 {
12353 	struct nm_next_hdr	*nh;
12354 	struct nm_shared_name	*shn, *new_shn;
12355 	char			*prefix = "/dev/md/";
12356 	char			*shrname;
12357 	int			len;
12358 	mdkey_t			o_key;
12359 	uint32_t		o_count, o_data;
12360 	mddb_recid_t		recid, ids[3];
12361 	int			err = 0;
12362 	mddb_set_t		*dbp;
12363 
12364 	/* Import setname */
12365 	dbp = (mddb_set_t *)md_set[setno].s_db;
12366 	len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
12367 	shrname = kmem_zalloc(len, KM_SLEEP);
12368 	(void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");
12369 
12370 	rw_enter(&nm_lock.lock, RW_WRITER);
12371 	if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
12372 		/*
12373 		 * No namespace is okay
12374 		 */
12375 		err = 0;
12376 		goto out;
12377 	}
12378 
12379 	if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
12380 	    0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
12381 		/*
12382 		 * No metadevice is okay
12383 		 */
12384 		err = 0;
12385 		goto out;
12386 	}
12387 
12388 	/*
12389 	 * We have it, go ahead and update the namespace.
12390 	 */
12391 	o_key = shn->sn_key;
12392 	o_count = shn->sn_count;
12393 	o_data = shn->sn_data;
12394 
12395 	if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
12396 	    NM_NOCOMMIT)) {
12397 		err = MDDB_E_NORECORD;
12398 		goto out;
12399 	}
12400 	if ((new_shn = (struct nm_shared_name *)alloc_entry(
12401 	    nh, md_set[setno].s_nmid, len, NM_SHARED |
12402 	    NM_NOCOMMIT, &recid)) == NULL) {
12403 		err = MDDB_E_NORECORD;
12404 		goto out;
12405 	}
12406 
12407 	new_shn->sn_key = o_key;
12408 	new_shn->sn_count = o_count;
12409 	new_shn->sn_data = o_data;
12410 	new_shn->sn_namlen = (ushort_t)len;
12411 	(void) strcpy(new_shn->sn_name, shrname);
12412 
12413 	ids[0] = recid;
12414 	ids[1] = md_set[setno].s_nmid;
12415 	ids[2] = 0;
12416 	err = mddb_commitrecs(ids);
12417 
12418 out:
12419 	if (shrname)
12420 		kmem_free(shrname, len);
12421 	rw_exit(&nm_lock.lock);
12422 	return (err);
12423 }
12424 
12425 /*
12426  * Returns 0 on success.
12427  * Returns -1 on failure with ep filled in.
12428  */
12429 static int
12430 md_imp_db(
12431 	set_t		setno,
12432 	int		stale_flag,
12433 	md_error_t	*ep
12434 )
12435 {
12436 	mddb_set_t	*s;
12437 	int		err = 0;
12438 	mddb_dt_t	*dtp;
12439 	mddb_lb_t	*lbp;
12440 	int		i;
12441 	int		loccnt;
12442 
12443 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12444 		return (mddbstatus2error(ep, err, NODEV32, setno));
12445 	}
12446 
12447 	/* Update dt */
12448 	if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
12449 		crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
12450 	}
12451 
12452 	if ((err = dt_write(s)) != 0) {
12453 		err = mdsyserror(ep, err);
12454 		mddb_setexit(s);
12455 		return (err);
12456 	}
12457 
12458 	/*
12459 	 * Update lb, no need to update the mediator because
12460 	 * the diskset will only exist on the importing node
12461 	 * and as such a mediator adds no value.
12462 	 */
12463 
12464 	/* Update lb */
12465 	if (stale_flag & MD_IMP_STALE_SET) {
12466 		lbp = s->s_lbp;
12467 		loccnt = lbp->lb_loccnt;
12468 		for (i = 0; i < loccnt; i++) {
12469 			mddb_locator_t	*lp = &lbp->lb_locators[i];
12470 			md_dev64_t	ndev = md_expldev(lp->l_dev);
12471 			ddi_devid_t	devid_ptr;
12472 
12473 			devid_ptr = s->s_did_icp->did_ic_devid[i];
12474 			if (devid_ptr == NULL) {
12475 				/*
12476 				 * Already deleted, go to next one.
12477 				 */
12478 				continue;
12479 			}
12480 			if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
12481 			    NULL)) {
12482 				/* disk unavailable, mark deleted */
12483 				lp->l_flags = MDDB_F_DELETED;
12484 				/* then remove the device id from the list */
12485 				free_mbipp(&s->s_mbiarray[i]);
12486 				s->s_mbiarray[i] = 0;
12487 				(void) mddb_devid_delete(s, i);
12488 			}
12489 		}
12490 		md_clr_setstatus(setno, MD_SET_STALE);
12491 	}
12492 
12493 	if ((err = writelocall(s)) != 0) {
12494 		err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
12495 		mddb_setexit(s);
12496 		return (err);
12497 	}
12498 
12499 	mddb_setexit(s);
12500 
12501 	/* Update db records */
12502 	if ((err = update_db_rec(s)) != 0) {
12503 		return (mddbstatus2error(ep, err, NODEV32, setno));
12504 	}
12505 
12506 	/* Update setname embedded in the namespace */
12507 	if ((err = update_setname(setno)) != 0)
12508 		return (mddbstatus2error(ep, err, NODEV32, setno));
12509 
12510 	return (err);
12511 }
12512 
12513 static void
12514 md_dr_add(
12515 	md_set_record	*sr,
12516 	md_drive_record	*dr
12517 )
12518 {
12519 	md_drive_record	*drv;
12520 
12521 	if (sr->sr_driverec == 0) {
12522 		sr->sr_driverec = dr->dr_selfid;
12523 		return;
12524 	}
12525 
12526 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12527 	    drv->dr_nextrec != 0;
12528 	    drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
12529 		;
12530 	drv->dr_nextrec = dr->dr_selfid;
12531 }
12532 
12533 static void
12534 md_setup_recids(
12535 	md_set_record	*sr,
12536 	mddb_recid_t	**ids,
12537 	size_t		size
12538 )
12539 {
12540 	md_drive_record	*drv;
12541 	int		cnt;
12542 	mddb_recid_t	*recids;
12543 
12544 	recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
12545 	    * size, KM_SLEEP);
12546 	recids[0] = sr->sr_selfid;
12547 	cnt = 1;
12548 
12549 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12550 	    /* CSTYLED */
12551 	    drv != NULL;) {
12552 		recids[cnt++] = drv->dr_selfid;
12553 		if (drv->dr_nextrec != 0)
12554 			drv = (md_drive_record *)mddb_getrecaddr
12555 			    (drv->dr_nextrec);
12556 		else
12557 			drv = NULL;
12558 	}
12559 	recids[cnt] = 0;
12560 	*ids = &recids[0];
12561 }
12562 
12563 /*
12564  * The purpose of this function is to replace the old_devid with the
12565  * new_devid in the given namespace.   This is used for importing
12566  * remotely replicated drives.
12567  */
12568 int
12569 md_update_namespace_rr_did(
12570 	mddb_config_t	*cp
12571 )
12572 {
12573 	set_t			setno = cp->c_setno;
12574 	struct nm_next_hdr	*nh;
12575 	mdkey_t			key = MD_KEYWILD;
12576 	side_t			side = MD_SIDEWILD;
12577 	mddb_recid_t		recids[3];
12578 	struct did_min_name	*n;
12579 	struct nm_next_hdr	*did_shr_nh;
12580 	struct did_shr_name	*shr_n;
12581 	mdkey_t			ent_did_key;
12582 	uint32_t		ent_did_count;
12583 	uint32_t		ent_did_data;
12584 	size_t			ent_size, size;
12585 	ddi_devid_t		devid = NULL;
12586 	struct did_shr_name	*shn;
12587 	size_t			offset;
12588 	struct nm_next_hdr	*this_did_shr_nh;
12589 	void			*old_devid, *new_devid;
12590 
12591 	if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
12592 		return (EIO);
12593 
12594 	old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
12595 	new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;
12596 
12597 	/*
12598 	 * It is okay if we dont have any configuration
12599 	 */
12600 	offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name));
12601 	if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
12602 	    == NULL) {
12603 		return (0);
12604 	}
12605 	while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
12606 		/* check out every entry in the namespace */
12607 		if ((n = (struct did_min_name *)lookup_entry(nh, setno,
12608 		    side, key, NODEV64, NM_DEVID)) == NULL) {
12609 			continue;
12610 		} else {
12611 			did_shr_nh = get_first_record(setno, 0, NM_DEVID |
12612 			    NM_SHARED);
12613 			if (did_shr_nh == NULL) {
12614 				return (ENOENT);
12615 			}
12616 			this_did_shr_nh = did_shr_nh->nmn_nextp;
12617 			shr_n = (struct did_shr_name *)lookup_shared_entry(
12618 			    did_shr_nh, n->min_devid_key, (char *)0,
12619 			    &recids[0], NM_DEVID);
12620 			if (shr_n == NULL) {
12621 				return (ENOENT);
12622 			}
12623 			rw_enter(&nm_lock.lock, RW_WRITER);
12624 			devid = (ddi_devid_t)shr_n->did_devid;
12625 			/* find this devid in the incore replica  */
12626 			if (ddi_devid_compare(devid, old_devid) == 0) {
12627 				/*
12628 				 * found the corresponding entry
12629 				 * update with new devid
12630 				 */
12631 				/* first remove old devid info */
12632 				ent_did_key = shr_n ->did_key;
12633 				ent_did_count = shr_n->did_count;
12634 				ent_did_data = shr_n->did_data;
12635 				ent_size = DID_SHR_NAMSIZ(shr_n);
12636 				size = ((struct nm_rec_hdr *)
12637 				    this_did_shr_nh->nmn_record)->
12638 				    r_used_size - offset - ent_size;
12639 				if (size == 0) {
12640 					(void) bzero(shr_n, ent_size);
12641 				} else {
12642 					(void) ovbcopy((caddr_t)shr_n +
12643 					    ent_size, shr_n, size);
12644 					(void) bzero((caddr_t)shr_n +
12645 					    size, ent_size);
12646 				}
12647 				((struct nm_rec_hdr *)this_did_shr_nh->
12648 				    nmn_record)->r_used_size -=
12649 				    ent_size;
12650 				/* add in new devid info */
12651 				if ((shn = (struct did_shr_name *)
12652 				    alloc_entry(did_shr_nh,
12653 				    md_set[setno].s_did_nmid,
12654 				    cp->c_locator.l_devid_sz,
12655 				    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
12656 				    &recids[0])) == NULL) {
12657 						rw_exit(&nm_lock.lock);
12658 						return (ENOMEM);
12659 					}
12660 					shn->did_key = ent_did_key;
12661 					shn->did_count = ent_did_count;
12662 					ent_did_data |= NM_DEVID_VALID;
12663 					shn->did_data = ent_did_data;
12664 					shn->did_size = ddi_devid_sizeof(
12665 					    new_devid);
12666 					bcopy((void *)new_devid, (void *)
12667 					    shn->did_devid, shn->did_size);
12668 					recids[1] = md_set[setno].s_nmid;
12669 					recids[2] = 0;
12670 					mddb_commitrecs_wrapper(recids);
12671 			}
12672 			rw_exit(&nm_lock.lock);
12673 		}
12674 	}
12675 
12676 	return (0);
12677 }
12678 
12679 /*
12680  * namespace is loaded before this is called.
12681  * This function is a wrapper for md_update_namespace_rr_did.
12682  *
12683  * md_update_namespace_rr_did may be called twice if attempting to
12684  * resolve a replicated device id during the take of a diskset - once
12685  * for the diskset namespace and a second time for the local namespace.
12686  * The local namespace would need to be updated when a drive has been
12687  * found during a take of the diskset that hadn't been resolved during
12688  * the import (aka partial replicated import).
12689  *
12690  * If being called during the import of the diskset (IMPORT flag set)
12691  * md_update_namespace_rr_did will only be called once with the disket
12692  * namespace.
12693  */
12694 int
12695 md_update_nm_rr_did_ioctl(
12696 	mddb_config_t	*cp
12697 )
12698 {
12699 	int	rval = 0;
12700 
12701 	/* If update of diskset namespace fails, stop and return failure */
12702 	if ((rval = md_update_namespace_rr_did(cp)) != 0)
12703 		return (rval);
12704 
12705 	if (cp->c_flags & MDDB_C_IMPORT)
12706 		return (0);
12707 
12708 	/* If update of local namespace fails, return failure */
12709 	cp->c_setno = MD_LOCAL_SET;
12710 	rval = md_update_namespace_rr_did(cp);
12711 	return (rval);
12712 }
12713 
12714 /*ARGSUSED*/
12715 int
12716 md_imp_snarf_set(
12717 	mddb_config_t	*cp
12718 )
12719 {
12720 	set_t		setno;
12721 	int		stale_flag;
12722 	mddb_set_t	*s;
12723 	int		i, err = 0;
12724 	md_ops_t	*ops;
12725 	md_error_t	*ep = &cp->c_mde;
12726 
12727 	setno = cp->c_setno;
12728 	stale_flag = cp->c_flags;
12729 
12730 	mdclrerror(ep);
12731 	if (setno >= md_nsets) {
12732 		return (mdsyserror(ep, EINVAL));
12733 	}
12734 
12735 	md_haltsnarf_enter(setno);
12736 	if (md_get_setstatus(setno) & MD_SET_IMPORT) {
12737 		goto out;
12738 	}
12739 
12740 	/* Set the bit first otherwise load_old_replicas can fail */
12741 	md_set_setstatus(setno, MD_SET_IMPORT);
12742 
12743 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12744 		err = mddbstatus2error(ep, err, NODEV32, setno);
12745 		goto out;
12746 	}
12747 
12748 	/*
12749 	 * Upon completion of load_old_replicas, the old setno is
12750 	 * restored from the disk so we need to reset
12751 	 */
12752 	s->s_lbp->lb_setno = setno;
12753 
12754 	/*
12755 	 * Fixup the NM records before loading namespace
12756 	 */
12757 	(void) md_imp_nm(s);
12758 	mddb_setexit(s);
12759 
12760 	/*
12761 	 * Load the devid name space if it exists
12762 	 * and ask each module to fixup unit records
12763 	 */
12764 	if (!md_load_namespace(setno, NULL, NM_DEVID)) {
12765 		err = mdsyserror(ep, ENOENT);
12766 		goto cleanup;
12767 	}
12768 	if (!md_load_namespace(setno, NULL, 0L)) {
12769 		(void) md_unload_namespace(setno, NM_DEVID);
12770 		err = mdsyserror(ep, ENOENT);
12771 		goto cleanup;
12772 	}
12773 
12774 	do {
12775 		i = 0;
12776 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
12777 			if (ops->md_imp_set != NULL)
12778 				i += ops->md_imp_set(setno);
12779 	} while (i);
12780 
12781 	/*
12782 	 * Fixup
12783 	 *	(1) locator block
12784 	 *	(2) locator name block if necessary
12785 	 *	(3) master block
12786 	 *	(4) directory block
12787 	 * calls appropriate writes to push changes out
12788 	 */
12789 	if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
12790 		goto cleanup;
12791 	}
12792 
12793 	/*
12794 	 * Don't unload namespace if importing a replicated diskset.
12795 	 * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
12796 	 */
12797 	if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12798 		md_haltsnarf_exit(setno);
12799 		return (err);
12800 	}
12801 
12802 cleanup:
12803 	/*
12804 	 * Halt the set
12805 	 */
12806 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
12807 	(void) md_halt_set(setno, MD_HALT_ALL);
12808 	rw_exit(&md_unit_array_rw.lock);
12809 
12810 	/*
12811 	 * Unload the namespace for the imported set
12812 	 */
12813 	mutex_enter(&mddb_lock);
12814 	mddb_unload_set(setno);
12815 	mutex_exit(&mddb_lock);
12816 
12817 out:
12818 	md_haltsnarf_exit(setno);
12819 	md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
12820 	return (err);
12821 }
12822 #endif	/* MDDB_FAKE */
12823