xref: /onnv-gate/usr/src/uts/common/io/lvm/md/md_mddb.c (revision 11130:ce5c27fd996f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/conf.h>
29 #include <sys/time.h>
30 #include <sys/uio.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/systeminfo.h>
34 #include <sys/sysmacros.h>
35 #include <sys/buf.h>
36 #include <sys/kmem.h>
37 #include <sys/file.h>
38 #include <sys/open.h>
39 #include <sys/debug.h>
40 #include <sys/stat.h>
41 #include <sys/lvm/mdvar.h>
42 #include <sys/lvm/md_crc.h>
43 #include <sys/lvm/md_convert.h>
44 #include <sys/types.h>
45 #include <sys/kmem.h>
46 #include <sys/lvm/mdmn_commd.h>
47 #include <sys/cladm.h>
48 
49 mhd_mhiargs_t	defmhiargs = {
50 	1000,
51 	{ 6000, 6000, 30000 }
52 };
53 
54 #define	MDDB
55 
56 #include <sys/lvm/mdvar.h>
57 #include <sys/lvm/mdmed.h>
58 #include <sys/lvm/md_names.h>
59 #include <sys/cred.h>
60 #include <sys/ddi.h>
61 #include <sys/sunddi.h>
62 #include <sys/esunddi.h>
63 
64 #include <sys/sysevent/eventdefs.h>
65 #include <sys/sysevent/svm.h>
66 
67 extern char svm_bootpath[];
68 
69 int			md_maxbootlist = MAXBOOTLIST;
70 static ulong_t		mddb_maxblocks = 0;	/* tune for small records */
71 static int		mddb_maxbufheaders = 50;
72 static uint_t		mddb_maxcopies = MDDB_NLB;
73 
74 /*
75  * If this is set, more detailed messages about DB init will be given, instead
76  * of just the MDE_DB_NODB.
77  */
78 static int		mddb_db_err_detail = 0;
79 
80 /*
81  * This lock is used to single-thread load/unload of all sets
82  */
83 static kmutex_t		mddb_lock;
84 
85 /*
86  * You really do NOT want to change this boolean.
87  * It can be VERY dangerous to do so.  Loss of
88  * data may occur. USE AT YOUR OWN RISK!!!!
89  */
90 static int		mddb_allow_half = 0;
91 /*
92  * For mirrored root allow reboot with only half the replicas available
93  * Flag inserted for Santa Fe project.
94  */
95 int mirrored_root_flag;
96 
97 #define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
98 			    ((c) == '\r') || ((c) == '\n'))
99 #define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))
100 
101 #define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)
102 
103 extern md_krwlock_t	md_unit_array_rw;	/* md.c */
104 extern set_t		md_nsets;		/* md.c */
105 extern int		md_nmedh;		/* md.c */
106 extern md_set_t		md_set[];		/* md.c */
107 extern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
108 extern dev_info_t	*md_devinfo;
109 extern int		md_init_debug;
110 extern int		md_status;
111 extern md_ops_t		*md_opslist;
112 extern md_krwlock_t	nm_lock;
113 
114 static int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
115 				ddi_devid_t didptr, ddi_devid_t old_didptr);
116 
117 /*
118  * Defines for crc calculation for records
119  * rec_crcgen generates a crc checksum for a record block
120  * rec_crcchk checks the crc checksum for a record block
121  */
122 #define	REC_CRCGEN	0
123 #define	REC_CRCCHK	1
124 #define	rec_crcgen(s, dep, rbp) \
125 	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
126 #define	rec_crcchk(s, dep, rbp) \
127 	rec_crcfunc(s, dep, rbp, REC_CRCCHK)
128 
129 /*
130  * During upgrade, SVM basically runs with the devt from the target
131  * being upgraded.  Translations are made from the target devt to the
132  * miniroot devt when writing data out to the disk.  This is done by
133  * the following routines:
134  *	wrtblklst
135  *	writeblks
136  *	readblklst
137  *	readblks
138  *	dt_read
139  *
140  * The following routines are used by the routines listed above and
141  * expect a translated (aka miniroot) devt:
142  *	getblks
143  * 	getmasters
144  *
145  * Also, when calling any system routines, such as ddi_lyr_get_devid,
146  * the translated (aka miniroot) devt must be used.
147  *
148  * By the same token, the major number and major name conversion operations
149  * need to use the name_to_major file from the target system instead
150  * of the name_to_major file on the miniroot.  So, calls to
151  * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
152  * when running on an upgrade.  Same is true with calls to
153  * ddi_major_to_name.
154  */
155 
156 
157 #ifndef MDDB_FAKE
158 
159 static int
160 mddb_rwdata(
161 	mddb_set_t	*s,	/* incore db set structure */
162 	int		flag,	/* B_ASYNC, B_FAILFAST or 0 passed in here */
163 	buf_t		*bp
164 )
165 {
166 	int		err = 0;
167 
168 	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
169 
170 	mutex_exit(SETMUTEX(s->s_setno));
171 	if (mdv_strategy_tstpnt == NULL ||
172 	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
173 		(void) bdev_strategy(bp);
174 
175 	if (flag & B_ASYNC) {
176 		mutex_enter(SETMUTEX(s->s_setno));
177 		return (0);
178 	}
179 
180 	err = biowait(bp);
181 	mutex_enter(SETMUTEX(s->s_setno));
182 	return (err);
183 }
184 
185 static void
186 setidentifier(
187 	mddb_set_t	*s,
188 	identifier_t	*ident
189 )
190 {
191 	if (s->s_setno == MD_LOCAL_SET)
192 		(void) strcpy(&ident->serial[0], s->s_ident.serial);
193 	else
194 		ident->createtime = s->s_ident.createtime;
195 }
196 
197 static int
198 cmpidentifier(
199 	mddb_set_t	*s,
200 	identifier_t	*ident
201 )
202 {
203 	if (s->s_setno == MD_LOCAL_SET)
204 		return (strcmp(ident->serial, s->s_ident.serial));
205 	else
206 		return (timercmp(&ident->createtime,
207 		    /*CSTYLED*/
208 		    &s->s_ident.createtime, !=));
209 }
210 
211 static int
212 mddb_devopen(
213 	md_dev64_t	dev
214 )
215 {
216 	dev_t		ddi_dev = md_dev64_to_dev(dev);
217 
218 	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
219 		return (0);
220 	return (1);
221 }
222 
223 static void
224 mddb_devclose(
225 	md_dev64_t	dev
226 )
227 {
228 	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
229 }
230 
231 /*
232  * stripe_skip_ts
233  *
234  * Returns a list of fields to be skipped in the stripe record structure.
235  * These fields are ms_timestamp in the component structure.
236  * Used to skip these fields when calculating the checksum.
237  */
238 static crc_skip_t *
239 stripe_skip_ts(void *un, uint_t revision)
240 {
241 	struct ms_row32_od	*small_mdr;
242 	struct ms_row		*big_mdr;
243 	uint_t			row, comp, ncomps, compoff;
244 	crc_skip_t		*skip;
245 	crc_skip_t		*skip_prev;
246 	crc_skip_t		skip_start = {0, 0, 0};
247 	ms_unit_t		*big_un;
248 	ms_unit32_od_t		*small_un;
249 	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);
250 
251 	switch (revision) {
252 	case MDDB_REV_RB:
253 	case MDDB_REV_RBFN:
254 		small_un = (ms_unit32_od_t *)un;
255 		skip_prev = &skip_start;
256 
257 		if (small_un->un_nrows == 0)
258 			return (NULL);
259 		/*
260 		 * walk through all rows to find the total number
261 		 * of components
262 		 */
263 		small_mdr   = &small_un->un_row[0];
264 		ncomps = 0;
265 		for (row = 0; (row < small_un->un_nrows); row++) {
266 			ncomps += small_mdr[row].un_ncomp;
267 		}
268 
269 		/* Now walk through the components */
270 		compoff = small_un->un_ocomp + rb_off;
271 		for (comp = 0; (comp < ncomps); ++comp) {
272 			uint_t	mdcp = compoff +
273 			    (comp * sizeof (ms_comp32_od_t));
274 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
275 			    KM_SLEEP);
276 			skip->skip_offset = mdcp +
277 			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
278 			skip->skip_size = sizeof (md_timeval32_t);
279 			skip_prev->skip_next = skip;
280 			skip_prev = skip;
281 		}
282 		break;
283 	case MDDB_REV_RB64:
284 	case MDDB_REV_RB64FN:
285 		big_un = (ms_unit_t *)un;
286 		skip_prev = &skip_start;
287 
288 		if (big_un->un_nrows == 0)
289 			return (NULL);
290 		/*
291 		 * walk through all rows to find the total number
292 		 * of components
293 		 */
294 		big_mdr   = &big_un->un_row[0];
295 		ncomps = 0;
296 		for (row = 0; (row < big_un->un_nrows); row++) {
297 			ncomps += big_mdr[row].un_ncomp;
298 		}
299 
300 		/* Now walk through the components */
301 		compoff = big_un->un_ocomp + rb_off;
302 		for (comp = 0; (comp < ncomps); ++comp) {
303 			uint_t	mdcp = compoff +
304 			    (comp * sizeof (ms_comp_t));
305 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
306 			    KM_SLEEP);
307 			skip->skip_offset = mdcp +
308 			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
309 			skip->skip_size = sizeof (md_timeval32_t);
310 			skip_prev->skip_next = skip;
311 			skip_prev = skip;
312 		}
313 		break;
314 	}
315 	/* Return the start of the list of fields to skip */
316 	return (skip_start.skip_next);
317 }
318 
319 /*
320  * mirror_skip_ts
321  *
322  * Returns a list of fields to be skipped in the mirror record structure.
323  * This includes un_last_read and sm_timestamp for each submirror
324  * Used to skip these fields when calculating the checksum.
325  */
326 static crc_skip_t *
327 mirror_skip_ts(uint_t revision)
328 {
329 	int		i;
330 	crc_skip_t	*skip;
331 	crc_skip_t	*skip_prev;
332 	crc_skip_t	skip_start = {0, 0, 0};
333 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
334 
335 	skip_prev = &skip_start;
336 
337 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
338 	switch (revision) {
339 	case MDDB_REV_RB:
340 	case MDDB_REV_RBFN:
341 		skip->skip_offset = offsetof(mm_unit32_od_t,
342 		    un_last_read) + rb_off;
343 		break;
344 	case MDDB_REV_RB64:
345 	case MDDB_REV_RB64FN:
346 		skip->skip_offset = offsetof(mm_unit_t,
347 		    un_last_read) + rb_off;
348 		break;
349 	}
350 	skip->skip_size = sizeof (int);
351 	skip_prev->skip_next = skip;
352 	skip_prev = skip;
353 
354 	for (i = 0; i < NMIRROR; i++) {
355 		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
356 		switch (revision) {
357 		case MDDB_REV_RB:
358 		case MDDB_REV_RBFN:
359 			skip->skip_offset = offsetof(mm_unit32_od_t,
360 			    un_sm[i].sm_timestamp) + rb_off;
361 			break;
362 		case MDDB_REV_RB64:
363 		case MDDB_REV_RB64FN:
364 			skip->skip_offset = offsetof(mm_unit_t,
365 			    un_sm[i].sm_timestamp) + rb_off;
366 			break;
367 		}
368 		skip->skip_size = sizeof (md_timeval32_t);
369 		skip_prev->skip_next = skip;
370 		skip_prev = skip;
371 	}
372 	/* Return the start of the list of fields to skip */
373 	return (skip_start.skip_next);
374 }
375 
376 /*
377  * hotspare_skip_ts
378  *
379  * Returns a list of the timestamp fields in the hotspare record structure.
380  * Used to skip these fields when calculating the checksum.
381  */
382 static crc_skip_t *
383 hotspare_skip_ts(uint_t revision)
384 {
385 	crc_skip_t	*skip;
386 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
387 
388 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
389 	switch (revision) {
390 	case MDDB_REV_RB:
391 	case MDDB_REV_RBFN:
392 		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
393 		    rb_off;
394 		break;
395 	case MDDB_REV_RB64:
396 	case MDDB_REV_RB64FN:
397 		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
398 		    rb_off;
399 		break;
400 	}
401 	skip->skip_size = sizeof (md_timeval32_t);
402 	return (skip);
403 }
404 
405 /*
406  * rec_crcfunc
407  *
408  * Calculate or check the checksum for a record
409  * Calculate the crc if check == 0, Check the crc if check == 1
410  *
411  * Record block may be written by different nodes in a multi-owner diskset
412  * (in case of master change), the function rec_crcchk excludes timestamp
413  * fields in crc computation of record data.
414  * Otherwise, timestamp fields will cause each node to have a different
415  * checksum for same record block causing the exclusive-or of all record block
416  * checksums and data block record sums to be non-zero after new master writes
417  * at least one record block.
418  */
419 static uint_t
420 rec_crcfunc(
421 	mddb_set_t	*s,
422 	mddb_de_ic_t	*dep,
423 	mddb_rb32_t	*rbp,
424 	int		check
425 )
426 {
427 	crc_skip_t	*skip;
428 	crc_skip_t	*skip_tail;
429 	mddb_type_t	type = dep->de_type1;
430 	uint_t		ret;
431 
432 	/*
433 	 * Generate a list of the areas to be skipped when calculating
434 	 * the checksum.
435 	 * First skip rb_checksum, rb_private and rb_userdata.
436 	 */
437 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
438 	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
439 	skip->skip_size = 3 * sizeof (uint_t);
440 	skip_tail = skip;
441 	if (MD_MNSET_SETNO(s->s_setno)) {
442 		/* For a MN set, skip rb_timestamp */
443 		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
444 		    KM_SLEEP);
445 		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
446 		skip_tail->skip_size = sizeof (md_timeval32_t);
447 		skip->skip_next = skip_tail;
448 
449 		/* Now add a list of timestamps to be skipped */
450 		if (type >= MDDB_FIRST_MODID) {
451 			switch (dep->de_flags) {
452 				case MDDB_F_STRIPE:
453 					skip_tail->skip_next =
454 					    stripe_skip_ts((void *)rbp->rb_data,
455 					    rbp->rb_revision);
456 					break;
457 				case MDDB_F_MIRROR:
458 					skip_tail->skip_next =
459 					    mirror_skip_ts(rbp->rb_revision);
460 					break;
461 				case MDDB_F_HOTSPARE:
462 					skip_tail->skip_next =
463 					    hotspare_skip_ts(rbp->rb_revision);
464 					break;
465 				default:
466 					break;
467 			}
468 		}
469 	}
470 
471 	if (check) {
472 		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
473 	} else {
474 		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
475 		ret = rbp->rb_checksum;
476 	}
477 	while (skip) {
478 		crc_skip_t	*skip_save = skip;
479 
480 		skip = skip->skip_next;
481 		kmem_free(skip_save, sizeof (crc_skip_t));
482 	}
483 	return (ret);
484 }
485 
486 static mddb_bf_t *
487 allocbuffer(
488 	mddb_set_t	*s,
489 	int		sleepflag
490 )
491 {
492 	mddb_bf_t	*bfp;
493 
494 	while ((bfp = s->s_freebufhead) == NULL) {
495 		if (sleepflag == MDDB_NOSLEEP)
496 			return ((mddb_bf_t *)NULL);
497 		++s->s_bufmisses;
498 #ifdef	DEBUG
499 		if (s->s_bufmisses == 1)
500 			cmn_err(CE_NOTE,
501 			    "md: mddb: set %u sleeping for buffer", s->s_setno);
502 #endif
503 		s->s_bufwakeup = 1;
504 		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
505 	}
506 	s->s_freebufhead = bfp->bf_next;
507 	bzero((caddr_t)bfp, sizeof (*bfp));
508 	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
509 	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
510 	return (bfp);
511 }
512 
513 static void
514 freebuffer(
515 	mddb_set_t		*s,
516 	mddb_bf_t	*bfp
517 )
518 {
519 	bfp->bf_next = s->s_freebufhead;
520 	s->s_freebufhead = bfp;
521 	if (s->s_bufwakeup) {
522 		cv_broadcast(&s->s_buf_cv);
523 		s->s_bufwakeup = 0;
524 	}
525 }
526 
527 
528 static void
529 blkbusy(
530 	mddb_set_t	*s,
531 	mddb_block_t	blk
532 )
533 {
534 	int		bit, byte;
535 
536 	s->s_freeblkcnt--;
537 	byte = blk / 8;
538 	bit = 1 << (blk & 7);
539 	ASSERT(! (s->s_freebitmap[byte] & bit));
540 	s->s_freebitmap[byte] |= bit;
541 }
542 
543 static void
544 blkfree(
545 	mddb_set_t	*s,
546 	mddb_block_t	blk
547 )
548 {
549 	int		bit, byte;
550 
551 	s->s_freeblkcnt++;
552 	byte = blk / 8;
553 	bit = 1 << (blk & 7);
554 	ASSERT(s->s_freebitmap[byte] & bit);
555 	s->s_freebitmap[byte] &= ~bit;
556 }
557 
558 static int
559 blkcheck(
560 	mddb_set_t	*s,
561 	mddb_block_t	blk
562 )
563 {
564 	int		bit, byte;
565 
566 	byte = blk / 8;
567 	bit = 1 << (blk & 7);
568 	return (s->s_freebitmap[byte] & bit);
569 }
570 
571 /*
572  * not fast but simple
573  */
574 static mddb_block_t
575 getfreeblks(
576 	mddb_set_t	*s,
577 	size_t		count
578 )
579 {
580 	int		i;
581 	size_t		contig;
582 
583 	contig = 0;
584 	for (i = 0; i < s->s_totalblkcnt; i++) {
585 		if (blkcheck(s, i)) {
586 			contig = 0;
587 		} else {
588 			contig++;
589 			if (contig == count) {
590 				contig = i - count + 1;
591 				for (i = (int)contig; i < contig + count; i++)
592 					blkbusy(s, i);
593 				return ((mddb_block_t)contig);
594 			}
595 		}
596 	}
597 	return (0);
598 }
599 
600 static void
601 computefreeblks(
602 	mddb_set_t	*s
603 )
604 {
605 	mddb_db_t	*dbp;
606 	mddb_de_ic_t	*dep;
607 	int		i;
608 	int		minblks;
609 	int		freeblks;
610 	mddb_mb_ic_t	*mbip;
611 	mddb_lb_t	*lbp;
612 	mddb_block_t	maxblk;
613 	mddb_did_db_t	*did_dbp;
614 	int		nblks;
615 
616 	minblks = 0;
617 	lbp = s->s_lbp;
618 	maxblk = 0;
619 
620 	/*
621 	 * Determine the max number of blocks.
622 	 */
623 	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
624 	/*
625 	 * go through and find highest logical block
626 	 */
627 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
628 		if (dbp->db_blknum > maxblk)
629 			maxblk = dbp->db_blknum;
630 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
631 			for (i = 0; i < dep->de_blkcount; i++)
632 				if (dep->de_blks[i] > maxblk)
633 					maxblk = dep->de_blks[i];
634 	}
635 
636 	for (i = 0; i < lbp->lb_loccnt; i++) {
637 		mddb_locator_t	*lp = &lbp->lb_locators[i];
638 
639 		if ((lp->l_flags & MDDB_F_DELETED) ||
640 		    (lp->l_flags & MDDB_F_EMASTER))
641 			continue;
642 
643 		freeblks = 0;
644 		for (mbip = s->s_mbiarray[i]; mbip != NULL;
645 		    mbip = mbip->mbi_next) {
646 			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
647 		}
648 		if (freeblks == 0)	/* this happen when there is no */
649 			continue;	/*	master blk		*/
650 
651 		if (freeblks <= maxblk) {
652 			lp->l_flags |= MDDB_F_TOOSMALL;
653 			lp->l_flags &= ~MDDB_F_ACTIVE;
654 		}
655 
656 		if (freeblks < minblks || minblks == 0)
657 			minblks = freeblks;
658 	}
659 	/*
660 	 * set up reasonable freespace if no
661 	 * data bases exist
662 	 */
663 	if (minblks == 0)
664 		minblks = 100;
665 	if (minblks > nblks)
666 		minblks = nblks;
667 	s->s_freeblkcnt = minblks;
668 	s->s_totalblkcnt = minblks;
669 	if (! s->s_freebitmapsize) {
670 		s->s_freebitmapsize = nblks / 8;
671 		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
672 		    KM_SLEEP);
673 	}
674 	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
675 
676 	/* locator block sectors */
677 	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
678 		blkbusy(s, i);
679 
680 	/* locator name sectors */
681 	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
682 		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
683 
684 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
685 		/* locator block device id information */
686 		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
687 			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
688 
689 		/* disk blocks containing actual device ids */
690 		did_dbp = s->s_did_icp->did_ic_dbp;
691 		while (did_dbp) {
692 			for (i = 0; i < did_dbp->db_blkcnt; i++) {
693 				blkbusy(s, did_dbp->db_firstblk + i);
694 			}
695 			did_dbp = did_dbp->db_next;
696 		}
697 	}
698 
699 	/* Only use data tags if not a MN set */
700 	if (!(lbp->lb_flags & MDDB_MNSET)) {
701 		/* Found a bad tag, do NOT mark the data tag blks busy here */
702 		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
703 			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
704 				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
705 		}
706 	}
707 
708 	/* directory block/entry sectors */
709 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
710 		blkbusy(s, dbp->db_blknum);
711 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
712 			for (i = 0; i < dep->de_blkcount; i++)
713 				blkbusy(s, dep->de_blks[i]);
714 	}
715 }
716 
717 /*
718  * Add free space to the device id incore free list.
719  * Called:
720  *    - During startup when all devid blocks are temporarily placed on the
721  *       free list
722  *    - After a devid has been deleted via the metadb command.
723  *    - When mddb_devid_free_get adds unused space from a disk block
724  *       to free list
725  */
726 static int
727 mddb_devid_free_add(
728 	mddb_set_t *s,
729 	uint_t firstblk,
730 	uint_t offset,
731 	uint_t length
732 )
733 {
734 	mddb_did_free_t	*did_freep;
735 
736 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
737 		return (0);
738 	}
739 
740 	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
741 	    KM_SLEEP);
742 	did_freep->free_blk = firstblk;
743 	did_freep->free_offset = offset;
744 	did_freep->free_length = length;
745 	did_freep->free_next = s->s_did_icp->did_ic_freep;
746 	s->s_did_icp->did_ic_freep = did_freep;
747 
748 	return (0);
749 }
750 
751 /*
752  * Remove specific free space from the device id incore free list.
753  * Called at startup (after all devid blocks have been placed on
754  * free list) in order to remove the free space from the list that
755  * contains actual devids.
756  * Returns 0 if area successfully removed.
757  * Returns 1 if no matching area is found - so nothing removed.
758  */
759 static int
760 mddb_devid_free_delete(
761 	mddb_set_t *s,
762 	uint_t firstblk,
763 	uint_t offset,
764 	uint_t length
765 )
766 {
767 	int		block_found = 0;
768 	mddb_did_free_t	*did_freep1;		/* next free block */
769 	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
770 	mddb_did_free_t *did_freep_before;	/* area before offset, len */
771 	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
772 	uint_t		old_length;
773 
774 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
775 		return (1);
776 	}
777 
778 	/* find free block for this devid */
779 	did_freep1 = s->s_did_icp->did_ic_freep;
780 	while (did_freep1) {
781 		/*
782 		 * Look through free list of <block, offset, length> to
783 		 * find our entry in the free list.  Our entry should
784 		 * exist since the entire devid block was placed into
785 		 * this free list at startup.  This code is just removing
786 		 * the non-free (in-use) portions of the devid block so
787 		 * that the remaining linked list does indeed just
788 		 * contain a free list.
789 		 *
790 		 * Our entry has been found if
791 		 *   - the blocks match,
792 		 *   - the offset (starting address) in the free list is
793 		 *	less than the offset of our entry and
794 		 *   - the length+offset (ending address) in the free list is
795 		 *	greater than the length+offset of our entry.
796 		 */
797 		if ((did_freep1->free_blk == firstblk) &&
798 		    (did_freep1->free_offset <= offset) &&
799 		    ((did_freep1->free_length + did_freep1->free_offset) >=
800 		    (length + offset))) {
801 			/* Have found our entry - remove from list */
802 			block_found = 1;
803 			did_freep_before = did_freep1;
804 			old_length = did_freep1->free_length;
805 			/* did_freep1 - pts to next free block */
806 			did_freep1 = did_freep1->free_next;
807 			if (did_freep2) {
808 				did_freep2->free_next = did_freep1;
809 			} else {
810 				s->s_did_icp->did_ic_freep = did_freep1;
811 			}
812 
813 			/*
814 			 * did_freep_before points to area in block before
815 			 * offset, length.
816 			 */
817 			did_freep_before->free_length = offset -
818 			    did_freep_before->free_offset;
819 			/*
820 			 * did_freep_after points to area in block after
821 			 * offset, length.
822 			 */
823 			did_freep_after = (mddb_did_free_t *)kmem_zalloc
824 			    (sizeof (mddb_did_free_t), KM_SLEEP);
825 			did_freep_after->free_blk = did_freep_before->free_blk;
826 			did_freep_after->free_offset = offset + length;
827 			did_freep_after->free_length = old_length - length -
828 			    did_freep_before->free_length;
829 			/*
830 			 * Add before and after areas to free list
831 			 * If area before or after offset, length has length
832 			 * of 0, that entry is not added.
833 			 */
834 			if (did_freep_after->free_length) {
835 				did_freep_after->free_next = did_freep1;
836 				if (did_freep2) {
837 					did_freep2->free_next =
838 					    did_freep_after;
839 				} else {
840 					s->s_did_icp->did_ic_freep =
841 					    did_freep_after;
842 				}
843 				did_freep1 = did_freep_after;
844 			} else {
845 				kmem_free(did_freep_after,
846 				    sizeof (mddb_did_free_t));
847 			}
848 
849 			if (did_freep_before->free_length) {
850 				did_freep_before->free_next = did_freep1;
851 				if (did_freep2) {
852 					did_freep2->free_next =
853 					    did_freep_before;
854 				} else {
855 					s->s_did_icp->did_ic_freep =
856 					    did_freep_before;
857 				}
858 			} else {
859 				kmem_free(did_freep_before,
860 				    sizeof (mddb_did_free_t));
861 			}
862 			break;
863 		} else {
864 			did_freep2 = did_freep1;
865 			did_freep1 = did_freep1->free_next;
866 		}
867 	}
868 	if (block_found == 0) {
869 		return (1);
870 	} else {
871 		return (0);
872 	}
873 }
874 
875 /*
876  * Find free space of devid length and remove free space from list.
877  * Return a pointer to the previously free area.
878  *
879  * If there's not enough free space on the free list, get an empty
880  * disk block, put the empty disk block on the did_ic_dbp linked list,
881  * and add the disk block space not used for devid to the free list.
882  *
883  * Return pointer to address (inside disk block) of free area for devid.
884  * Return 0 if error.
885  */
886 static caddr_t
887 mddb_devid_free_get(
888 	mddb_set_t *s,
889 	uint_t len,
890 	uint_t *blk,
891 	uint_t *cnt,
892 	uint_t *offset
893 )
894 {
895 	mddb_did_free_t	*freep, *freep2;
896 	mddb_did_db_t	*dbp;
897 	uint_t		blk_cnt, blk_num;
898 	ddi_devid_t	devid_ptr = NULL;
899 
900 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
901 		return (0);
902 	}
903 
904 	freep = s->s_did_icp->did_ic_freep;
905 	freep2 = (mddb_did_free_t *)NULL;
906 	while (freep) {
907 		/* found a free area - remove from free list */
908 		if (len <= freep->free_length) {
909 			*blk = freep->free_blk;
910 			*offset = freep->free_offset;
911 			/* find disk block pointer that contains free area */
912 			dbp = s->s_did_icp->did_ic_dbp;
913 			while (dbp) {
914 				if (dbp->db_firstblk == *blk)
915 					break;
916 				else
917 					dbp = dbp->db_next;
918 			}
919 			/*
920 			 * If a disk block pointer can't be found - something
921 			 * is wrong, so don't use this free space.
922 			 */
923 			if (dbp == NULL) {
924 				freep2 = freep;
925 				freep = freep->free_next;
926 				continue;
927 			}
928 
929 			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
930 			*cnt = dbp->db_blkcnt;
931 
932 			/* Update free list information */
933 			freep->free_offset += len;
934 			freep->free_length -= len;
935 			if (freep->free_length == 0) {
936 				if (freep2) {
937 					freep2->free_next =
938 					    freep->free_next;
939 				} else {
940 					s->s_did_icp->did_ic_freep =
941 					    freep->free_next;
942 				}
943 				kmem_free(freep, sizeof (mddb_did_free_t));
944 			}
945 			break;
946 		}
947 		freep2 = freep;
948 		freep = freep->free_next;
949 	}
950 
951 	/* Didn't find a free spot */
952 	if (freep == NULL) {
953 		/* get free logical disk blk in replica */
954 		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
955 		blk_num = getfreeblks(s, blk_cnt);
956 		if (blk_num == 0)
957 			return (0);
958 
959 		/* Add disk block to disk block linked list */
960 		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
961 		dbp->db_firstblk = blk_num;
962 		dbp->db_blkcnt = blk_cnt;
963 		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
964 		dbp->db_next = s->s_did_icp->did_ic_dbp;
965 		s->s_did_icp->did_ic_dbp = dbp;
966 		devid_ptr = (ddi_devid_t)dbp->db_ptr;
967 
968 		/* Update return values */
969 		*blk = blk_num;
970 		*offset = 0;
971 		*cnt = blk_cnt;
972 
973 		/* Add unused part of block to free list */
974 		(void) mddb_devid_free_add(s, blk_num,
975 		    len, (dbtob(blk_cnt) - len));
976 	}
977 
978 	return ((caddr_t)devid_ptr);
979 }
980 
981 /*
982  * Add device id information for locator index to device id area in set.
983  * Get free area to store device id from free list.   Update checksum
984  * for mddb_did_blk.
985  *
986  * This routine does not write any data out to disk.
987  * After this routine has been called, the routine, writelocall, should
988  * be called to write both the locator block and device id area out
989  * to disk.
990  */
991 static int
992 mddb_devid_add(
993 	mddb_set_t	*s,
994 	uint_t		index,
995 	ddi_devid_t	devid,
996 	char		*minor_name
997 )
998 {
999 	uint_t		devid_len;
1000 	uint_t		blk, offset;
1001 	ddi_devid_t	devid_ptr;
1002 	mddb_did_info_t	*did_info;
1003 	uint_t		blkcnt, i;
1004 	mddb_did_blk_t	*did_blk;
1005 
1006 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1007 		return (1);
1008 	}
1009 	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
1010 		return (1);
1011 
1012 	/* Check if device id has already been added */
1013 	did_blk = s->s_did_icp->did_ic_blkp;
1014 	did_info = &(did_blk->blk_info[index]);
1015 	if (did_info->info_flags & MDDB_DID_EXISTS)
1016 		return (0);
1017 
1018 	devid_len = ddi_devid_sizeof(devid);
1019 	devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
1020 	    devid_len, &blk, &blkcnt, &offset);
1021 
1022 	if (devid_ptr == NULL) {
1023 		return (1);
1024 	}
1025 
1026 	/* Copy devid into devid free area */
1027 	for (i = 0; i < devid_len; i++)
1028 		((char *)devid_ptr)[i] = ((char *)devid)[i];
1029 
1030 	/* Update mddb_did_info area for new device id */
1031 	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;
1032 
1033 	/*
1034 	 * Only set UPDATED flag for non-replicated import cases.
1035 	 * This allows the side locator driver name index to get
1036 	 * updated in load_old_replicas.
1037 	 */
1038 	if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
1039 		did_info->info_flags |= MDDB_DID_UPDATED;
1040 
1041 	did_info->info_firstblk = blk;
1042 	did_info->info_blkcnt = blkcnt;
1043 	did_info->info_offset = offset;
1044 	did_info->info_length = devid_len;
1045 	(void) strcpy(did_info->info_minor_name, minor_name);
1046 	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
1047 
1048 	/* Add device id pointer to did_ic_devid array */
1049 	s->s_did_icp->did_ic_devid[index] = devid_ptr;
1050 
1051 	return (0);
1052 }
1053 
1054 
1055 /*
1056  * Delete device id information for locator index from device id area in set.
1057  * Add device id space to free area.
1058  *
1059  * This routine does not write any data out to disk.
1060  * After this routine has been called, the routine, writelocall, should
1061  * be called to write both the locator block and device id area out
1062  * to disk.
1063  */
1064 static int
1065 mddb_devid_delete(mddb_set_t *s, uint_t index)
1066 {
1067 	mddb_did_info_t	*did_info;
1068 	mddb_did_blk_t	*did_blk;
1069 
1070 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1071 		return (1);
1072 	}
1073 
1074 	/* Get device id information from mddb_did_blk */
1075 	did_blk = s->s_did_icp->did_ic_blkp;
1076 	did_info = &(did_blk->blk_info[index]);
1077 
1078 	/*
1079 	 * Ensure that the underlying device supports device ids
1080 	 * before arbitrarily removing them.
1081 	 */
1082 	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
1083 		return (1);
1084 	}
1085 
1086 	/* Remove device id information from mddb_did_blk */
1087 	did_info->info_flags = 0;
1088 
1089 	/* Remove device id from incore area */
1090 	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
1091 
1092 	/* Add new free space in disk block to free list */
1093 	(void) mddb_devid_free_add(s, did_info->info_firstblk,
1094 	    did_info->info_offset, did_info->info_length);
1095 
1096 	return (0);
1097 }
1098 
1099 /*
1100  * Check if there is a device id for a locator index.
1101  *
1102  * Caller of this routine should not free devid or minor_name since
1103  * these will point to internal data structures that should not
1104  * be freed.
1105  */
1106 static int
1107 mddb_devid_get(
1108 	mddb_set_t *s,
1109 	uint_t index,
1110 	ddi_devid_t *devid,
1111 	char **minor_name
1112 )
1113 {
1114 	mddb_did_info_t	*did_info;
1115 
1116 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1117 		return (0);
1118 	}
1119 	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
1120 
1121 	if (did_info->info_flags & MDDB_DID_EXISTS) {
1122 		*devid = s->s_did_icp->did_ic_devid[index];
1123 		*minor_name =
1124 		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
1125 		return (1);
1126 	} else
1127 		return (0);
1128 
1129 
1130 }
1131 
1132 /*
1133  * Check if device id is valid on current system.
1134  * Needs devid, previously known dev_t and current minor_name.
1135  *
1136  * Success:
1137  * 	Returns 0 if valid device id is found and updates
1138  * 	dev_t if the dev_t associated with the device id is
1139  *	different than dev_t.
1140  * Failure:
1141  * 	Returns 1 if device id not valid on current system.
1142  */
1143 static int
1144 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
1145 {
1146 	int		retndevs;
1147 	dev_t		*ddi_devs;
1148 	int		devid_flag = 0;
1149 	int 		cnt;
1150 
1151 	if (dev == 0)
1152 		return (1);
1153 	/*
1154 	 * See if devid is valid in the current system.
1155 	 * If so, set dev to match the devid.
1156 	 */
1157 	if (ddi_lyr_devid_to_devlist(devid, minor_name,
1158 	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
1159 		if (retndevs > 0) {
1160 			/* devid is valid to use */
1161 			devid_flag = 1;
1162 			/* does dev_t in list match dev */
1163 			cnt = 0;
1164 			while (cnt < retndevs) {
1165 				if (*dev == md_expldev(ddi_devs[cnt]))
1166 					break;
1167 				cnt++;
1168 			}
1169 			/*
1170 			 * If a different dev_t, then setup
1171 			 * new dev and new major name
1172 			 */
1173 			if (cnt == retndevs) {
1174 				*dev = md_expldev(ddi_devs[0]);
1175 			}
1176 			ddi_lyr_free_devlist(ddi_devs, retndevs);
1177 		}
1178 	}
1179 	if (devid_flag)
1180 		return (0);
1181 	else
1182 		return (1);
1183 }
1184 
1185 
1186 /*
1187  * Free the devid incore data areas
1188  */
1189 static void
1190 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
1191 {
1192 	mddb_did_free_t	*did_freep1, *did_freep2;
1193 	mddb_did_db_t	*did_dbp1, *did_dbp2;
1194 	mddb_did_ic_t	*icp = *did_icp;
1195 
1196 	if (icp) {
1197 		if (icp->did_ic_blkp) {
1198 			kmem_free((caddr_t)icp->did_ic_blkp,
1199 			    dbtob(lbp->lb_didblkcnt));
1200 			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
1201 		}
1202 
1203 		if (icp->did_ic_dbp) {
1204 			did_dbp1 = icp->did_ic_dbp;
1205 			while (did_dbp1) {
1206 				did_dbp2 = did_dbp1->db_next;
1207 				kmem_free((caddr_t)did_dbp1->db_ptr,
1208 				    dbtob(did_dbp1->db_blkcnt));
1209 				kmem_free((caddr_t)did_dbp1,
1210 				    sizeof (mddb_did_db_t));
1211 				did_dbp1 = did_dbp2;
1212 			}
1213 		}
1214 
1215 		if (icp->did_ic_freep) {
1216 			did_freep1 = icp->did_ic_freep;
1217 			while (did_freep1) {
1218 				did_freep2 = did_freep1->free_next;
1219 				kmem_free((caddr_t)did_freep1,
1220 				    sizeof (mddb_did_free_t));
1221 				did_freep1 = did_freep2;
1222 			}
1223 		}
1224 
1225 		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
1226 		*did_icp = (mddb_did_ic_t *)NULL;
1227 	}
1228 
1229 }
1230 
1231 static daddr_t
1232 getphysblk(
1233 	mddb_block_t		blk,
1234 	mddb_mb_ic_t		*mbip
1235 )
1236 {
1237 	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);
1238 
1239 	while (blk >= mbp->mb_blkcnt) {
1240 		if (! mbip->mbi_next)
1241 			return ((daddr_t)-1);	/* no such block */
1242 		blk -= mbp->mb_blkcnt;
1243 		mbip = mbip->mbi_next;
1244 		mbp = &(mbip->mbi_mddb_mb);
1245 	}
1246 
1247 	if (blk >= mbp->mb_blkmap.m_consecutive)
1248 		return ((daddr_t)-1);	/* no such block */
1249 
1250 	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
1251 }
1252 
1253 /*
1254  * when a buf header is passed in the new buffer must be
1255  * put on the front of the chain. writerec counts on it
1256  */
1257 static int
1258 putblks(
1259 	mddb_set_t	*s,		/* incore db set structure */
1260 	caddr_t		buffer,		/* adr of buffer to be written */
1261 	daddr_t		blk,		/* block number for first block */
1262 	int		cnt,		/* number of blocks to be written */
1263 	md_dev64_t	device,		/* device to be written to */
1264 	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
1265 					/*    and put buf address here */
1266 )
1267 {
1268 	buf_t		*bp;
1269 	mddb_bf_t	*bfp;
1270 	int		err = 0;
1271 
1272 	bfp = allocbuffer(s, MDDB_SLEEPOK);
1273 	bp = &bfp->bf_buf;
1274 	bp->b_bcount = MDDB_BSIZE * cnt;
1275 	bp->b_un.b_addr = buffer;
1276 	bp->b_blkno = blk;
1277 	bp->b_edev = md_dev64_to_dev(device);
1278 	/*
1279 	 * if a header for a buf chain is passed in this is async io.
1280 	 * currently only done for optimize  records
1281 	 */
1282 	if (bufhead) {
1283 		bfp->bf_next = *bufhead;
1284 		*bufhead = bfp;
1285 		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
1286 		return (0);
1287 	}
1288 	err = mddb_rwdata(s, B_WRITE, bp);
1289 	freebuffer(s, bfp);
1290 	if (err) {
1291 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1292 		    s->s_setno, device);
1293 		return (MDDB_F_EWRITE);
1294 	}
1295 	return (0);
1296 }
1297 
1298 /*
1299  * wrtblklst - takes an array of logical block numbers
1300  *		and writes the buffer to those blocks (scatter).
1301  * If called during upgrade, this routine expects a
1302  * non-translated (aka target) dev.
1303  */
1304 static int
1305 wrtblklst(
1306 	mddb_set_t	*s,		/* incore set structure */
1307 	caddr_t		buffer,		/* buffer to be written (record blk) */
1308 	mddb_block_t	blka[],		/* list of logical blks for record */
1309 	daddr_t		cnt,		/* number of logical blks */
1310 	const int	li,		/* locator index */
1311 	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
1312 					/*    and put buf address here */
1313 	int		master_only	/* allow only master node to write */
1314 )
1315 {
1316 	daddr_t		blk;
1317 	daddr_t		blk1;
1318 	int		err = 0;
1319 	int		cons;
1320 	mddb_lb_t	*lbp = s->s_lbp;
1321 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1322 	md_dev64_t	dev;
1323 	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];
1324 
1325 	/*
1326 	 * If a MN diskset and only the master can write,
1327 	 * then a non-master node will just return success.
1328 	 */
1329 	if (lbp->lb_flags & MDDB_MNSET) {
1330 		if (master_only == MDDB_WR_ONLY_MASTER) {
1331 			/* return successfully if we aren't the master */
1332 			if (!(md_set[s->s_setno].s_am_i_master)) {
1333 				return (0);
1334 			}
1335 		}
1336 		if (mbip == NULL)
1337 			return (MDDB_F_EWRITE);
1338 	}
1339 
1340 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1341 	if (dev == NODEV64) {
1342 		return (1);
1343 	}
1344 
1345 	blk = getphysblk(blka[0], mbip);
1346 	ASSERT(blk >= 0);
1347 
1348 	cons = 1;
1349 	while (cnt) {
1350 		if (cons != cnt) {
1351 			blk1 = getphysblk(blka[cons], mbip);
1352 			ASSERT(blk1 >= 0);
1353 			if ((blk + cons) == blk1) {
1354 				cons++;
1355 				continue;
1356 			}
1357 		}
1358 		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
1359 			/*
1360 			 * If an MN diskset and any_node_can_write
1361 			 * then this request is coming from writeoptrecord
1362 			 * and l_flags field should not be updated.
1363 			 * l_flags will be updated as a result of sending
1364 			 * a class1 message to the master.  Setting l_flags
1365 			 * here will cause slave to be out of sync with
1366 			 * master.
1367 			 *
1368 			 * Otherwise, set the error in l_flags
1369 			 * (this occurs if this is not a MN diskset or
1370 			 * only_master_can_write is set).
1371 			 */
1372 			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
1373 			    (master_only == MDDB_WR_ONLY_MASTER)) {
1374 				lp->l_flags |= MDDB_F_EWRITE;
1375 			}
1376 			return (err);
1377 		}
1378 		if (bufhead)
1379 			(*bufhead)->bf_locator = lp;
1380 
1381 		buffer += MDDB_BSIZE * cons;
1382 		cnt -= cons;
1383 		blka += cons;
1384 		if (cnt) {
1385 			blk = getphysblk(blka[0], mbip);
1386 			ASSERT(blk >= 0);
1387 		}
1388 		cons = 1;
1389 	}
1390 
1391 	return (0);
1392 }
1393 
1394 /*
1395  * writeblks - takes a logical block number/block count pair
1396  * 		and writes the buffer to those contiguous logical blocks.
1397  * If called during upgrade, this routine expects a non-translated
1398  * (aka target) dev.
1399  */
1400 static int
1401 writeblks(
1402 	mddb_set_t	*s,		/* incore set structure */
1403 	caddr_t		buffer,		/* buffer to be written */
1404 	mddb_block_t	blk,		/* starting logical block number */
1405 	int		cnt,		/* number of log blocks to be written */
1406 	const int	li,		/* locator index */
1407 	int		master_only	/* allow only master node to write */
1408 )
1409 {
1410 	daddr_t		physblk;
1411 	int		err = 0;
1412 	int		i;
1413 	mddb_lb_t	*lbp = s->s_lbp;
1414 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1415 	md_dev64_t	dev;
1416 	mddb_block_t	*blkarray;
1417 	int		size;
1418 	int		ret;
1419 
1420 	/*
1421 	 * If a MN diskset and only the master can write,
1422 	 * then a non-master node will just return success.
1423 	 */
1424 	if ((lbp->lb_flags & MDDB_MNSET) &&
1425 	    (master_only == MDDB_WR_ONLY_MASTER)) {
1426 		/* return successfully if we aren't the master */
1427 		if (!(md_set[s->s_setno].s_am_i_master)) {
1428 			return (0);
1429 		}
1430 	}
1431 
1432 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1433 	if (dev == NODEV64) {
1434 		return (1);
1435 	}
1436 
1437 	if (cnt > 1) {
1438 		size = sizeof (mddb_block_t) * cnt;
1439 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1440 		for (i = 0; i < cnt; i++)
1441 			blkarray[i] = blk + i;
1442 		ret = wrtblklst(s, buffer, blkarray, cnt,
1443 		    li, 0, MDDB_WR_ONLY_MASTER);
1444 		kmem_free(blkarray, size);
1445 		return (ret);
1446 	}
1447 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1448 	ASSERT(physblk > 0);
1449 	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
1450 		lp->l_flags |= MDDB_F_EWRITE;
1451 		return (err);
1452 	}
1453 	return (0);
1454 }
1455 
1456 /*
1457  * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
1458  */
1459 static int
1460 writeall(
1461 	mddb_set_t	*s,		/* incore set structure */
1462 	caddr_t		buffer,		/* buffer to be written */
1463 	mddb_block_t	block,		/* starting logical block number */
1464 	int		cnt,		/* number of log blocks to be written */
1465 	int		master_only	/* allow only master node to write */
1466 )
1467 {
1468 	int		li;
1469 	int		err = 0;
1470 	mddb_lb_t	*lbp = s->s_lbp;
1471 
1472 	for (li = 0; li < lbp->lb_loccnt; li++) {
1473 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1474 
1475 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1476 		    (lp->l_flags & MDDB_F_EWRITE))
1477 			continue;
1478 
1479 		err |= writeblks(s, buffer, block, cnt, li, master_only);
1480 	}
1481 
1482 	return (err);
1483 }
1484 
1485 /*
1486  * writelocall - write the locator block and device id information (if
1487  * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
1488  *
1489  * Increments the locator block's commitcnt.  Updates the device id area's
1490  * commitcnt if the replica is in device id format.  Regenerates the
1491  * checksums after updating the commitcnt(s).
1492  */
1493 static int
1494 writelocall(
1495 	mddb_set_t	*s	/* incore set structure */
1496 )
1497 {
1498 	int		li;
1499 	int		err = 0;
1500 	mddb_lb_t	*lbp = s->s_lbp;
1501 	mddb_did_blk_t	*did_blk;
1502 	mddb_did_db_t	*did_dbp;
1503 
1504 	s->s_lbp->lb_commitcnt++;
1505 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1506 		did_blk = s->s_did_icp->did_ic_blkp;
1507 		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
1508 		crcgen(did_blk, &did_blk->blk_checksum,
1509 		    dbtob(lbp->lb_didblkcnt), NULL);
1510 	}
1511 	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
1512 
1513 	for (li = 0; li < lbp->lb_loccnt; li++) {
1514 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1515 
1516 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1517 		    (lp->l_flags & MDDB_F_EWRITE))
1518 			continue;
1519 
1520 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1521 			/* write out blocks containing actual device ids */
1522 			did_dbp = s->s_did_icp->did_ic_dbp;
1523 			while (did_dbp) {
1524 				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
1525 				    did_dbp->db_firstblk,
1526 				    did_dbp->db_blkcnt, li,
1527 				    MDDB_WR_ONLY_MASTER);
1528 				did_dbp = did_dbp->db_next;
1529 			}
1530 
1531 			/* write out device id area block */
1532 			err |= writeblks(s, (caddr_t)did_blk,
1533 			    lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
1534 			    MDDB_WR_ONLY_MASTER);
1535 		}
1536 		/* write out locator block */
1537 		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
1538 		    MDDB_WR_ONLY_MASTER);
1539 	}
1540 
1541 	/*
1542 	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
1543 	 * in the mddb_set structure to show that the locator block has
1544 	 * been changed.
1545 	 */
1546 
1547 	if ((lbp->lb_flags & MDDB_MNSET) &&
1548 	    (md_set[s->s_setno].s_am_i_master)) {
1549 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
1550 	}
1551 	return (err);
1552 }
1553 
1554 /*
1555  * If called during upgrade, this routine expects a translated
1556  * (aka miniroot) dev.
1557  */
1558 static int
1559 getblks(
1560 	mddb_set_t	*s,	/* incore db set structure */
1561 	caddr_t		buffer,	/* buffer to read data into */
1562 	md_dev64_t	device,	/* device to read from */
1563 	daddr_t		blk,	/* physical block number to read */
1564 	int		cnt,	/* number of blocks to read */
1565 	int		flag	/* flags for I/O */
1566 )
1567 {
1568 	buf_t		*bp;
1569 	mddb_bf_t	*bfp;
1570 	int		err = 0;
1571 
1572 	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
1573 	bp = &bfp->bf_buf;
1574 	bp->b_bcount = MDDB_BSIZE * cnt;
1575 	bp->b_un.b_addr = buffer;
1576 	bp->b_blkno = blk;
1577 	bp->b_edev = md_dev64_to_dev(device);
1578 	err = mddb_rwdata(s, (B_READ | flag), bp);
1579 	freebuffer(s, bfp);
1580 	if (err) {
1581 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1582 		    s->s_setno, device);
1583 		return (MDDB_F_EREAD);
1584 	}
1585 	return (0);
1586 }
1587 
1588 /*
1589  * readblklst - takes an array of logical block numbers
1590  * 		and reads those blocks (gather) into the buffer.
1591  * If called during upgrade, this routine expects a non-translated
1592  * (aka target) dev.
1593  */
1594 static int
1595 readblklst(
1596 	mddb_set_t	*s,	/* incore set structure */
1597 	caddr_t		buffer,	/* buffer to be read (record block) */
1598 	mddb_block_t	blka[],	/* list of logical blocks to be read */
1599 	daddr_t		cnt,	/* number of logical blocks */
1600 	int		li,	/* locator index */
1601 	int		flag	/* flags for I/O */
1602 )
1603 {
1604 	daddr_t		blk;
1605 	daddr_t		blk1;
1606 	int		err = 0;
1607 	int		cons;
1608 	md_dev64_t	dev;
1609 	mddb_mb_ic_t	*mbip;
1610 
1611 	mbip = s->s_mbiarray[li];
1612 	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1613 	dev = md_xlate_targ_2_mini(dev);
1614 	if (dev == NODEV64) {
1615 		return (1);
1616 	}
1617 
1618 	blk = getphysblk(blka[0], mbip);
1619 	ASSERT(blk >= 0);
1620 
1621 	cons = 1;
1622 	while (cnt) {
1623 		if (cons != cnt) {
1624 			blk1 = getphysblk(blka[cons], mbip);
1625 			ASSERT(blk1 >= 0);
1626 			if ((blk + cons) == blk1) {
1627 				cons++;
1628 				continue;
1629 			}
1630 		}
1631 		if (err = getblks(s, buffer, dev, blk, cons, flag))
1632 			return (err);
1633 		buffer += MDDB_BSIZE * cons;
1634 		cnt -= cons;
1635 		blka += cons;
1636 		if (cnt) {
1637 			blk = getphysblk(blka[0], mbip);
1638 			ASSERT(blk >= 0);
1639 		}
1640 		cons = 1;
1641 	}
1642 	return (0);
1643 }
1644 
1645 /*
1646  * readblks - takes a logical block number/block count pair
1647  * 		and reads those contiguous logical blocks into the buffer.
1648  * If called during upgrade, this routine expects a non-translated
1649  * (aka target) dev.
1650  */
1651 static int
1652 readblks(
1653 	mddb_set_t	*s,	/* incore set structure */
1654 	caddr_t		buffer,	/* buffer to be read into */
1655 	mddb_block_t	blk,	/* logical block number to be read */
1656 	int		cnt,	/* number of logical blocks to be read */
1657 	int		li	/* locator index */
1658 )
1659 {
1660 	daddr_t		physblk;
1661 	md_dev64_t	device;
1662 	int		i;
1663 	mddb_block_t	*blkarray;
1664 	int		size;
1665 	int		ret;
1666 
1667 	if (cnt > 1) {
1668 		size = sizeof (mddb_block_t) * cnt;
1669 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1670 		for (i = 0; i < cnt; i++)
1671 			blkarray[i] = blk + i;
1672 		ret = readblklst(s, buffer, blkarray, cnt, li, 0);
1673 		kmem_free(blkarray, size);
1674 		return (ret);
1675 	}
1676 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1677 	ASSERT(physblk > 0);
1678 	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1679 	device = md_xlate_targ_2_mini(device);
1680 	if (device == NODEV64) {
1681 		return (1);
1682 	}
1683 	return (getblks(s, buffer, device, physblk, 1, 0));
1684 }
1685 
1686 static void
1687 single_thread_start(
1688 	mddb_set_t	*s
1689 )
1690 {
1691 	while (s->s_singlelockgotten) {
1692 		s->s_singlelockwanted++;
1693 		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
1694 	}
1695 	s->s_singlelockgotten++;
1696 }
1697 
1698 static void
1699 single_thread_end(
1700 	mddb_set_t	*s
1701 )
1702 {
1703 	ASSERT(s->s_singlelockgotten);
1704 	s->s_singlelockgotten = 0;
1705 	if (s->s_singlelockwanted) {
1706 		s->s_singlelockwanted = 0;
1707 		cv_broadcast(&s->s_single_thread_cv);
1708 	}
1709 }
1710 
1711 static size_t
1712 sizeofde(
1713 	mddb_de_ic_t	*dep
1714 )
1715 {
1716 	size_t		size;
1717 
1718 	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
1719 	    sizeof (mddb_block_t) * dep->de_blkcount;
1720 	return (size);
1721 }
1722 
1723 static size_t
1724 sizeofde32(
1725 	mddb_de32_t	*dep
1726 )
1727 {
1728 	size_t		size;
1729 
1730 	size = sizeof (*dep) - sizeof (dep->de32_blks) +
1731 	    sizeof (mddb_block_t) * dep->de32_blkcount;
1732 	return (size);
1733 }
1734 
1735 static mddb_de32_t *
1736 nextentry(
1737 	mddb_de32_t	*dep
1738 )
1739 {
1740 	mddb_de32_t	*ret;
1741 
1742 	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
1743 	return (ret);
1744 }
1745 
1746 static void
1747 create_db32rec(
1748 	mddb_db32_t *db32p,
1749 	mddb_db_t *dbp
1750 )
1751 {
1752 	mddb_de_ic_t *dep;
1753 	mddb_de32_t *de32p;
1754 
1755 #if defined(_ILP32) && !defined(lint)
1756 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1757 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1758 #endif
1759 
1760 	dbtodb32(dbp, db32p);
1761 	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
1762 		db32p->db32_firstentry = 0x4;
1763 	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
1764 	    + sizeof (db32p->db32_firstentry)));
1765 	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
1766 		detode32(dep, de32p);
1767 		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
1768 			de32p->de32_next = 0x4;
1769 		de32p = nextentry(de32p);
1770 	}
1771 	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
1772 }
1773 
1774 /*
1775  * If called during upgrade, this routine expects a translated
1776  * (aka miniroot) dev.
1777  * If master blocks are found, set the mn_set parameter to 1 if the
1778  * the master block revision number is MDDB_REV_MNMB; otherwise,
1779  * set it to 0.
1780  * If master blocks are not found, do not change the mnset parameter.
1781  */
1782 static mddb_mb_ic_t *
1783 getmasters(
1784 	mddb_set_t	*s,
1785 	md_dev64_t	dev,
1786 	daddr_t		blkno,
1787 	uint_t		*flag,
1788 	int		*mn_set
1789 )
1790 {
1791 	mddb_mb_ic_t	*mbi = NULL;
1792 	mddb_mb_t	*mb;
1793 	int		error = 0;
1794 	ddi_devid_t	devid;
1795 
1796 
1797 	if (mddb_devopen(dev)) {
1798 		if (flag)
1799 			*flag |= MDDB_F_EMASTER;
1800 		return ((mddb_mb_ic_t *)NULL);
1801 	}
1802 
1803 
1804 	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
1805 	mb = &(mbi->mbi_mddb_mb);
1806 	if (error = getblks(s, (caddr_t)mb, dev, blkno,
1807 	    btodb(MDDB_BSIZE), 0)) {
1808 		error |= MDDB_F_EMASTER;
1809 	}
1810 	if (mb->mb_magic != MDDB_MAGIC_MB) {
1811 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1812 	}
1813 	/* Check for MDDB_REV_MNMB and lower */
1814 	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
1815 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1816 	}
1817 	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
1818 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1819 	}
1820 
1821 	if (!(md_get_setstatus(s->s_setno) &
1822 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
1823 	    (mb->mb_setno != s->s_setno)) {
1824 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1825 	}
1826 	if (mb->mb_blkno != blkno) {
1827 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1828 	}
1829 	mb->mb_next = NULL;
1830 	mbi->mbi_next = NULL;
1831 
1832 	if (error)
1833 		goto out;
1834 
1835 	/*
1836 	 * Check the md_devid_destroy and md_keep_repl_state flags
1837 	 * to see if we need to regen the devid or not.
1838 	 *
1839 	 * Don't care about devid in local set since it is not used
1840 	 * and this should not be part of set importing
1841 	 */
1842 	if ((s->s_setno != MD_LOCAL_SET) &&
1843 	    !(md_get_setstatus(s->s_setno) &
1844 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
1845 		/*
1846 		 * Now check the destroy flag. We also need to handle
1847 		 * the case where the destroy flag is reset after the
1848 		 * destroy
1849 		 */
1850 		if (md_devid_destroy || (mb->mb_devid_len == 0)) {
1851 
1852 			if (md_devid_destroy) {
1853 				bzero(mb->mb_devid, mb->mb_devid_len);
1854 				mb->mb_devid_len = 0;
1855 			}
1856 
1857 			/*
1858 			 * Try to regenerate it if the 'keep' flag is not set
1859 			 */
1860 			if (!md_keep_repl_state) {
1861 				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
1862 				    &devid) == DDI_SUCCESS) {
1863 					mb->mb_devid_len =
1864 					    ddi_devid_sizeof(devid);
1865 					bcopy(devid, mb->mb_devid,
1866 					    mb->mb_devid_len);
1867 					ddi_devid_free(devid);
1868 				} else {
1869 					error = MDDB_F_EFMT | MDDB_F_EMASTER;
1870 				}
1871 			}
1872 
1873 			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
1874 
1875 			/*
1876 			 * Push
1877 			 */
1878 			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
1879 				error = MDDB_F_EFMT | MDDB_F_EMASTER;
1880 			}
1881 		}
1882 	}
1883 
1884 	if (! error) {
1885 		/* Set mn_set parameter to 1 if a MN set */
1886 		if (mb->mb_revision == MDDB_REV_MNMB)
1887 			*mn_set = 1;
1888 		else
1889 			*mn_set = 0;
1890 		return (mbi);
1891 	}
1892 
1893 out:
1894 	/* Error Out */
1895 	if (flag)
1896 		*flag |= error;
1897 
1898 	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
1899 	mddb_devclose(dev);
1900 	return ((mddb_mb_ic_t *)NULL);
1901 }
1902 
1903 static int
1904 getrecord(
1905 	mddb_set_t	*s,
1906 	mddb_de_ic_t	*dep,
1907 	int		li
1908 )
1909 {
1910 	int		err = 0;
1911 	mddb_rb32_t	*rbp;
1912 
1913 #if defined(_ILP32) && !defined(lint)
1914 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
1915 #endif
1916 
1917 
1918 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
1919 	rbp = dep->de_rb;
1920 
1921 	err = readblklst(s, (caddr_t)rbp, dep->de_blks,
1922 	    dep->de_blkcount, li, 0);
1923 	if (err) {
1924 		return (MDDB_F_EDATA | err);
1925 	}
1926 	if (rbp->rb_magic != MDDB_MAGIC_RB) {
1927 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1928 	}
1929 	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
1930 	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
1931 	    (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
1932 	    (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
1933 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1934 	}
1935 	/* Check crc for this record */
1936 	if (rec_crcchk(s, dep, rbp)) {
1937 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1938 	}
1939 	return (0);
1940 }
1941 
1942 /*
1943  * Code to read in the locator name information
1944  */
1945 static int
1946 readlocnames(
1947 	mddb_set_t	*s,
1948 	int		li
1949 )
1950 {
1951 	mddb_ln_t	*lnp;
1952 	int		err = 0;
1953 	mddb_block_t	ln_blkcnt, ln_blkno;
1954 
1955 	/*
1956 	 * read in the locator name blocks
1957 	 */
1958 	s->s_lnp = NULL;
1959 
1960 	ln_blkno = s->s_lbp->lb_lnfirstblk;
1961 	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
1962 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
1963 
1964 	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
1965 	if (err) {
1966 		err |= MDDB_F_EDATA;
1967 		goto out;
1968 	}
1969 	if (lnp->ln_magic != MDDB_MAGIC_LN) {
1970 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1971 		goto out;
1972 	}
1973 	if (s->s_lbp->lb_flags & MDDB_MNSET) {
1974 		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
1975 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1976 			goto out;
1977 		}
1978 	} else {
1979 		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
1980 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1981 			goto out;
1982 		}
1983 	}
1984 	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
1985 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1986 		goto out;
1987 	}
1988 out:
1989 	/*
1990 	 *	if error occurred in locator name blocks free them
1991 	 *	and return
1992 	 */
1993 	if (err) {
1994 		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
1995 		return (err);
1996 	}
1997 	s->s_lnp = lnp;
1998 	return (0);
1999 }
2000 
2001 /*
2002  * code to read in a copy of the database.
2003  */
2004 
2005 static int
2006 readcopy(
2007 	mddb_set_t	*s,
2008 	int		li
2009 )
2010 {
2011 	uint_t		blk;
2012 	mddb_db_t	*dbp, *dbp1, *dbhp;
2013 	mddb_db32_t	*db32p;
2014 	mddb_de_ic_t	*dep, *dep2;
2015 	mddb_de32_t	*de32p, *de32p2;
2016 	int		err = 0;
2017 	uint_t		checksum;
2018 
2019 
2020 #if defined(_ILP32) && !defined(lint)
2021 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2022 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2023 #endif
2024 
2025 	dbp = NULL;
2026 	dbhp = NULL;
2027 	/*
2028 	 *	read in all the directory blocks
2029 	 */
2030 	blk = s->s_lbp->lb_dbfirstblk;
2031 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2032 
2033 	for (; blk != 0; blk = dbp->db_nextblk) {
2034 		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
2035 		if (! dbhp) {
2036 			dbhp = dbp1;
2037 		} else {
2038 			dbp->db_next = dbp1;
2039 		}
2040 		dbp = dbp1;
2041 
2042 		err = readblks(s, (caddr_t)db32p, blk, 1, li);
2043 		if (err) {
2044 			err |= MDDB_F_EDATA;
2045 			break;
2046 		}
2047 		db32todb(db32p, dbp);
2048 		if (db32p->db32_magic != MDDB_MAGIC_DB) {
2049 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2050 			break;
2051 		}
2052 		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
2053 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2054 			break;
2055 		}
2056 		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
2057 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2058 			break;
2059 		}
2060 		/*
2061 		 * first go through and fix up all de_next pointers
2062 		 */
2063 		if (dbp->db_firstentry) {
2064 
2065 			de32p = (mddb_de32_t *)
2066 			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
2067 			    + sizeof (db32p->db32_firstentry)));
2068 
2069 			dep = (mddb_de_ic_t *)
2070 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
2071 			    sizeof (mddb_block_t) +
2072 			    sizeof (mddb_block_t) * de32p->de32_blkcount,
2073 			    KM_SLEEP);
2074 			de32tode(de32p, dep);
2075 
2076 			dbp->db_firstentry = dep;
2077 			while (de32p && de32p->de32_next) {
2078 
2079 				de32p2 = nextentry(de32p);
2080 
2081 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
2082 				    sizeof (mddb_de_ic_t) -
2083 				    sizeof (mddb_block_t) +
2084 				    sizeof (mddb_block_t) *
2085 				    de32p2->de32_blkcount, KM_SLEEP);
2086 
2087 				de32tode(de32p2, dep2);
2088 
2089 				dep->de_next = dep2;
2090 				dep = dep2;
2091 				de32p = de32p2;
2092 			}
2093 		}
2094 		/*
2095 		 * go through and make all of the pointer to record blocks
2096 		 * are null;
2097 		 */
2098 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
2099 			dep->de_rb = NULL;
2100 	}
2101 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2102 	dbp->db_next = NULL;
2103 	/*
2104 	 *	if error occurred in directory blocks free them
2105 	 *	and return
2106 	 */
2107 	if (err) {
2108 		dbp = dbhp;
2109 		while (dbp) {
2110 			dep = dbp->db_firstentry;
2111 			while (dep) {
2112 				/* No mddb_rb32_t structures yet */
2113 				dep2 = dep->de_next;
2114 				kmem_free((caddr_t)dep, sizeofde(dep));
2115 				dep = dep2;
2116 			}
2117 			dbp1 = dbp->db_next;
2118 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2119 			dbp = dbp1;
2120 		}
2121 		s->s_dbp = NULL;
2122 		return (err);
2123 
2124 	}
2125 	/*
2126 	 */
2127 	err = 0;
2128 	checksum = MDDB_GLOBAL_XOR;
2129 	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
2130 		checksum ^= dbp->db_recsum;
2131 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2132 			if (dep->de_flags & MDDB_F_OPT)
2133 				continue;
2134 			err = getrecord(s, dep, li);
2135 			if (err)
2136 				break;
2137 			/* Don't include CHANGELOG in big XOR */
2138 			if (dep->de_flags & MDDB_F_CHANGELOG)
2139 				continue;
2140 			checksum ^= dep->de_rb->rb_checksum;
2141 			checksum ^= dep->de_rb->rb_checksum_fiddle;
2142 		}
2143 		if (err)
2144 			break;
2145 	}
2146 	if (checksum) {
2147 		if (! err)
2148 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2149 	}
2150 	if (err) {
2151 		dbp = dbhp;
2152 		dbhp = NULL;
2153 		while (dbp) {
2154 			dep = dbp->db_firstentry;
2155 			while (dep) {
2156 				if (dep->de_rb)
2157 					kmem_free((caddr_t)dep->de_rb,
2158 					    dep->de_recsize);
2159 				dep2 = dep->de_next;
2160 				kmem_free((caddr_t)dep, sizeofde(dep));
2161 				dep = dep2;
2162 			}
2163 			dbp1 = dbp->db_next;
2164 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2165 			dbp = dbp1;
2166 		}
2167 	}
2168 	s->s_dbp = dbhp;
2169 	return (err);
2170 }
2171 
2172 static int
2173 getoptcnt(
2174 	mddb_set_t	*s,
2175 	int		li)
2176 {
2177 	int		result;
2178 	mddb_de_ic_t	*dep;
2179 	mddb_db_t	*dbp;
2180 
2181 #if defined(_ILP32) && !defined(lint)
2182 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2183 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2184 #endif
2185 
2186 	result = 0;
2187 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2188 		dep = dbp->db_firstentry;
2189 		for (; dep != NULL; dep = dep->de_next) {
2190 			if (! (dep->de_flags & MDDB_F_OPT))
2191 				continue;
2192 			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
2193 			    (li == dep->de_optinfo[0].o_li)) ||
2194 			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
2195 			    (li == dep->de_optinfo[1].o_li)))
2196 			result++;
2197 		}
2198 	}
2199 	return (result);
2200 }
2201 
2202 static void
2203 getoptdev(
2204 	mddb_set_t	*s,
2205 	mddb_de_ic_t	*rdep,
2206 	int		opti
2207 )
2208 {
2209 	mddb_lb_t	*lbp;
2210 	mddb_locator_t	*lp;
2211 	mddb_optinfo_t	*otherop;
2212 	mddb_optinfo_t	*resultop;
2213 	int		li;
2214 	dev_t		otherdev;
2215 	int		blkonly = 0;
2216 	int		mincnt;
2217 	int		thiscnt;
2218 
2219 	lbp = s->s_lbp;
2220 
2221 	resultop = &rdep->de_optinfo[opti];
2222 	otherop = &rdep->de_optinfo[1-opti];
2223 
2224 	resultop->o_flags = 0;
2225 
2226 	/*
2227 	 * scan through and see if data bases have to vary by only device
2228 	 */
2229 
2230 	if (otherop->o_flags & MDDB_F_ACTIVE) {
2231 		blkonly = 1;
2232 		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
2233 		for (li = 0; li < lbp->lb_loccnt; li++) {
2234 			lp = &lbp->lb_locators[li];
2235 			if (! (lp->l_flags & MDDB_F_ACTIVE))
2236 				continue;
2237 			if (expldev(lp->l_dev) != otherdev) {
2238 				blkonly = 0;
2239 				break;
2240 			}
2241 		}
2242 	}
2243 
2244 	mincnt = 999999;
2245 	for (li = 0; li < lbp->lb_loccnt; li++) {
2246 		dev_info_t	*devi;
2247 		int		removable = 0;
2248 
2249 		lp = &lbp->lb_locators[li];
2250 		if (! (lp->l_flags & MDDB_F_ACTIVE))
2251 			continue;
2252 		if (otherop->o_flags & MDDB_F_ACTIVE) {
2253 			if (blkonly) {
2254 				if (otherop->o_li == li)
2255 					continue;
2256 			} else {
2257 				if (otherdev == expldev(lp->l_dev))
2258 					continue;
2259 			}
2260 		}
2261 
2262 		/*
2263 		 * Check if this is a removable device.  If it is we
2264 		 * assume it is something like a USB flash disk, a zip disk
2265 		 * or even a floppy that is being used to help maintain
2266 		 * mddb quorum.  We don't want to put any optimized resync
2267 		 * records on these kinds of disks since they are usually
2268 		 * slower or don't have the same read/write lifetimes as
2269 		 * a regular fixed disk.
2270 		 */
2271 		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
2272 			int		error;
2273 			struct cb_ops	*cb;
2274 			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
2275 			int		propvalue = 0;
2276 			int		proplength = sizeof (int);
2277 
2278 			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
2279 			    != NULL) {
2280 				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
2281 				    prop_op, DDI_PROP_NOTPROM |
2282 				    DDI_PROP_DONTPASS, "removable-media",
2283 				    (caddr_t)&propvalue, &proplength);
2284 
2285 				if (error == DDI_PROP_SUCCESS)
2286 					removable = 1;
2287 			}
2288 
2289 			ddi_release_devi(devi);
2290 		}
2291 
2292 		if (removable)
2293 			continue;
2294 
2295 		thiscnt = getoptcnt(s, li);
2296 		if (thiscnt < mincnt) {
2297 			resultop->o_li  = li;
2298 			mincnt = thiscnt;
2299 			resultop->o_flags = MDDB_F_ACTIVE;
2300 		}
2301 	}
2302 }
2303 
2304 static void
2305 allocuserdata(
2306 	mddb_de_ic_t	*dep
2307 )
2308 {
2309 	mddb_rb32_t	*rbp;
2310 
2311 #if defined(_ILP32) && !defined(lint)
2312 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2313 #endif
2314 
2315 	rbp = dep->de_rb;
2316 	rbp->rb_private = 0;
2317 	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
2318 	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
2319 	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
2320 }
2321 
2322 
2323 static void
2324 getuserdata(
2325 	set_t		setno,
2326 	mddb_de_ic_t	*dep
2327 )
2328 {
2329 	mddb_rb32_t	 *rbp;
2330 
2331 
2332 	mddb_type_t	type = dep->de_type1;
2333 	caddr_t		data, udata;
2334 
2335 #if defined(_ILP32) && !defined(lint)
2336 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2337 #endif
2338 	rbp = dep->de_rb;
2339 	data = (caddr_t)rbp->rb_data;
2340 	udata = (caddr_t)dep->de_rb_userdata;
2341 
2342 	/*
2343 	 * If it's a driver record, and an old style record, and not a DRL
2344 	 * record, we must convert it because it was incore as a 64 bit
2345 	 * structure but its on disk layout has only 32 bit for block sizes
2346 	 */
2347 	if (!(md_get_setstatus(setno) &
2348 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
2349 	    (type >= MDDB_FIRST_MODID) &&
2350 	    ((rbp->rb_revision == MDDB_REV_RB) ||
2351 	    (rbp->rb_revision == MDDB_REV_RBFN))) {
2352 
2353 		switch (dep->de_flags) {
2354 
2355 			case MDDB_F_STRIPE:
2356 				stripe_convert(data, udata, BIG_2_SMALL);
2357 				break;
2358 
2359 			case MDDB_F_MIRROR:
2360 				mirror_convert(data, udata, BIG_2_SMALL);
2361 				break;
2362 
2363 			case MDDB_F_RAID:
2364 				raid_convert(data, udata, BIG_2_SMALL);
2365 				break;
2366 
2367 			case MDDB_F_SOFTPART:
2368 				softpart_convert(data, udata, BIG_2_SMALL);
2369 				break;
2370 
2371 			case MDDB_F_TRANS_MASTER:
2372 				trans_master_convert(data, udata, BIG_2_SMALL);
2373 				break;
2374 
2375 			case MDDB_F_TRANS_LOG:
2376 				trans_log_convert(data, udata, BIG_2_SMALL);
2377 				break;
2378 
2379 			case MDDB_F_HOTSPARE:
2380 				hs_convert(data, udata, BIG_2_SMALL);
2381 				break;
2382 
2383 			case MDDB_F_OPT:
2384 			default:
2385 				bcopy(udata, data, dep->de_reqsize);
2386 		}
2387 	} else {
2388 		bcopy(udata, data, dep->de_reqsize);
2389 	}
2390 }
2391 
2392 static void
2393 getoptrecord(
2394 	mddb_set_t	*s,
2395 	mddb_de_ic_t	*dep
2396 )
2397 {
2398 	mddb_lb_t	*lbp;
2399 	mddb_locator_t	*lp;
2400 	mddb_rb32_t	*rbp, *crbp;
2401 	int		li;
2402 	int		i;
2403 	int		err = 0;
2404 	size_t		recsize;
2405 
2406 #if defined(_ILP32) && !defined(lint)
2407 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2408 #endif
2409 
2410 	lbp = s->s_lbp;
2411 
2412 	recsize = dep->de_recsize;
2413 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2414 	rbp = dep->de_rb;
2415 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2416 
2417 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
2418 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2419 
2420 	for (i = 0; i < 2; i++) {
2421 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2422 			continue;
2423 		li = dep->de_optinfo[i].o_li;
2424 		lp = &lbp->lb_locators[li];
2425 
2426 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
2427 		    (lp->l_flags & MDDB_F_EMASTER))
2428 			continue;
2429 
2430 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
2431 		    dep->de_blkcount, li, 0);
2432 
2433 		if (err)
2434 			continue;
2435 
2436 		if (rbp->rb_magic != MDDB_MAGIC_RB)
2437 			continue;
2438 
2439 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
2440 			continue;
2441 
2442 		/* Check the crc for this record */
2443 		if (rec_crcchk(s, dep, rbp)) {
2444 			continue;
2445 		}
2446 
2447 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
2448 
2449 		if (rbp == crbp) {
2450 			if (rbp->rb_checksum != crbp->rb_checksum)
2451 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2452 			break;
2453 		}
2454 		rbp = crbp;
2455 	}
2456 
2457 	if (rbp == crbp) {
2458 		rbp->rb_private = 0;
2459 		kmem_free((caddr_t)crbp, recsize);
2460 		return;
2461 	}
2462 	bzero((caddr_t)rbp, recsize);
2463 	rbp->rb_magic = MDDB_MAGIC_RB;
2464 	rbp->rb_revision = MDDB_REV_RB;
2465 	uniqtime32(&rbp->rb_timestamp);
2466 	/* Generate the crc for this record */
2467 	rec_crcgen(s, dep, rbp);
2468 	kmem_free((caddr_t)crbp, recsize);
2469 }
2470 
2471 /*
2472  * writeoptrecord writes out an optimized record.
2473  */
2474 static int
2475 writeoptrecord(
2476 	mddb_set_t	*s,
2477 	mddb_de_ic_t	*dep
2478 )
2479 {
2480 	mddb_rb32_t	*rbp;
2481 	int		li;
2482 	int		err = 0, wrt_err = 0;
2483 	mddb_bf_t	*bufhead, *bfp;
2484 	mddb_lb_t	*lbp = s->s_lbp;
2485 	mddb_locator_t	*lp;
2486 	int		i;
2487 
2488 #if defined(_ILP32) && !defined(lint)
2489 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2490 #endif
2491 
2492 	bufhead = NULL;
2493 	err = 0;
2494 
2495 	while (s->s_opthavequeuinglck) {
2496 		s->s_optwantqueuinglck++;
2497 		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
2498 	}
2499 	s->s_opthavequeuinglck++;
2500 	rbp = dep->de_rb;
2501 	for (i = 0; i < 2; i++) {
2502 		/*
2503 		 * only possible error is xlate. This can
2504 		 * occur if a replica was off line and came
2505 		 * back. During the mean time the database grew
2506 		 * large than the now on line replica can store
2507 		 */
2508 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2509 			continue;
2510 		li = dep->de_optinfo[i].o_li;
2511 		/*
2512 		 * In a MN diskset, any node can write optimized record(s).
2513 		 */
2514 		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
2515 		    dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
2516 		/*
2517 		 * For MN diskset, set error in optinfo structure so
2518 		 * that mddb_commitrec knows which replica failed.
2519 		 */
2520 		if ((MD_MNSET_SETNO(s->s_setno)) &&
2521 		    (wrt_err & MDDB_F_EWRITE)) {
2522 			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
2523 		}
2524 		err |= wrt_err;
2525 	}
2526 	s->s_opthavequeuinglck = 0;
2527 	if (s->s_optwantqueuinglck) {
2528 		s->s_optwantqueuinglck = 0;
2529 		cv_broadcast(&s->s_optqueuing_cv);
2530 	}
2531 	for (bfp = bufhead; bfp; bfp = bufhead) {
2532 		mutex_exit(SETMUTEX(s->s_setno));
2533 		(void) biowait(&bfp->bf_buf);
2534 		mutex_enter(SETMUTEX(s->s_setno));
2535 		if (bfp->bf_buf.b_flags & B_ERROR) {
2536 			/*
2537 			 * If an MN diskset, don't set replica
2538 			 * in error since this hasn't been set in master.
2539 			 * Setting replica in error before master could
2540 			 * leave the nodes with different views of the
2541 			 * world since a class 1 configuration change
2542 			 * could occur in mddb_commitrec as soon as
2543 			 * all locks are dropped.  Must keep this
2544 			 * node the same as master and can't afford a
2545 			 * failure from the class 1 config change
2546 			 * if master succeeded.
2547 			 */
2548 			if (!(MD_MNSET_SETNO(s->s_setno))) {
2549 				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
2550 			} else {
2551 				/*
2552 				 * Find which de_optinfo (which replica)
2553 				 * had a failure and set the failure in
2554 				 * the o_flags field.
2555 				 */
2556 				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
2557 				if (lp == bfp->bf_locator) {
2558 					dep->de_optinfo[0].o_flags |=
2559 					    MDDB_F_EWRITE;
2560 				} else {
2561 					dep->de_optinfo[1].o_flags |=
2562 					    MDDB_F_EWRITE;
2563 				}
2564 			}
2565 			err |= MDDB_F_EWRITE;
2566 		}
2567 		bufhead = bfp->bf_next;
2568 		freebuffer(s, bfp);
2569 	}
2570 	return (err);
2571 }
2572 
2573 /*
2574  * Fix up the optimized resync record.  Used in the traditional and local
2575  * disksets to move an optimized record from a failed or deleted mddb
2576  * to an active one.
2577  *
2578  * In a MN diskset, the fixing of the optimized record is split between
2579  * the master and slave nodes.  If the master node moves the optimized
2580  * resync record, then the master node will send a MDDB_PARSE_OPTRECS
2581  * message to the slave nodes causing the slave nodes to reget the
2582  * directory entry containing the location of the optimized resync record.
2583  * After the record is reread from disk, then writeoptrecord is called
2584  * if the location of the optimized resync record or flags have changed.
2585  * When writeoptrecord is called, the node that is the owner of this record
2586  * will write the optimized record to the location specified in the directory
2587  * entry.  Since the master node uses the highest class message (PARSE)
2588  * the record owner node is guaranteed to already have an updated
2589  * directory entry incore.
2590  *
2591  * The other difference between the traditional/local set and MN diskset
2592  * is that the directory entry can be written to disk before the optimized
2593  * record in a MN diskset if the record is owned by a slave node.  So,
2594  * the users of an optimized record must handle the failure case when no
2595  * data is available from an optimized record since the master node could
2596  * have failed during the relocation of the optimized record to another mddb.
2597  */
2598 static int
2599 fixoptrecord(
2600 	mddb_set_t	*s,
2601 	mddb_de_ic_t	*dep,
2602 	mddb_db_t	*dbp
2603 )
2604 {
2605 	int		changed;
2606 	int		writedata;
2607 	int		err = 0;
2608 	int		i;
2609 	mddb_lb_t	*lbp;
2610 	mddb_optinfo_t	*op;
2611 	mddb_db32_t	*db32p;
2612 	int		rec_owner;	/* Is node owner of record? */
2613 
2614 #if defined(_ILP32) && !defined(lint)
2615 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2616 #endif
2617 
2618 	lbp = s->s_lbp;
2619 	changed = 0;
2620 	writedata = 0;
2621 	for (i = 0; i < 2; i++) {
2622 		op = &dep->de_optinfo[i];
2623 
2624 		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
2625 			op->o_flags = 0;
2626 
2627 		/*
2628 		 * If optimized record has seen a replica failure,
2629 		 * assign new replica to record and re-write data
2630 		 * to new record.
2631 		 */
2632 		if (! (op->o_flags & MDDB_F_ACTIVE)) {
2633 			getoptdev(s, dep, i);
2634 			writedata++;
2635 			changed++;
2636 			/* Set flag for slaves to reread dep and write rec */
2637 			if (lbp->lb_flags & MDDB_MNSET) {
2638 				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
2639 			}
2640 		}
2641 
2642 		/*
2643 		 * If just an error in the data was seen, set
2644 		 * the optimized record's replica flag to active (ok)
2645 		 * and try again.
2646 		 */
2647 		if (op->o_flags & MDDB_F_EDATA) {
2648 			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
2649 			writedata++;
2650 		}
2651 	}
2652 
2653 	rec_owner = 0;
2654 	if (lbp->lb_flags & MDDB_MNSET) {
2655 		/*
2656 		 * If a MN diskset then check the owner of optimized record.
2657 		 * If the master node owns the record or if there is
2658 		 * no owner of the record, then the master can write the
2659 		 * optimized record to disk.
2660 		 * Master node can write the optimized record now, but
2661 		 * slave nodes write their records during handling of
2662 		 * the MDDB_PARSE_OPTRECS message.
2663 		 */
2664 		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
2665 		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
2666 			rec_owner = 1;
2667 		}
2668 	} else {
2669 		/*
2670 		 * In traditional diskset and local set, this node
2671 		 * is always the record owner and always the master.
2672 		 */
2673 		rec_owner = 1;
2674 	}
2675 
2676 	/*
2677 	 * If this node is the record owner, write out record.
2678 	 */
2679 	if ((writedata) && (rec_owner)) {
2680 		if (err = writeoptrecord(s, dep)) {
2681 			return (err);
2682 		}
2683 	}
2684 	if (! changed)
2685 		return (0);
2686 	uniqtime32(&dbp->db_timestamp);
2687 	dbp->db_revision = MDDB_REV_DB;
2688 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2689 	create_db32rec(db32p, dbp);
2690 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
2691 	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
2692 	    1, MDDB_WR_ONLY_MASTER);
2693 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2694 	return (err);
2695 }
2696 
2697 static int
2698 fixoptrecords(
2699 	mddb_set_t		*s
2700 )
2701 {
2702 	mddb_de_ic_t	*dep;
2703 	mddb_db_t	*dbp;
2704 	int		err = 0;
2705 	set_t		setno;
2706 
2707 	/*
2708 	 * In a MN diskset, the master node is the only node that runs
2709 	 * fixoptrecords.  If the master node changes anything, then the
2710 	 * master node sends PARSE message to the slave nodes.  The slave
2711 	 * nodes will then re-read in the locator block or re-read in the
2712 	 * directory blocks and re-write the optimized resync records.
2713 	 */
2714 	setno = s->s_setno;
2715 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
2716 	    (md_set[setno].s_am_i_master == 0)) {
2717 		return (0);
2718 	}
2719 
2720 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2721 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2722 			if (! (dep->de_flags & MDDB_F_OPT))
2723 				continue;
2724 			err = fixoptrecord(s, dep, dbp);
2725 			if (err != 0)
2726 				return (err);
2727 		}
2728 	}
2729 	return (0);
2730 }
2731 
2732 /*
2733  * Checks incore version of mddb data to mddb data ondisk.
2734  *
2735  * Returns:
2736  *	- 0 if the data was successfully read and is good.
2737  *	- MDDB_F_EREAD if a read error occurred.
2738  *	- 1 if the data read is bad (checksum failed, etc)
2739  */
2740 static int
2741 checkcopy
2742 (
2743 	mddb_set_t	*s,
2744 	int		li
2745 )
2746 {
2747 	mddb_db_t	*dbp;
2748 	mddb_db32_t	*cdb32p;
2749 	mddb_de_ic_t	*dep;
2750 	mddb_de32_t	*cde32p;
2751 	mddb_rb32_t	*rbp, *crbp;
2752 	size_t		size;
2753 	int		i;
2754 	int		retval = 1;
2755 
2756 #if defined(_ILP32) && !defined(lint)
2757 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2758 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2759 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2760 #endif
2761 
2762 	if (s->s_databuffer_size == 0) {
2763 		size_t maxrecsize = MDDB_BSIZE;
2764 
2765 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
2766 			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
2767 				if (! (dep->de_flags & MDDB_F_OPT) &&
2768 				    dep->de_recsize > maxrecsize)
2769 					maxrecsize = dep->de_recsize;
2770 
2771 		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
2772 		s->s_databuffer_size = maxrecsize;
2773 	}
2774 
2775 	cdb32p = (mddb_db32_t *)s->s_databuffer;
2776 
2777 	/*
2778 	 * first go through and make sure all directory stuff
2779 	 * is the same
2780 	 */
2781 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2782 		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
2783 			retval = MDDB_F_EREAD;
2784 			goto err;
2785 		}
2786 		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
2787 			goto err;
2788 		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
2789 			goto err;
2790 		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
2791 			goto err;
2792 		if (cdb32p->db32_nextblk != dbp->db_nextblk)
2793 			goto err;
2794 		if (cdb32p->db32_recsum != dbp->db_recsum)
2795 			goto err;
2796 		if (cdb32p->db32_firstentry) {
2797 			cde32p = (mddb_de32_t *)
2798 			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
2799 			    + sizeof (cdb32p->db32_firstentry)));
2800 		} else
2801 			cde32p = NULL;
2802 
2803 		dep = dbp->db_firstentry;
2804 		/*
2805 		 * check if all directory entries are identical
2806 		 */
2807 		while (dep && cde32p) {
2808 			if (dep->de_recid != cde32p->de32_recid)
2809 				goto err;
2810 			if (dep->de_type1 != cde32p->de32_type1)
2811 				goto err;
2812 			if (dep->de_type2 != cde32p->de32_type2)
2813 				goto err;
2814 			if (dep->de_reqsize != cde32p->de32_reqsize)
2815 				goto err;
2816 			if (dep->de_flags != cde32p->de32_flags)
2817 				goto err;
2818 
2819 			for (i = 0; i < 2; i++) {
2820 				if (dep->de_optinfo[i].o_li !=
2821 				    cde32p->de32_optinfo[i].o_li)
2822 					break;
2823 			}
2824 			if (i != 2)
2825 				goto err;
2826 			size = sizeof (mddb_block_t) * dep->de_blkcount;
2827 			if (bcmp((caddr_t)dep->de_blks,
2828 			    (caddr_t)cde32p->de32_blks, size))
2829 				goto err;
2830 			dep = dep->de_next;
2831 			if (cde32p->de32_next)
2832 				cde32p = nextentry(cde32p);
2833 			else
2834 				cde32p = NULL;
2835 		}
2836 		if (dep || cde32p)
2837 			goto err;
2838 	}
2839 	/*
2840 	 * If here, all directories are functionally identical
2841 	 * check to make sure all records are identical
2842 	 * the reason the records are not just bcmped is that the
2843 	 * lock flag does not want to be compared.
2844 	 */
2845 	crbp = (mddb_rb32_t *)cdb32p;
2846 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2847 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2848 			if ((dep->de_flags & MDDB_F_OPT) ||
2849 			    (dep->de_flags & MDDB_F_CHANGELOG))
2850 				continue;
2851 			rbp = (mddb_rb32_t *)dep->de_rb;
2852 			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
2853 			    dep->de_blkcount, li, 0)) {
2854 				retval = MDDB_F_EREAD;
2855 				goto err;
2856 			}
2857 			/* Check the crc for this record */
2858 			if (rec_crcchk(s, dep, crbp))
2859 				goto err;
2860 
2861 			if (rbp->rb_checksum != crbp->rb_checksum ||
2862 			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
2863 				goto err;
2864 		}
2865 	}
2866 	return (0);
2867 err:
2868 	return (retval);
2869 }
2870 
2871 /*
2872  * Determine if the location information for two mddbs is the same.
2873  * The device slice and block offset should match.  If both have devids then
2874  * use that for the comparison, otherwise we compare the dev_ts.
2875  * Comparing with the devid allows us to handle the case where a mddb was
2876  * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
2877  * the dead mddb but the devid comparison will catch this and not match.
2878  *
2879  * Return 1 if the location of the two mddbs match, 0 if not.
2880  */
2881 static int
2882 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
2883 	daddr32_t blkno)
2884 {
2885 	if (rip->ri_flags & MDDB_F_EMASTER) {
2886 		/*
2887 		 * If this element is errored then we don't try to match on it.
2888 		 * If we try to match we could erroneously match on the dev_t
2889 		 * of a relocated disk.
2890 		 */
2891 		return (0);
2892 	}
2893 
2894 	if (rip->ri_devid && devid && minor) {
2895 		/*
2896 		 * If old devid exists, then this is a replicated diskset
2897 		 * and both old and new devids must be checked.
2898 		 */
2899 		if (rip->ri_old_devid) {
2900 			if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
2901 			    (ddi_devid_compare(rip->ri_old_devid,
2902 			    devid) != 0)) ||
2903 			    (strcmp(rip->ri_minor_name, minor) != 0))
2904 				return (0);
2905 		} else {
2906 			if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
2907 			    strcmp(rip->ri_minor_name, minor) != 0)
2908 				return (0);
2909 		}
2910 	} else {
2911 		if (rip->ri_dev != dev)
2912 			return (0);
2913 	}
2914 
2915 	if (rip->ri_blkno != blkno)
2916 		return (0);
2917 
2918 	return (1);
2919 }
2920 
2921 static int
2922 ridev(
2923 	mddb_ri_t	**rip,
2924 	mddb_cfg_loc_t	*clp,
2925 	dev32_t		*dev_2b_fixed,
2926 	int		flag)
2927 {
2928 	mddb_ri_t	*r, *r1;
2929 	md_dev64_t	ldev, ndev;
2930 	major_t		majordev;
2931 	int		sz;
2932 
2933 	if (MD_UPGRADE) {
2934 		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
2935 		    clp->l_mnum);
2936 	} else {
2937 		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
2938 			return (EINVAL);
2939 
2940 		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
2941 		    clp->l_mnum);
2942 	}
2943 
2944 	if (clp->l_devid != 0) {
2945 		/*
2946 		 * Get dev associated with device id and minor name.
2947 		 * Setup correct driver name if dev is now different.
2948 		 * Don't change driver name if during upgrade.
2949 		 */
2950 		ndev = ldev;
2951 		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
2952 		    &ndev, clp->l_minor_name)) {
2953 			if ((ndev != ldev) && (!(MD_UPGRADE))) {
2954 				majordev = md_getmajor(ndev);
2955 				(void) strcpy(clp->l_driver,
2956 				    ddi_major_to_name(majordev));
2957 				clp->l_mnum = md_getminor(ndev);
2958 				clp->l_devid_flags |= MDDB_DEVID_VALID;
2959 				ldev = ndev;
2960 			}
2961 		} else {
2962 			/* Mark as invalid */
2963 			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
2964 		}
2965 	}
2966 
2967 	clp->l_dev = md_cmpldev(ldev);
2968 	if (dev_2b_fixed)
2969 		*dev_2b_fixed = clp->l_dev;
2970 	r = *rip;
2971 
2972 	while (r) {
2973 		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
2974 		    clp->l_minor_name, ldev, clp->l_blkno)) {
2975 			if ((clp->l_devid != 0) &&
2976 			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2977 				r->ri_flags |= MDDB_F_EMASTER;
2978 			} else {
2979 				r->ri_flags |= flag;
2980 			}
2981 			return (0);	/* already entered return success */
2982 		}
2983 		r = r->ri_next;
2984 	}
2985 
2986 	/*
2987 	 * This replica not represented in the current rip list,
2988 	 * so add it to the list.
2989 	 */
2990 	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
2991 	r->ri_dev = ldev;
2992 	r->ri_blkno = clp->l_blkno;
2993 	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
2994 	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
2995 		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
2996 	}
2997 	if (clp->l_devname != NULL) {
2998 		(void) strcpy(r->ri_devname, clp->l_devname);
2999 	}
3000 	r->ri_flags |= flag;
3001 	if (clp->l_devid != 0) {
3002 		sz = clp->l_devid_sz;
3003 		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
3004 		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
3005 
3006 		if (clp->l_old_devid != NULL) {
3007 			sz = clp->l_old_devid_sz;
3008 			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
3009 			    KM_SLEEP);
3010 			bcopy((char *)(uintptr_t)clp->l_old_devid,
3011 			    (char *)r->ri_old_devid, sz);
3012 		} else {
3013 			r->ri_old_devid = 0;
3014 		}
3015 		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
3016 			(void) strcpy(r->ri_minor_name, clp->l_minor_name);
3017 
3018 		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
3019 			/*
3020 			 * Devid is present, but not valid.  This could
3021 			 * happen if device has been powered off or if
3022 			 * the device has been removed.  Mark the device in
3023 			 * error.  Don't allow any writes to this device
3024 			 * based on the dev_t since another device could
3025 			 * have been placed in its spot and be responding to
3026 			 * the dev_t accesses.
3027 			 */
3028 			r->ri_flags |= MDDB_F_EMASTER;
3029 		}
3030 	} else {
3031 		r->ri_devid = 0;
3032 		r->ri_old_devid = 0;
3033 	}
3034 
3035 	/*
3036 	 * If the rip list is empty then this entry
3037 	 * is the list.
3038 	 */
3039 	if (*rip == NULL) {
3040 		*rip = r;
3041 		return (0);
3042 	}
3043 
3044 	/*
3045 	 * Add this entry to the end of the rip list
3046 	 */
3047 	r1 = *rip;
3048 	while (r1->ri_next)
3049 		r1 = r1->ri_next;
3050 	r1->ri_next = r;
3051 	return (0);
3052 }
3053 
3054 /*
3055  * writecopy writes the incore data blocks out to all of the replicas.
3056  * This is called from writestart
3057  *	- when a diskset is started or
3058  *	- when an error has been enountered during the write to a mddb.
3059  * and from newdev when a new mddb is being added.
3060  *
3061  * flag can be 2 values:
3062  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3063  *		always used for traditional and local disksets.
3064  *		For MN diskset:
3065  *			All nodes can call writecopy, but only the
3066  *			master node actually writes data to the disk
3067  *			except for optimized resync records.
3068  *			An optimized resync record can only be written to
3069  *			by the record owner.
3070  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3071  *		master has been chosen, the new master may need to
3072  * 		write its incore mddb to disk (this is the case where the
3073  *		old master had executed a message but hadn't relayed it
3074  *		to this slave yet).  New master should not write the
3075  *		change log records since new master would be overwriting
3076  *		valuable data.  Only used during a reconfig cycle.
3077  */
3078 static int
3079 writecopy(
3080 	mddb_set_t	*s,
3081 	int		li,
3082 	int		flag
3083 )
3084 {
3085 	mddb_db_t	*dbp;
3086 	mddb_db32_t	*db32p;
3087 	mddb_de_ic_t	*dep;
3088 	mddb_rb32_t	*rbp;
3089 	uint_t		checksum;
3090 	int		err = 0;
3091 
3092 #if defined(_ILP32) && !defined(lint)
3093 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
3094 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
3095 #endif
3096 
3097 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
3098 		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
3099 		create_db32rec(db32p, dbp);
3100 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
3101 		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
3102 		    MDDB_WR_ONLY_MASTER);
3103 		kmem_free((caddr_t)db32p, MDDB_BSIZE);
3104 		if (err)
3105 			return (err);
3106 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
3107 			/*
3108 			 * In a multinode diskset, when a new master is
3109 			 * chosen the new master may need to write its
3110 			 * incore copy of the mddb to disk.  In this case,
3111 			 * don't want to overwrite the change log records
3112 			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
3113 			 */
3114 			if (flag == MDDB_WRITECOPY_SYNC) {
3115 				if (dep->de_flags & MDDB_F_CHANGELOG)
3116 					continue;
3117 			}
3118 			/*
3119 			 * In a multinode diskset, don't write out optimized
3120 			 * resync resyncs since only the mirror owner node
3121 			 * will have the correct data.  If writecopy is
3122 			 * being called from writestart as a result of
3123 			 * an mddb failure, then writestart will handle
3124 			 * the optimized records when it calls fixoptrecords.
3125 			 */
3126 			if ((MD_MNSET_SETNO(s->s_setno)) &&
3127 			    (dep->de_flags & MDDB_F_OPT)) {
3128 				continue;
3129 			}
3130 
3131 			rbp = dep->de_rb;
3132 			checksum = rbp->rb_checksum_fiddle;
3133 			checksum ^= rbp->rb_checksum;
3134 			/* Generate the crc for this record */
3135 			rec_crcgen(s, dep, rbp);
3136 			checksum ^= rbp->rb_checksum;
3137 			rbp->rb_checksum_fiddle = checksum;
3138 			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
3139 			    dep->de_blkcount, li, (mddb_bf_t **)0,
3140 			    MDDB_WR_ONLY_MASTER))
3141 				return (err);
3142 		}
3143 	}
3144 	return (0);
3145 }
3146 
3147 static int
3148 upd_med(
3149 	mddb_set_t	*s,
3150 	char		*tag
3151 )
3152 {
3153 	med_data_t	meddb;
3154 	int		medok;
3155 	mddb_lb_t	*lbp = s->s_lbp;
3156 	set_t		setno = s->s_setno;
3157 	int		li;
3158 	int		alc;
3159 	int		lc;
3160 
3161 
3162 	/* If no mediator hosts, nothing to do */
3163 	if (s->s_med.n_cnt == 0)
3164 		return (0);
3165 
3166 	/*
3167 	 * If this is a MN set and we are not the master, then don't
3168 	 * update mediator hosts or mark mediator as golden since
3169 	 * only master node should do that.
3170 	 */
3171 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
3172 	    (md_set[setno].s_am_i_master == 0)) {
3173 		return (0);
3174 	}
3175 
3176 	bzero((char *)&meddb, sizeof (med_data_t));
3177 	meddb.med_dat_mag = MED_DATA_MAGIC;
3178 	meddb.med_dat_rev = MED_DATA_REV;
3179 	meddb.med_dat_fl = 0;
3180 	meddb.med_dat_sn = setno;
3181 	meddb.med_dat_cc = lbp->lb_commitcnt;
3182 	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
3183 	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3184 
3185 	/* count accessible mediators */
3186 	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3187 
3188 	/* count accessible and existing replicas */
3189 	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
3190 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3191 
3192 		if (lp->l_flags & MDDB_F_DELETED)
3193 			continue;
3194 
3195 		lc++;
3196 
3197 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
3198 		    (lp->l_flags & MDDB_F_EMASTER) ||
3199 		    (lp->l_flags & MDDB_F_EWRITE))
3200 			continue;
3201 
3202 		alc++;
3203 	}
3204 
3205 	/*
3206 	 * Mediator update quorum is >= 50%: check for less than
3207 	 * "mediator update" quorum.
3208 	 */
3209 	if ((medok * 2) < s->s_med.n_cnt) {
3210 		/* panic if <= 50% of all replicas are accessible */
3211 		if ((lc > 0) && ((alc * 2) <= lc)) {
3212 			cmn_err(CE_PANIC,
3213 			    "md: Update of 50%% of the mediator hosts failed");
3214 			/* NOTREACHED */
3215 		}
3216 
3217 		cmn_err(CE_WARN,
3218 		    "md: Update of 50%% of the mediator hosts failed");
3219 	}
3220 
3221 	/*
3222 	 * If we have mediator update quorum and exactly 50% of the replicas
3223 	 * are accessible then mark the mediator as golden.
3224 	 */
3225 	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
3226 	    ((alc * 2) == lc)) {
3227 		meddb.med_dat_fl = MED_DFL_GOLDEN;
3228 		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3229 		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3230 	}
3231 
3232 	return (0);
3233 }
3234 
3235 static int
3236 push_lb(mddb_set_t *s)
3237 {
3238 	mddb_lb_t	*lbp = s->s_lbp;
3239 
3240 	/* push the change to all the replicas */
3241 	uniqtime32(&lbp->lb_timestamp);
3242 	if (MD_MNSET_SETNO(s->s_setno)) {
3243 		lbp->lb_revision = MDDB_REV_MNLB;
3244 	} else {
3245 		lbp->lb_revision = MDDB_REV_LB;
3246 	}
3247 	/*
3248 	 * The updates to the mediator hosts are done
3249 	 * by the callers of this function.
3250 	 */
3251 	return (writelocall(s));
3252 }
3253 
3254 /* Should not call for MN diskset since data tags are not supported */
3255 static int
3256 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
3257 {
3258 	int 		diff = 0;
3259 
3260 	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
3261 	if (diff)
3262 		return (diff);
3263 
3264 	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
3265 	if (diff)
3266 		return (diff);
3267 
3268 	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
3269 	if (diff)
3270 		return (diff);
3271 
3272 	/*CSTYLED*/
3273 	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
3274 }
3275 
3276 /* Should not call for MN diskset since data tags are not supported */
3277 static int
3278 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
3279 {
3280 	int		nextid = 0;
3281 	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
3282 
3283 	/* Run to the end of the list */
3284 	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
3285 		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
3286 			return (0);
3287 		nextid++;
3288 	}
3289 
3290 	/* Add the new member */
3291 	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
3292 
3293 	/* Update the dtag portion of the list */
3294 	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
3295 	    sizeof (mddb_dtag_t));
3296 
3297 	/* Fix up the id value */
3298 	(*dtlpp)->dtl_dt.dt_id = ++nextid;
3299 
3300 	return (0);
3301 }
3302 
3303 /*
3304  * Even though data tags are not supported in MN disksets, dt_cntl may
3305  * be called for a MN diskset since this routine is called even before
3306  * it is known the kind of diskset being read in from disk.
3307  * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
3308  */
3309 static int
3310 dtl_cntl(mddb_set_t *s)
3311 {
3312 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3313 	int		ndt = 0;
3314 
3315 	while (dtlp != NULL) {
3316 		ndt++;
3317 		dtlp = dtlp->dtl_nx;
3318 	}
3319 
3320 	return (ndt);
3321 }
3322 
3323 /*
3324  * Even though data tags are not supported in MN disksets, dt_cntl may
3325  * be called for a MN diskset since this routine is called even before
3326  * it is known the kind of diskset being read in from disk.
3327  * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
3328  */
3329 static mddb_dtag_t *
3330 dtl_findl(mddb_set_t *s, int id)
3331 {
3332 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3333 
3334 	while (dtlp != NULL) {
3335 		if (dtlp->dtl_dt.dt_id == id)
3336 			return (&dtlp->dtl_dt);
3337 		dtlp = dtlp->dtl_nx;
3338 	}
3339 	return ((mddb_dtag_t *)NULL);
3340 }
3341 
3342 /* Should not call for MN diskset since data tags are not supported */
3343 static void
3344 dtl_freel(mddb_dtag_lst_t **dtlpp)
3345 {
3346 	mddb_dtag_lst_t	*dtlp;
3347 	mddb_dtag_lst_t	*tdtlp;
3348 
3349 
3350 	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
3351 		dtlp = tdtlp->dtl_nx;
3352 		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
3353 	}
3354 	*dtlpp = (mddb_dtag_lst_t *)NULL;
3355 }
3356 
3357 /*
3358  * Even though data tags are not supported in MN disksets, dt_setup will
3359  * be called for a MN diskset since this routine is called even before
3360  * it is known the kind of diskset being read in from disk.
3361  * Once this set is known as a MN diskset, the dtp area will be freed.
3362  */
3363 static void
3364 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
3365 {
3366 	mddb_dt_t	*dtp;
3367 	set_t		setno = s->s_setno;
3368 
3369 
3370 	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
3371 		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3372 	else if (dtagp == (mddb_dtag_t *)NULL)
3373 		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
3374 
3375 	/* shorthand */
3376 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3377 
3378 	dtp->dt_mag = MDDB_MAGIC_DT;
3379 	dtp->dt_rev = MDDB_REV_DT;
3380 
3381 	if (dtagp != NULL)
3382 		dtp->dt_dtag = *dtagp;		/* structure assignment */
3383 
3384 	/* Initialize the setno */
3385 	dtp->dt_dtag.dt_setno = setno;
3386 
3387 	/* Clear the id and flags, this is only used in user land */
3388 	dtp->dt_dtag.dt_id = 0;
3389 
3390 	/* Checksum it */
3391 	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
3392 }
3393 
3394 /* Should not call for MN diskset since data tags are not supported */
3395 static int
3396 set_dtag(mddb_set_t *s, md_error_t *ep)
3397 {
3398 	mddb_lb_t	*lbp = s->s_lbp;
3399 	mddb_dtag_t	tag;
3400 
3401 	if (lbp->lb_dtblkcnt == 0) {
3402 		/* Data tags not used in a MN set - so no failure returned */
3403 		if (lbp->lb_flags & MDDB_MNSET)
3404 			return (0);
3405 
3406 		cmn_err(CE_WARN,
3407 		    "No tag record allocated, unable to tag data");
3408 		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
3409 		return (1);
3410 	}
3411 
3412 	/* Clear the stack variable */
3413 	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
3414 
3415 	/* Get the HW serial number for this host */
3416 	(void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL));
3417 	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
3418 
3419 	/* Get the nodename that this host goes by */
3420 	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
3421 	tag.dt_hn[MD_MAX_NODENAME] = '\0';
3422 
3423 	/* Get a time stamp for NOW */
3424 	uniqtime32(&tag.dt_tv);
3425 
3426 	/* Setup the data tag record */
3427 	dt_setup(s, &tag);
3428 
3429 	/* Free any list of tags if they exist */
3430 	dtl_freel(&s->s_dtlp);
3431 
3432 	/* Put the new tag onto the tag list */
3433 	(void) dtl_addl(s, &tag);
3434 
3435 	return (0);
3436 }
3437 
3438 /*
3439  * If called during upgrade, this routine expects a non-translated
3440  * (aka target) dev.
3441  * Should not call for MN diskset since data tags are not supported.
3442  */
3443 static int
3444 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
3445 {
3446 	int		err = 0;
3447 	md_dev64_t	dev;
3448 	caddr_t		tbuf;
3449 	daddr_t		physblk;
3450 	mddb_block_t	blk;
3451 	mddb_dt_t	*dtp;
3452 	mddb_dtag_t	*dtagp;
3453 	set_t		setno = s->s_setno;
3454 
3455 	/* If have not allocated a data tag record, there is nothing to do */
3456 	if (lbp->lb_dtblkcnt == 0)
3457 		return (1);
3458 
3459 	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3460 
3461 	if (dtp == (mddb_dt_t *)NULL)
3462 		return (1);
3463 
3464 	/* shorthand */
3465 	dev = md_xlate_targ_2_mini(rip->ri_dev);
3466 	if (dev == NODEV64) {
3467 		return (1);
3468 	}
3469 
3470 	tbuf = (caddr_t)rip->ri_dtp;
3471 
3472 	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
3473 		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
3474 		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
3475 		/* error reading the tag */
3476 		if (err) {
3477 			err = 1;
3478 			goto out;
3479 		}
3480 		tbuf += MDDB_BSIZE;
3481 	}
3482 
3483 	/* magic is valid? */
3484 	if (dtp->dt_mag != MDDB_MAGIC_DT) {
3485 		err = 1;
3486 		goto out;
3487 	}
3488 
3489 	/* revision is valid? */
3490 	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
3491 		err = 1;
3492 		goto out;
3493 	}
3494 
3495 	/* crc is valid? */
3496 	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
3497 		err = 1;
3498 		goto out;
3499 	}
3500 
3501 	/* shorthand */
3502 	dtagp = &dtp->dt_dtag;
3503 
3504 	/* set number match? */
3505 	if (dtagp->dt_setno != setno) {
3506 		err = 1;
3507 		goto out;
3508 	}
3509 
3510 	/* tag is not empty? */
3511 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3512 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3513 	    dtagp->dt_id == 0) {
3514 		err = 2;
3515 		goto out;
3516 	}
3517 
3518 	/* Mark the locator as having tagged data */
3519 	rip->ri_flags |= MDDB_F_TAGDATA;
3520 
3521 out:
3522 	if (err) {
3523 		if (err == 1) {
3524 			md_set_setstatus(setno, MD_SET_BADTAG);
3525 			rip->ri_flags |= MDDB_F_BADTAG;
3526 		}
3527 		if (dtp != NULL) {
3528 			kmem_free(dtp, MDDB_DT_BYTES);
3529 			rip->ri_dtp = (mddb_dt_t *)NULL;
3530 		}
3531 	}
3532 
3533 	return (err);
3534 }
3535 
3536 /* Should not call for MN diskset since data tags are not supported */
3537 static int
3538 dt_write(mddb_set_t *s)
3539 {
3540 	int		li;
3541 	int		err = 0;
3542 	int		werr;
3543 	int		empty_tag = 0;
3544 	mddb_dtag_t	*dtagp;
3545 	mddb_dt_t	*dtp;
3546 	mddb_lb_t	*lbp = s->s_lbp;
3547 	set_t		setno = s->s_setno;
3548 	uint_t		set_status = md_get_setstatus(setno);
3549 
3550 
3551 	ASSERT(md_set[setno].s_dtp != NULL);
3552 
3553 	/* Nowhere to write to */
3554 	if (lbp->lb_dtblkcnt == 0)
3555 		return (err);
3556 
3557 	if (set_status & MD_SET_BADTAG)
3558 		return (err);
3559 
3560 	/* shorthand */
3561 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3562 	dtagp = &dtp->dt_dtag;
3563 
3564 	/* See if the tag is empty. */
3565 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3566 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3567 	    dtagp->dt_id == 0)
3568 		empty_tag = 1;
3569 
3570 	/* Write the tag to the locators and reset appropriate flags. */
3571 	for (li = 0; li < lbp->lb_loccnt; li++) {
3572 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3573 
3574 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3575 		    (lp->l_flags & MDDB_F_DELETED) ||
3576 		    (lp->l_flags & MDDB_F_EWRITE))
3577 			continue;
3578 
3579 		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
3580 		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
3581 
3582 		if (werr) {
3583 			err |= werr;
3584 			continue;
3585 		}
3586 
3587 		if (empty_tag)
3588 			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
3589 		else {
3590 			lp->l_flags |= MDDB_F_TAGDATA;
3591 			lp->l_flags &= ~MDDB_F_BADTAG;
3592 		}
3593 	}
3594 
3595 	if (err)
3596 		return (err);
3597 
3598 
3599 	/* If the tags were written, check to see if any tags remain. */
3600 	for (li = 0; li < lbp->lb_loccnt; li++) {
3601 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3602 
3603 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3604 		    (lp->l_flags & MDDB_F_DELETED) ||
3605 		    (lp->l_flags & MDDB_F_EWRITE))
3606 			continue;
3607 
3608 		if (lp->l_flags & MDDB_F_TAGDATA)
3609 			break;
3610 	}
3611 
3612 	/* If there are no tags, then clear CLRTAG and TAGDATA */
3613 	if (li == lbp->lb_loccnt) {
3614 		md_clr_setstatus(setno, MD_SET_CLRTAG);
3615 		md_clr_setstatus(setno, MD_SET_TAGDATA);
3616 	}
3617 
3618 	return (err);
3619 }
3620 
3621 /* Should not call for MN diskset since data tags are not supported */
3622 static int
3623 dt_alloc_if_needed(mddb_set_t *s)
3624 {
3625 	int		i;
3626 	int		li;
3627 	int		moveit = 0;
3628 	mddb_lb_t	*lbp = s->s_lbp;
3629 	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
3630 	set_t		setno = s->s_setno;
3631 	uint_t		set_status = md_get_setstatus(setno);
3632 
3633 	/*
3634 	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
3635 	 * not detected, there is nothing to do.
3636 	 */
3637 	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
3638 		return (0);
3639 
3640 	/* Bitmap not setup, checks can't be done */
3641 	if (s->s_totalblkcnt == 0)
3642 		return (0);
3643 
3644 	/* While reading the tag(s) an invalid tag data record was seen */
3645 	if (set_status & MD_SET_BADTAG)
3646 		/* See if the invalid tag needs to be moved */
3647 		for (i = 0; i < MDDB_DT_BLOCKS; i++)
3648 			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
3649 				moveit = 1;
3650 				break;
3651 			}
3652 
3653 	/* Need to move or allocate the tag data record */
3654 	if (moveit || blkcnt == 0) {
3655 		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
3656 		if (lbp->lb_dtfirstblk == 0) {
3657 			cmn_err(CE_WARN,
3658 			    "Unable to allocate data tag record");
3659 			return (0);
3660 		}
3661 		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
3662 
3663 		/* Mark the locators so that they get written to disk. */
3664 		for (li = 0; li < lbp->lb_loccnt; li++) {
3665 			mddb_locator_t	*lp = &lbp->lb_locators[li];
3666 
3667 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3668 			    (lp->l_flags & MDDB_F_DELETED) ||
3669 			    (lp->l_flags & MDDB_F_EWRITE))
3670 				continue;
3671 
3672 			lp->l_flags |= MDDB_F_BADTAG;
3673 		}
3674 		return (1);
3675 	}
3676 
3677 	/*
3678 	 * Make sure the blocks are owned, since the calculation in
3679 	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
3680 	 */
3681 	for (i = 0; i < MDDB_DT_BLOCKS; i++)
3682 		blkbusy(s, (i + lbp->lb_dtfirstblk));
3683 
3684 	return (1);
3685 }
3686 
3687 /*
3688  * Writestart writes the incore mddb out to all of the replicas.
3689  * This is called when a diskset is started and when an error has
3690  * been enountered during the write to a mddb.
3691  *
3692  * flag can be 2 values:
3693  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3694  *		always used for traditional and local disksets.
3695  *		This is the normal path for MN disksets since the slave
3696  *		nodes aren't actually allowed to write to disk.
3697  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3698  *		master has been chosen, the new master may need to
3699  * 		write its incore mddb to disk (this is the case where the
3700  *		old master had executed a message but hadn't relayed it
3701  *		to this slave yet).  New master should not write the
3702  *		change log records since new master would be overwriting
3703  *		valuable data.  Only used during a reconfig cycle.
3704  */
3705 static int
3706 writestart(
3707 	mddb_set_t	*s,
3708 	int		flag
3709 )
3710 {
3711 	int		li;
3712 	mddb_locator_t	*lp;
3713 	mddb_lb_t	*lbp;
3714 	mddb_ln_t	*lnp;
3715 	int		err = 0;
3716 	uint_t		set_status;
3717 
3718 	lbp = s->s_lbp;
3719 
3720 	for (li = 0; li < lbp->lb_loccnt; li++) {
3721 		lp = &lbp->lb_locators[li];
3722 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3723 			continue;
3724 		if (! (lp->l_flags & MDDB_F_SUSPECT))
3725 			continue;
3726 		if (writecopy(s, li, flag))
3727 			return (1);
3728 		lp->l_flags |= MDDB_F_UP2DATE;
3729 	}
3730 
3731 	for (li = 0; li < lbp->lb_loccnt; li++) {
3732 		lp = &lbp->lb_locators[li];
3733 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3734 			continue;
3735 		if ((lp->l_flags & MDDB_F_UP2DATE))
3736 			continue;
3737 		if (checkcopy(s, li))
3738 			if (err = writecopy(s, li, flag))
3739 				return (1);
3740 		lp->l_flags |= MDDB_F_UP2DATE;
3741 	}
3742 
3743 	/*
3744 	 * Call fixoptrecord even during a reconfig cycle since a replica
3745 	 * failure may force the master to re-assign the optimized
3746 	 * resync record to another replica.
3747 	 */
3748 	if (fixoptrecords(s))
3749 		return (1);
3750 
3751 	set_status = md_get_setstatus(s->s_setno);
3752 
3753 	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
3754 	for (li = 0; li < lbp->lb_loccnt; li++) {
3755 		lp = &lbp->lb_locators[li];
3756 
3757 		if (lp->l_flags & MDDB_F_DELETED)
3758 			continue;
3759 
3760 		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
3761 		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
3762 		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
3763 		    (lp->l_flags & MDDB_F_OLDACT) != 0))
3764 			break;
3765 
3766 		if ((set_status & MD_SET_TAGDATA) ||
3767 		    (set_status & MD_SET_CLRTAG))
3768 			if ((lp->l_flags & MDDB_F_TAGDATA) ||
3769 			    (lp->l_flags & MDDB_F_BADTAG))
3770 				break;
3771 	}
3772 
3773 	/*
3774 	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
3775 	 * the lbp identifier and the set identifier doesn't match.
3776 	 */
3777 	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
3778 
3779 		/* Only call for traditional and local sets */
3780 		if (!(lbp->lb_flags & MDDB_MNSET))
3781 			(void) dt_write(s);
3782 
3783 		setidentifier(s, &lbp->lb_ident);
3784 
3785 		if (err = push_lb(s)) {
3786 			(void) upd_med(s, "writestart(0)");
3787 			return (err);
3788 		}
3789 
3790 		(void) upd_med(s, "writestart(0)");
3791 
3792 		if (err = push_lb(s)) {
3793 			(void) upd_med(s, "writestart(1)");
3794 			return (err);
3795 		}
3796 
3797 		(void) upd_med(s, "writestart(1)");
3798 
3799 		lnp = s->s_lnp;
3800 		uniqtime32(&lnp->ln_timestamp);
3801 		if (lbp->lb_flags & MDDB_MNSET)
3802 			lnp->ln_revision = MDDB_REV_MNLN;
3803 		else
3804 			lnp->ln_revision = MDDB_REV_LN;
3805 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
3806 		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
3807 		    lbp->lb_lnblkcnt, 0);
3808 		/*
3809 		 * If a MN diskset and this is the master, set the PARSE_LOCNM
3810 		 * flag in the mddb_set structure to show that the locator
3811 		 * names have changed.
3812 		 * Don't set parseflags as a result of a new master sync
3813 		 * during reconfig cycle since slaves nodes are already
3814 		 * in-sync with the new master.
3815 		 */
3816 
3817 		if ((lbp->lb_flags & MDDB_MNSET) &&
3818 		    (md_set[s->s_setno].s_am_i_master) &&
3819 		    (flag != MDDB_WRITECOPY_SYNC)) {
3820 			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
3821 		}
3822 
3823 		if (err)
3824 			return (err);
3825 	}
3826 
3827 	for (li = 0; li < lbp->lb_loccnt; li++) {
3828 		lp = &lbp->lb_locators[li];
3829 		if (lp->l_flags & MDDB_F_DELETED)
3830 			continue;
3831 		if (lp->l_flags & MDDB_F_ACTIVE) {
3832 			lp->l_flags |= MDDB_F_OLDACT;
3833 		} else {
3834 			lp->l_flags &= ~MDDB_F_OLDACT;
3835 		}
3836 	}
3837 
3838 	md_clr_setstatus(s->s_setno, MD_SET_STALE);
3839 
3840 	return (0);
3841 }
3842 
3843 /*
3844  * selectreplicas selects the working replicas and may write the incore
3845  * version of the mddb out to the replicas ondisk.
3846  *
3847  * flag can be 3 values:
3848  *	MDDB_RETRYSCAN - quick scan to see if there is an error.
3849  *			If no new error, returns without writing mddb
3850  *			to disks.  If a new error is seen, writes out
3851  *			mddb to disks.
3852  *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
3853  *			out mddb to the replica ondisk.  Calls writecopy
3854  *			with MDDB_WRITECOPY_ALL flag which writes out
3855  *			all records to the replicas ondisk.
3856  *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
3857  *			and ondisk mddbs by writing incore values to disk.
3858  *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
3859  *			that change log records are not written out.
3860  *			Only used by MN disksets.
3861  *
3862  * Returns:
3863  *	0 - Successful
3864  *	1 - Unable to write incore mddb data to disk since < 50% replicas.
3865  */
3866 int
3867 selectreplicas(
3868 	mddb_set_t	*s,
3869 	int		flag
3870 )
3871 {
3872 	int		li;
3873 	int		alc;
3874 	int		lc;
3875 	mddb_locator_t	*lp;
3876 	mddb_lb_t	*lbp = s->s_lbp;
3877 	set_t		setno = s->s_setno;
3878 	int		wc_flag;
3879 
3880 	/*
3881 	 * can never transition from stale to not stale
3882 	 */
3883 	if (md_get_setstatus(setno) & MD_SET_STALE) {
3884 		for (li = 0; li < lbp->lb_loccnt; li++) {
3885 			lp = &lbp->lb_locators[li];
3886 			if (lp->l_flags & MDDB_F_DELETED)
3887 				continue;
3888 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3889 				lp->l_flags |= MDDB_F_ACTIVE;
3890 			} else {
3891 				lp->l_flags &= ~MDDB_F_ACTIVE;
3892 			}
3893 		}
3894 		return (1);
3895 	}
3896 
3897 	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
3898 		for (li = 0; li < lbp->lb_loccnt; li++) {
3899 			lp = &lbp->lb_locators[li];
3900 			if (lp->l_flags & MDDB_F_DELETED)
3901 				continue;
3902 			if (lp->l_flags & MDDB_F_ACTIVE) {
3903 				lp->l_flags |= MDDB_F_OLDACT;
3904 				lp->l_flags &= ~MDDB_F_SUSPECT;
3905 			} else {
3906 				lp->l_flags |= MDDB_F_SUSPECT;
3907 				lp->l_flags &= ~MDDB_F_OLDACT;
3908 			}
3909 
3910 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3911 				lp->l_flags |= MDDB_F_ACTIVE;
3912 				lp->l_flags &= ~MDDB_F_EWRITE;
3913 				lp->l_flags &= ~MDDB_F_TOOSMALL;
3914 			} else {
3915 				lp->l_flags &= ~MDDB_F_ACTIVE;
3916 			}
3917 		}
3918 		computefreeblks(s); /* set up free block bits */
3919 	} else {
3920 		for (li = 0; li < lbp->lb_loccnt; li++) {
3921 			lp = &lbp->lb_locators[li];
3922 			if (! (lp->l_flags & MDDB_F_ACTIVE))
3923 				continue;
3924 			if (lp->l_flags & MDDB_F_EWRITE)
3925 				break;
3926 		}
3927 
3928 		/*
3929 		 * if there are no errors this is error has already
3930 		 * been processed return current state
3931 		 */
3932 		if (li == lbp->lb_loccnt)
3933 			return (md_get_setstatus(setno) & MD_SET_TOOFEW);
3934 
3935 		lp->l_flags &= ~MDDB_F_ACTIVE;
3936 		do {
3937 			lp = &lbp->lb_locators[li];
3938 			lp->l_flags &= ~MDDB_F_UP2DATE;
3939 		} while (++li < lbp->lb_loccnt);
3940 	}
3941 
3942 	alc = 0;
3943 	lc = 0;
3944 	for (li = 0; li < lbp->lb_loccnt; li++) {
3945 		lp = &lbp->lb_locators[li];
3946 		if (lp->l_flags & MDDB_F_DELETED)
3947 			continue;
3948 		lc++;
3949 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3950 			continue;
3951 		alc++;
3952 	}
3953 
3954 	if (alc < ((lc + 1) / 2)) {
3955 		md_set_setstatus(setno, MD_SET_TOOFEW);
3956 		return (1);
3957 	}
3958 
3959 	/* Set wc_flag based on flag passed in. */
3960 	if (flag == MDDB_SCANALLSYNC)
3961 		wc_flag = MDDB_WRITECOPY_SYNC;
3962 	else
3963 		wc_flag = MDDB_WRITECOPY_ALL;
3964 
3965 	do {
3966 		if (! writestart(s, wc_flag)) {
3967 			md_clr_setstatus(setno, MD_SET_TOOFEW);
3968 			return (0);
3969 		}
3970 		alc  = 0;
3971 		for (li = 0; li < lbp->lb_loccnt; li++) {
3972 			lp = &lbp->lb_locators[li];
3973 			if ((lp->l_flags & MDDB_F_DELETED) ||
3974 			    (lp->l_flags & MDDB_F_EMASTER))
3975 				continue;
3976 
3977 			if (lp->l_flags & MDDB_F_EWRITE) {
3978 				lp->l_flags &= ~MDDB_F_ACTIVE;
3979 				lp->l_flags &= ~MDDB_F_UP2DATE;
3980 				continue;
3981 			}
3982 			alc++;
3983 		}
3984 	} while (alc >= ((lc + 1) / 2));
3985 	md_set_setstatus(setno, MD_SET_TOOFEW);
3986 	return (1);
3987 }
3988 
3989 static int
3990 checkstate(
3991 	mddb_set_t	*s,
3992 	int		probe
3993 )
3994 {
3995 	int		error;
3996 	uint_t		set_status = md_get_setstatus(s->s_setno);
3997 
3998 	ASSERT(s != NULL);
3999 
4000 	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
4001 		return (0);
4002 
4003 	if (probe == MDDB_NOPROBE)
4004 		return (1);
4005 
4006 	single_thread_start(s);
4007 	error = selectreplicas(s, MDDB_SCANALL);
4008 	single_thread_end(s);
4009 
4010 	if (error == 0 && s->s_zombie != 0) {
4011 		mutex_exit(SETMUTEX(s->s_setno));
4012 		error = mddb_deleterec(s->s_zombie);
4013 		mutex_enter(SETMUTEX(s->s_setno));
4014 		if (error == 0)
4015 			s->s_zombie = 0;
4016 	}
4017 	return (error);
4018 }
4019 
4020 static int
4021 writeretry(
4022 	mddb_set_t	*s
4023 )
4024 {
4025 	if (selectreplicas(s, MDDB_RETRYSCAN))
4026 		if (selectreplicas(s, MDDB_SCANALL))
4027 			return (1);
4028 	return (0);
4029 }
4030 
4031 static void
4032 free_mbipp(mddb_mb_ic_t **mbipp)
4033 {
4034 	mddb_mb_ic_t	*mbip1, *mbip2;
4035 
4036 	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
4037 		mbip2 = mbip1->mbi_next;
4038 		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
4039 	}
4040 	*mbipp = (mddb_mb_ic_t *)NULL;
4041 }
4042 
4043 static mddb_ri_t *
4044 save_rip(mddb_set_t *s)
4045 {
4046 	mddb_ri_t	*trip = s->s_rip;
4047 	mddb_ri_t	*nrip = NULL;
4048 	mddb_ri_t	**nripp = &nrip;
4049 	mddb_ri_t	*rip;
4050 
4051 	while (trip) {
4052 		/* Run to the end of the list */
4053 		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
4054 			/* void */;
4055 
4056 		/* Add the new member */
4057 		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
4058 
4059 		ASSERT(*nripp != NULL);
4060 
4061 		/* shorthand */
4062 		rip = *nripp;
4063 
4064 		*rip = *trip;			/* structure assignment */
4065 
4066 		/* Clear the stuff that is not needed for hints */
4067 		rip->ri_flags = 0;
4068 		rip->ri_commitcnt = 0;
4069 		rip->ri_transplant = 0;
4070 		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
4071 		rip->ri_dtp = (mddb_dt_t *)NULL;
4072 		rip->ri_lbp = (mddb_lb_t *)NULL;
4073 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4074 		rip->ri_devid = (ddi_devid_t)NULL;
4075 		rip->ri_old_devid = (ddi_devid_t)NULL;
4076 		rip->ri_next = (mddb_ri_t *)NULL;
4077 
4078 		trip = trip->ri_next;
4079 	}
4080 	return (nrip);
4081 }
4082 
4083 static void
4084 free_rip(mddb_ri_t **ripp)
4085 {
4086 	mddb_ri_t	*rip;
4087 	mddb_ri_t	*arip;
4088 
4089 	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
4090 		arip = rip->ri_next;
4091 		if (rip->ri_devid != (ddi_devid_t)NULL) {
4092 			ddi_devid_free(rip->ri_devid);
4093 			rip->ri_devid = (ddi_devid_t)NULL;
4094 		}
4095 		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
4096 			ddi_devid_free(rip->ri_old_devid);
4097 			rip->ri_old_devid = (ddi_devid_t)NULL;
4098 		}
4099 		kmem_free((caddr_t)rip, sizeof (*rip));
4100 	}
4101 	*ripp = (mddb_ri_t *)NULL;
4102 }
4103 
4104 /*
4105  * this routine selects the correct replica to use
4106  * the rules are as follows
4107  *	1.	if all replica has same init time select highest commit count
4108  *	2.	if some but not all replicas are from another hostid discard
4109  *		them.
4110  *	3.	find which init time is present is most replicas
4111  *	4.	discard all replicas which do not match most init times
4112  *	5.	select replica with highest commit count
4113  */
4114 
4115 static mddb_lb_t *
4116 selectlocator(
4117 	mddb_set_t	*s
4118 )
4119 {
4120 	mddb_ri_t	*rip = s->s_rip;
4121 	mddb_ri_t	*r, *r1;
4122 	mddb_lb_t	*lbp;
4123 	struct timeval32 *tp = (struct timeval32 *)NULL;
4124 	int		different;
4125 	int		same;
4126 	int		count;
4127 	int		maxcount;
4128 	set_t		setno = s->s_setno;
4129 	size_t		sz;
4130 	int		mn_set = 0;
4131 
4132 	/* Clear the ri_transplant flag on all the rip entries. */
4133 	/* Set ri_commitcnt to locator's commitcnt - if available */
4134 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4135 		r->ri_transplant = 0;
4136 		if (r->ri_lbp != (mddb_lb_t *)NULL) {
4137 			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
4138 			/* If any locators have MN bit set, set flag */
4139 			if (r->ri_lbp->lb_flags & MDDB_MNSET)
4140 				mn_set = 1;
4141 		}
4142 	}
4143 
4144 	/*
4145 	 * A data tag is being used, so use it to limit the selection first.
4146 	 * Data tags not used in MN diskset.
4147 	 */
4148 	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
4149 		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;
4150 
4151 		/*
4152 		 * now toss any locators that have a different data tag
4153 		 */
4154 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4155 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4156 				continue;
4157 
4158 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4159 				/* If same tag, keep it */
4160 				if (dtl_cmp(&dtp->dt_dtag,
4161 				    &r->ri_dtp->dt_dtag) == 0)
4162 					continue;
4163 			}
4164 
4165 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4166 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4167 				r->ri_dtp = (mddb_dt_t *)NULL;
4168 			}
4169 
4170 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4171 			if (!(md_get_setstatus(setno) &
4172 			    MD_SET_REPLICATED_IMPORT)) {
4173 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4174 					sz = ddi_devid_sizeof(r->ri_old_devid);
4175 					kmem_free((caddr_t)r->ri_old_devid, sz);
4176 					r->ri_old_devid = (ddi_devid_t)NULL;
4177 				}
4178 			}
4179 
4180 			kmem_free((caddr_t)r->ri_lbp,
4181 			    dbtob(r->ri_lbp->lb_blkcnt));
4182 			r->ri_lbp = (mddb_lb_t *)NULL;
4183 
4184 			r->ri_transplant = 1;
4185 		}
4186 
4187 		/* Tag used, clear the bit */
4188 		md_clr_setstatus(s->s_setno, MD_SET_USETAG);
4189 
4190 		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
4191 			/*
4192 			 * Get rid of the list of tags.
4193 			 */
4194 			dtl_freel(&s->s_dtlp);
4195 
4196 			/*
4197 			 * Re-create the list with the tag used.
4198 			 */
4199 			(void) dtl_addl(s, &dtp->dt_dtag);
4200 		}
4201 	}
4202 
4203 	/*
4204 	 * scan to see if all replicas have same time
4205 	 */
4206 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4207 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4208 			continue;
4209 		if (tp == NULL) {
4210 			tp = &r->ri_lbp->lb_inittime;
4211 			continue;
4212 		}
4213 		/* CSTYLED */
4214 		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
4215 			break;
4216 	}
4217 
4218 	/*
4219 	 * if r == NULL then they were all them same. Choose highest
4220 	 * commit count
4221 	 */
4222 	if (r == (mddb_ri_t *)NULL)
4223 		goto out;
4224 
4225 	/*
4226 	 * If here, a bogus replica is present and at least 1 lb_inittime
4227 	 * did not match.
4228 	 */
4229 
4230 	/*
4231 	 * look and see if any but not all are from different id
4232 	 */
4233 
4234 	different = 0;
4235 	same = 0;
4236 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4237 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4238 			continue;
4239 		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4240 			different = 1;
4241 		else
4242 			same = 1;
4243 	}
4244 
4245 	/*
4246 	 * now go through and throw out different if there are some
4247 	 * that are the same
4248 	 */
4249 	if (different != 0 && same != 0) {
4250 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4251 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4252 				continue;
4253 
4254 			if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
4255 				continue;
4256 
4257 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4258 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4259 				r->ri_dtp = (mddb_dt_t *)NULL;
4260 			}
4261 
4262 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4263 			if (!(md_get_setstatus(setno) &
4264 			    MD_SET_REPLICATED_IMPORT)) {
4265 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4266 					sz = ddi_devid_sizeof(r->ri_old_devid);
4267 					kmem_free((caddr_t)r->ri_old_devid, sz);
4268 					r->ri_old_devid = (ddi_devid_t)NULL;
4269 				}
4270 			}
4271 
4272 			kmem_free((caddr_t)r->ri_lbp,
4273 			    dbtob(r->ri_lbp->lb_blkcnt));
4274 			r->ri_lbp = (mddb_lb_t *)NULL;
4275 
4276 			r->ri_transplant = 1;
4277 		}
4278 	}
4279 
4280 	/*
4281 	 * go through and pick highest. Use n square because it is
4282 	 * simple and 40 some is max possible
4283 	 */
4284 	maxcount = 0;
4285 	lbp = (mddb_lb_t *)NULL;
4286 	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
4287 		if (r1->ri_lbp == (mddb_lb_t *)NULL)
4288 			continue;
4289 		count = 0;
4290 		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4291 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4292 				continue;
4293 			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
4294 			    &r->ri_lbp->lb_inittime, ==))
4295 				count++;
4296 		}
4297 		if (count > maxcount) {
4298 			maxcount = count;
4299 			lbp = r1->ri_lbp;
4300 		}
4301 	}
4302 
4303 	/*
4304 	 * now go though and toss any that are of a different time stamp
4305 	 */
4306 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4307 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4308 			continue;
4309 		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
4310 		    &r->ri_lbp->lb_inittime, ==))
4311 			continue;
4312 
4313 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4314 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4315 			r->ri_dtp = (mddb_dt_t *)NULL;
4316 		}
4317 
4318 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4319 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4320 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4321 				sz = ddi_devid_sizeof(r->ri_old_devid);
4322 				kmem_free((caddr_t)r->ri_old_devid, sz);
4323 				r->ri_old_devid = (ddi_devid_t)NULL;
4324 			}
4325 		}
4326 
4327 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4328 		r->ri_lbp = (mddb_lb_t *)NULL;
4329 
4330 		r->ri_transplant = 1;
4331 	}
4332 
4333 out:
4334 	/*
4335 	 * Find the locator with the highest commit count, and make it the
4336 	 * "chosen" one.
4337 	 */
4338 	lbp = (mddb_lb_t *)NULL;
4339 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4340 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4341 			continue;
4342 
4343 		if (lbp == NULL) {
4344 			lbp = r->ri_lbp;
4345 			continue;
4346 		}
4347 
4348 		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
4349 			lbp = r->ri_lbp;
4350 	}
4351 
4352 	/* Toss all locator blocks, except the "chosen" one. */
4353 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4354 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4355 			continue;
4356 
4357 		/* Get rid of all dtp's */
4358 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4359 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4360 			r->ri_dtp = (mddb_dt_t *)NULL;
4361 		}
4362 
4363 		if (r->ri_lbp == lbp)
4364 			continue;
4365 
4366 		/* Get rid of extra locator devid block info */
4367 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4368 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4369 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4370 				sz = ddi_devid_sizeof(r->ri_old_devid);
4371 				kmem_free((caddr_t)r->ri_old_devid, sz);
4372 				r->ri_old_devid = (ddi_devid_t)NULL;
4373 			}
4374 		}
4375 
4376 		/* Get rid of extra locators */
4377 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4378 		r->ri_lbp = (mddb_lb_t *)NULL;
4379 	}
4380 	return (lbp);
4381 }
4382 
4383 static void
4384 locator2cfgloc(
4385 	mddb_lb_t		*lbp,
4386 	mddb_cfg_loc_t		*clp,
4387 	int			li,
4388 	side_t			sideno,
4389 	mddb_did_ic_t		*did_icp
4390 )
4391 {
4392 	mddb_drvnm_t		*dn;
4393 	mddb_locator_t		*lp = &lbp->lb_locators[li];
4394 	mddb_sidelocator_t	*slp;
4395 	mddb_mnsidelocator_t	*mnslp;
4396 	mddb_did_info_t		*did_info;
4397 	int 			i, sz, szalloc;
4398 	int			mn_set = 0;
4399 	mddb_mnlb_t		*mnlbp;
4400 
4401 	if (lbp->lb_flags & MDDB_MNSET) {
4402 		mn_set = 1;
4403 		mnlbp = (mddb_mnlb_t *)lbp;
4404 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4405 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4406 			if (mnslp->mnl_sideno == sideno)
4407 				break;
4408 		}
4409 		if (i == MD_MNMAXSIDES)
4410 			return;
4411 	} else {
4412 		slp = &lbp->lb_sidelocators[sideno][li];
4413 	}
4414 
4415 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4416 		did_info = &(did_icp->did_ic_blkp->blk_info[li]);
4417 		if (did_info->info_flags & MDDB_DID_EXISTS) {
4418 			sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
4419 			if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
4420 				/*
4421 				 * copy device id from mddb to
4422 				 * cfg_loc structure
4423 				 */
4424 				szalloc = clp->l_devid_sz;
4425 				if (sz <= szalloc) {
4426 					for (i = 0; i < sz; i++) {
4427 						((char *)(uintptr_t)
4428 						    clp->l_devid)[i] =
4429 						    ((char *)did_icp->
4430 						    did_ic_devid[li])[i];
4431 					}
4432 					clp->l_devid_flags |= MDDB_DEVID_VALID;
4433 					(void) strcpy(clp->l_minor_name,
4434 					    did_info->info_minor_name);
4435 				} else {
4436 					clp->l_devid_flags |=
4437 					    MDDB_DEVID_NOSPACE;
4438 				}
4439 			} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
4440 				clp->l_devid_flags = MDDB_DEVID_SZ;
4441 				clp->l_devid_sz = sz;
4442 			}
4443 		}
4444 	}
4445 
4446 	/*
4447 	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
4448 	 * and sidelocators.  During startup, the dev, drvnm and mnum in
4449 	 * these structures may not match the devid (the locators and
4450 	 * sidelocators will be updated to match the devid by the routine
4451 	 * load_old_replicas).  Using out-of-sync values won't cause any
4452 	 * problems since ridev will re-derive these from the devid and mnum.
4453 	 * After startup, the dev, drvnm and mnum in these structures have
4454 	 * been updated and can be used.
4455 	 */
4456 
4457 	clp->l_blkno = lp->l_blkno;
4458 	clp->l_flags = lp->l_flags;
4459 	clp->l_dev = lp->l_dev;
4460 
4461 	if (mn_set) {
4462 		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
4463 		clp->l_mnum = mnslp->mnl_mnum;
4464 	} else {
4465 		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
4466 		clp->l_mnum = slp->l_mnum;
4467 	}
4468 	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
4469 }
4470 
4471 /*
4472  * Find the index into the mnsidelocator where entry will go.
4473  * Then index can be fed into both splitname2locatorblocks and
4474  * cfgloc2locator so that those entries can be kept in sync.
4475  *
4476  * Returns:
4477  *	-1 if failed to find unused slot or if a traditional diskset
4478  *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
4479  */
4480 static int
4481 checklocator(
4482 	mddb_lb_t		*lbp,
4483 	int			li,
4484 	side_t			sideno
4485 )
4486 {
4487 	uchar_t			i;
4488 	mddb_mnsidelocator_t	*mnslp;
4489 	mddb_mnlb_t		*mnlbp;
4490 	int			index = -1;
4491 
4492 	if (lbp->lb_flags & MDDB_MNSET) {
4493 		/*
4494 		 * Checking side locator structure.  First, check if
4495 		 * there is already an entry for this side.  If so,
4496 		 * then use that entry.  Otherwise, find an entry
4497 		 * that has a sideno of 0.
4498 		 */
4499 		mnlbp = (mddb_mnlb_t *)lbp;
4500 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4501 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4502 			if (mnslp->mnl_sideno == sideno) {
4503 				/* Found a match - stop looking */
4504 				index = i;
4505 				break;
4506 			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
4507 				/* Set first empty slot, but keep looking */
4508 				index = i;
4509 			}
4510 		}
4511 		/* Didn't find empty slot or previously used slot */
4512 		if ((i == MD_MNMAXSIDES) && (index == -1)) {
4513 			return (-1);
4514 		}
4515 		return (index);
4516 	} else
4517 		return (0);
4518 }
4519 
4520 /*
4521  * Takes locator information (driver name, minor number, sideno) and
4522  * stores it in the locator block.
4523  * For traditional diskset, the sideno is the index into the sidelocator
4524  * array in the locator block.
4525  * For the MN diskset, the sideno is the nodeid which can be any number,
4526  * so the index passed in is the index into the mnsidelocator array
4527  * in the locator block.
4528  */
4529 static int
4530 cfgloc2locator(
4531 	mddb_lb_t		*lbp,
4532 	mddb_cfg_loc_t		*clp,
4533 	int			li,
4534 	side_t			sideno,
4535 	int			index	/* Only useful in MNsets when > 1 */
4536 )
4537 {
4538 	uchar_t			i;
4539 	mddb_sidelocator_t	*slp;
4540 	mddb_mnsidelocator_t	*mnslp;
4541 	mddb_set_t		*s;
4542 	int			mn_set = 0;
4543 	mddb_mnlb_t		*mnlbp;
4544 
4545 	if (lbp->lb_flags & MDDB_MNSET) {
4546 		mnlbp = (mddb_mnlb_t *)lbp;
4547 		mn_set = 1;
4548 		/*
4549 		 * Index will be the slot that has the given sideno or
4550 		 * the first empty slot if no match is found.
4551 		 * This was pre-checked out in check locator.
4552 		 */
4553 		mnslp = &mnlbp->lb_mnsidelocators[index][li];
4554 	} else {
4555 		slp = &lbp->lb_sidelocators[sideno][li];
4556 	}
4557 
4558 	/*
4559 	 * Look for the driver name
4560 	 */
4561 	for (i = 0; i < MDDB_DRVNMCNT; i++) {
4562 		if (lbp->lb_drvnm[i].dn_len == 0)
4563 			continue;
4564 		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4565 		    MD_MAXDRVNM) == 0)
4566 			break;
4567 	}
4568 
4569 	/*
4570 	 * Didn't find one, add a new one
4571 	 */
4572 	if (i == MDDB_DRVNMCNT) {
4573 		for (i = 0; i < MDDB_DRVNMCNT; i++) {
4574 			if (lbp->lb_drvnm[i].dn_len == 0)
4575 				break;
4576 		}
4577 		if (i == MDDB_DRVNMCNT)
4578 			return (1);
4579 		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4580 		    MD_MAXDRVNM);
4581 		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
4582 	}
4583 
4584 	/* Fill in the drvnm index */
4585 	if (mn_set) {
4586 		mnslp->mnl_drvnm_index = i;
4587 		mnslp->mnl_mnum = clp->l_mnum;
4588 		mnslp->mnl_sideno = sideno;
4589 	} else {
4590 		slp->l_drvnm_index = i;
4591 		slp->l_mnum = clp->l_mnum;
4592 	}
4593 
4594 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4595 		/*
4596 		 * This device id could already be associated with this index
4597 		 * if this is not the first side added to the set.
4598 		 * If device id is 0, there is no device id for this device.
4599 		 */
4600 		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
4601 			return (0);
4602 		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
4603 		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
4604 		    clp->l_minor_name)) {
4605 			return (1);
4606 		}
4607 	}
4608 
4609 	return (0);
4610 }
4611 
4612 /*
4613  * See if there are mediator hosts and try to use the data.
4614  */
4615 static int
4616 mediate(
4617 	mddb_set_t	*s
4618 )
4619 {
4620 	mddb_lb_t	*lbp = s->s_lbp;
4621 	med_data_lst_t	*meddlp = NULL;
4622 	med_data_lst_t	*tmeddlp = NULL;
4623 	med_data_t	*meddp;
4624 	int		medok = 0;
4625 	int		medacc = 0;
4626 	uint_t		maxcc;
4627 	int		golden = 0;
4628 	int		err = 1;
4629 	set_t		setno = s->s_setno;
4630 
4631 	/* Do not have a mediator, then the state is stale */
4632 	if (s->s_med.n_cnt == 0)
4633 		return (err);
4634 
4635 	/* Contact the mediator hosts for the data */
4636 	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
4637 
4638 	/* No mediator data, stale */
4639 	if (meddlp == NULL)
4640 		return (err);
4641 
4642 	/* Mark all the mediator data that is not for this set as errored */
4643 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4644 		struct timeval32 tmptime;
4645 		meddp = tmeddlp->mdl_med;
4646 
4647 		/* Count the number of mediators contacted */
4648 		medacc++;
4649 
4650 		/* Paranoid check */
4651 		if (meddp->med_dat_sn != setno)
4652 			meddp->med_dat_fl |= MED_DFL_ERROR;
4653 
4654 		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
4655 
4656 		/*CSTYLED*/
4657 		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
4658 			meddp->med_dat_fl |= MED_DFL_ERROR;
4659 	}
4660 
4661 	/* Get the max commitcount */
4662 	maxcc = 0;
4663 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4664 		meddp = tmeddlp->mdl_med;
4665 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4666 			continue;
4667 		if (meddp->med_dat_cc > maxcc)
4668 			maxcc = meddp->med_dat_cc;
4669 	}
4670 
4671 	/* Now mark the records that don't have the highest cc as errored */
4672 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4673 		meddp = tmeddlp->mdl_med;
4674 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4675 			continue;
4676 		if (meddp->med_dat_cc != maxcc)
4677 			meddp->med_dat_fl |= MED_DFL_ERROR;
4678 	}
4679 
4680 	/* Now mark the records that don't match the lb commitcnt as errored */
4681 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4682 		meddp = tmeddlp->mdl_med;
4683 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4684 			continue;
4685 		if (meddp->med_dat_cc != lbp->lb_commitcnt)
4686 			meddp->med_dat_fl |= MED_DFL_ERROR;
4687 	}
4688 
4689 	/* Is there a "golden" copy and how many valid mediators */
4690 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4691 		meddp = tmeddlp->mdl_med;
4692 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4693 			continue;
4694 
4695 		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
4696 			golden++;
4697 
4698 		medok++;
4699 	}
4700 
4701 	/* No survivors, stale */
4702 	if (medok == 0)
4703 		goto out;
4704 
4705 	/* No mediator quorum and no golden copies, stale */
4706 	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
4707 		/* Skip odd numbers, no exact 50% */
4708 		if (s->s_med.n_cnt & 1)
4709 			goto out;
4710 		/* Have 50%, allow an accept */
4711 		if (medacc == (s->s_med.n_cnt / 2))
4712 			md_set_setstatus(setno, MD_SET_ACCOK);
4713 		goto out;
4714 	}
4715 
4716 	/* We either have a quorum or a golden copy, or both */
4717 	err = 0;
4718 
4719 out:
4720 	if (meddlp) {
4721 		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
4722 			tmeddlp = meddlp->mdl_nx;
4723 			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
4724 			kmem_free(meddlp, sizeof (med_data_lst_t));
4725 		}
4726 	}
4727 
4728 	return (err);
4729 }
4730 
4731 /*
4732  *	1. read masterblks and locator blocks for all know database locations
4733  *		a. keep track of which have good master blks
4734  *		b. keep track of which have good locators
4735  *
4736  */
4737 static int
4738 get_mbs_n_lbs(
4739 	mddb_set_t	*s,
4740 	int		*write_lb
4741 )
4742 {
4743 	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
4744 						/* May be cast to mddb_mnlb_t */
4745 						/* if accessing sidenames in */
4746 						/* MN set */
4747 	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
4748 	mddb_did_blk_t	*did_blkp = 0;
4749 	int		did_blkp_sz = 0;
4750 	mddb_did_db_t	*did_dbp;
4751 	mddb_did_info_t	*did_info;
4752 	caddr_t		did_block;
4753 	mddb_ri_t	*rip;
4754 	mddb_dtag_lst_t	*dtlp;
4755 	mddb_locator_t	*lp;
4756 	daddr_t		physblk;
4757 	int		li;
4758 	uint_t		blk;
4759 	md_dev64_t	dev;
4760 	caddr_t		buffer;
4761 	uint_t		lb_blkcnt;
4762 	int		retval = 0;
4763 	int		err = 0;
4764 	int		lb_ok = 0;
4765 	int		lb_total = 0;
4766 	int		lb_tagged = 0;
4767 	int		lb_tags;
4768 	set_t		setno = s->s_setno;
4769 	int		cont_flag, i;
4770 	mddb_did_db_t	*did_dbp1, *did_dbp2;
4771 	int		mn_set = 0;
4772 	mddb_cfg_loc_t	*cl;
4773 
4774 	/*
4775 	 * read in master blocks and locator block for all known locators.
4776 	 * lb_blkcnt will be set correctly for MN set later once getmasters
4777 	 * has determined that the set is a MN set.
4778 	 */
4779 	lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
4780 
4781 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
4782 		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
4783 		    MDDB_F_EMASTER);
4784 		rip->ri_lbp = (mddb_lb_t *)NULL;
4785 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4786 
4787 		/*
4788 		 * Translated dev is only used in calls to getmasters and
4789 		 * getblks which expect a translated (aka miniroot) dev.
4790 		 */
4791 		dev = md_xlate_targ_2_mini(rip->ri_dev);
4792 		if (dev == NODEV64) {
4793 			/* Set error flag that getmasters would have set */
4794 			/* if getmasters had been allowed to fail */
4795 			rip->ri_flags |= MDDB_F_EMASTER;
4796 		}
4797 
4798 		/*
4799 		 * Invalid device id on system (due to failed or
4800 		 * removed device) or invalid devt during upgrade
4801 		 * (due to powered off device) will cause this
4802 		 * replica to be marked in error and not used.
4803 		 */
4804 		if (rip->ri_flags & MDDB_F_EMASTER)
4805 			continue;
4806 
4807 		/* get all master blocks, does mddb_devopen() */
4808 		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
4809 		    &rip->ri_flags, &mn_set);
4810 
4811 		/* if invalid master block - try next replica */
4812 		if (! rip->ri_mbip)
4813 			continue;
4814 
4815 		/*
4816 		 * If lbp alloc'd to wrong size - reset it.
4817 		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
4818 		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
4819 		 */
4820 		if (lbp) {
4821 			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
4822 			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
4823 				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
4824 				lbp = (mddb_lb_t *)NULL;
4825 			}
4826 		}
4827 
4828 		if (lbp == (mddb_lb_t *)NULL) {
4829 			/* If a MN set, set lb_blkcnt for MN loc blk size */
4830 			if (mn_set)
4831 				lb_blkcnt = MDDB_MNLBCNT;
4832 			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
4833 			    KM_SLEEP);
4834 		}
4835 
4836 		/*
4837 		 * Read in all the sectors for the locator block
4838 		 * NOTE: Need to use getblks, rather than readblklst.
4839 		 *	because it is too early and things are
4840 		 *	NOT set up yet for read*()'s
4841 		 */
4842 		buffer = (caddr_t)lbp;
4843 		for (blk = 0; blk < lb_blkcnt; blk++) {
4844 			physblk = getphysblk(blk, rip->ri_mbip);
4845 			err = getblks(s, buffer, dev, physblk,
4846 			    btodb(MDDB_BSIZE), 0);
4847 			if (err) {
4848 				rip->ri_flags |= err;
4849 				break;
4850 			}
4851 			buffer += MDDB_BSIZE;
4852 		}
4853 
4854 		if (err)
4855 			continue;
4856 
4857 		/* Verify the locator block */
4858 		if (blk != lb_blkcnt)
4859 			continue;
4860 		if (lbp->lb_magic != MDDB_MAGIC_LB)
4861 			continue;
4862 		if (lbp->lb_blkcnt != lb_blkcnt)
4863 			continue;
4864 		if (mn_set) {
4865 			/* If a MN set, check for MNLB revision in lb. */
4866 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
4867 				continue;
4868 		} else {
4869 			/* If not a MN set, check for LB revision in lb. */
4870 			if (revchk(MDDB_REV_LB, lbp->lb_revision))
4871 				continue;
4872 		}
4873 		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
4874 			continue;
4875 
4876 		/*
4877 		 * With the addition of MultiNode Disksets, we must make sure
4878 		 * to verify that this is the correct set.  A node could
4879 		 * have been out of the config for awhile and this disk could
4880 		 * have been moved to a different diskset and we don't want
4881 		 * to accidentally start the wrong set.
4882 		 *
4883 		 * We don't do this check if we're in the middle of
4884 		 * importing a set.
4885 		 */
4886 		if (!(md_get_setstatus(s->s_setno) &
4887 		    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
4888 		    (lbp->lb_setno != s->s_setno))
4889 			continue;
4890 
4891 		rip->ri_flags |= MDDB_F_LOCACC;
4892 
4893 		/*
4894 		 * a commit count of zero means this locator has been deleted
4895 		 */
4896 		if (lbp->lb_commitcnt == 0)
4897 			continue;
4898 
4899 		/*
4900 		 * If replica is in the device ID style and md_devid_destroy
4901 		 * flag is set, turn off device id style.  This is only to be
4902 		 * used in a catastrophic failure case.  Examples would be
4903 		 * where the device id of all drives in the system
4904 		 * (especially the mirror'd root drives) had been changed
4905 		 * by firmware upgrade or by a patch to an existing disk
4906 		 * driver.  Another example would be in the case of non-unique
4907 		 * device ids due to a bug.  The device id would be valid on
4908 		 * the system, but would return the wrong dev_t.
4909 		 */
4910 		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
4911 			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
4912 			lbp->lb_didfirstblk = 0;
4913 			lbp->lb_didblkcnt = 0;
4914 			*write_lb = 1;
4915 		}
4916 
4917 
4918 		/*
4919 		 * If replica is in device ID style, read in device ID
4920 		 * block and verify device ID block information.
4921 		 */
4922 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4923 
4924 			/* Read in device ID block */
4925 			if (did_icp == NULL) {
4926 				did_icp = (mddb_did_ic_t *)
4927 				    kmem_zalloc(sizeof (mddb_did_ic_t),
4928 				    KM_SLEEP);
4929 			} else {
4930 				/* Reuse did_icp, but clear out data */
4931 				if (did_icp->did_ic_blkp !=
4932 				    (mddb_did_blk_t *)NULL) {
4933 					kmem_free((caddr_t)did_icp->did_ic_blkp,
4934 					    did_blkp_sz);
4935 					did_blkp = (mddb_did_blk_t *)NULL;
4936 					did_icp->did_ic_blkp =
4937 					    (mddb_did_blk_t *)NULL;
4938 				}
4939 				if (did_icp->did_ic_dbp !=
4940 				    (mddb_did_db_t *)NULL) {
4941 					did_dbp1 = did_icp->did_ic_dbp;
4942 					while (did_dbp1) {
4943 						did_dbp2 = did_dbp1->db_next;
4944 						kmem_free((caddr_t)
4945 						    did_dbp1->db_ptr,
4946 						    dbtob(did_dbp1->db_blkcnt));
4947 						kmem_free((caddr_t)did_dbp1,
4948 						    sizeof (mddb_did_db_t));
4949 						did_dbp1 = did_dbp2;
4950 					}
4951 					did_icp->did_ic_dbp =
4952 					    (mddb_did_db_t *)NULL;
4953 				}
4954 				for (i = 0; i < MDDB_NLB; i++) {
4955 					did_icp->did_ic_devid[i] =
4956 					    (ddi_devid_t)NULL;
4957 				}
4958 			}
4959 
4960 			/* Can't reuse blkp since size could be different */
4961 			if (did_blkp != (mddb_did_blk_t *)NULL) {
4962 				kmem_free(did_blkp, did_blkp_sz);
4963 			}
4964 			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
4965 			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
4966 			    KM_SLEEP);
4967 			did_icp->did_ic_blkp = did_blkp;
4968 			buffer = (caddr_t)did_blkp;
4969 			for (blk = lbp->lb_didfirstblk;
4970 			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
4971 			    blk++) {
4972 				physblk = getphysblk(blk, rip->ri_mbip);
4973 				err = getblks(s, buffer, dev, physblk,
4974 				    btodb(MDDB_BSIZE), 0);
4975 				if (err) {
4976 					rip->ri_flags |= err;
4977 					break;
4978 				}
4979 				buffer += MDDB_BSIZE;
4980 			}
4981 			if (err)
4982 				continue;
4983 
4984 			/* Verify the Device ID block */
4985 			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
4986 				continue;
4987 			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
4988 				continue;
4989 			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
4990 				continue;
4991 			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
4992 				continue;
4993 			if (crcchk(did_blkp, &did_blkp->blk_checksum,
4994 			    dbtob(lbp->lb_didblkcnt), NULL))
4995 				continue;
4996 
4997 			/*
4998 			 * Check if device ID block is out of sync with the
4999 			 * Locator Block by checking if the locator block
5000 			 * commitcnt does not match the device id block
5001 			 * commitcnt.  If an 'out of sync' condition
5002 			 * exists, discard this replica since it has
5003 			 * inconsistent data and can't be used in
5004 			 * determining the best replica.
5005 			 *
5006 			 * An 'out of sync' condition could happen if old
5007 			 * SDS code was running with new devid style replicas
5008 			 * or if a failure occurred between the writing of
5009 			 * the locator block's commitcnt and the device
5010 			 * id block's commitcnt.
5011 			 *
5012 			 * If old SDS code had been running, the upgrade
5013 			 * process should detect this situation and
5014 			 * have removed all of the device id information
5015 			 * via the md_devid_destroy flag in md.conf.
5016 			 */
5017 			if (did_blkp->blk_commitcnt !=
5018 			    lbp->lb_commitcnt) {
5019 				continue;
5020 			}
5021 		}
5022 
5023 
5024 		/*
5025 		 * If replica is still in device ID style, read in all
5026 		 * of the device IDs, verify the checksum of the device IDs.
5027 		 */
5028 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5029 			/*
5030 			 * Reset valid bit in device id info block flags. This
5031 			 * flag is stored on disk, but the valid bit is reset
5032 			 * when reading in the replica.  If the corresponding
5033 			 * device id is valid (aka meaning that the system
5034 			 * knows about this device id), the valid bit will
5035 			 * be set at a later time.  The valid bit for this
5036 			 * replica's device ID will be set in this routine.
5037 			 * The valid bits for the rest of the device id's
5038 			 * will be set after the 'best' replica has
5039 			 * been selected in routine load_old_replicas.
5040 			 * Reset updated bit in device id info block flags.
5041 			 * This flag is also stored on disk, reset when read
5042 			 * in and set when the locators and side locators
5043 			 * have been updated to match this valid device
5044 			 * id information.
5045 			 */
5046 			for (li = 0; li < lbp->lb_loccnt; li++) {
5047 				did_info = &did_blkp->blk_info[li];
5048 				if (did_info->info_flags & MDDB_DID_EXISTS)
5049 					did_info->info_flags &=
5050 					    ~(MDDB_DID_VALID |
5051 					    MDDB_DID_UPDATED);
5052 			}
5053 
5054 			cont_flag = 0;
5055 			for (li = 0; li < lbp->lb_loccnt; li++) {
5056 				did_info = &did_blkp->blk_info[li];
5057 				did_block = (caddr_t)NULL;
5058 				if (did_info->info_flags & MDDB_DID_EXISTS) {
5059 					/*
5060 					 * Check if block has
5061 					 * already been read in
5062 					 */
5063 					did_dbp = did_icp->did_ic_dbp;
5064 					while (did_dbp != 0) {
5065 						if (did_dbp->db_firstblk ==
5066 						    did_info->info_firstblk)
5067 							break;
5068 						else
5069 							did_dbp =
5070 							    did_dbp->db_next;
5071 					}
5072 					/* if block not found, read it in */
5073 					if (did_dbp == NULL) {
5074 						did_block = (caddr_t)
5075 						    (kmem_zalloc(dbtob(
5076 						    did_info->info_blkcnt),
5077 						    KM_SLEEP));
5078 						buffer = (caddr_t)did_block;
5079 						for (blk =
5080 						    did_info->info_firstblk;
5081 						    blk < (did_info->
5082 						    info_firstblk +
5083 						    did_info->info_blkcnt);
5084 						    blk++) {
5085 							physblk =
5086 							    getphysblk(blk,
5087 							    rip->ri_mbip);
5088 							err = getblks(s,
5089 							    buffer, dev,
5090 							    physblk, btodb(
5091 							    MDDB_BSIZE), 0);
5092 							if (err) {
5093 								rip->ri_flags |=
5094 								    err;
5095 								break;
5096 							}
5097 							buffer += MDDB_BSIZE;
5098 						}
5099 						if (err) {
5100 							kmem_free(did_block,
5101 							    dbtob(did_info->
5102 							    info_blkcnt));
5103 							did_block =
5104 							    (caddr_t)NULL;
5105 							cont_flag = 1;
5106 							break;
5107 						}
5108 
5109 						/*
5110 						 * Block read in -
5111 						 * alloc Disk Block area
5112 						 */
5113 						did_dbp = (mddb_did_db_t *)
5114 						    kmem_zalloc(
5115 						    sizeof (mddb_did_db_t),
5116 						    KM_SLEEP);
5117 						did_dbp->db_ptr = did_block;
5118 						did_dbp->db_firstblk =
5119 						    did_info->info_firstblk;
5120 						did_dbp->db_blkcnt =
5121 						    did_info->info_blkcnt;
5122 
5123 						/* Add to front of dbp list */
5124 						did_dbp->db_next =
5125 						    did_icp->did_ic_dbp;
5126 						did_icp->did_ic_dbp = did_dbp;
5127 					}
5128 					/* Check validity of devid in block */
5129 					if (crcchk(((char *)did_dbp->db_ptr +
5130 					    did_info->info_offset),
5131 					    &did_info->info_checksum,
5132 					    did_info->info_length, NULL)) {
5133 						cont_flag = 1;
5134 						break;
5135 					}
5136 
5137 					/* Block now pointed to by did_dbp */
5138 					did_icp->did_ic_devid[li] =
5139 					    (ddi_devid_t)((char *)
5140 					    did_dbp->db_ptr +
5141 					    did_info->info_offset);
5142 				}
5143 			}
5144 			if (cont_flag)
5145 				continue;
5146 		}
5147 
5148 		/*
5149 		 * All blocks containing devids are now in core.
5150 		 */
5151 
5152 		/*
5153 		 * If we're doing a replicated import (also known as
5154 		 * remote copy import), the device id in the locator
5155 		 * block is incorrect and we need to fix it up here
5156 		 * alongwith the l_dev otherwise we run into lots of
5157 		 * trouble later on.
5158 		 */
5159 		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5160 			mddb_ri_t	*trip;
5161 			for (li = 0; li < lbp->lb_loccnt; li++) {
5162 				did_info = &did_blkp->blk_info[li];
5163 				lp = &lbp->lb_locators[li];
5164 
5165 				if (lp->l_flags & MDDB_F_DELETED)
5166 					continue;
5167 
5168 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5169 					continue;
5170 
5171 				if (did_icp->did_ic_devid[li] == NULL)
5172 					continue;
5173 
5174 				for (trip = s->s_rip; trip != NULL;
5175 				    trip = trip->ri_next) {
5176 					if (trip->ri_old_devid == NULL)
5177 						continue;
5178 					if (ddi_devid_compare(
5179 					    trip->ri_old_devid,
5180 					    did_icp->did_ic_devid[li]) != 0) {
5181 						continue;
5182 					}
5183 
5184 					/* update l_dev and side mnum */
5185 					lp->l_dev = md_cmpldev(trip->ri_dev);
5186 					lbp->lb_sidelocators[0][li].l_mnum =
5187 					    md_getminor(trip->ri_dev);
5188 				}
5189 			}
5190 		}
5191 
5192 		/*
5193 		 * If there is a valid devid, verify that this locator
5194 		 * block has information about itself by checking the
5195 		 * device ID, minor_name and block
5196 		 * number from this replica's incore data structure
5197 		 * against the locator block information that has just
5198 		 * been read in from disk.
5199 		 *
5200 		 * If not a valid devid, verify that this locator block
5201 		 * has information about itself by checking the minor
5202 		 * number, block number and driver name from this
5203 		 * replica's incore data structure against the locator
5204 		 * block information that has just been read in from disk.
5205 		 */
5206 		if ((rip->ri_devid != NULL) &&
5207 		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
5208 			/*
5209 			 * This locator block MUST have locator (replica)
5210 			 * information about itself.  Check against devid,
5211 			 * slice part of minor number, and block number.
5212 			 */
5213 			for (li = 0; li < lbp->lb_loccnt; li++) {
5214 				did_info = &did_blkp->blk_info[li];
5215 				lp = &lbp->lb_locators[li];
5216 				if (lp->l_flags & MDDB_F_DELETED)
5217 					continue;
5218 
5219 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5220 					continue;
5221 
5222 				if (((md_get_setstatus(setno) &
5223 				    MD_SET_REPLICATED_IMPORT)) &&
5224 				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
5225 					if (ddi_devid_compare(rip->ri_old_devid,
5226 					    did_icp->did_ic_devid[li]) != 0)
5227 						continue;
5228 				} else {
5229 					if (ddi_devid_compare(rip->ri_devid,
5230 					    did_icp->did_ic_devid[li]) != 0)
5231 						continue;
5232 				}
5233 
5234 				if (strcmp(rip->ri_minor_name,
5235 				    did_info->info_minor_name) != 0)
5236 					continue;
5237 
5238 				if (lp->l_blkno == rip->ri_blkno)
5239 					break;
5240 			}
5241 		} else {
5242 			/*
5243 			 * This locator block MUST have locator (replica)
5244 			 * information about itself.
5245 			 */
5246 			if (!mn_set) {
5247 				for (li = 0; li < lbp->lb_loccnt; li++) {
5248 					mddb_drvnm_t		*dn;
5249 					mddb_sidelocator_t	*slp;
5250 
5251 					lp = &lbp->lb_locators[li];
5252 					slp = &lbp->
5253 					    lb_sidelocators[s->s_sideno][li];
5254 					if (lp->l_flags & MDDB_F_DELETED)
5255 						continue;
5256 					if (slp->l_mnum != md_getminor(
5257 					    rip->ri_dev))
5258 						continue;
5259 					if (lp->l_blkno != rip->ri_blkno)
5260 						continue;
5261 					dn = &lbp->lb_drvnm[slp->l_drvnm_index];
5262 					if (strncmp(dn->dn_data,
5263 					    rip->ri_driver, MD_MAXDRVNM) == 0)
5264 						break;
5265 				}
5266 			} else {
5267 				for (li = 0; li < lbp->lb_loccnt; li++) {
5268 					mddb_drvnm_t		*dn;
5269 					mddb_mnsidelocator_t	*mnslp;
5270 					mddb_mnlb_t		*mnlbp;
5271 					int			i;
5272 
5273 					/*
5274 					 * Check all possible locators locking
5275 					 * for match to the currently read-in
5276 					 * locator, must match on:
5277 					 *	- blkno
5278 					 *	- side locator for this
5279 					 *	  node's side
5280 					 *	- side locator minor number
5281 					 *	- side locator driver name
5282 					 */
5283 
5284 					/*
5285 					 * Looking at sidelocs:
5286 					 * cast lbp -> mnlbp
5287 					 */
5288 					mnlbp = (mddb_mnlb_t *)lbp;
5289 					lp = &mnlbp->lb_locators[li];
5290 					if (lp->l_flags & MDDB_F_DELETED)
5291 						continue;
5292 					if (lp->l_blkno != rip->ri_blkno)
5293 						continue;
5294 
5295 					for (i = 0; i < MD_MNMAXSIDES; i++) {
5296 						mnslp = &mnlbp->
5297 						    lb_mnsidelocators[i][li];
5298 						if (mnslp->mnl_sideno ==
5299 						    s->s_sideno) {
5300 							break;
5301 						}
5302 					}
5303 					/* No matching side found */
5304 					if (i == MD_MNMAXSIDES)
5305 						continue;
5306 					if (mnslp->mnl_mnum !=
5307 					    md_getminor(rip->ri_dev))
5308 						continue;
5309 					dn = &lbp->
5310 					    lb_drvnm[mnslp->mnl_drvnm_index];
5311 					if (strncmp(dn->dn_data,
5312 					    rip->ri_driver, MD_MAXDRVNM) == 0)
5313 						break;
5314 				}
5315 			}
5316 		}
5317 
5318 		/*
5319 		 * Didn't find ourself in this locator block it means
5320 		 * the locator block is a stale transplant. Probably from
5321 		 * a user doing a dd.
5322 		 */
5323 		if (li == lbp->lb_loccnt)
5324 			continue;
5325 
5326 		/*
5327 		 * Keep track of the number of accessed and valid
5328 		 * locator blocks.
5329 		 */
5330 		lb_ok++;
5331 
5332 		/*
5333 		 * Read the tag in, skips invalid or blank tags.
5334 		 * Only valid tags allocate storage
5335 		 * Data tags are not used in MN disksets.
5336 		 */
5337 		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
5338 			/*
5339 			 * Keep track of the number of tagged
5340 			 * locator blocks.
5341 			 */
5342 			lb_tagged++;
5343 
5344 			/* Keep a list of unique tags. */
5345 			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
5346 		}
5347 
5348 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5349 			/*
5350 			 * go through locator block and add any other
5351 			 * locations of the data base.
5352 			 * For the replicated import case, this was done earlier
5353 			 * and we really don't need or want to do so again
5354 			 */
5355 			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
5356 			for (li = 0; li < lbp->lb_loccnt; li++) {
5357 				lp = &lbp->lb_locators[li];
5358 				if (lp->l_flags & MDDB_F_DELETED)
5359 					continue;
5360 
5361 				cl->l_devid_flags = MDDB_DEVID_GETSZ;
5362 				cl->l_devid = (uint64_t)0;
5363 				cl->l_devid_sz = 0;
5364 				cl->l_old_devid = (uint64_t)0;
5365 				cl->l_old_devid_sz = 0;
5366 				cl->l_minor_name[0] = '\0';
5367 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5368 				    did_icp);
5369 
5370 				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
5371 					if ((cl->l_devid = (uintptr_t)kmem_alloc
5372 					    (cl->l_devid_sz, KM_SLEEP))
5373 					    == NULL) {
5374 						continue;
5375 					} else {
5376 						cl->l_devid_flags =
5377 						    MDDB_DEVID_SPACE;
5378 					}
5379 				}
5380 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5381 				    did_icp);
5382 
5383 				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
5384 
5385 				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
5386 					kmem_free((caddr_t)(uintptr_t)
5387 					    cl->l_devid, cl->l_devid_sz);
5388 			}
5389 			kmem_free(cl, sizeof (mddb_cfg_loc_t));
5390 		}
5391 
5392 		/* Save LB for later */
5393 		rip->ri_lbp = lbp;
5394 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5395 			rip->ri_did_icp = did_icp;
5396 			did_icp = (mddb_did_ic_t *)NULL;
5397 			did_blkp = (mddb_did_blk_t *)NULL;
5398 		} else
5399 			rip->ri_did_icp = NULL;
5400 		lbp = (mddb_lb_t *)NULL;
5401 	}
5402 
5403 	if (lbp != (mddb_lb_t *)NULL)
5404 		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
5405 
5406 	if (did_icp != (mddb_did_ic_t *)NULL) {
5407 		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
5408 			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
5409 			did_blkp = (mddb_did_blk_t *)NULL;
5410 		}
5411 		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
5412 			mddb_did_db_t	*did_dbp1, *did_dbp2;
5413 
5414 			did_dbp1 = did_icp->did_ic_dbp;
5415 			while (did_dbp1) {
5416 				did_dbp2 = did_dbp1->db_next;
5417 				kmem_free((caddr_t)did_dbp1->db_ptr,
5418 				    dbtob(did_dbp1->db_blkcnt));
5419 				kmem_free((caddr_t)did_dbp1,
5420 				    sizeof (mddb_did_db_t));
5421 				did_dbp1 = did_dbp2;
5422 			}
5423 		}
5424 		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
5425 	}
5426 
5427 	if (did_blkp != (mddb_did_blk_t *)NULL) {
5428 		kmem_free((caddr_t)did_blkp, did_blkp_sz);
5429 	}
5430 
5431 	/* No locator blocks were ok */
5432 	if (lb_ok == 0)
5433 		goto out;
5434 
5435 	/* No tagged data was found - will be 0 for MN diskset */
5436 	if (lb_tagged == 0)
5437 		goto out;
5438 
5439 	/* Find the highest non-deleted replica count */
5440 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5441 		int		lb_tot = 0;
5442 
5443 		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
5444 			continue;
5445 
5446 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
5447 			continue;
5448 
5449 		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
5450 			lp = &rip->ri_lbp->lb_locators[li];
5451 			if (lp->l_flags & MDDB_F_DELETED)
5452 				continue;
5453 			lb_tot++;
5454 		}
5455 
5456 		if (lb_tot > lb_total)
5457 			lb_total = lb_tot;
5458 	}
5459 
5460 	/* Count the number of unique tags */
5461 	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
5462 		lb_tags++;
5463 
5464 	/* Should have at least one tag at this point */
5465 	ASSERT(lb_tags > 0);
5466 
5467 
5468 	/*
5469 	 * If the number of tagged locators is not the same as the number of
5470 	 * OK locators OR more than one tag exists, then make sure the
5471 	 * selected tag will be written out later.
5472 	 */
5473 	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
5474 		md_set_setstatus(setno, MD_SET_TAGDATA);
5475 
5476 	/* Only a single tag, take the tagged data */
5477 	if (lb_tags == 1) {
5478 		dt_setup(s, &s->s_dtlp->dtl_dt);
5479 		md_set_setstatus(setno, MD_SET_USETAG);
5480 		goto out;
5481 	}
5482 
5483 	/* Multiple tags, not selecting a tag, tag mode is on */
5484 	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
5485 		retval = MDDB_E_TAGDATA;
5486 
5487 out:
5488 
5489 	return (retval);
5490 }
5491 
5492 /*
5493  *	1. Select a locator.
5494  *	2. check if enough locators now have current copies
5495  *	3. read in database from one of latest
5496  *	4. if known to have latest make all database the same
5497  *	5. if configuration has changed rewrite locators
5498  *
5499  * Parameters:
5500  * 	s - pointer to mddb_set structure
5501  *	flag - used in MN disksets to tell if this node is being joined to
5502  *		a diskset that is in the STALE state.  If the flag is
5503  *		MDDB_MN_STALE, then this node should be marked in the STALE
5504  *		state even if > 50% mddbs are available.  (The diskset can
5505  *		only change from STALE->OK if all nodes withdraw from the
5506  *		MN diskset and then rejoin).
5507  */
5508 static int
5509 load_old_replicas(
5510 	mddb_set_t	*s,
5511 	int		flag
5512 )
5513 {
5514 	mddb_lb_t	*lbp = NULL;
5515 	mddb_mnlb_t	*mnlbp = NULL;
5516 	mddb_ri_t	*rip;
5517 	mddb_locator_t	*lp;
5518 	mddb_db_t	*dbp;
5519 	mddb_de_ic_t	*dep;
5520 	int		li;
5521 	int		alc;
5522 	int		lc;
5523 	int		tlc;
5524 	int		retval = 0;
5525 	caddr_t		p;
5526 	size_t		maxrecsize;
5527 	set_t		setno = s->s_setno;
5528 	mddb_did_db_t	*did_dbp1;
5529 	mddb_did_info_t	*did_info;
5530 	mddb_did_ic_t	*did_icp = NULL;
5531 	md_dev64_t	*newdev;
5532 	mddb_sidelocator_t	*slp = 0;
5533 	mddb_mnsidelocator_t	*mnslp = 0;
5534 	uchar_t		i;
5535 	char		*name;
5536 	ddi_devid_t	ret_devid;
5537 	md_dev64_t	dev;
5538 	uint_t		len, sz;
5539 	char		*minor_name;
5540 	int		write_lb = 0;
5541 	int		rval;
5542 	int		stale_rtn = 0;
5543 
5544 	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
5545 	if (retval = get_mbs_n_lbs(s, &write_lb))
5546 		goto errout;
5547 
5548 	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
5549 		retval = MDDB_E_NOLOCBLK;
5550 		goto errout;
5551 	}
5552 
5553 	/* If a multi-node set, then set md_set.s_status flag */
5554 	if (lbp->lb_flags & MDDB_MNSET) {
5555 		md_set_setstatus(setno, MD_SET_MNSET);
5556 		/*
5557 		 * If data tag area had been allocated before set type was
5558 		 * known - free it now.
5559 		 */
5560 		if (md_set[setno].s_dtp) {
5561 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
5562 			md_set[setno].s_dtp = NULL;
5563 		}
5564 	}
5565 
5566 	/*
5567 	 * If the replica is in devid format, setup the devid incore ptr.
5568 	 */
5569 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5570 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5571 			if (rip->ri_lbp == s->s_lbp) {
5572 				did_icp = s->s_did_icp = rip->ri_did_icp;
5573 				break;
5574 			}
5575 		}
5576 		/*
5577 		 * If no devid incore info found - something has gone
5578 		 * wrong so errout.
5579 		 */
5580 		if (rip == NULL) {
5581 			retval = MDDB_E_NODEVID;
5582 			goto errout;
5583 		}
5584 
5585 		/*
5586 		 * Add all blocks containing devids to free list.
5587 		 * Then remove addresses that actually contain devids.
5588 		 */
5589 		did_dbp1 = did_icp->did_ic_dbp;
5590 		while (did_dbp1) {
5591 			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
5592 			    0, dbtob(did_dbp1->db_blkcnt))) {
5593 				retval = MDDB_E_NOSPACE;
5594 				goto errout;
5595 			}
5596 
5597 			did_dbp1 = did_dbp1->db_next;
5598 		}
5599 		for (li = 0; li < lbp->lb_loccnt; li++) {
5600 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5601 			if (!(did_info->info_flags & MDDB_DID_EXISTS))
5602 				continue;
5603 
5604 			if (mddb_devid_free_delete(s, did_info->info_firstblk,
5605 			    did_info->info_offset, did_info->info_length)) {
5606 				/* unable to find disk block */
5607 				retval = MDDB_E_NODEVID;
5608 				goto errout;
5609 			}
5610 		}
5611 	}
5612 
5613 	/*
5614 	 * create mddb_mbaray, count all locators and active locators.
5615 	 */
5616 	alc = 0;
5617 	lc = 0;
5618 	for (li = 0; li < lbp->lb_loccnt; li++) {
5619 		ddi_devid_t	li_devid;
5620 
5621 		lp = &lbp->lb_locators[li];
5622 
5623 		if (lp->l_flags & MDDB_F_DELETED)
5624 			continue;
5625 
5626 		/* Count non-deleted replicas */
5627 		lc++;
5628 
5629 		/*
5630 		 * Use the devid of this locator to compare with the rip
5631 		 * list.  The scenario to watch out for here is that this
5632 		 * locator could be on a disk that is dead and there could
5633 		 * be a valid entry in the rip list for a different disk
5634 		 * that has been moved to the dead disks dev_t.  We don't
5635 		 * want to match with the moved disk.
5636 		 */
5637 		li_devid = NULL;
5638 		(void) mddb_devid_get(s, li, &li_devid, &minor_name);
5639 
5640 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5641 			if (match_mddb(rip, li_devid, minor_name,
5642 			    md_expldev(lp->l_dev), lp->l_blkno)) {
5643 				break;
5644 			}
5645 		}
5646 		if (rip == NULL) {
5647 			/*
5648 			 * If rip not found, then mark error in master block
5649 			 * so that no writes are later attempted to this
5650 			 * replica.  rip may not be setup if ridev
5651 			 * failed due to un-found driver name.
5652 			 */
5653 			lp->l_flags |= MDDB_F_EMASTER;
5654 			continue;
5655 		}
5656 
5657 		s->s_mbiarray[li] = rip->ri_mbip;
5658 
5659 		lp->l_flags &= MDDB_F_ACTIVE;
5660 		lp->l_flags |= (int)rip->ri_flags;
5661 
5662 		if (rip->ri_transplant)
5663 			lp->l_flags &= ~MDDB_F_ACTIVE;
5664 
5665 		if (lp->l_flags & MDDB_F_LOCACC)
5666 			alc++;
5667 	}
5668 
5669 	/* Save on a divide - calculate 50% + 1 up front */
5670 	tlc = ((lc + 1) / 2);
5671 
5672 	if (alc > tlc) {		/* alc > tlc		- OK */
5673 		md_clr_setstatus(setno, MD_SET_STALE);
5674 	} else if (alc < tlc) {		/* alc < tlc		- stale */
5675 		md_set_setstatus(setno, MD_SET_STALE);
5676 	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
5677 		md_clr_setstatus(setno, MD_SET_STALE);
5678 	} else {			/* alc == tlc && even	- ? */
5679 		/* Can do an accept, and are */
5680 		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
5681 			md_clr_setstatus(setno, MD_SET_STALE);
5682 		} else {		/* possibly has a mediator */
5683 			if (mediate(s)) {
5684 				md_set_setstatus(setno, MD_SET_STALE);
5685 			} else {
5686 				md_clr_setstatus(setno, MD_SET_STALE);
5687 			}
5688 		}
5689 
5690 		/*
5691 		 * The mirrored_root_flag allows the sysadmin to decide to
5692 		 * start the local set in a read/write (non-stale) mode
5693 		 * when there are only 50% available mddbs on the system and
5694 		 * when the root file system is on a mirror.  This is useful
5695 		 * in a 2 disk system where 1 disk failure would cause an mddb
5696 		 * quorum failure and subsequent boot failures since the root
5697 		 * filesystem would be in a read-only state.
5698 		 */
5699 		if (mirrored_root_flag == 1 && setno == 0 &&
5700 		    svm_bootpath[0] != 0) {
5701 			md_clr_setstatus(setno, MD_SET_STALE);
5702 		} else {
5703 			if (md_get_setstatus(setno) & MD_SET_STALE) {
5704 				/* Allow half mode - CAREFUL! */
5705 				if (mddb_allow_half)
5706 					md_clr_setstatus(setno, MD_SET_STALE);
5707 			}
5708 		}
5709 
5710 		/*
5711 		 * In a MN diskset,
5712 		 *	- if 50% mddbs are unavailable and this
5713 		 *		has been marked STALE above
5714 		 * 	- master node isn't in the STALE state
5715 		 *	- this node isn't the master node (this node
5716 		 *		isn't the first node to join the set)
5717 		 * then clear the STALE state and set TOOFEW.
5718 		 *
5719 		 * If this node is the master node and set was marked STALE,
5720 		 * then the set stays STALE.
5721 		 *
5722 		 * If this node is not the master and this node's state is
5723 		 * STALE and the master node is not marked STALE,
5724 		 * then master node must be in the TOOFEW state or the
5725 		 * master is panic'ing.  A MN diskset can only be placed into
5726 		 * the STALE state by having the first node join the set
5727 		 * with <= 50% mddbs.  There's no way for a MN diskset to
5728 		 * transition between STALE and not-STALE states unless all
5729 		 * nodes are withdrawn from the diskset or all nodes in the
5730 		 * diskset are rebooted at the same time.
5731 		 *
5732 		 * So, mark this node's state as TOOFEW instead of STALE.
5733 		 */
5734 		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
5735 		    == (MD_SET_MNSET | MD_SET_STALE)) &&
5736 		    ((flag & MDDB_MN_STALE) == 0) &&
5737 		    (!(md_set[setno].s_am_i_master))) {
5738 			md_clr_setstatus(setno, MD_SET_STALE);
5739 			md_set_setstatus(setno, MD_SET_TOOFEW);
5740 		}
5741 	}
5742 
5743 	/*
5744 	 * If a MN set is marked STALE on the other nodes,
5745 	 * mark it stale here.  Override all other considerations
5746 	 * such as a mediator or > 50% mddbs available.
5747 	 */
5748 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
5749 		if (flag & MDDB_MN_STALE)
5750 			md_set_setstatus(setno, MD_SET_STALE);
5751 	}
5752 
5753 	/*
5754 	 * read a good copy of the locator names
5755 	 * if an error occurs reading what is suppose
5756 	 * to be a good copy continue looking for another
5757 	 * good copy
5758 	 */
5759 	s->s_lnp = NULL;
5760 	for (li = 0; li < lbp->lb_loccnt; li++) {
5761 		lp = &lbp->lb_locators[li];
5762 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5763 		    (lp->l_flags & MDDB_F_EMASTER))
5764 			continue;
5765 
5766 		/* Find rip entry for this locator if one exists */
5767 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5768 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5769 			    lp->l_blkno))
5770 				break;
5771 		}
5772 
5773 		if (rip == NULL) {
5774 			continue;
5775 		}
5776 
5777 		/*
5778 		 * Use the rip commitcnt since the commitcnt in lbp could
5779 		 * been cleared by selectlocator.  Looking for a replica with
5780 		 * the same commitcnt as the 'golden' copy in order to
5781 		 * get the same data.
5782 		 */
5783 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5784 			continue;
5785 		}
5786 
5787 		/*
5788 		 * Now have a copy of the database that is equivalent
5789 		 * to the chosen locator block with respect to
5790 		 * inittime, identifier and commitcnt.   Trying the
5791 		 * equivalent databases in the order that they were
5792 		 * written will provide the most up to date data.
5793 		 */
5794 		lp->l_flags |= readlocnames(s, li);
5795 		if (s->s_lnp)
5796 			break;
5797 	}
5798 
5799 	if (s->s_lnp == NULL) {
5800 		retval = MDDB_E_NOLOCNMS;
5801 		goto errout;
5802 	}
5803 
5804 	/*
5805 	 * read a good copy of the data base
5806 	 * if an error occurs reading what is suppose
5807 	 * to be a good copy continue looking for another
5808 	 * good copy
5809 	 */
5810 
5811 	s->s_dbp = NULL;
5812 	for (li = 0; li < lbp->lb_loccnt; li++) {
5813 		lp = &lbp->lb_locators[li];
5814 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5815 		    (lp->l_flags & MDDB_F_EMASTER))
5816 			continue;
5817 
5818 		/* Find rip entry for this locator if one exists */
5819 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5820 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5821 			    lp->l_blkno))
5822 				break;
5823 		}
5824 
5825 		if (rip == NULL) {
5826 			continue;
5827 		}
5828 
5829 		/*
5830 		 * Use the rip commitcnt since the commitcnt in lbp could
5831 		 * been cleared by selectlocator.  Looking for a replica with
5832 		 * the same commitcnt as the 'golden' copy in order to
5833 		 * get the same data.
5834 		 */
5835 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5836 			continue;
5837 		}
5838 
5839 		/*
5840 		 * Now have a copy of the database that is equivalent
5841 		 * to the chosen locator block with respect to
5842 		 * inittime, identifier and commitcnt.   Trying the
5843 		 * equivalent databases in the order that they were
5844 		 * written will provide the most up to date data.
5845 		 */
5846 		lp->l_flags |= readcopy(s, li);
5847 
5848 		if (s->s_dbp)
5849 			break;
5850 	}
5851 
5852 	if (s->s_dbp == NULL) {
5853 		retval = MDDB_E_NODIRBLK;
5854 		goto errout;
5855 	}
5856 
5857 	lp->l_flags |= MDDB_F_MASTER;
5858 	lp->l_flags |= MDDB_F_UP2DATE;
5859 
5860 	/*
5861 	 * go through and find largest record;
5862 	 * Also fixup the user data area's
5863 	 */
5864 	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
5865 
5866 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
5867 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
5868 			if (dep->de_flags & MDDB_F_OPT)
5869 				getoptrecord(s, dep);
5870 			else {
5871 				allocuserdata(dep);
5872 				maxrecsize = MAX(dep->de_recsize, maxrecsize);
5873 			}
5874 
5875 	if (maxrecsize > s->s_databuffer_size) {
5876 		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
5877 		if (s->s_databuffer_size)
5878 			kmem_free(s->s_databuffer, s->s_databuffer_size);
5879 		s->s_databuffer = p;
5880 		s->s_databuffer_size = maxrecsize;
5881 	}
5882 
5883 	/* If we can clear the tag data record, do it now. */
5884 	/* Data tags not supported on MN sets */
5885 	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
5886 	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
5887 		dt_setup(s, NULL);
5888 
5889 	/* This will return non-zero if STALE or TOOFEW */
5890 	/* This will write out chosen replica image to all replicas */
5891 	stale_rtn = selectreplicas(s, MDDB_SCANALL);
5892 
5893 	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5894 		ddi_devid_t	devidptr;
5895 
5896 		/*
5897 		 * ignore the return value from selectreplicas because we
5898 		 * may have a STALE or TOOFEW set in the case of a partial
5899 		 * replicated diskset. We will fix that up later.
5900 		 */
5901 
5902 		lbp = s->s_lbp;
5903 		for (li = 0; li < lbp->lb_loccnt; li++) {
5904 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5905 
5906 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5907 				devidptr = s->s_did_icp->did_ic_devid[li];
5908 				lp = &lbp->lb_locators[li];
5909 				for (rip = s->s_rip; rip != NULL;
5910 				    rip = rip->ri_next) {
5911 					if (rip->ri_old_devid == 0)
5912 						continue;
5913 					if (ddi_devid_compare(rip->ri_old_devid,
5914 					    devidptr) != 0) {
5915 						continue;
5916 					}
5917 					if (update_locatorblock(s,
5918 					    md_expldev(lp->l_dev),
5919 					    rip->ri_devid, rip->ri_old_devid)) {
5920 						goto errout;
5921 					}
5922 				}
5923 			}
5924 		}
5925 	} else {
5926 		if (stale_rtn)
5927 			goto errout;
5928 	}
5929 
5930 	/*
5931 	 * If the replica is in device id style - validate the device id's,
5932 	 * if present, in the locator block devid area.
5933 	 */
5934 	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
5935 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5936 		for (li = 0; li < lbp->lb_loccnt; li++) {
5937 			newdev[li] = 0;
5938 			lp = &lbp->lb_locators[li];
5939 			if (lp->l_flags & MDDB_F_DELETED)
5940 				continue;
5941 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5942 			dev = md_expldev(lp->l_dev);
5943 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5944 				/* Validate device id on current system */
5945 				newdev[li] = dev;
5946 				if (mddb_devid_validate(
5947 				    did_icp->did_ic_devid[li],
5948 				    &(newdev[li]),
5949 				    did_info->info_minor_name) == 0) {
5950 					/* Set valid flag */
5951 					did_info->info_flags |= MDDB_DID_VALID;
5952 				} else {
5953 					lp->l_flags |= MDDB_F_EMASTER;
5954 				}
5955 			} else if (!(MD_UPGRADE)) {
5956 				/*
5957 				 * If a device doesn't have a device id,
5958 				 * check if there is now a device ID
5959 				 * associated with device.  If one exists,
5960 				 * add it to the locator block devid area.
5961 				 * If there's not enough space to add it,
5962 				 * print a warning.
5963 				 * Don't do this during upgrade.
5964 				 */
5965 				dev_t ddi_dev = md_dev64_to_dev(dev);
5966 				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
5967 				    DDI_SUCCESS) {
5968 					if (ddi_lyr_get_minor_name(ddi_dev,
5969 					    S_IFBLK, &minor_name)
5970 					    == DDI_SUCCESS) {
5971 						if (mddb_devid_add(s, li,
5972 						    ret_devid, minor_name)) {
5973 							cmn_err(CE_WARN,
5974 							    "Not enough space"
5975 							    " in metadevice"
5976 							    " state"
5977 							    " database\n");
5978 							cmn_err(CE_WARN,
5979 							    "to add relocation"
5980 							    " information for"
5981 							    " device:\n");
5982 							cmn_err(CE_WARN,
5983 							    " major = %d, "
5984 							    " minor = %d\n",
5985 							    getmajor(ddi_dev),
5986 							    getminor(ddi_dev));
5987 						} else {
5988 							write_lb = 1;
5989 						}
5990 						kmem_free(minor_name,
5991 						    strlen(minor_name) + 1);
5992 					}
5993 					ddi_devid_free(ret_devid);
5994 				}
5995 			}
5996 		}
5997 
5998 		/*
5999 		 * If a device has a valid device id and if the dev_t
6000 		 * associated with the device id has changed, update the
6001 		 * driver name, minor num and dev_t in the local and side
6002 		 * locators to match the dev_t that the system currently
6003 		 * associates with the device id.
6004 		 *
6005 		 * Don't do this during upgrade.
6006 		 */
6007 		if (!(MD_UPGRADE)) {
6008 			for (li = 0; li < lbp->lb_loccnt; li++) {
6009 				lp = &lbp->lb_locators[li];
6010 				if (lp->l_flags & MDDB_F_DELETED)
6011 					continue;
6012 				did_info = &(did_icp->did_ic_blkp->blk_info
6013 				    [li]);
6014 				if ((did_info->info_flags & MDDB_DID_VALID) &&
6015 				    !(did_info->info_flags &
6016 				    MDDB_DID_UPDATED)) {
6017 					if (lbp->lb_flags & MDDB_MNSET) {
6018 						int j;
6019 						int index = -1;
6020 						mnlbp = (mddb_mnlb_t *)lbp;
6021 						for (j = 0; j < MD_MNMAXSIDES;
6022 						    j++) {
6023 							mnslp = &mnlbp->
6024 							    lb_mnsidelocators[j]
6025 							    [li];
6026 							if (mnslp->mnl_sideno ==
6027 							    s->s_sideno)
6028 								break;
6029 							if (mnslp->mnl_sideno ==
6030 							    0)
6031 								index = j;
6032 						}
6033 						if (j == MD_MNMAXSIDES) {
6034 							/*
6035 							 * No match found; take
6036 							 * empty
6037 							 */
6038 							mnslp = &mnlbp->
6039 							    lb_mnsidelocators
6040 							    [index][li];
6041 							write_lb = 1;
6042 							mnslp->mnl_mnum =
6043 							    md_getminor(newdev
6044 							    [li]);
6045 						} else if (mnslp->mnl_mnum !=
6046 						    md_getminor(newdev[li])) {
6047 							write_lb = 1;
6048 							mnslp->mnl_mnum =
6049 							    md_getminor(newdev
6050 							    [li]);
6051 						}
6052 					} else {
6053 						slp = &lbp->
6054 						    lb_sidelocators[s->s_sideno]
6055 						    [li];
6056 						if (slp->l_mnum !=
6057 						    md_getminor(newdev[li])) {
6058 							write_lb = 1;
6059 							slp->l_mnum =
6060 							    md_getminor(newdev
6061 							    [li]);
6062 						}
6063 					}
6064 					name = ddi_major_to_name(md_getmajor(
6065 					    newdev[li]));
6066 					if (lbp->lb_flags & MDDB_MNSET)
6067 						i = mnslp->mnl_drvnm_index;
6068 					else
6069 						i = slp->l_drvnm_index;
6070 					if (strncmp(lbp->lb_drvnm[i].dn_data,
6071 					    name, lbp->lb_drvnm[i].dn_len) !=
6072 					    0) {
6073 						/* Driver name has changed */
6074 						len = strlen(name);
6075 						/* Look for the driver name */
6076 						for (i = 0; i < MDDB_DRVNMCNT;
6077 						    i++) {
6078 							if (lbp->lb_drvnm[i].
6079 							    dn_len != len)
6080 								continue;
6081 							if (strncmp(lbp->
6082 							    lb_drvnm[i].dn_data,
6083 							    name, len) == 0)
6084 								break;
6085 						}
6086 						/* Didn't find one, add it */
6087 						if (i == MDDB_DRVNMCNT) {
6088 							for (i = 0; i <
6089 							    MDDB_DRVNMCNT;
6090 							    i++) {
6091 								if (lbp->
6092 								    lb_drvnm[i].
6093 								    dn_len == 0)
6094 									break;
6095 							}
6096 							if (i ==
6097 							    MDDB_DRVNMCNT) {
6098 								cmn_err(CE_WARN,
6099 								    "Unable to "
6100 								    " update "
6101 								    "driver "
6102 								    " name for "
6103 								    "dev:  "
6104 								    "major = %d"
6105 								    ", minor = "
6106 								    "%d\n",
6107 								    md_getmajor(
6108 								    newdev[li]),
6109 								    md_getminor(
6110 								    newdev
6111 								    [li]));
6112 								continue;
6113 							}
6114 							(void) strncpy(lbp->
6115 							    lb_drvnm[i].dn_data,
6116 							    name, MD_MAXDRVNM);
6117 							lbp->lb_drvnm[i].
6118 							    dn_len = (uchar_t)
6119 							    strlen(name);
6120 						}
6121 						/* Fill in the drvnm index */
6122 						if (lbp->lb_flags &
6123 						    MDDB_MNSET)
6124 							mnslp->mnl_drvnm_index =
6125 							    i;
6126 						else
6127 							slp->l_drvnm_index = i;
6128 						write_lb = 1;
6129 					}
6130 					did_info->info_flags |=
6131 					    MDDB_DID_UPDATED;
6132 				}
6133 			}
6134 		}
6135 	}
6136 	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
6137 
6138 	/*
6139 	 * If locator block has been changed by get_mbs_n_lbs,
6140 	 * by addition of new device id, by updated minor name or
6141 	 * by updated driver name - write out locator block.
6142 	 */
6143 	if (write_lb) {
6144 		rval = push_lb(s);
6145 		(void) upd_med(s, "load_old_replicas(0)");
6146 		if (rval)
6147 			goto errout;
6148 	}
6149 
6150 	/*
6151 	 * If the tag was moved, allocated, or a BADTAG was seen for some other
6152 	 * reason, then make sure tags are written to all the replicas.
6153 	 * Data tags not supported on MN sets.
6154 	 */
6155 	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
6156 		if (! (lc = dt_alloc_if_needed(s))) {
6157 			for (li = 0; li < lbp->lb_loccnt; li++) {
6158 				lp = &lbp->lb_locators[li];
6159 
6160 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
6161 				    (lp->l_flags & MDDB_F_EMASTER))
6162 					continue;
6163 
6164 				if (lp->l_flags & MDDB_F_BADTAG) {
6165 					lc = 1;
6166 					break;
6167 				}
6168 			}
6169 		}
6170 
6171 		if (lc) {
6172 			md_set_setstatus(setno, MD_SET_TAGDATA);
6173 			md_clr_setstatus(setno, MD_SET_BADTAG);
6174 			(void) selectreplicas(s, MDDB_SCANALL);
6175 		}
6176 	}
6177 
6178 errout:
6179 
6180 	/* Free extraneous rip components. */
6181 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
6182 		/* Get rid of lbp's and dtp's */
6183 
6184 		if (rip->ri_lbp != lbp) {
6185 			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
6186 				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
6187 				rip->ri_dtp = (mddb_dt_t *)NULL;
6188 			}
6189 
6190 			if (rip->ri_devid != (ddi_devid_t)NULL) {
6191 				sz = (int)ddi_devid_sizeof(rip->ri_devid);
6192 				kmem_free((caddr_t)rip->ri_devid, sz);
6193 				rip->ri_devid = (ddi_devid_t)NULL;
6194 			}
6195 			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
6196 				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
6197 				kmem_free((caddr_t)rip->ri_old_devid, sz);
6198 				rip->ri_old_devid = (ddi_devid_t)NULL;
6199 			}
6200 
6201 			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
6202 				mddb_devid_icp_free(&rip->ri_did_icp,
6203 				    rip->ri_lbp);
6204 
6205 				kmem_free((caddr_t)rip->ri_lbp,
6206 				    dbtob(rip->ri_lbp->lb_blkcnt));
6207 				rip->ri_lbp = (mddb_lb_t *)NULL;
6208 			}
6209 		}
6210 
6211 		if (lbp != NULL) {
6212 			for (li = 0; li < lbp->lb_loccnt; li++) {
6213 				lp = &lbp->lb_locators[li];
6214 				if (lp->l_flags & MDDB_F_DELETED)
6215 					continue;
6216 				if (rip->ri_dev == md_expldev(lp->l_dev) &&
6217 				    rip->ri_blkno == lp->l_blkno)
6218 					break;
6219 			}
6220 			if (li < lbp->lb_loccnt)
6221 				continue;
6222 		}
6223 
6224 		/*
6225 		 * Get rid of mbp's:
6226 		 *	if lbp, those out of lb_loccnt bounds
6227 		 *	if !lbp,  all of them.
6228 		 */
6229 		if (rip->ri_mbip) {
6230 			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
6231 			if (dev64 != NODEV64)
6232 				mddb_devclose(dev64);
6233 
6234 			free_mbipp(&rip->ri_mbip);
6235 		}
6236 		/*
6237 		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
6238 		 * code always ends up calling ridev for all replicas
6239 		 * before calling load_old_replicas.  ridev will reset
6240 		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
6241 		 */
6242 		if (setno != MD_LOCAL_SET)
6243 			rip->ri_flags &= ~MDDB_F_EMASTER;
6244 	}
6245 	return (retval);
6246 }
6247 
6248 /*
6249  * Given the devt from the md.conf info, get the devid for the device.
6250  */
6251 static void
6252 lookup_db_devid(mddb_cfg_loc_t *cl)
6253 {
6254 	dev_t		ldev;
6255 	ddi_devid_t	devid;
6256 	char		*minor;
6257 
6258 	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
6259 		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
6260 		return;
6261 	}
6262 
6263 	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
6264 	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
6265 		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
6266 		    cl->l_driver, cl->l_mnum);
6267 		return;
6268 	}
6269 
6270 	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
6271 		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
6272 		    cl->l_mnum);
6273 		return;
6274 	}
6275 
6276 	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
6277 	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
6278 	cl->l_devid = (uint64_t)(uintptr_t)devid;
6279 	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
6280 
6281 	kmem_free(minor, strlen(minor) + 1);
6282 }
6283 
6284 /*
6285  * grab driver name, minor, block and devid out of
6286  * strings like "driver:minor:block:devid"
6287  */
6288 static int
6289 parse_db_loc(
6290 	char		*str,
6291 	mddb_cfg_loc_t	*clp
6292 )
6293 {
6294 	char		*p, *e;
6295 	char		*minor_name;
6296 	ddi_devid_t	ret_devid;
6297 
6298 	clp->l_dev = 0;
6299 	p = clp->l_driver;
6300 	e = p + sizeof (clp->l_driver) - 1;
6301 	while ((*str != ':') && (*str != '\0') && (p < e))
6302 		*p++ = *str++;
6303 	*p = '\0';
6304 	if (*str++ != ':')
6305 		return (-1);
6306 	clp->l_mnum = 0;
6307 	while (ISNUM(*str)) {
6308 		clp->l_mnum *= 10;
6309 		clp->l_mnum += *str++ - '0';
6310 	}
6311 	if (*str++ != ':')
6312 		return (-1);
6313 	clp->l_blkno = 0;
6314 	while (ISNUM(*str)) {
6315 		clp->l_blkno *= 10;
6316 		clp->l_blkno += *str++ - '0';
6317 	}
6318 	if (*str++ != ':')
6319 		return (-1);
6320 
6321 	/*
6322 	 * If the md_devid_destroy flag is set, ignore the device ids.
6323 	 * This is only to used in a catastrophic failure case.  Examples
6324 	 * would be where the device id of all drives in the system
6325 	 * (especially the mirror'd root drives) had been changed
6326 	 * by firmware upgrade or by a patch to an existing disk
6327 	 * driver.  Another example would be in the case of non-unique
6328 	 * device ids due to a bug.  The device id would be valid on
6329 	 * the system, but would return the wrong dev_t.
6330 	 */
6331 	if (md_devid_destroy) {
6332 		clp->l_devid_flags = 0;
6333 		clp->l_devid = (uint64_t)NULL;
6334 		clp->l_devid_sz = 0;
6335 		clp->l_old_devid = (uint64_t)NULL;
6336 		clp->l_old_devid_sz = 0;
6337 		clp->l_minor_name[0] = '\0';
6338 		return (0);
6339 	}
6340 
6341 	if (ddi_devid_str_decode(str,
6342 	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
6343 		return (-1);
6344 
6345 	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
6346 	clp->l_devid_flags = 0;
6347 	clp->l_old_devid = (uint64_t)NULL;
6348 	clp->l_old_devid_sz = 0;
6349 
6350 	/* If no device id associated with device, just return */
6351 	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
6352 		clp->l_devid_sz = 0;
6353 		clp->l_minor_name[0] = '\0';
6354 		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
6355 		    md_keep_repl_state == 0) {
6356 			/*
6357 			 * No devid in md.conf; we're in recovery mode so
6358 			 * lookup the devid for the device as specified by
6359 			 * the devt in md.conf.
6360 			 */
6361 			lookup_db_devid(clp);
6362 		}
6363 		return (0);
6364 	}
6365 
6366 	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
6367 	    MDDB_DEVID_SZ;
6368 	clp->l_devid_sz = (int)ddi_devid_sizeof(
6369 	    (ddi_devid_t)(uintptr_t)clp->l_devid);
6370 	(void) strcpy(clp->l_minor_name, minor_name);
6371 	kmem_free(minor_name, strlen(minor_name) + 1);
6372 
6373 	return (0);
6374 }
6375 
6376 /*
6377  * grab driver name, minor, and block out of
6378  * strings like "driver:minor:block:devid driver:minor:block:devid ..."
6379  */
6380 static void
6381 parse_db_string(
6382 	char		*str
6383 )
6384 {
6385 	char		*p, *e;
6386 	mddb_cfg_loc_t	*cl;
6387 	char		restore_space;
6388 
6389 	/* CSTYLED */
6390 	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
6391 	for (p = str; (*p != '\0'); ) {
6392 		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
6393 			;
6394 		if (*p == '\0')
6395 			break;
6396 		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
6397 			;
6398 		/*
6399 		 * Only give parse_db_loc 1 entry, so stuff a null into
6400 		 * the string if we're not at the end.  We need to save this
6401 		 * char and restore it after call.
6402 		 */
6403 		restore_space = '\0';
6404 		if (*e != '\0') {
6405 			restore_space = *e;
6406 			*e = '\0';
6407 		}
6408 		if (parse_db_loc(p, cl) != 0) {
6409 			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
6410 		} else {
6411 			(void) ridev(
6412 			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
6413 			    cl, NULL, MDDB_F_PTCHED);
6414 			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
6415 				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
6416 				    cl->l_devid_sz);
6417 			}
6418 		}
6419 		if (restore_space != '\0') {
6420 			*e = restore_space;
6421 		}
6422 		p = e;
6423 	}
6424 	kmem_free(cl, sizeof (mddb_cfg_loc_t));
6425 }
6426 
6427 /*
6428  * grab database locations supplied by md.conf as properties
6429  */
6430 static void
6431 parse_db_strings(void)
6432 {
6433 	int		bootlist_id;
6434 	int		proplen;
6435 	/*
6436 	 * size of _bootlist_name should match uses of line and entry in
6437 	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
6438 	 */
6439 	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
6440 	char		*bootlist_name;
6441 	caddr_t		prop;
6442 
6443 /*
6444  * Step through the bootlist properties one at a time by forming the
6445  * correct name, fetching the property, parsing the property and
6446  * then freeing the memory.  If a property does not exist or returns
6447  * some form of error just ignore it.  There is no guarantee that
6448  * the properties will always exist in sequence, for example
6449  * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
6450  * mddb_bootlist3 existing.
6451  */
6452 	bootlist_name = &_bootlist_name[0];
6453 	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
6454 
6455 		proplen = 0;
6456 		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
6457 
6458 		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
6459 		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
6460 		    &proplen) != DDI_PROP_SUCCESS)
6461 			continue;
6462 
6463 		if (proplen <= 0)
6464 			continue;
6465 
6466 		if (md_init_debug)
6467 			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
6468 
6469 		parse_db_string(prop);
6470 		kmem_free(prop, proplen);
6471 	}
6472 }
6473 
6474 static int
6475 initit(
6476 	set_t		setno,
6477 	int		flag
6478 )
6479 {
6480 	int		i;
6481 	mddb_set_t	*s;
6482 	mddb_lb_t	*lbp;		/* pointer to locator block */
6483 	mddb_ln_t	*lnp;		/* pointer to locator names */
6484 	mddb_db_t	*dbp;		/* pointer to directory block */
6485 	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
6486 	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
6487 	mddb_bf_t	*bfp;
6488 	side_t		sideno;
6489 	side_t		maxsides;
6490 	mddb_block_t	lb_blkcnt;
6491 	int		retval = 0;
6492 	md_dev64_t	dev;
6493 	mddb_mnlb_t	*mnlbp;
6494 	int		devid_flag;
6495 
6496 	/* single thread's all loads/unloads of set's */
6497 	mutex_enter(&mddb_lock);
6498 	mutex_enter(SETMUTEX(setno));
6499 
6500 	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
6501 		mutex_exit(SETMUTEX(setno));
6502 		mutex_exit(&mddb_lock);
6503 		return (MDDB_E_NOTNOW);
6504 	}
6505 
6506 	s = (mddb_set_t *)md_set[setno].s_db;
6507 
6508 	single_thread_start(s);
6509 
6510 	/*
6511 	 * init is already underway, block. Return success.
6512 	 */
6513 	if (s->s_lbp) {
6514 		single_thread_end(s);
6515 		mutex_exit(SETMUTEX(setno));
6516 		mutex_exit(&mddb_lock);
6517 		return (0);
6518 	}
6519 
6520 	uniqtime32(&s->s_inittime);
6521 
6522 	/* grab database locations patched by /etc/system */
6523 	if (setno == MD_LOCAL_SET)
6524 		parse_db_strings();
6525 
6526 	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
6527 	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
6528 
6529 	s->s_zombie = 0;
6530 	s->s_staledeletes = 0;
6531 	s->s_optcmtcnt = 0;
6532 	s->s_opthavelck = 0;
6533 	s->s_optwantlck = 0;
6534 	s->s_optwaiterr = 0;
6535 	s->s_opthungerr = 0;
6536 
6537 	/*
6538 	 * KEEPTAG can never be set for a MN diskset since no tags are
6539 	 * allowed to be stored in a MN diskset.  No way to check
6540 	 * if this is a MN diskset or not at this point since the mddb
6541 	 * hasn't been read in from disk yet.  (flag will only have
6542 	 * MUTLINODE bit set if a new set is being created.)
6543 	 */
6544 	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
6545 		dt_setup(s, NULL);
6546 
6547 	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
6548 
6549 	for (i = 0; i <	mddb_maxbufheaders; i++) {
6550 		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
6551 		sema_init(&bfp->bf_buf.b_io, 0, NULL,
6552 		    SEMA_DEFAULT, NULL);
6553 		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
6554 		    SEMA_DEFAULT, NULL);
6555 		bfp->bf_buf.b_offset = -1;
6556 		freebuffer(s, bfp);
6557 	}
6558 
6559 	retval = load_old_replicas(s, flag);
6560 	/* If 0 return value - success */
6561 	if (! retval) {
6562 		single_thread_end(s);
6563 		mutex_exit(SETMUTEX(setno));
6564 		mutex_exit(&mddb_lock);
6565 		return (0);
6566 	}
6567 
6568 	/*
6569 	 * If here, then the load_old_replicas() failed
6570 	 */
6571 
6572 
6573 	/* If the database was supposed to exist. */
6574 	if (flag & MDDB_MUSTEXIST) {
6575 		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
6576 			for (i = 0; i < mddb_maxcopies;	 i++) {
6577 				if (! s->s_mbiarray[i])
6578 					continue;
6579 				dev = md_expldev(
6580 				    s->s_lbp->lb_locators[i].l_dev);
6581 				dev = md_xlate_targ_2_mini(dev);
6582 				if (dev != NODEV64)
6583 					mddb_devclose(dev);
6584 
6585 				free_mbipp(&s->s_mbiarray[i]);
6586 			}
6587 
6588 			kmem_free((caddr_t)s->s_mbiarray,
6589 			    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
6590 			s->s_mbiarray = NULL;
6591 		}
6592 
6593 		if (s->s_lnp != (mddb_ln_t *)NULL) {
6594 			kmem_free((caddr_t)s->s_lnp,
6595 			    dbtob(s->s_lbp->lb_lnblkcnt));
6596 			s->s_lnp = (mddb_ln_t *)NULL;
6597 		}
6598 
6599 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
6600 
6601 		if (s->s_lbp != (mddb_lb_t *)NULL) {
6602 			kmem_free((caddr_t)s->s_lbp,
6603 			    dbtob(s->s_lbp->lb_blkcnt));
6604 			s->s_lbp = (mddb_lb_t *)NULL;
6605 		}
6606 
6607 		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
6608 			kmem_free((caddr_t)bfp, sizeof (*bfp));
6609 
6610 		single_thread_end(s);
6611 		mutex_exit(SETMUTEX(setno));
6612 		mutex_exit(&mddb_lock);
6613 
6614 		if (retval == MDDB_E_TAGDATA)
6615 			return (retval);
6616 
6617 		/* Want a bit more detailed error messages */
6618 		if (mddb_db_err_detail)
6619 			return (retval);
6620 
6621 		return (MDDB_E_NODB);
6622 	}
6623 
6624 
6625 	/*
6626 	 * MDDB_NOOLDOK set - Creating a new database, so do
6627 	 * more initialization.
6628 	 */
6629 
6630 	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6631 	    MDDB_LOCAL_LBCNT : MDDB_LBCNT);
6632 	if (flag & MDDB_MULTINODE) {
6633 		lb_blkcnt = MDDB_MNLBCNT;
6634 	}
6635 
6636 	if (s->s_lbp == NULL)
6637 		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
6638 	lbp = s->s_lbp;
6639 
6640 	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
6641 	lbp->lb_setno = setno;
6642 	lbp->lb_magic = MDDB_MAGIC_LB;
6643 	if (flag & MDDB_MULTINODE) {
6644 		lbp->lb_revision = MDDB_REV_MNLB;
6645 	} else {
6646 		lbp->lb_revision = MDDB_REV_LB;
6647 	}
6648 	lbp->lb_inittime = s->s_inittime;
6649 	if (flag & MDDB_MULTINODE) {
6650 		mnlbp = (mddb_mnlb_t *)lbp;
6651 		for (i = 0; i < MDDB_NLB; i++) {
6652 			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
6653 				mddb_mnsidelocator_t	*mnslp;
6654 				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
6655 				mnslp->mnl_mnum = NODEV32;
6656 				mnslp->mnl_sideno = 0;
6657 				mnslp->mnl_drvnm_index = 0;
6658 			}
6659 		}
6660 	} else {
6661 		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
6662 		for (i = 0; i < MDDB_NLB; i++) {
6663 			for (sideno = 0; sideno < maxsides; sideno++) {
6664 				mddb_sidelocator_t	*slp;
6665 				slp = &lbp->lb_sidelocators[sideno][i];
6666 				slp->l_mnum = NODEV32;
6667 			}
6668 		}
6669 	}
6670 	lbp->lb_blkcnt = lb_blkcnt;
6671 
6672 	/* lb starts on block 0 */
6673 	/* locator names starts after locator block */
6674 	lbp->lb_lnfirstblk = lb_blkcnt;
6675 	if (flag & MDDB_MULTINODE) {
6676 		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
6677 	} else {
6678 		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6679 		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
6680 	}
6681 
6682 	if (flag & MDDB_MULTINODE) {
6683 		/* Creating a multinode diskset */
6684 		md_set_setstatus(setno, MD_SET_MNSET);
6685 		lbp->lb_flags |= MDDB_MNSET;
6686 	}
6687 
6688 	/* Data portion of mddb located after locator names */
6689 	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
6690 
6691 	/* the btodb that follows is converting the directory block size */
6692 	/* Data tag part of mddb located after first block of mddb data */
6693 	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
6694 	    btodb(MDDB_BSIZE));
6695 	/* Data tags are not used in MN diskset - so set count to 0 */
6696 	if (flag & MDDB_MULTINODE)
6697 		lbp->lb_dtblkcnt = (mddb_block_t)0;
6698 	else
6699 		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
6700 
6701 
6702 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
6703 	lnp->ln_magic = MDDB_MAGIC_LN;
6704 	if (flag & MDDB_MULTINODE) {
6705 		lnp->ln_revision = MDDB_REV_MNLN;
6706 	} else {
6707 		lnp->ln_revision = MDDB_REV_LN;
6708 	}
6709 	s->s_lnp = lnp;
6710 
6711 	/*
6712 	 * Set up Device ID portion of Locator Block.
6713 	 * Do not set locator to device id style if
6714 	 * md_devid_destroy is 1 and md_keep_repl_state is 1
6715 	 * (destroy all device id data and keep replica in
6716 	 * non device id mode).
6717 	 *
6718 	 * This is logically equivalent to set locator to
6719 	 * device id style if md_devid_destroy is 0 or
6720 	 * md_keep_repl_state is 0.
6721 	 *
6722 	 * In SunCluster environment, device id mode is disabled
6723 	 * which means diskset will be run in non-devid mode.  For
6724 	 * localset, the behavior will remain intact and run in
6725 	 * device id mode.
6726 	 *
6727 	 * In multinode diskset devids are turned off.
6728 	 */
6729 	devid_flag = 1;
6730 	if (cluster_bootflags & CLUSTER_CONFIGURED)
6731 		if (setno != MD_LOCAL_SET)
6732 			devid_flag = 0;
6733 	if (flag & MDDB_MULTINODE)
6734 		devid_flag = 0;
6735 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
6736 		devid_flag = 0;
6737 	/*
6738 	 * if we weren't devid style before and md_keep_repl_state=1
6739 	 * we need to stay non-devid
6740 	 */
6741 	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
6742 	    (md_keep_repl_state == 1))
6743 		devid_flag = 0;
6744 	if (devid_flag) {
6745 		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
6746 		    lbp->lb_dtblkcnt;
6747 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6748 		lbp->lb_flags |= MDDB_DEVID_STYLE;
6749 
6750 		did_icp = (mddb_did_ic_t *)kmem_zalloc
6751 		    (sizeof (mddb_did_ic_t), KM_SLEEP);
6752 		did_blkp = (mddb_did_blk_t *)
6753 		    kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
6754 		did_blkp->blk_magic = MDDB_MAGIC_DI;
6755 		did_blkp->blk_revision = MDDB_REV_DI;
6756 		did_icp->did_ic_blkp = did_blkp;
6757 		s->s_did_icp = did_icp;
6758 	}
6759 
6760 	setidentifier(s, &lbp->lb_ident);
6761 	uniqtime32(&lbp->lb_timestamp);
6762 	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
6763 	dbp->db_magic = MDDB_MAGIC_DB;
6764 	dbp->db_revision = MDDB_REV_DB;
6765 	uniqtime32(&dbp->db_timestamp);
6766 	dbp->db_nextblk = 0;
6767 	dbp->db_firstentry = NULL;
6768 	dbp->db_blknum = lbp->lb_dbfirstblk;
6769 	dbp->db_recsum = MDDB_GLOBAL_XOR;
6770 	s->s_dbp = dbp;
6771 	single_thread_end(s);
6772 	mutex_exit(SETMUTEX(setno));
6773 	mutex_exit(&mddb_lock);
6774 	return (0);
6775 }
6776 
6777 mddb_set_t *
6778 mddb_setenter(
6779 	set_t		setno,
6780 	int		flag,
6781 	int		*errorcodep
6782 )
6783 {
6784 	mddb_set_t	*s;
6785 	int		err = 0;
6786 	size_t		sz = sizeof (void *) * MD_MAXUNITS;
6787 
6788 	mutex_enter(SETMUTEX(setno));
6789 	if (! md_set[setno].s_db) {
6790 		mutex_exit(SETMUTEX(setno));
6791 		if (errorcodep != NULL)
6792 			*errorcodep = MDDB_E_NOTOWNER;
6793 		return (NULL);
6794 	}
6795 
6796 	/* Allocate s_un and s_ui arrays if not already present. */
6797 	if (md_set[setno].s_un == NULL) {
6798 		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
6799 		if (md_set[setno].s_un == NULL) {
6800 			mutex_exit(SETMUTEX(setno));
6801 			if (errorcodep != NULL)
6802 				*errorcodep = MDDB_E_NOTOWNER;
6803 			return (NULL);
6804 		}
6805 	}
6806 	if (md_set[setno].s_ui == NULL) {
6807 		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
6808 		if (md_set[setno].s_ui == NULL) {
6809 			mutex_exit(&md_set[setno].s_dbmx);
6810 			kmem_free(md_set[setno].s_un, sz);
6811 			md_set[setno].s_un = NULL;
6812 			if (errorcodep != NULL)
6813 				*errorcodep = MDDB_E_NOTOWNER;
6814 			return (NULL);
6815 		}
6816 	}
6817 	s = (mddb_set_t *)md_set[setno].s_db;
6818 	if (s->s_lbp)
6819 		return (s);
6820 
6821 	if (flag & MDDB_NOINIT)
6822 		return (s);
6823 
6824 	/*
6825 	 * Release the set mutex - it will be acquired and released in
6826 	 * initit after acquiring the mddb_lock.  This is done to assure
6827 	 * that mutexes are always acquired in the same order to prevent
6828 	 * possible deadlock
6829 	 */
6830 	mutex_exit(SETMUTEX(setno));
6831 
6832 	if ((err = initit(setno, flag)) != 0) {
6833 		if (errorcodep != NULL)
6834 			*errorcodep = err;
6835 		return (NULL);
6836 	}
6837 
6838 	mutex_enter(SETMUTEX(setno));
6839 	return ((mddb_set_t *)md_set[setno].s_db);
6840 }
6841 
6842 /*
6843  * Release the set lock for a given set.
6844  *
6845  * In a MN diskset, this routine may send messages to the rpc.mdcommd
6846  * in order to have the slave nodes re-parse parts of the mddb.
6847  * Messages are only sent if the global ioctl lock is not held.
6848  *
6849  * With the introduction of multi-threaded ioctls, there is no way
6850  * to determine which thread(s) are holding the ioctl lock.  So, if
6851  * the ioctl lock is held (by process X) process X will send the
6852  * messages to the slave nodes when process X releases the ioctl lock.
6853  */
6854 void
6855 mddb_setexit(
6856 	mddb_set_t	*s
6857 )
6858 {
6859 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
6860 	md_mn_kresult_t			*kresult;
6861 	mddb_lb_t			*lbp = s->s_lbp;
6862 	int				i;
6863 	int				rval = 1;
6864 
6865 	/*
6866 	 * If not a MN diskset OR
6867 	 * a MN diskset but this node isn't master,
6868 	 * then release the mutex.
6869 	 */
6870 	if (!(MD_MNSET_SETNO(s->s_setno)) ||
6871 	    ((MD_MNSET_SETNO(s->s_setno)) &&
6872 	    (!md_set[s->s_setno].s_am_i_master))) {
6873 		mutex_exit(SETMUTEX(s->s_setno));
6874 		return;
6875 	}
6876 
6877 	/*
6878 	 * If global ioctl lock is held, then send no messages,
6879 	 * just release mutex and return.
6880 	 *
6881 	 */
6882 	if (md_status & MD_GBL_IOCTL_LOCK) {
6883 		mutex_exit(SETMUTEX(s->s_setno));
6884 		return;
6885 	}
6886 
6887 	/*
6888 	 * This thread is not holding the ioctl lock, so drop the set
6889 	 * lock, send messages to slave nodes to reparse portions
6890 	 * of the mddb and return.
6891 	 *
6892 	 * If the block parse flag is set, do not send parse messages.
6893 	 * This flag is set when master is adding a new mddb that would
6894 	 * cause parse messages to be sent to the slaves, but the slaves
6895 	 * don't have knowledge of the new mddb yet since the mddb add
6896 	 * operation hasn't been run on the slave nodes yet.  When the
6897 	 * master unblocks the parse flag, the parse messages will be
6898 	 * generated.
6899 	 *
6900 	 * If s_mn_parseflags_sending is non-zero, then another thread
6901 	 * is already currently sending a parse message, so just release
6902 	 * the mutex and return.  If an mddb change occurred that results
6903 	 * in a parse message to be generated, the thread that is currently
6904 	 * sending a parse message would generate the additional parse message.
6905 	 *
6906 	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
6907 	 * then loop until s_mn_parseflags is 0 (until there are no more
6908 	 * messages to send).
6909 	 * While s_mn_parseflags is non-zero,
6910 	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
6911 	 * 	set s_mn_parseflags to zero
6912 	 *	release mutex
6913 	 *	send message
6914 	 *	re-grab mutex
6915 	 *	set s_mn_parseflags_sending to zero
6916 	 */
6917 	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
6918 	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
6919 	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
6920 	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
6921 		/* Grab snapshot of parse flags */
6922 		s->s_mn_parseflags_sending = s->s_mn_parseflags;
6923 		s->s_mn_parseflags = 0;
6924 
6925 		mutex_exit(SETMUTEX(s->s_setno));
6926 
6927 		/*
6928 		 * Send the message to the slaves to re-parse
6929 		 * the indicated portions of the mddb. Send the status
6930 		 * of the 50 mddbs in this set so that slaves know which
6931 		 * mddbs that the master node thinks are 'good'.
6932 		 * Otherwise, slave may reparse, but from wrong replica.
6933 		 */
6934 		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
6935 		for (i = 0; i < MDDB_NLB; i++) {
6936 			mddb_parse_msg->msg_lb_flags[i] =
6937 			    lbp->lb_locators[i].l_flags;
6938 		}
6939 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
6940 		while (rval != 0) {
6941 			rval = mdmn_ksend_message(s->s_setno,
6942 			    MD_MN_MSG_MDDB_PARSE, 0, 0,
6943 			    (char *)mddb_parse_msg,
6944 			    sizeof (md_mn_msg_mddb_parse_t), kresult);
6945 			if (rval != 0)
6946 				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
6947 				    "mddb update message to other nodes in "
6948 				    "diskset %s\n", s->s_setname);
6949 		}
6950 		kmem_free(kresult, sizeof (md_mn_kresult_t));
6951 
6952 		/*
6953 		 * Re-grab mutex to clear sending field and to
6954 		 * see if another parse message needs to be generated.
6955 		 */
6956 		mutex_enter(SETMUTEX(s->s_setno));
6957 		s->s_mn_parseflags_sending = 0;
6958 	}
6959 	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
6960 	mutex_exit(SETMUTEX(s->s_setno));
6961 }
6962 
6963 static void
6964 mddb_setexit_no_parse(
6965 	mddb_set_t	*s
6966 )
6967 {
6968 	mutex_exit(SETMUTEX(s->s_setno));
6969 }
6970 
6971 uint_t
6972 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
6973 {
6974 	uint_t			li;
6975 	mddb_lb_t		*lbp = s->s_lbp;
6976 	mddb_locator_t		*lp;
6977 	ddi_devid_t		ret_devid;
6978 	uint_t			devid_len;
6979 	dev_t			ddi_dev;
6980 	mddb_did_ic_t		*did_icp;
6981 	mddb_did_blk_t		*did_blkp;
6982 	char			*minor_name;
6983 	size_t			sz;
6984 	int			retval;
6985 	int			err;
6986 	md_dev64_t		dev64; /* tmp var to make code look better */
6987 
6988 
6989 	/* Need disk block(s) to hold mddb_did_blk_t */
6990 	*blk_cnt = MDDB_DID_BLOCKS;
6991 
6992 	if (doit) {
6993 		/*
6994 		 * Alloc mddb_did_blk_t disk block and fill in header area.
6995 		 * Don't fill in did magic number until end of routine so
6996 		 * if machine panics in the middle of conversion, the
6997 		 * device id information will be thrown away at the
6998 		 * next snarfing of this set.
6999 		 * Need to set DEVID_STYLE so that mddb_devid_add will
7000 		 * function properly.
7001 		 */
7002 		/* grab the mutex */
7003 		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
7004 			return (1);
7005 		}
7006 		single_thread_start(s);
7007 		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
7008 		if (lbp->lb_didfirstblk == 0) {
7009 			single_thread_end(s);
7010 			mddb_setexit(s);
7011 			return (1);
7012 		}
7013 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
7014 		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
7015 		    KM_SLEEP);
7016 		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
7017 		    KM_SLEEP);
7018 
7019 		did_blkp->blk_revision = MDDB_REV_DI;
7020 		did_icp->did_ic_blkp = did_blkp;
7021 		s->s_did_icp = did_icp;
7022 		lbp->lb_flags |= MDDB_DEVID_STYLE;
7023 	}
7024 
7025 	/* Fill in information in mddb_did_info_t array */
7026 	for (li = 0; li < lbp->lb_loccnt; li++) {
7027 		lp = &lbp->lb_locators[li];
7028 		if (lp->l_flags & MDDB_F_DELETED)
7029 			continue;
7030 
7031 		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
7032 		ddi_dev = md_dev64_to_dev(dev64);
7033 		if (ddi_dev == NODEV) {
7034 			/*
7035 			 * No translation available for replica.
7036 			 * Could fail conversion to device id replica,
7037 			 * but instead will just continue with next
7038 			 * replica in list.
7039 			 */
7040 			continue;
7041 		}
7042 		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
7043 			/*
7044 			 * Just count each devid as at least 1 block.  This
7045 			 * is conservative since several device id's may fit
7046 			 * into 1 disk block, but it's better to overestimate
7047 			 * the number of blocks needed than to underestimate.
7048 			 */
7049 			devid_len = (int)ddi_devid_sizeof(ret_devid);
7050 			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
7051 			if (doit) {
7052 				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
7053 				    &minor_name) == DDI_SUCCESS) {
7054 					if (mddb_devid_add(s, li, ret_devid,
7055 					    minor_name)) {
7056 						cmn_err(CE_WARN,
7057 						    "Not enough space in metadb"
7058 						    " to add device id for"
7059 						    "  dev: major = %d, "
7060 						    "minor = %d\n",
7061 						    getmajor(ddi_dev),
7062 						    getminor(ddi_dev));
7063 					}
7064 					sz = strlen(minor_name) + 1;
7065 					kmem_free(minor_name, sz);
7066 				}
7067 			}
7068 			ddi_devid_free(ret_devid);
7069 		}
7070 	}
7071 
7072 	if (doit) {
7073 		did_blkp->blk_magic = MDDB_MAGIC_DI;
7074 		retval = push_lb(s);
7075 		(void) upd_med(s, "mddb_lb_did_convert(0)");
7076 		single_thread_end(s);
7077 		mddb_setexit(s);
7078 		if (retval != 0)
7079 			return (1);
7080 	}
7081 
7082 	return (0);
7083 }
7084 
7085 static mddb_set_t *
7086 init_set(
7087 	mddb_config_t	*cp,
7088 	int		flag,
7089 	int		*errp
7090 )
7091 {
7092 	mddb_set_t	*s;
7093 	char		*setname = NULL;
7094 	set_t		setno = MD_LOCAL_SET;
7095 	side_t		sideno = 0;
7096 	struct timeval32 *created = NULL;
7097 
7098 	if (cp != NULL) {
7099 		setname = cp->c_setname;
7100 		setno = cp->c_setno;
7101 		sideno = cp->c_sideno;
7102 		created = &cp->c_timestamp;
7103 	}
7104 
7105 	if (setno >= MD_MAXSETS)
7106 		return ((mddb_set_t *)NULL);
7107 
7108 	if (md_set[setno].s_db)
7109 		return (mddb_setenter(setno, flag, errp));
7110 
7111 	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
7112 
7113 	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
7114 	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
7115 	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
7116 	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
7117 	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
7118 
7119 	s->s_setno = setno;
7120 	s->s_sideno = sideno;
7121 	if (setno == MD_LOCAL_SET) {
7122 		(void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial),
7123 		    "%u", zone_get_hostid(NULL));
7124 	} else {
7125 		s->s_ident.createtime = *created;
7126 		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
7127 		    KM_SLEEP);
7128 		(void) strcpy(s->s_setname, setname);
7129 	}
7130 
7131 	/* have a config struct,  copy mediator information */
7132 	if (cp != NULL)
7133 		s->s_med = cp->c_med;		/* structure assignment */
7134 
7135 	md_set[setno].s_db = (void *) s;
7136 
7137 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
7138 
7139 	return (mddb_setenter(setno, flag, errp));
7140 }
7141 
7142 void
7143 mddb_unload_set(
7144 	set_t		setno
7145 )
7146 {
7147 
7148 	mddb_set_t	*s;
7149 	mddb_db_t	*dbp, *adbp = NULL;
7150 	mddb_de_ic_t	*dep, *dep2;
7151 	mddb_bf_t	*bfp;
7152 	int		i;
7153 	md_dev64_t	dev;
7154 
7155 	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
7156 		return;
7157 
7158 	single_thread_start(s);
7159 
7160 	s->s_opthavequeuinglck = 0;
7161 	s->s_optwantqueuinglck = 0;
7162 
7163 	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
7164 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
7165 			if (dep->de_rb_userdata != NULL) {
7166 				if (dep->de_icreqsize)
7167 					kmem_free(dep->de_rb_userdata_ic,
7168 					    dep->de_icreqsize);
7169 				else
7170 					kmem_free(dep->de_rb_userdata,
7171 					    dep->de_reqsize);
7172 			}
7173 			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
7174 			dep2 = dep->de_next;
7175 			kmem_free((caddr_t)dep, sizeofde(dep));
7176 		}
7177 		adbp = dbp->db_next;
7178 		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
7179 	}
7180 	s->s_dbp = (mddb_db_t *)NULL;
7181 
7182 	free_rip(&s->s_rip);
7183 
7184 	for (i = 0; i < mddb_maxcopies;	 i++) {
7185 		if (! s->s_mbiarray)
7186 			break;
7187 
7188 		if (! s->s_mbiarray[i])
7189 			continue;
7190 
7191 		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
7192 		dev = md_xlate_targ_2_mini(dev);
7193 		if (dev != NODEV64)
7194 			mddb_devclose(dev);
7195 
7196 		free_mbipp(&s->s_mbiarray[i]);
7197 	}
7198 
7199 	if (s->s_mbiarray) {
7200 		kmem_free((caddr_t)s->s_mbiarray,
7201 		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
7202 		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
7203 	}
7204 
7205 	if (s->s_lnp) {
7206 		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
7207 		s->s_lnp = (mddb_ln_t *)NULL;
7208 	}
7209 
7210 	if (s->s_lbp) {
7211 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
7212 		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
7213 		s->s_lbp = (mddb_lb_t *)NULL;
7214 	}
7215 
7216 	if (s->s_freebitmap) {
7217 		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
7218 		s->s_freebitmap = NULL;
7219 		s->s_freebitmapsize = 0;
7220 	}
7221 
7222 	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
7223 		kmem_free((caddr_t)bfp, sizeof (*bfp));
7224 
7225 	if (s->s_databuffer_size) {
7226 		kmem_free(s->s_databuffer, s->s_databuffer_size);
7227 		s->s_databuffer_size = 0;
7228 	}
7229 
7230 	if (s->s_setname != NULL)
7231 		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
7232 
7233 	/* Data tags not supported on MN sets. */
7234 	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
7235 		dtl_freel(&s->s_dtlp);
7236 
7237 	md_set[setno].s_db = NULL;
7238 	ASSERT(s->s_singlelockwanted == 0);
7239 	kmem_free(s, sizeof (mddb_set_t));
7240 
7241 	/* Take care of things setup in the md_set array */
7242 	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
7243 		if (md_set[setno].s_dtp) {
7244 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
7245 			md_set[setno].s_dtp = NULL;
7246 		}
7247 	}
7248 
7249 	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
7250 	    MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
7251 	    MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
7252 	    MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
7253 	    MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
7254 
7255 	mutex_exit(SETMUTEX(setno));
7256 }
7257 
7258 /*
7259  * returns 0 if name can be put into locator block
7260  * returns 1 if locator block prefixes are all used
7261  *
7262  * Takes splitname (suffix, prefix, sideno) and
7263  * stores it in the locator name structure.
7264  * For traditional diskset, the sideno is the index into the suffixes
7265  * array in the locator name structure.
7266  * For the MN diskset, the sideno is the nodeid which can be any number,
7267  * so the index passed in is the index into the mnsuffixes array
7268  * in the locator structure.  This index was computed by the
7269  * routine checklocator which basically checked the locator block
7270  * mnside locator structure.
7271  */
7272 static int
7273 splitname2locatorblock(
7274 	md_splitname	*spn,
7275 	mddb_ln_t	*lnp,
7276 	int		li,
7277 	side_t		sideno,
7278 	int		index
7279 )
7280 {
7281 	uchar_t			i;
7282 	md_name_suffix		*sn;
7283 	md_mnname_suffix_t	*mnsn;
7284 	mddb_mnln_t		*mnlnp;
7285 
7286 	for (i = 0; i < MDDB_PREFIXCNT; i++) {
7287 		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
7288 			continue;
7289 		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
7290 		    SPN_PREFIX(spn).pre_len) == 0)
7291 			break;
7292 	}
7293 	if (i == MDDB_PREFIXCNT) {
7294 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7295 			if (lnp->ln_prefixes[i].pre_len == 0)
7296 				break;
7297 		}
7298 		if (i == MDDB_PREFIXCNT)
7299 			return (1);
7300 		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
7301 		    SPN_PREFIX(spn).pre_len);
7302 		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
7303 	}
7304 
7305 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7306 		/* If a MN diskset, use index */
7307 		mnlnp = (mddb_mnln_t *)lnp;
7308 		mnsn = &mnlnp->ln_mnsuffixes[index][li];
7309 		mnsn->mn_ln_sideno = sideno;
7310 		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
7311 		mnsn->mn_ln_suffix.suf_prefix = i;
7312 		bcopy(SPN_SUFFIX(spn).suf_data,
7313 		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
7314 	} else {
7315 		sn = &lnp->ln_suffixes[sideno][li];
7316 		sn->suf_len = SPN_SUFFIX(spn).suf_len;
7317 		sn->suf_prefix = i;
7318 		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
7319 		    SPN_SUFFIX(spn).suf_len);
7320 	}
7321 	return (0);
7322 }
7323 
7324 /*
7325  * Find the locator name for the given sideno and convert the locator name
7326  * information into a splitname structure.
7327  */
7328 void
7329 mddb_locatorblock2splitname(
7330 	mddb_ln_t	*lnp,
7331 	int		li,
7332 	side_t		sideno,
7333 	md_splitname	*spn
7334 )
7335 {
7336 	int			iprefix;
7337 	md_name_suffix		*sn;
7338 	md_mnname_suffix_t	*mnsn;
7339 	int			i;
7340 	mddb_mnln_t		*mnlnp;
7341 
7342 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7343 		mnlnp = (mddb_mnln_t *)lnp;
7344 		for (i = 0; i < MD_MNMAXSIDES; i++) {
7345 			mnsn = &mnlnp->ln_mnsuffixes[i][li];
7346 			if (mnsn->mn_ln_sideno == sideno)
7347 				break;
7348 		}
7349 		if (i == MD_MNMAXSIDES)
7350 			return;
7351 
7352 		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
7353 		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
7354 		    SPN_SUFFIX(spn).suf_len);
7355 		iprefix = mnsn->mn_ln_suffix.suf_prefix;
7356 	} else {
7357 		sn = &lnp->ln_suffixes[sideno][li];
7358 		SPN_SUFFIX(spn).suf_len = sn->suf_len;
7359 		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
7360 		    SPN_SUFFIX(spn).suf_len);
7361 		iprefix = sn->suf_prefix;
7362 	}
7363 	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
7364 	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
7365 	    SPN_PREFIX(spn).pre_len);
7366 }
7367 
7368 static int
7369 getdeldev(
7370 	mddb_config_t	*cp,
7371 	int		command,
7372 	md_error_t	*ep
7373 )
7374 {
7375 	mddb_set_t	*s;
7376 	mddb_lb_t	*lbp;
7377 	mddb_locator_t	*locators;
7378 	uint_t		loccnt;
7379 	mddb_mb_ic_t	*mbip;
7380 	mddb_block_t	blk;
7381 	int		err = 0;
7382 	int		i, j;
7383 	int		li;
7384 	uint_t		commitcnt;
7385 	set_t		setno = cp->c_setno;
7386 	uint_t		set_status;
7387 	md_dev64_t	dev;
7388 	int		flags = MDDB_MUSTEXIST;
7389 
7390 	cp->c_dbmax = MDDB_NLB;
7391 
7392 	/*
7393 	 * Data checking
7394 	 */
7395 	if (setno >= md_nsets || cp->c_id < 0 ||
7396 	    cp->c_id > cp->c_dbmax) {
7397 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7398 	}
7399 
7400 	if (cp->c_flags & MDDB_C_STALE)
7401 		flags |= MDDB_MN_STALE;
7402 
7403 	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
7404 		return (mddbstatus2error(ep, err, NODEV32, setno));
7405 
7406 	cp->c_flags = 0;
7407 
7408 	lbp = s->s_lbp;
7409 	loccnt = lbp->lb_loccnt;
7410 	locators = lbp->lb_locators;
7411 
7412 	/* shorthand */
7413 	set_status = md_get_setstatus(setno);
7414 
7415 	if (set_status & MD_SET_STALE)
7416 		cp->c_flags |= MDDB_C_STALE;
7417 
7418 	if (set_status & MD_SET_TOOFEW)
7419 		cp->c_flags |= MDDB_C_TOOFEW;
7420 
7421 	cp->c_sideno = s->s_sideno;
7422 
7423 	cp->c_dbcnt = 0;
7424 	/*
7425 	 * go through and count active entries
7426 	 */
7427 	for (i = 0; i < loccnt;	 i++) {
7428 		if (locators[i].l_flags & MDDB_F_DELETED)
7429 			continue;
7430 		cp->c_dbcnt++;
7431 	}
7432 
7433 	/*
7434 	 * add the ability to accept a locator block index
7435 	 * which is not relative to previously deleted replicas.  This
7436 	 * is for support of MD_DEBUG=STAT in metastat since it asks for
7437 	 * replica information specifically for each of the mirror resync
7438 	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
7439 	 * the mddb_config_t type.
7440 	 */
7441 	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
7442 		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
7443 			mddb_setexit(s);
7444 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7445 			    setno));
7446 		}
7447 		li = cp->c_id;
7448 	} else {
7449 		if (cp->c_id >= cp->c_dbcnt) {
7450 			mddb_setexit(s);
7451 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7452 			    setno));
7453 		}
7454 
7455 		/* CSTYLED */
7456 		for (li = 0, j = 0; /* void */; li++) {
7457 			if (locators[li].l_flags & MDDB_F_DELETED)
7458 				continue;
7459 			j++;
7460 			if (j > cp->c_id)
7461 				break;
7462 		}
7463 	}
7464 
7465 	if (command == MDDB_ENDDEV) {
7466 		daddr_t ib = 0, jb;
7467 
7468 		blk = 0;
7469 		if ((s != NULL) && s->s_mbiarray[li]) {
7470 			mbip = s->s_mbiarray[li];
7471 			while ((jb = getphysblk(blk++, mbip)) > 0) {
7472 				if (jb > ib)
7473 					ib = jb;
7474 			}
7475 			cp->c_dbend = (int)ib;
7476 		} else {
7477 			cp->c_dbend = 0;
7478 		}
7479 	}
7480 
7481 	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
7482 	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
7483 
7484 	if (command != MDDB_DELDEV) {
7485 		mddb_setexit(s);
7486 		return (0);
7487 	}
7488 
7489 	/* Currently don't allow addition/deletion of sides during upgrade */
7490 	if (MD_UPGRADE) {
7491 		cmn_err(CE_WARN,
7492 		    "Deletion of replica not allowed during upgrade.\n");
7493 		mddb_setexit(s);
7494 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7495 	}
7496 
7497 	/*
7498 	 * If here, replica delete in progress.
7499 	 */
7500 	single_thread_start(s);
7501 
7502 	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
7503 	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
7504 		commitcnt = lbp->lb_commitcnt;
7505 		lbp->lb_commitcnt = 0;
7506 		setidentifier(s, &lbp->lb_ident);
7507 		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
7508 		/*
7509 		 * Don't need to write out device id area, since locator
7510 		 * block on this replica is being deleted by setting the
7511 		 * commitcnt to 0.
7512 		 */
7513 		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
7514 		    MDDB_WR_ONLY_MASTER);
7515 		lbp->lb_commitcnt = commitcnt;
7516 	}
7517 
7518 	if (s->s_mbiarray[li])
7519 		free_mbipp(&s->s_mbiarray[li]);
7520 
7521 	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
7522 		dev = md_expldev(locators[li].l_dev);
7523 		dev = md_xlate_targ_2_mini(dev);
7524 		if (dev != NODEV64)
7525 			mddb_devclose(dev);
7526 	}
7527 
7528 	s->s_mbiarray[li] = 0;
7529 	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
7530 
7531 	/* Only support data tags for traditional and local sets */
7532 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
7533 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
7534 	    setno != MD_LOCAL_SET)
7535 		if (set_dtag(s, ep))
7536 			mdclrerror(ep);
7537 
7538 	/* Write data tags to all accessible devices */
7539 	/* Only support data tags for traditional and local sets */
7540 	if (!(lbp->lb_flags & MDDB_MNSET)) {
7541 		(void) dt_write(s);
7542 	}
7543 
7544 	/* Delete device id of deleted replica */
7545 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7546 		(void) mddb_devid_delete(s, li);
7547 	}
7548 	/* write new locator to all devices */
7549 	err = writelocall(s);
7550 
7551 	(void) upd_med(s, "getdeldev(0)");
7552 
7553 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
7554 	    md_expldev(locators[li].l_dev));
7555 
7556 	computefreeblks(s); /* recompute always it may be larger */
7557 	cp->c_dbcnt--;
7558 	err |= fixoptrecords(s);
7559 	if (err) {
7560 		if (writeretry(s)) {
7561 			single_thread_end(s);
7562 			mddb_setexit(s);
7563 			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
7564 		}
7565 	}
7566 
7567 	single_thread_end(s);
7568 	mddb_setexit(s);
7569 	return (0);
7570 }
7571 
7572 static int
7573 getdriver(
7574 	mddb_cfg_loc_t	*clp
7575 )
7576 {
7577 	major_t		majordev;
7578 
7579 	/*
7580 	 * Data checking
7581 	 */
7582 	if (clp->l_dev <= 0)
7583 		return (EINVAL);
7584 
7585 	majordev = getmajor(expldev(clp->l_dev));
7586 
7587 	if (ddi_major_to_name(majordev) == (char *)NULL)
7588 		return (EINVAL);
7589 
7590 	if (MD_UPGRADE)
7591 		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
7592 	else
7593 		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
7594 	return (0);
7595 }
7596 
7597 /*
7598  * update_valid_replica - updates the locator block namespace (prefix
7599  * 	and/or suffix) with new pathname and devname.
7600  *	RETURN
7601  *		1	Error
7602  *		0	Success
7603  */
7604 static int
7605 update_valid_replica(
7606 	side_t		side,
7607 	mddb_locator_t	*lp,
7608 	mddb_set_t	*s,
7609 	int		li,
7610 	char		*devname,
7611 	char		*pathname,
7612 	md_dev64_t	devt
7613 )
7614 {
7615 	uchar_t		pre_len, suf_len;
7616 	md_name_suffix	*sn;
7617 	mddb_ln_t	*lnp;
7618 	uchar_t		pre_index;
7619 	uchar_t		i;
7620 
7621 	if (md_expldev(lp->l_dev) != devt) {
7622 		return (0);
7623 	}
7624 
7625 	if (pathname[strlen(pathname) - 1] == '/')
7626 		pathname[strlen(pathname) - 1] = '\0';
7627 
7628 	pre_len = (uchar_t)strlen(pathname);
7629 	suf_len = (uchar_t)strlen(devname);
7630 
7631 	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
7632 		return (1);
7633 
7634 	lnp = s->s_lnp;
7635 
7636 	/*
7637 	 * Future note:  Need to do something here for the MN diskset case
7638 	 * when device ids are supported in disksets.
7639 	 * Can't add until merging devids_in_diskset code into code base
7640 	 * Currently only called with side of 0.
7641 	 */
7642 
7643 	sn = &lnp->ln_suffixes[side][li];
7644 
7645 	/*
7646 	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
7647 	 * If new prefix is the same as the previous prefix - no change.
7648 	 *
7649 	 * If new prefix is not the same, check if new prefix
7650 	 * matches an existing one.  If so, use that one.
7651 	 *
7652 	 * If new prefix doesn't exist, add a new prefix.  If not enough
7653 	 * space, return failure.
7654 	 */
7655 	pre_index = sn->suf_prefix;
7656 	/* Check if new prefix is the same as the old prefix. */
7657 	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
7658 	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
7659 	    pre_len) != 0)) {
7660 		/* Check if new prefix is an already known prefix. */
7661 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7662 			if (lnp->ln_prefixes[i].pre_len != pre_len) {
7663 				continue;
7664 			}
7665 			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
7666 			    pre_len) == 0) {
7667 				break;
7668 			}
7669 		}
7670 		/* If no match found for new prefix - add the new prefix */
7671 		if (i == MDDB_PREFIXCNT) {
7672 			for (i = 0; i < MDDB_PREFIXCNT; i++) {
7673 				if (lnp->ln_prefixes[i].pre_len == 0)
7674 					break;
7675 			}
7676 			/* No space to add new prefix - return failure */
7677 			if (i == MDDB_PREFIXCNT) {
7678 				return (1);
7679 			}
7680 			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
7681 			lnp->ln_prefixes[i].pre_len = pre_len;
7682 		}
7683 		sn->suf_prefix = i;
7684 	}
7685 
7686 	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
7687 	if ((sn->suf_len != suf_len) ||
7688 	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
7689 		bcopy(devname, sn->suf_data, suf_len);
7690 		sn->suf_len = suf_len;
7691 	}
7692 	return (0);
7693 }
7694 
7695 
7696 /*
7697  * md_update_locator_namespace - If in devid style and active and the devid's
7698  *		exist and are valid update the locator namespace pathname
7699  *		and devname.
7700  *	RETURN
7701  *		1	Error
7702  *		0	Success
7703  */
7704 int
7705 md_update_locator_namespace(
7706 	set_t		setno,		/* which set to get name from */
7707 	side_t		side,
7708 	char		*dname,
7709 	char		*pname,
7710 	md_dev64_t	devt
7711 )
7712 {
7713 	mddb_set_t	*s;
7714 	mddb_lb_t	*lbp;
7715 	int		li;
7716 	uint_t		flg;
7717 	int		err = 0;
7718 	mddb_ln_t	*lnp;
7719 
7720 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7721 		return (1);
7722 	single_thread_start(s);
7723 	lbp = s->s_lbp;
7724 	/* must be DEVID_STYLE */
7725 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7726 		for (li = 0; li < lbp->lb_loccnt; li++) {
7727 			mddb_locator_t *lp = &lbp->lb_locators[li];
7728 
7729 			if (lp->l_flags & MDDB_F_DELETED) {
7730 				continue;
7731 			}
7732 
7733 			/* replica also must be active */
7734 			if (lp->l_flags & MDDB_F_ACTIVE) {
7735 				flg = s->s_did_icp->did_ic_blkp->
7736 				    blk_info[li].info_flags;
7737 				/* only update if did exists and is valid */
7738 				if ((flg & MDDB_DID_EXISTS) &&
7739 				    (flg & MDDB_DID_VALID)) {
7740 					if (update_valid_replica(side, lp, s,
7741 					    li, dname, pname, devt)) {
7742 						err = 1;
7743 						goto out;
7744 					}
7745 				}
7746 			}
7747 		}
7748 	}
7749 	lnp = s->s_lnp;
7750 	uniqtime32(&lnp->ln_timestamp);
7751 	if (lbp->lb_flags & MDDB_MNSET)
7752 		lnp->ln_revision = MDDB_REV_MNLN;
7753 	else
7754 		lnp->ln_revision = MDDB_REV_LN;
7755 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
7756 	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
7757 	    lbp->lb_lnblkcnt, 0);
7758 	/*
7759 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
7760 	 * flag in the mddb_set structure to show that the locator
7761 	 * names have changed.
7762 	 */
7763 
7764 	if ((lbp->lb_flags & MDDB_MNSET) &&
7765 	    (md_set[s->s_setno].s_am_i_master)) {
7766 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
7767 	}
7768 out:
7769 	single_thread_end(s);
7770 	mddb_setexit(s);
7771 	if (err)
7772 		return (1);
7773 	return (0);
7774 }
7775 
7776 /*
7777  * update_locatorblock - for active entries in the locator block, check
7778  *		the devt to see if it matches the given devt. If so, and
7779  *		there is an associated device id which is not the same
7780  *		as the passed in devid, delete old devid and add a new one.
7781  *
7782  *		During import of replicated disksets, old_didptr contains
7783  *		the original disk's device id.  Use this device id in
7784  *		addition to the devt to determine if an entry is a match
7785  *		and should be updated with the new device id of the
7786  *		replicated disk.  Specifically, this is the case being handled:
7787  *
7788  *		Original_disk	Replicated_disk	Disk_Available_During_Import
7789  *		c1t1d0		c1t3d0		no - so old name c1t1d0 shown
7790  *		c1t2d0		c1t1d0		yes - name is c1t1d0
7791  *		c1t3d0		c1t2d0		yes - name is c1t2d0
7792  *
7793  *		Can't just match on devt since devt for the first and third
7794  *		disks will be the same, but the original disk's device id
7795  *		is known and can be used to distinguish which disk's
7796  *		replicated device id should be updated.
7797  *	RETURN
7798  *		MDDB_E_NODEVID
7799  *		MDDB_E_NOLOCBLK
7800  *		1	Error
7801  *		0	Success
7802  */
7803 static int
7804 update_locatorblock(
7805 	mddb_set_t	*s,
7806 	md_dev64_t	dev,
7807 	ddi_devid_t	didptr,
7808 	ddi_devid_t	old_didptr
7809 )
7810 {
7811 	mddb_lb_t	*lbp = NULL;
7812 	mddb_locator_t	*lp;
7813 	int		li;
7814 	uint_t		flg;
7815 	ddi_devid_t	devid_ptr;
7816 	int		retval = 0;
7817 	char		*minor_name;
7818 	int		repl_import_flag;
7819 
7820 	/* Set replicated flag if this is a replicated import */
7821 	repl_import_flag = md_get_setstatus(s->s_setno) &
7822 	    MD_SET_REPLICATED_IMPORT;
7823 
7824 	lbp = s->s_lbp;
7825 	/* find replicas that haven't been deleted */
7826 	for (li = 0; li < lbp->lb_loccnt; li++) {
7827 		lp = &lbp->lb_locators[li];
7828 
7829 		if ((lp->l_flags & MDDB_F_DELETED)) {
7830 			continue;
7831 		}
7832 		/*
7833 		 * check to see if locator devt matches given dev
7834 		 * and if there is a device ID associated with it
7835 		 */
7836 		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
7837 		if ((md_expldev(lp->l_dev) == dev) &&
7838 		    (flg & MDDB_DID_EXISTS)) {
7839 			if (flg & MDDB_DID_VALID) {
7840 				continue; /* cont to nxt active entry */
7841 			}
7842 			devid_ptr = s->s_did_icp->did_ic_devid[li];
7843 			if (devid_ptr == NULL) {
7844 				return (MDDB_E_NODEVID);
7845 			}
7846 
7847 			/*
7848 			 * During a replicated import the old_didptr
7849 			 * must match the current devid before the
7850 			 * devid can be updated.
7851 			 */
7852 			if (repl_import_flag) {
7853 				if (ddi_devid_compare(devid_ptr,
7854 				    old_didptr) != 0)
7855 					continue;
7856 			}
7857 
7858 			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
7859 				/*
7860 				 * devid's not equal so
7861 				 * delete and add
7862 				 */
7863 				if (ddi_lyr_get_minor_name(
7864 				    md_dev64_to_dev(dev),
7865 				    S_IFBLK, &minor_name) == DDI_SUCCESS) {
7866 					(void) mddb_devid_delete(s, li);
7867 					(void) mddb_devid_add(s, li, didptr,
7868 					    minor_name);
7869 					kmem_free(minor_name,
7870 					    strlen(minor_name)+1);
7871 					break;
7872 				} else {
7873 					retval = 1;
7874 					goto err_out;
7875 				}
7876 			}
7877 		}
7878 	} /* end for */
7879 	retval = push_lb(s);
7880 	(void) upd_med(s, "update_locatorblock(0)");
7881 err_out:
7882 	return (retval);
7883 }
7884 
7885 static int
7886 update_mb_devid(
7887 	mddb_set_t	*s,
7888 	mddb_ri_t	*rip,
7889 	ddi_devid_t	devidptr
7890 )
7891 {
7892 	mddb_mb_ic_t	*mbip;
7893 	mddb_mb_t	*mb = NULL;
7894 	daddr_t		blkno;
7895 	md_dev64_t	device;
7896 	uint_t		sz;
7897 	int		mb2free = 0;
7898 	int		err = 0;
7899 
7900 
7901 	/*
7902 	 * There is case where a disk may not have mddb,
7903 	 * and only has dummy mddb which contains
7904 	 * a valid devid we like to update and in this
7905 	 * case, the rip_lbp will be NULL but we still
7906 	 * like to update the devid embedded in the
7907 	 * dummy mb block.
7908 	 *
7909 	 */
7910 	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
7911 		mbip = rip->ri_mbip;
7912 		mb = &mbip->mbi_mddb_mb;
7913 	} else {
7914 		/*
7915 		 * Done if it is non-replicated set
7916 		 */
7917 		if (devidptr != (ddi_devid_t)NULL) {
7918 			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
7919 			    KM_SLEEP);
7920 			mb->mb_magic = MDDB_MAGIC_DU;
7921 			mb->mb_revision = MDDB_REV_MB;
7922 			mb2free = 1;
7923 		} else {
7924 			goto out;
7925 		}
7926 	}
7927 
7928 	blkno = rip->ri_blkno;
7929 	device = rip->ri_dev;
7930 	/*
7931 	 * Replace the mb_devid with the new/valid one
7932 	 */
7933 	if (devidptr != (ddi_devid_t)NULL) {
7934 		/*
7935 		 * Zero out what we have previously
7936 		 */
7937 		if (mb->mb_devid_len)
7938 			bzero(mb->mb_devid, mb->mb_devid_len);
7939 		sz = ddi_devid_sizeof(devidptr);
7940 		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
7941 		mb->mb_devid_len = sz;
7942 	}
7943 
7944 	mb->mb_setno = s->s_setno;
7945 	uniqtime32(&mb->mb_timestamp);
7946 	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
7947 	/*
7948 	 * putblks will
7949 	 *
7950 	 *	- drop the s_dbmx lock
7951 	 *	- biowait
7952 	 *	- regain the s_dbmx lock
7953 	 *
7954 	 * Need to update this if we wants to handle
7955 	 * mb_next != NULL which it is unlikely will happen
7956 	 */
7957 	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
7958 
7959 	if (mb2free) {
7960 		kmem_free(mb, MDDB_BSIZE);
7961 	}
7962 out:
7963 	return (err);
7964 }
7965 
7966 static int
7967 setdid(
7968 	mddb_config_t		*cp
7969 )
7970 {
7971 	ddi_devid_t		devidp;
7972 	dev_t			ddi_dev;
7973 	mddb_set_t		*s;
7974 	int			err = 0;
7975 	mddb_ri_t		*rip;
7976 
7977 	/*
7978 	 * Data integrity check
7979 	 */
7980 	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
7981 		return (EINVAL);
7982 
7983 	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
7984 		return (0);
7985 
7986 	ddi_dev = md_dev64_to_dev(cp->c_devt);
7987 	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
7988 		return (-1);
7989 	}
7990 	if (devidp == NULL) {
7991 		return (-1);
7992 	}
7993 
7994 	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
7995 		return (-1);
7996 	single_thread_start(s);
7997 
7998 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
7999 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
8000 			continue;
8001 		/*
8002 		 * We only update what is asked
8003 		 */
8004 		if (rip->ri_dev == cp->c_devt) {
8005 			if (update_mb_devid(s, rip, devidp) != 0) {
8006 				err = -1;
8007 				goto out;
8008 			}
8009 		}
8010 	}
8011 
8012 	if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
8013 		err = -1;
8014 		goto out;
8015 	}
8016 
8017 out:
8018 	single_thread_end(s);
8019 	mddb_setexit(s);
8020 	ddi_devid_free(devidp);
8021 	return (err);
8022 }
8023 
8024 static int
8025 delnewside(
8026 	mddb_config_t		*cp,
8027 	int			command,
8028 	md_error_t		*ep
8029 )
8030 {
8031 	mddb_set_t		*s;
8032 	int			li;
8033 	mddb_lb_t		*lbp;		/* pointer to locator block */
8034 	mddb_ln_t		*lnp;		/* pointer to locator names */
8035 	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
8036 	mddb_locator_t		*lp;
8037 	mddb_sidelocator_t	*slp;
8038 	mddb_cfg_loc_t		*clp;
8039 	int			err = 0;
8040 	set_t			setno = cp->c_setno;
8041 	ddi_devid_t		devid;
8042 	ddi_devid_t		ret_devid = NULL;
8043 	char			*minor_name;
8044 	uint_t			use_devid = 0;
8045 	dev_t			ddi_dev;
8046 	md_mnname_suffix_t	*mnsn;
8047 	mddb_mnlb_t		*mnlbp;
8048 	mddb_mnsidelocator_t	*mnslp;
8049 
8050 	/* Currently don't allow addition/deletion of sides during upgrade */
8051 	if (MD_UPGRADE) {
8052 		cmn_err(CE_WARN,
8053 		    "Addition and deletion of sides not allowed"
8054 		    " during upgrade. \n");
8055 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8056 	}
8057 
8058 	/*
8059 	 * Data integrity check
8060 	 */
8061 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8062 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8063 
8064 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8065 		return (mddbstatus2error(ep, err, NODEV32, setno));
8066 
8067 	single_thread_start(s);
8068 	clp = &cp->c_locator;
8069 
8070 	lbp = s->s_lbp;
8071 
8072 	if (lbp->lb_setno != setno) {
8073 		single_thread_end(s);
8074 		mddb_setexit(s);
8075 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8076 	}
8077 
8078 	/*
8079 	 * Find this device/blkno pair
8080 	 */
8081 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8082 		ddi_dev = md_dev64_to_dev(clp->l_dev);
8083 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8084 		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
8085 		    == DDI_SUCCESS)) {
8086 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8087 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8088 				use_devid = 1;
8089 				(void) strcpy(clp->l_minor_name, minor_name);
8090 			}
8091 			kmem_free(minor_name, strlen(minor_name)+1);
8092 		}
8093 		if (use_devid != 1 && ret_devid != NULL)
8094 			ddi_devid_free(ret_devid);
8095 	}
8096 	for (li = 0; li < lbp->lb_loccnt; li++) {
8097 		lp = &lbp->lb_locators[li];
8098 		if (lp->l_flags & MDDB_F_DELETED)
8099 			continue;
8100 		if (use_devid) {
8101 			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
8102 				continue;
8103 			if ((ddi_devid_compare(devid,
8104 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8105 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8106 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8107 				break;
8108 			}
8109 		} else {
8110 			if (lp->l_dev == clp->l_dev &&
8111 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8112 				break;
8113 			}
8114 		}
8115 	}
8116 
8117 	if (li == lbp->lb_loccnt) {
8118 		if (use_devid)
8119 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8120 		single_thread_end(s);
8121 		mddb_setexit(s);
8122 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8123 	}
8124 
8125 	lnp = s->s_lnp;
8126 	if (command == MDDB_NEWSIDE) {
8127 		int 	index = 0;
8128 		/*
8129 		 * If a MN diskset, need to find the index where the new
8130 		 * locator information is to be stored in the mnsidelocator
8131 		 * field of the locator block so that the locator name can
8132 		 * be stored at the same array index in the mnsuffixes
8133 		 * field of the locator names structure.
8134 		 */
8135 		if (lbp->lb_flags & MDDB_MNSET) {
8136 			if ((index = checklocator(lbp, li,
8137 			    cp->c_sideno)) == -1) {
8138 				if (use_devid) {
8139 					ddi_devid_free((ddi_devid_t)
8140 					    (uintptr_t)clp->l_devid);
8141 				}
8142 				single_thread_end(s);
8143 				mddb_setexit(s);
8144 				return (mdmddberror(ep, MDE_DB_TOOSMALL,
8145 				    NODEV32, setno));
8146 			}
8147 		}
8148 
8149 		/*
8150 		 * Store the locator name before the sidelocator information
8151 		 * in case a panic occurs between these 2 steps.  Must have
8152 		 * the locator name information in order to print reasonable
8153 		 * error information.
8154 		 */
8155 		if (splitname2locatorblock(&cp->c_devname, lnp, li,
8156 		    cp->c_sideno, index)) {
8157 			if (use_devid)
8158 				ddi_devid_free(
8159 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8160 			single_thread_end(s);
8161 			mddb_setexit(s);
8162 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8163 			    setno));
8164 		}
8165 
8166 		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
8167 			if (use_devid)
8168 				ddi_devid_free(
8169 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8170 			single_thread_end(s);
8171 			mddb_setexit(s);
8172 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8173 			    setno));
8174 		}
8175 	}
8176 
8177 	if (use_devid)
8178 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8179 
8180 	if (command == MDDB_DELSIDE) {
8181 		int i;
8182 		for (i = 0; i < lbp->lb_loccnt; i++) {
8183 			if (lbp->lb_flags & MDDB_MNSET) {
8184 				int	j;
8185 				mnlbp = (mddb_mnlb_t *)lbp;
8186 				for (j = 0; j < MD_MNMAXSIDES; j++) {
8187 					mnslp = &mnlbp->lb_mnsidelocators[j][i];
8188 					if (mnslp->mnl_sideno == cp->c_sideno)
8189 						break;
8190 				}
8191 				if (j < MD_MNMAXSIDES) {
8192 					mnslp->mnl_mnum = NODEV32;
8193 					mnslp->mnl_sideno = 0;
8194 					mnlnp = (mddb_mnln_t *)lnp;
8195 					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
8196 					bzero((caddr_t)mnsn,
8197 					    sizeof (md_mnname_suffix_t));
8198 				}
8199 			} else {
8200 				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
8201 				bzero((caddr_t)&lnp->ln_suffixes
8202 				    [cp->c_sideno][i], sizeof (md_name_suffix));
8203 				slp->l_mnum = NODEV32;
8204 			}
8205 		}
8206 	}
8207 
8208 	/* write new locator names to all devices */
8209 	uniqtime32(&lnp->ln_timestamp);
8210 	if (lbp->lb_flags & MDDB_MNSET)
8211 		lnp->ln_revision = MDDB_REV_MNLN;
8212 	else
8213 		lnp->ln_revision = MDDB_REV_LN;
8214 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8215 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8216 	    lbp->lb_lnblkcnt, 0);
8217 	/*
8218 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8219 	 * flag in the mddb_set structure to show that the locator
8220 	 * names have changed.
8221 	 */
8222 
8223 	if ((lbp->lb_flags & MDDB_MNSET) &&
8224 	    (md_set[s->s_setno].s_am_i_master)) {
8225 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8226 	}
8227 	if (err) {
8228 		if (writeretry(s)) {
8229 			single_thread_end(s);
8230 			mddb_setexit(s);
8231 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8232 		}
8233 	}
8234 
8235 	uniqtime32(&lbp->lb_timestamp);
8236 	/* write new locator to all devices */
8237 	err = writelocall(s);
8238 
8239 	(void) upd_med(s, "delnewside(0)");
8240 
8241 	computefreeblks(s); /* recompute always it may be larger */
8242 	if (err) {
8243 		if (writeretry(s)) {
8244 			single_thread_end(s);
8245 			mddb_setexit(s);
8246 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8247 		}
8248 	}
8249 
8250 	single_thread_end(s);
8251 	mddb_setexit(s);
8252 
8253 	return (0);
8254 }
8255 
8256 static int
8257 newdev(
8258 	mddb_config_t	*cp,
8259 	int		command,
8260 	md_error_t	*ep
8261 )
8262 {
8263 	mddb_set_t	*s;
8264 	mddb_mb_ic_t	*mbip, *mbip1;
8265 	int		i, j;
8266 	int		li;
8267 	mddb_lb_t	*lbp;		/* pointer to locator block */
8268 	mddb_ln_t	*lnp;		/* pointer to locator names */
8269 	mddb_locator_t	*lp;
8270 	mddb_cfg_loc_t	*clp;
8271 	int		err = 0;
8272 	set_t		setno = cp->c_setno;
8273 	ddi_devid_t	devid2;
8274 	ddi_devid_t	ret_devid = NULL;
8275 	char		*minor_name;
8276 	uint_t		use_devid = 0;
8277 	dev_t		ddi_dev;
8278 	int		old_flags;
8279 	int		flags;
8280 	int		mn_set = 0;
8281 	int		index;
8282 
8283 
8284 	/* Currently don't allow addition of new replica during upgrade */
8285 	if (MD_UPGRADE) {
8286 		cmn_err(CE_WARN,
8287 		    "Addition of new replica not allowed during upgrade.\n");
8288 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8289 	}
8290 
8291 	/*
8292 	 * Data integrity check
8293 	 */
8294 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8295 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8296 
8297 	/* Determine the flag settings for multinode sets */
8298 	flags = MDDB_NOOLDOK;
8299 	if (cp->c_multi_node)
8300 		flags |= MDDB_MULTINODE;
8301 
8302 	if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
8303 		if (err != MDDB_E_NOTOWNER)
8304 			return (mddbstatus2error(ep, err, NODEV32, setno));
8305 		s = init_set(cp, flags, &err);
8306 		if (s == NULL)
8307 			return (mddbstatus2error(ep, err, NODEV32, setno));
8308 	}
8309 
8310 	single_thread_start(s);
8311 
8312 	/* shorthand */
8313 	clp = &cp->c_locator;
8314 
8315 	/* shorthand */
8316 	lbp = s->s_lbp;
8317 
8318 	if (lbp->lb_setno != setno) {
8319 		single_thread_end(s);
8320 		mddb_setexit(s);
8321 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8322 	}
8323 
8324 	/*
8325 	 * See if this device/blkno pair is already a replica
8326 	 */
8327 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8328 		ddi_dev = expldev(clp->l_dev);
8329 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8330 		    (ddi_lyr_get_minor_name(ddi_dev,
8331 		    S_IFBLK, &minor_name) == DDI_SUCCESS)) {
8332 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8333 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8334 				use_devid = 1;
8335 				(void) strcpy(clp->l_minor_name, minor_name);
8336 			}
8337 			kmem_free(minor_name, strlen(minor_name)+1);
8338 		}
8339 		if (use_devid != 1 && ret_devid != NULL)
8340 			ddi_devid_free(ret_devid);
8341 	}
8342 
8343 	for (i = 0; i < lbp->lb_loccnt;	 i++) {
8344 		lp = &lbp->lb_locators[i];
8345 		if (lp->l_flags & MDDB_F_DELETED)
8346 			continue;
8347 		if (use_devid) {
8348 			if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
8349 				continue;
8350 			if ((ddi_devid_compare(devid2,
8351 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8352 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8353 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8354 				if (command == MDDB_NEWDEV) {
8355 					ddi_devid_free((ddi_devid_t)(uintptr_t)
8356 					    clp->l_devid);
8357 					single_thread_end(s);
8358 					mddb_setexit(s);
8359 					return (mdmddberror(ep,
8360 					    MDE_DB_EXISTS, NODEV32, setno));
8361 				}
8362 			}
8363 		} else {
8364 			if (lp->l_dev == clp->l_dev &&
8365 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8366 				if (command == MDDB_NEWDEV) {
8367 					single_thread_end(s);
8368 					mddb_setexit(s);
8369 					return (mdmddberror(ep,
8370 					    MDE_DB_EXISTS, NODEV32, setno));
8371 				}
8372 			}
8373 		}
8374 	}
8375 
8376 	/*
8377 	 * Really is a new replica, go get the master blocks
8378 	 */
8379 	mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
8380 	    (uint_t *)0, &mn_set);
8381 	if (! mbip) {
8382 		if (use_devid)
8383 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8384 		single_thread_end(s);
8385 		mddb_setexit(s);
8386 		return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
8387 	}
8388 
8389 	/*
8390 	 * Compute free blocks in replica.
8391 	 */
8392 	computefreeblks(s);
8393 
8394 	/*
8395 	 * Check if this is large enough
8396 	 */
8397 	for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
8398 		i += mbip1->mbi_mddb_mb.mb_blkcnt;
8399 	for (j = i; j < s->s_totalblkcnt; j++) {
8400 		if (blkcheck(s, j)) {
8401 			while (mbip) {
8402 				mbip1 = mbip->mbi_next;
8403 				kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
8404 				mbip = mbip1;
8405 			}
8406 			if (use_devid)
8407 				ddi_devid_free(
8408 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8409 			mddb_devclose(md_expldev(clp->l_dev));
8410 			single_thread_end(s);
8411 			mddb_setexit(s);
8412 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8413 			    setno));
8414 		}
8415 	}
8416 
8417 	/* Look for a deleted slot */
8418 	for (li = 0; li < lbp->lb_loccnt; li++) {
8419 		lp = &lbp->lb_locators[li];
8420 		if (lp->l_flags & MDDB_F_DELETED)
8421 			break;
8422 	}
8423 
8424 	/* If no deleted slots, add a new one */
8425 	if (li == lbp->lb_loccnt) {
8426 		/* Already have the max replicas, bail */
8427 		if (lbp->lb_loccnt == MDDB_NLB) {
8428 			if (use_devid)
8429 				ddi_devid_free((ddi_devid_t)(uintptr_t)
8430 				    clp->l_devid);
8431 			mddb_devclose(md_expldev(clp->l_dev));
8432 			single_thread_end(s);
8433 			mddb_setexit(s);
8434 			return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
8435 			    setno));
8436 		}
8437 		lbp->lb_loccnt++;
8438 		lp = &lbp->lb_locators[li];
8439 	}
8440 
8441 	/* Initialize the new or deleted slot */
8442 	old_flags = lp->l_flags;
8443 	lp->l_dev = clp->l_dev;
8444 	lp->l_blkno = (daddr32_t)clp->l_blkno;
8445 	lp->l_flags = clp->l_flags;
8446 
8447 	/* shorthand */
8448 	lnp = s->s_lnp;
8449 
8450 	index = 0;
8451 	if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
8452 		/*
8453 		 * If a MN diskset, need to find the index where the new
8454 		 * locator information is to be stored in the mnsidelocator
8455 		 * field of the locator block so that the locator name can
8456 		 * be stored at the same array index in the mnsuffixes
8457 		 * field of the locator names structure.
8458 		 */
8459 		lbp->lb_flags |= MDDB_MNSET;
8460 		if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
8461 			if (use_devid)
8462 				ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
8463 				    l_devid);
8464 			lp->l_flags = old_flags;
8465 			lbp->lb_loccnt--;
8466 			mddb_devclose(md_expldev(clp->l_dev));
8467 			single_thread_end(s);
8468 			mddb_setexit(s);
8469 			return (mdmddberror(ep, MDE_DB_TOOSMALL,
8470 			    NODEV32, setno));
8471 		}
8472 	}
8473 	/*
8474 	 * Store the locator name before the sidelocator information
8475 	 * in case a panic occurs between these 2 steps.  Must have
8476 	 * the locator name information in order to print reasonable
8477 	 * error information.
8478 	 */
8479 	if (splitname2locatorblock(&cp->c_devname, lnp, li,
8480 	    s->s_sideno, index)) {
8481 		if (use_devid)
8482 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8483 		lp->l_flags = old_flags;
8484 		lbp->lb_loccnt--;
8485 		mddb_devclose(md_expldev(clp->l_dev));
8486 		single_thread_end(s);
8487 		mddb_setexit(s);
8488 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8489 	}
8490 
8491 	/*
8492 	 * Compute free blocks in replica before calling cfgloc2locator
8493 	 * since cfgloc2locator may attempt to alloc an unused block
8494 	 * to store the device id.
8495 	 * mbiarray needs to be setup before calling computefreeblks.
8496 	 */
8497 	s->s_mbiarray[li] = mbip;
8498 	computefreeblks(s);
8499 
8500 	if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
8501 		if (use_devid)
8502 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8503 		lp->l_flags = old_flags;
8504 		lbp->lb_loccnt--;
8505 		s->s_mbiarray[li] = 0;
8506 		mddb_devclose(md_expldev(clp->l_dev));
8507 		single_thread_end(s);
8508 		mddb_setexit(s);
8509 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8510 	}
8511 
8512 	if (use_devid)
8513 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8514 
8515 	uniqtime32(&lbp->lb_timestamp);
8516 	lp->l_flags = MDDB_F_ACTIVE;
8517 
8518 	/* write db copy to new device */
8519 	err = writecopy(s, li, MDDB_WRITECOPY_ALL);
8520 	lp->l_flags |= MDDB_F_UP2DATE;
8521 
8522 	/* write new locator names to all devices */
8523 	uniqtime32(&lnp->ln_timestamp);
8524 	if (lbp->lb_flags & MDDB_MNSET)
8525 		lnp->ln_revision = MDDB_REV_MNLN;
8526 	else
8527 		lnp->ln_revision = MDDB_REV_LN;
8528 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8529 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8530 	    lbp->lb_lnblkcnt, 0);
8531 	/*
8532 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8533 	 * flag in the mddb_set structure to show that the locator
8534 	 * names have changed.
8535 	 */
8536 
8537 	if ((lbp->lb_flags & MDDB_MNSET) &&
8538 	    (md_set[s->s_setno].s_am_i_master)) {
8539 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8540 	}
8541 	if (err) {
8542 		if (writeretry(s)) {
8543 			single_thread_end(s);
8544 			mddb_setexit(s);
8545 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8546 		}
8547 	}
8548 
8549 	/* Data tags not supported on MN sets */
8550 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
8551 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
8552 	    setno != MD_LOCAL_SET)
8553 		if (set_dtag(s, ep))
8554 			mdclrerror(ep);
8555 
8556 	/* Write data tags to all accessible devices */
8557 	/* Data tags not supported on MN sets */
8558 	if (!(lbp->lb_flags & MDDB_MNSET)) {
8559 		(void) dt_write(s);
8560 	}
8561 
8562 	/* write new locator to all devices */
8563 	err = writelocall(s);
8564 
8565 	(void) upd_med(s, "newdev(0)");
8566 
8567 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
8568 	    md_expldev(clp->l_dev));
8569 
8570 	computefreeblks(s); /* recompute always it may be smaller */
8571 	if (err) {
8572 		if (writeretry(s)) {
8573 			single_thread_end(s);
8574 			mddb_setexit(s);
8575 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8576 		}
8577 	}
8578 
8579 	single_thread_end(s);
8580 	mddb_setexit(s);
8581 
8582 	return (0);
8583 }
8584 
8585 #ifdef DEBUG
8586 static void
8587 mddb_check_set(
8588 	set_t	setno
8589 )
8590 {
8591 	mddb_set_t	*s;
8592 	mddb_db_t	*dbp;
8593 	mddb_de_ic_t	*dep;
8594 	mddb_rb32_t	*rbp;
8595 
8596 	if (! md_set[setno].s_db)
8597 		return;
8598 
8599 	s = (mddb_set_t *)md_set[setno].s_db;
8600 
8601 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8602 		for (dep = dbp->db_firstentry;
8603 		    dep != NULL; dep = dep->de_next) {
8604 			rbp = dep->de_rb;
8605 			ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
8606 			if (dep->de_rb_userdata)
8607 				ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
8608 		}
8609 	}
8610 }
8611 #endif /* DEBUG */
8612 
8613 /*
8614  * Exported Entry Points
8615  */
8616 #ifdef DEBUG
8617 void
8618 mddb_check(void)
8619 {
8620 	int	i;
8621 
8622 	for (i = 0; i < md_nsets; i++) {
8623 		if (! md_set[i].s_db)
8624 			return;
8625 
8626 		mddb_check_set(i);
8627 	}
8628 
8629 }
8630 #endif /* DEBUG */
8631 
8632 int
8633 mddb_configure(
8634 	mddb_cfgcmd_t	command,
8635 	mddb_config_t	*cp
8636 )
8637 {
8638 	mddb_set_t	*s;
8639 	md_error_t	*ep = &cp->c_mde;
8640 	int		flag = 0;
8641 	int		err = 0;
8642 	set_t		setno = cp->c_setno;
8643 
8644 	mdclrerror(ep);
8645 
8646 	switch (command) {
8647 		case MDDB_NEWDEV:
8648 			err = newdev(cp, command, ep);
8649 			break;
8650 
8651 		case MDDB_NEWSIDE:
8652 		case MDDB_DELSIDE:
8653 			err = delnewside(cp, command, ep);
8654 			break;
8655 
8656 		case MDDB_GETDEV:
8657 		case MDDB_DELDEV:
8658 		case MDDB_ENDDEV:
8659 			err = getdeldev(cp, command, ep);
8660 			break;
8661 
8662 		case MDDB_GETDRVRNAME:
8663 			err = getdriver(&cp->c_locator);
8664 			break;
8665 
8666 		case MDDB_USEDEV:
8667 			/*
8668 			 * Note: must allow USEDEV ioctl during upgrade to
8669 			 * support auto-take disksets.
8670 			 *
8671 			 * Also during the set import if the md_devid_destroy
8672 			 * flag is set then error out
8673 			 */
8674 
8675 			if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
8676 				return (mdmderror(ep, MDE_INVAL_UNIT,
8677 				    MD_ADM_MINOR));
8678 
8679 			if (setno >= md_nsets)
8680 				return (mdmderror(ep, MDE_INVAL_UNIT,
8681 				    MD_ADM_MINOR));
8682 
8683 			if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
8684 			    NULL) {
8685 				if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
8686 				    NULL) {
8687 					err = mddbstatus2error(ep, err,
8688 					    NODEV32, setno);
8689 					break;
8690 				}
8691 			}
8692 			if (setno == MD_LOCAL_SET)
8693 				flag = MDDB_F_IOCTL;
8694 			if (cp->c_locator.l_old_devid) {
8695 				md_set_setstatus(setno,
8696 				    MD_SET_REPLICATED_IMPORT);
8697 			}
8698 			err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
8699 			mddb_setexit(s);
8700 			break;
8701 
8702 		case MDDB_RELEASESET:
8703 			mutex_enter(&mddb_lock);
8704 			mddb_unload_set(cp->c_setno);
8705 			mutex_exit(&mddb_lock);
8706 			break;
8707 
8708 		case MDDB_SETDID:
8709 			err = setdid(cp);
8710 			break;
8711 
8712 		default:
8713 			err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
8714 			    cp->c_setno);
8715 	}
8716 
8717 	return (err);
8718 }
8719 
8720 int
8721 mddb_getoptloc(
8722 	mddb_optloc_t		*ol
8723 )
8724 {
8725 	mddb_set_t		*s;
8726 	mddb_db_t		*dbp;
8727 	mddb_de_ic_t		*dep;
8728 	mddb_recid_t		id;
8729 	set_t			setno;
8730 
8731 	ol->li[0] = -1;
8732 	ol->li[1] = -1;
8733 
8734 	id = ol->recid;
8735 	setno = DBSET(id);
8736 	if (setno >= md_nsets)
8737 		return (EINVAL);
8738 
8739 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
8740 		return (0);
8741 
8742 	id = DBID(id);
8743 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8744 		for (dep = dbp->db_firstentry;
8745 		    dep != NULL; dep = dep->de_next) {
8746 			if (dep->de_recid != id)
8747 				continue;
8748 			ol->li[0] = dep->de_optinfo[0].o_li;
8749 			ol->li[1] = dep->de_optinfo[1].o_li;
8750 			mddb_setexit(s);
8751 			return (0);
8752 		}
8753 	}
8754 	mddb_setexit(s);
8755 	return (0);
8756 }
8757 
8758 void
8759 mddb_init(void)
8760 {
8761 	mddb_set_t	*s;
8762 
8763 	mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
8764 	if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
8765 		mddb_setexit(s);
8766 }
8767 
8768 
8769 void
8770 mddb_unload(void)
8771 {
8772 	int	i;
8773 
8774 	mutex_enter(&mddb_lock);
8775 
8776 	for (i = 0; i < md_nsets; i++) {
8777 		md_clr_setstatus(i, MD_SET_KEEPTAG);
8778 		mddb_unload_set(i);
8779 	}
8780 
8781 	crcfreetab();
8782 
8783 	mutex_exit(&mddb_lock);
8784 }
8785 
8786 mddb_recid_t
8787 mddb_createrec(
8788 	size_t		usersize,	 /* size of db record */
8789 	mddb_type_t	type,		 /* type1 of db record */
8790 	uint_t		type2,		 /* type2 of db record */
8791 	md_create_rec_option_t	options, /* options for this creation  */
8792 	set_t		setno		 /* set number to create record in */
8793 )
8794 {
8795 	mddb_set_t	*s;
8796 	mddb_db_t	*dbp, *prevdbp, *newdbp;
8797 	mddb_db32_t	*db32p;
8798 	mddb_de_ic_t	*dep;
8799 	/* LINTED variable unused - used for sizeof calculations */
8800 	mddb_de32_t	*de32p;
8801 	mddb_rb32_t	*rbp;
8802 	size_t		recsize;
8803 	ulong_t		blkcnt;
8804 	ulong_t		maxblocks;
8805 	size_t		desize, desize_ic;
8806 	size_t		used;
8807 	mddb_recid_t	newid;
8808 	caddr_t		tmppnt;
8809 	int		i, err = 0;
8810 	void		*userdata;
8811 	uint_t		flag_type;
8812 
8813 #if defined(_ILP32) && !defined(lint)
8814 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
8815 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8816 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8817 #endif
8818 
8819 	/*
8820 	 * everyone is supposed to sepcify if it's a
8821 	 * 32 bit or a 64 bit record
8822 	 */
8823 	if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
8824 		return (MDDB_E_INVALID);
8825 	}
8826 
8827 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8828 		return (err);
8829 
8830 	if (checkstate(s, MDDB_PROBE)) {
8831 		mddb_setexit(s);
8832 		return (MDDB_E_NOTNOW);
8833 	}
8834 
8835 	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
8836 	    usersize, MDDB_BSIZE);
8837 	blkcnt = btodb(recsize);
8838 
8839 	if (mddb_maxblocks)
8840 		maxblocks = mddb_maxblocks;
8841 	else
8842 		maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
8843 		    sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
8844 
8845 	if (blkcnt > maxblocks) {
8846 		mddb_setexit(s);
8847 		return (MDDB_E_INVALID);
8848 	}
8849 	/*
8850 	 * allocate record block
8851 	 * and new directory block so to avoid sleeping
8852 	 * after starting single_thread
8853 	 */
8854 	rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
8855 	if ((options & MD_CRO_OPTIMIZE) == 0)
8856 		userdata = kmem_zalloc(usersize, KM_SLEEP);
8857 	newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);
8858 
8859 	/*
8860 	 * if this is the largest record allocate new buffer for
8861 	 * checkcopy();
8862 	 */
8863 	if (recsize > s->s_databuffer_size) {
8864 		tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
8865 		/*
8866 		 * this test is incase when to sleep during kmem_alloc
8867 		 * and some other task bumped max record size
8868 		 */
8869 		if (recsize > s->s_databuffer_size) {
8870 			if (s->s_databuffer_size)
8871 				kmem_free(s->s_databuffer,
8872 				    s->s_databuffer_size);
8873 			s->s_databuffer = tmppnt;
8874 			s->s_databuffer_size = recsize;
8875 		} else {
8876 			kmem_free(tmppnt, recsize);
8877 		}
8878 	}
8879 
8880 	single_thread_start(s);
8881 
8882 	newid = 0;
8883 	do {
8884 		newid++;
8885 		if (DBID(newid) == 0) {
8886 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8887 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8888 			if ((options & MD_CRO_OPTIMIZE) == 0)
8889 				kmem_free(userdata, usersize);
8890 			single_thread_end(s);
8891 			mddb_setexit(s);
8892 			return (MDDB_E_NOTNOW);
8893 		}
8894 
8895 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8896 			for (dep = dbp->db_firstentry; dep;
8897 			    dep = dep->de_next) {
8898 				if (dep->de_recid == newid)
8899 					break;
8900 			}
8901 			if (dep != NULL)
8902 				break;
8903 		}
8904 	} while (dbp);
8905 
8906 	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
8907 	    (sizeof (mddb_block_t) * blkcnt);
8908 
8909 	/*
8910 	 * see if a directory block exists which will hold this entry
8911 	 */
8912 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8913 		used = sizeof (*db32p);
8914 		for (dep = dbp->db_firstentry;
8915 		    dep != NULL; dep = dep->de_next) {
8916 			used += sizeof (*de32p) - sizeof (de32p->de32_blks);
8917 			used += sizeof (mddb_block_t) * dep->de_blkcount;
8918 		}
8919 		if ((used + desize) < MDDB_BSIZE)
8920 			break;
8921 	}
8922 	if (dbp) {
8923 		kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8924 		if (blkcnt > s->s_freeblkcnt) {
8925 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8926 			if ((options & MD_CRO_OPTIMIZE) == 0)
8927 				kmem_free(userdata, usersize);
8928 			single_thread_end(s);
8929 			mddb_setexit(s);
8930 			return (MDDB_E_NOSPACE);
8931 		}
8932 		prevdbp = NULL;
8933 	} else {
8934 		/*
8935 		 * need to add directory block
8936 		 */
8937 		if ((blkcnt + 1) > s->s_freeblkcnt) {
8938 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8939 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8940 			if ((options & MD_CRO_OPTIMIZE) == 0)
8941 				kmem_free(userdata, usersize);
8942 			single_thread_end(s);
8943 			mddb_setexit(s);
8944 			return (MDDB_E_NOSPACE);
8945 		}
8946 		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
8947 			;
8948 		dbp->db_next = newdbp;
8949 		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
8950 		dbp->db_nextblk = getfreeblks(s, 1);
8951 		dbp->db_next->db_blknum = dbp->db_nextblk;
8952 		prevdbp = dbp;
8953 		dbp = dbp->db_next;
8954 		dbp->db_nextblk = 0;
8955 		dbp->db_firstentry = NULL;
8956 		dbp->db_recsum = 0;
8957 		dbp->db_magic = MDDB_MAGIC_DB;
8958 	}
8959 	/*
8960 	 * ready to add record
8961 	 */
8962 	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
8963 	    (sizeof (mddb_block_t) * blkcnt);
8964 	if (dbp->db_firstentry) {
8965 		for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
8966 			;
8967 		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8968 		dep = dep->de_next;
8969 	} else {
8970 		dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
8971 		dbp->db_firstentry = dep;
8972 	}
8973 	bzero((caddr_t)dep, desize_ic);
8974 	dep->de_recid = newid;
8975 	/*
8976 	 * Optimized records have an owner node associated with them in
8977 	 * a MN diskset.  The owner is only set on a node that is actively
8978 	 * writing to that record.  The other nodes will show that record
8979 	 * as having an invalid owner.  The owner for an optimized record
8980 	 * is used during fixoptrecord to determine which node should
8981 	 * write out the record when the replicas associated with that
8982 	 * optimized record have been changed.
8983 	 */
8984 	if (MD_MNSET_SETNO(s->s_setno)) {
8985 		dep->de_owner_nodeid = MD_MN_INVALID_NID;
8986 	}
8987 	dep->de_type1 =	type;
8988 	dep->de_type2 = type2;
8989 	dep->de_reqsize = usersize;
8990 	dep->de_recsize = recsize;
8991 	dep->de_blkcount = blkcnt;
8992 	flag_type = options &
8993 	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
8994 	    MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
8995 	    MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
8996 	switch (flag_type) {
8997 	case MD_CRO_OPTIMIZE:
8998 		dep->de_flags = MDDB_F_OPT;
8999 		getoptdev(s, dep, 0);
9000 		getoptdev(s, dep, 1);
9001 		break;
9002 	case MD_CRO_STRIPE:
9003 		dep->de_flags = MDDB_F_STRIPE;
9004 		break;
9005 	case MD_CRO_MIRROR:
9006 		dep->de_flags = MDDB_F_MIRROR;
9007 		break;
9008 	case MD_CRO_RAID:
9009 		dep->de_flags = MDDB_F_RAID;
9010 		break;
9011 	case MD_CRO_SOFTPART:
9012 		dep->de_flags = MDDB_F_SOFTPART;
9013 		break;
9014 	case MD_CRO_TRANS_MASTER:
9015 		dep->de_flags = MDDB_F_TRANS_MASTER;
9016 		break;
9017 	case MD_CRO_TRANS_LOG:
9018 		dep->de_flags = MDDB_F_TRANS_LOG;
9019 		break;
9020 	case MD_CRO_HOTSPARE:
9021 		dep->de_flags = MDDB_F_HOTSPARE;
9022 		break;
9023 	case MD_CRO_HOTSPARE_POOL:
9024 		dep->de_flags = MDDB_F_HOTSPARE_POOL;
9025 		break;
9026 	case MD_CRO_CHANGELOG:
9027 		dep->de_flags = MDDB_F_CHANGELOG;
9028 		break;
9029 	}
9030 	/*
9031 	 * try to get all blocks consecutive. If not possible
9032 	 * just get them one at a time
9033 	 */
9034 	dep->de_blks[0] = getfreeblks(s, blkcnt);
9035 	if (dep->de_blks[0]) {
9036 		for (i = 1; i < blkcnt; i++)
9037 			dep->de_blks[i] = dep->de_blks[0] + i;
9038 	} else {
9039 		for (i = 0; i < blkcnt;	 i++)
9040 			dep->de_blks[i] = getfreeblks(s, 1);
9041 	}
9042 	dep->de_rb = rbp;
9043 	bzero((caddr_t)rbp, recsize);
9044 	rbp->rb_magic = MDDB_MAGIC_RB;
9045 
9046 	/* Do we have to create an old style (32 bit) record?  */
9047 	if (options & MD_CRO_32BIT) {
9048 		if (options & MD_CRO_FN)
9049 			rbp->rb_revision = MDDB_REV_RBFN;
9050 		else
9051 			rbp->rb_revision = MDDB_REV_RB;
9052 	} else {
9053 		if (options & MD_CRO_FN)
9054 			rbp->rb_revision = MDDB_REV_RB64FN;
9055 		else
9056 			rbp->rb_revision = MDDB_REV_RB64;
9057 	}
9058 
9059 	/* set de_rb_userdata for non optimization records */
9060 	if ((options & MD_CRO_OPTIMIZE) == 0) {
9061 		dep->de_rb_userdata = userdata;
9062 	}
9063 
9064 	uniqtime32(&rbp->rb_timestamp);
9065 	/* Generate the crc for this record */
9066 	rec_crcgen(s, dep, rbp);
9067 	tmppnt = (caddr_t)rbp;
9068 	/*
9069 	 * the following code writes new records to all instances of
9070 	 * the data base. Writing one block at a time to each instance
9071 	 * is safe because they are not yet in a directory entry which
9072 	 * has been written to the data base
9073 	 */
9074 	err = 0;
9075 	if ((options & MD_CRO_OPTIMIZE) == 0) {
9076 		for (i = 0; i < blkcnt;	 i++) {
9077 			err |= writeall(s, (caddr_t)tmppnt,
9078 			    dep->de_blks[i], 1, 0);
9079 			tmppnt += MDDB_BSIZE;
9080 		}
9081 	} else {
9082 		if ((MD_MNSET_SETNO(s->s_setno)) &&
9083 		    md_set[s->s_setno].s_am_i_master) {
9084 		/*
9085 		 * If a MN diskset then only master writes out newly
9086 		 * created optimized record.
9087 		 */
9088 			err |= writeoptrecord(s, dep);
9089 		}
9090 	}
9091 	uniqtime32(&dbp->db_timestamp);
9092 	dbp->db_revision = MDDB_REV_DB;
9093 	/* Don't include opt resync and change log records in global XOR */
9094 	if (!(dep->de_flags & MDDB_F_OPT) &&
9095 	    !(dep->de_flags & MDDB_F_CHANGELOG))
9096 		dbp->db_recsum ^= rbp->rb_checksum;
9097 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9098 	create_db32rec(db32p, dbp);
9099 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9100 	err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9101 	if (prevdbp) {
9102 		dbp = prevdbp;
9103 		uniqtime32(&dbp->db_timestamp);
9104 		dbp->db_revision = MDDB_REV_DB;
9105 		create_db32rec(db32p, dbp);
9106 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9107 		err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9108 	}
9109 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9110 	if (err) {
9111 		if (writeretry(s)) {
9112 			s->s_zombie = newid;
9113 			single_thread_end(s);
9114 			mddb_setexit(s);
9115 			return (MDDB_E_NOTNOW);
9116 		}
9117 	}
9118 	single_thread_end(s);
9119 	mddb_setexit(s);
9120 
9121 	ASSERT((newid & MDDB_SETMASK) == 0);
9122 	return (MAKERECID(setno, newid));
9123 }
9124 
9125 int
9126 mddb_deleterec(
9127 	mddb_recid_t	id
9128 )
9129 {
9130 	mddb_set_t	*s;
9131 	mddb_db_t	*dbp;
9132 	mddb_db32_t	*db32p;
9133 	mddb_de_ic_t	*dep, *dep1;
9134 	int		i;
9135 
9136 #if defined(_ILP32) && !defined(lint)
9137 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
9138 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
9139 #endif
9140 
9141 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9142 	ASSERT(s != NULL);
9143 
9144 	id = DBID(id);
9145 	if (checkstate(s, MDDB_PROBE)) {
9146 		mddb_setexit(s);
9147 		return (MDDB_E_NOTNOW);
9148 	}
9149 
9150 	ASSERT(s->s_lbp != NULL);
9151 	single_thread_start(s);
9152 
9153 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9154 		dep1 = NULL;
9155 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9156 			if (dep->de_recid == id)
9157 				break;
9158 			dep1 = dep;
9159 		}
9160 		if (dep != NULL)
9161 			break;
9162 	}
9163 	/*
9164 	 * no such record
9165 	 */
9166 	if (dep == NULL) {
9167 		single_thread_end(s);
9168 		ASSERT(s->s_staledeletes != 0);
9169 		s->s_staledeletes--;
9170 		mddb_setexit(s);
9171 		return (0);
9172 	}
9173 
9174 	if (!(dep->de_flags & MDDB_F_OPT) &&
9175 	    !(dep->de_flags & MDDB_F_CHANGELOG)) {
9176 		dbp->db_recsum ^= dep->de_rb->rb_checksum;
9177 		dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
9178 	}
9179 
9180 	if (dep->de_rb_userdata != NULL) {
9181 		if (dep->de_icreqsize)
9182 			kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
9183 		else
9184 			kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9185 	}
9186 
9187 	kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
9188 
9189 	for (i = 0; i < dep->de_blkcount; i++)
9190 		blkfree(s, dep->de_blks[i]);
9191 	if (dep1)
9192 		dep1->de_next = dep->de_next;
9193 	else
9194 		dbp->db_firstentry = dep->de_next;
9195 
9196 	kmem_free(dep, sizeofde(dep));
9197 
9198 	uniqtime32(&dbp->db_timestamp);
9199 	dbp->db_revision = MDDB_REV_DB;
9200 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9201 	create_db32rec(db32p, dbp);
9202 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9203 	if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
9204 		if (writeretry(s)) {
9205 			/*
9206 			 * staledelete is used to mark deletes which failed.
9207 			 * its only use is to not panic when the user retries
9208 			 * the delete once the database is active again
9209 			 */
9210 			single_thread_end(s);
9211 			s->s_staledeletes++;
9212 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
9213 			mddb_setexit(s);
9214 			return (MDDB_E_NOTNOW);
9215 		}
9216 	}
9217 	single_thread_end(s);
9218 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9219 	mddb_setexit(s);
9220 	return (0);
9221 }
9222 
9223 mddb_recid_t
9224 mddb_getnextrec(
9225 	mddb_recid_t		id,
9226 	mddb_type_t		typ,
9227 	uint_t			type2
9228 )
9229 {
9230 	mddb_set_t		*s;
9231 	mddb_db_t		*dbp;
9232 	mddb_de_ic_t		*dep;
9233 	int			searching, err;
9234 	set_t			setno;
9235 
9236 	setno = DBSET(id);
9237 	id = DBID(id);
9238 	searching = id;
9239 
9240 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9241 		return (err);
9242 
9243 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9244 		for (dep = dbp->db_firstentry;
9245 		    dep != NULL; dep = dep->de_next) {
9246 			if (searching) {
9247 				if (dep->de_recid == id)
9248 					searching = 0;
9249 			} else {
9250 				if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
9251 				    (type2 == 0 || dep->de_type2 == type2)) {
9252 					id = dep->de_recid;
9253 					mddb_setexit(s);
9254 					ASSERT((id & MDDB_SETMASK) == 0);
9255 					return (MAKERECID(setno, id));
9256 				}
9257 			}
9258 		}
9259 	}
9260 
9261 	mddb_setexit(s);
9262 
9263 	if (searching)
9264 		return (MDDB_E_NORECORD);
9265 	return (0);
9266 }
9267 
9268 void *
9269 mddb_getrecaddr(
9270 	mddb_recid_t		id
9271 )
9272 {
9273 	mddb_set_t		*s;
9274 	mddb_db_t		*dbp;
9275 	mddb_de_ic_t		*dep;
9276 	void			*rval;
9277 
9278 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9279 		return (NULL);
9280 
9281 	id = DBID(id);
9282 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9283 		for (dep = dbp->db_firstentry;
9284 		    dep != NULL; dep = dep->de_next) {
9285 			if (dep->de_recid != id)
9286 				continue;
9287 			if (dep->de_rb_userdata)
9288 				rval = (void *)dep->de_rb_userdata;
9289 			else
9290 				rval = (void *)dep->de_rb->rb_data;
9291 			mddb_setexit(s);
9292 			return (rval);
9293 		}
9294 	}
9295 
9296 	mddb_setexit(s);
9297 	return (NULL);
9298 }
9299 
9300 
9301 mddb_de_ic_t *
9302 mddb_getrecdep(
9303 	mddb_recid_t		id
9304 )
9305 {
9306 	mddb_set_t		*s;
9307 	mddb_db_t		*dbp;
9308 	mddb_de_ic_t		*dep;
9309 
9310 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9311 		return (NULL);
9312 
9313 	id = DBID(id);
9314 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9315 		for (dep = dbp->db_firstentry;
9316 		    dep != NULL; dep = dep->de_next) {
9317 			if (dep->de_recid != id)
9318 				continue;
9319 			mddb_setexit(s);
9320 			return (dep);
9321 		}
9322 	}
9323 
9324 	mddb_setexit(s);
9325 	return (NULL);
9326 }
9327 
9328 void *
9329 mddb_getrecaddr_resize(
9330 	mddb_recid_t		id,
9331 	size_t			icsize,
9332 	off_t			off
9333 )
9334 {
9335 	mddb_set_t		*s;
9336 	mddb_db_t		*dbp;
9337 	mddb_de_ic_t		*dep;
9338 	void			*rval = NULL;
9339 
9340 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9341 		return (NULL);
9342 
9343 	id = DBID(id);
9344 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9345 		for (dep = dbp->db_firstentry;
9346 		    dep != NULL; dep = dep->de_next) {
9347 			if (dep->de_recid != id)
9348 				continue;
9349 			if (dep->de_rb_userdata)
9350 				rval = (void *)dep->de_rb_userdata;
9351 			else
9352 				rval = (void *)dep->de_rb->rb_data;
9353 			break;
9354 		}
9355 		if (rval != NULL)
9356 			break;
9357 	}
9358 
9359 	if (rval == NULL) {
9360 		mddb_setexit(s);
9361 		return (NULL);
9362 	}
9363 
9364 	if (dep->de_rb_userdata) {
9365 		caddr_t nud;
9366 
9367 		if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
9368 			mddb_setexit(s);
9369 			return (rval);
9370 		}
9371 		ASSERT((dep->de_reqsize + off) <= icsize);
9372 		nud = kmem_zalloc(icsize, KM_SLEEP);
9373 		bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
9374 		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9375 		dep->de_rb_userdata = nud + off;
9376 		dep->de_rb_userdata_ic = nud;
9377 		dep->de_icreqsize = icsize;
9378 		rval = nud;
9379 	} else {
9380 		size_t recsize;
9381 		/* LINTED variable unused - used for sizeof calculations */
9382 		mddb_rb32_t *nrbp;
9383 
9384 		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
9385 		    icsize, MDDB_BSIZE);
9386 		if (dep->de_recsize < recsize)
9387 			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
9388 			    "nonoptimized records can be resized\n");
9389 	}
9390 
9391 	mddb_setexit(s);
9392 	return (rval);
9393 }
9394 
9395 int
9396 mddb_getrecprivate(
9397 	mddb_recid_t		id
9398 )
9399 {
9400 	mddb_set_t		*s;
9401 	mddb_db_t		*dbp;
9402 	mddb_de_ic_t		*dep;
9403 	int			err = 0;
9404 	int			private;
9405 
9406 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9407 		return (err);
9408 
9409 	id = DBID(id);
9410 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9411 		for (dep = dbp->db_firstentry;
9412 		    dep != NULL; dep = dep->de_next) {
9413 			if (dep->de_recid != id)
9414 				continue;
9415 			private = (int)dep->de_rb->rb_private;
9416 			mddb_setexit(s);
9417 			return (private);
9418 		}
9419 	}
9420 
9421 	mddb_setexit(s);
9422 	return (MDDB_E_NORECORD);
9423 }
9424 
9425 void
9426 mddb_setrecprivate(
9427 	mddb_recid_t		id,
9428 	uint_t			private
9429 )
9430 {
9431 	mddb_set_t		*s;
9432 	mddb_db_t		*dbp;
9433 	mddb_de_ic_t		*dep;
9434 
9435 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
9436 		ASSERT(0);
9437 		return;
9438 	}
9439 
9440 	id = DBID(id);
9441 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9442 		for (dep = dbp->db_firstentry;
9443 		    dep != NULL; dep = dep->de_next) {
9444 			if (dep->de_recid != id)
9445 				continue;
9446 			dep->de_rb->rb_private = private;
9447 			mddb_setexit(s);
9448 			return;
9449 		}
9450 	}
9451 
9452 	mddb_setexit(s);
9453 	ASSERT(0);
9454 }
9455 
9456 mddb_type_t
9457 mddb_getrectype1(
9458 	mddb_recid_t		id
9459 )
9460 {
9461 	mddb_set_t		*s;
9462 	mddb_db_t		*dbp;
9463 	mddb_de_ic_t		*dep;
9464 	int			err = 0;
9465 	mddb_type_t		rval;
9466 
9467 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9468 		return (err);
9469 
9470 	id = DBID(id);
9471 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9472 		for (dep = dbp->db_firstentry;
9473 		    dep != NULL; dep = dep->de_next) {
9474 			if (dep->de_recid != id)
9475 				continue;
9476 			rval = dep->de_type1;
9477 			mddb_setexit(s);
9478 			return (rval);
9479 		}
9480 	}
9481 
9482 	mddb_setexit(s);
9483 	return (MDDB_E_NORECORD);
9484 }
9485 
9486 int
9487 mddb_getrectype2(
9488 	mddb_recid_t		id
9489 )
9490 {
9491 	mddb_set_t		*s;
9492 	mddb_db_t		*dbp;
9493 	mddb_de_ic_t		*dep;
9494 	int			err = 0;
9495 	int			rval;
9496 
9497 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9498 		return (err);
9499 
9500 	id = DBID(id);
9501 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9502 		for (dep = dbp->db_firstentry;
9503 		    dep != NULL; dep = dep->de_next) {
9504 			if (dep->de_recid != id)
9505 				continue;
9506 			rval = (int)dep->de_type2;
9507 			mddb_setexit(s);
9508 			return (rval);
9509 		}
9510 	}
9511 
9512 	mddb_setexit(s);
9513 	return (MDDB_E_NORECORD);
9514 }
9515 
9516 int
9517 mddb_getrecsize(
9518 	mddb_recid_t		id
9519 )
9520 {
9521 	mddb_set_t		*s;
9522 	mddb_db_t		*dbp;
9523 	mddb_de_ic_t		*dep;
9524 	int			err = 0;
9525 	int			rval;
9526 
9527 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9528 		return (err);
9529 
9530 	id = DBID(id);
9531 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9532 		for (dep = dbp->db_firstentry;
9533 		    dep != NULL; dep = dep->de_next) {
9534 			if (dep->de_recid != id)
9535 				continue;
9536 			rval = (int)dep->de_reqsize;
9537 			mddb_setexit(s);
9538 			return (rval);
9539 		}
9540 	}
9541 
9542 	mddb_setexit(s);
9543 	return (MDDB_E_NORECORD);
9544 }
9545 
9546 
9547 mddb_recstatus_t
9548 mddb_getrecstatus(
9549 	mddb_recid_t		id
9550 )
9551 {
9552 	mddb_set_t		*s;
9553 	mddb_db_t		*dbp;
9554 	mddb_de_ic_t		*dep;
9555 	int			err = 0;
9556 	mddb_recstatus_t	e_err;
9557 
9558 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9559 		return ((mddb_recstatus_t)err);
9560 
9561 	id = DBID(id);
9562 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9563 		for (dep = dbp->db_firstentry;
9564 		    dep != NULL; dep = dep->de_next) {
9565 			if (dep->de_recid == id)
9566 				break;
9567 		}
9568 		if (dep)
9569 			break;
9570 	}
9571 
9572 	e_err = MDDB_OK;
9573 
9574 	if (! dep)
9575 		e_err = MDDB_NORECORD;
9576 	else if (! dep->de_rb->rb_commitcnt)
9577 		e_err = MDDB_NODATA;
9578 	else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
9579 		e_err = MDDB_STALE;
9580 
9581 	mddb_setexit(s);
9582 	return (e_err);
9583 }
9584 
9585 static int	mddb_commitrec_retries = 5;
9586 
9587 /*
9588  * Commit given record to disk.
9589  * If committing an optimized record, do not call
9590  * with md ioctl lock held.
9591  */
9592 int
9593 mddb_commitrec(
9594 	mddb_recid_t	id
9595 )
9596 {
9597 	mddb_set_t			*s;
9598 	mddb_db_t			*dbp;
9599 	mddb_de_ic_t			*dep;
9600 	mddb_recid_t			ids[2];
9601 	mddb_rb32_t			*rbp;
9602 	static int			err = 0;
9603 	md_mn_msg_mddb_optrecerr_t	*msg_recerr;
9604 	md_mn_kresult_t			*kres;
9605 	mddb_lb_t			*lbp;
9606 	mddb_mnlb_t			*mnlbp;
9607 	mddb_locator_t			*lp;
9608 	mddb_mnsidelocator_t		*mnslp;
9609 	mddb_drvnm_t			*dn;
9610 	int				li;
9611 	md_replica_recerr_t		*recerr;
9612 	int				i, j;
9613 	int				rval;
9614 	int				hit_err = 0;
9615 	int				retry = mddb_commitrec_retries;
9616 	int				gave_up = 0;
9617 
9618 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9619 	ASSERT(s != NULL);
9620 
9621 	if (checkstate(s, MDDB_PROBE)) {
9622 		mddb_setexit(s);
9623 		return (MDDB_E_NOTNOW);
9624 	}
9625 
9626 	if (DBID(id) == 0) {
9627 		mddb_setexit(s);
9628 		return (0);
9629 	}
9630 
9631 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9632 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9633 			if (dep->de_recid == DBID(id))
9634 				break;
9635 		}
9636 		if (dep)
9637 			break;
9638 	}
9639 
9640 	if (dep == NULL) {
9641 		mddb_setexit(s);
9642 		return (MDDB_E_NORECORD);
9643 	}
9644 
9645 	if (! (dep->de_flags & MDDB_F_OPT)) {
9646 		ids[0] = id;
9647 		ids[1] = 0;
9648 		mddb_setexit(s);
9649 		return (mddb_commitrecs(ids));
9650 	}
9651 
9652 	/*
9653 	 * following code allows multiple processes to be doing
9654 	 * optimization commits in parallel.
9655 	 * NOTE: if lots of optimization commits then the lock
9656 	 * will not get released until it winds down
9657 	 */
9658 	if (s->s_optwaiterr) {
9659 		while (s->s_optwaiterr) {
9660 			s->s_opthungerr = 1;
9661 			cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
9662 		}
9663 		if (checkstate(s, MDDB_PROBE)) {
9664 			mddb_setexit(s);
9665 			return (MDDB_E_NOTNOW);
9666 		}
9667 	}
9668 	if (s->s_optcmtcnt++ == 0) {
9669 		single_thread_start(s);
9670 		s->s_opthavelck = 1;
9671 		if (s->s_optwantlck) {
9672 			cv_broadcast(&s->s_optwantlck_cv);
9673 			s->s_optwantlck = 0;
9674 		}
9675 	} else {
9676 		while (! s->s_opthavelck) {
9677 			s->s_optwantlck = 1;
9678 			cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
9679 		}
9680 	}
9681 
9682 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9683 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9684 			if (dep->de_recid == DBID(id))
9685 				break;
9686 		}
9687 		if (dep)
9688 			break;
9689 	}
9690 
9691 	if (dep == NULL) {
9692 		if (! (--s->s_optcmtcnt)) {
9693 			single_thread_end(s);
9694 			s->s_opthavelck = 0;
9695 		}
9696 		mddb_setexit(s);
9697 		return (MDDB_E_NORECORD);
9698 	}
9699 
9700 	rbp = dep->de_rb;
9701 	rbp->rb_commitcnt++;
9702 	uniqtime32(&rbp->rb_timestamp);
9703 	/* Generate the crc for this record */
9704 	rec_crcgen(s, dep, rbp);
9705 
9706 	if (writeoptrecord(s, dep)) {
9707 		if (MD_MNSET_SETNO(s->s_setno)) {
9708 			hit_err = 1;
9709 		}
9710 		s->s_optwaiterr++;
9711 	}
9712 	if (MD_MNSET_SETNO(s->s_setno)) {
9713 		/* If last thread out, release single_thread_start */
9714 		if (! (--s->s_optcmtcnt)) {
9715 			single_thread_end(s);
9716 			s->s_opthavelck = 0;
9717 		}
9718 		/*
9719 		 * If this thread had a writeoptrecords failure, then
9720 		 * need to send message to master.
9721 		 * But, multiple threads could all be running on the
9722 		 * same single_thread_start, so serialize the threads
9723 		 * by making each thread grab single_thread_start.
9724 		 *
9725 		 * After return from sending message to master message,
9726 		 * replicas associated with optimized record will havei
9727 		 * been changed (via a callback from the master to all
9728 		 * nodes), so retry call to writeoptrecord.
9729 		 * This code is replacing the call to writeretry that
9730 		 * occurs for the local and traditional disksets.
9731 		 */
9732 		if (hit_err) {
9733 			single_thread_start(s);
9734 			/*
9735 			 * If > 50% of replicas are alive then continue
9736 			 * to send message to master until writeoptrecord
9737 			 * succeeds.  For now, assume that minor name,
9738 			 * major number on this node is the same as on
9739 			 * the master node.  Once devids are turned on
9740 			 * for MN disksets, can send devid.
9741 			 */
9742 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
9743 			msg_recerr = kmem_zalloc(
9744 			    sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
9745 			while (!(md_get_setstatus(s->s_setno) &
9746 			    MD_SET_TOOFEW)) {
9747 				bzero((caddr_t)msg_recerr,
9748 				    sizeof (md_mn_msg_mddb_optrecerr_t));
9749 				lbp = s->s_lbp;
9750 				mnlbp = (mddb_mnlb_t *)lbp;
9751 				for (i = 0; i < 2; i++) {
9752 					li = dep->de_optinfo[i].o_li;
9753 					lp = &lbp->lb_locators[li];
9754 					for (j = 0; j < MD_MNMAXSIDES; j++) {
9755 						mnslp =
9756 						    &mnlbp->
9757 						    lb_mnsidelocators[j][li];
9758 						if (mnslp->mnl_sideno ==
9759 						    s->s_sideno)
9760 							break;
9761 					}
9762 					if (j == MD_MNMAXSIDES)
9763 						continue;
9764 
9765 					dn = &lbp->
9766 					    lb_drvnm[mnslp->mnl_drvnm_index];
9767 					recerr = &msg_recerr->msg_recerr[i];
9768 					recerr->r_li = li;
9769 					recerr->r_flags =
9770 					    dep->de_optinfo[i].o_flags;
9771 					recerr->r_blkno = lp->l_blkno;
9772 					recerr->r_mnum = md_getminor(lp->l_dev);
9773 					(void) strncpy(recerr->r_driver_name,
9774 					    dn->dn_data, MD_MAXDRVNM);
9775 				}
9776 
9777 				/* Release locks */
9778 				single_thread_end(s);
9779 				mutex_exit(SETMUTEX(s->s_setno));
9780 
9781 				/*
9782 				 * Send message to master about optimized
9783 				 * record failure.  After return, master
9784 				 * should have marked failed replicas
9785 				 * and sent parse message to slaves causing
9786 				 * slaves to have fixed up the optimized
9787 				 * record.
9788 				 * On return from ksend_message, retry
9789 				 * the write since this node should have fixed
9790 				 * the optimized resync records it owns.
9791 				 */
9792 				rval = mdmn_ksend_message(s->s_setno,
9793 				    MD_MN_MSG_MDDB_OPTRECERR,
9794 				    MD_MSGF_NO_BCAST, 0,
9795 				    (char *)msg_recerr,
9796 				    sizeof (md_mn_msg_mddb_optrecerr_t),
9797 				    kres);
9798 				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
9799 					cmn_err(CE_WARN, "mddb_commitrec: "
9800 					    "Unable to send optimized "
9801 					    "resync record failure "
9802 					    "message to other nodes in "
9803 					    "diskset %s\n", s->s_setname);
9804 					mdmn_ksend_show_error(rval, kres,
9805 					    "MD_MN_MSG_MDDB_OPTRECERR");
9806 				}
9807 
9808 				/* Regrab locks */
9809 				mutex_enter(SETMUTEX(s->s_setno));
9810 				single_thread_start(s);
9811 
9812 				/* Start over in case mddb changed */
9813 				for (dbp = s->s_dbp; dbp != NULL;
9814 				    dbp = dbp->db_next) {
9815 					for (dep = dbp->db_firstentry; dep;
9816 					    dep = dep->de_next) {
9817 						if (dep->de_recid == DBID(id))
9818 							break;
9819 					}
9820 					if (dep)
9821 						break;
9822 				}
9823 				if (dep) {
9824 					rbp = dep->de_rb;
9825 					rbp->rb_commitcnt++;
9826 					uniqtime32(&rbp->rb_timestamp);
9827 					/* Generate the crc for this record */
9828 					rec_crcgen(s, dep, rbp);
9829 
9830 					/*
9831 					 * If writeoptrecord succeeds, then
9832 					 * break out.
9833 					 */
9834 					if (!(writeoptrecord(s, dep)))
9835 						break;
9836 				}
9837 				if (--retry == 0) {
9838 					cmn_err(CE_WARN, "mddb_commitrec: "
9839 					    "giving up writing optimized "
9840 					    "resync record for "
9841 					    "diskset %s, device %s,%d "
9842 					    "blkno 0x%x, flags 0x%x\n",
9843 					    s->s_setname, recerr->r_driver_name,
9844 					    recerr->r_mnum, recerr->r_blkno,
9845 					    recerr->r_flags);
9846 					gave_up++;
9847 					break;
9848 				}
9849 			}
9850 			kmem_free(kres, sizeof (md_mn_kresult_t));
9851 			kmem_free(msg_recerr,
9852 			    sizeof (md_mn_msg_mddb_optrecerr_t));
9853 
9854 			/* Resync record should be fixed - if possible */
9855 			s->s_optwaiterr--;
9856 			if (s->s_optwaiterr == 0) {
9857 				/* All errors have been handled */
9858 				if (s->s_opthungerr) {
9859 					s->s_opthungerr = 0;
9860 					cv_broadcast(&s->s_opthungerr_cv);
9861 				}
9862 			}
9863 			single_thread_end(s);
9864 			mddb_setexit(s);
9865 			if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
9866 				return (MDDB_E_NOTNOW);
9867 			} else if (gave_up) {
9868 				return (MDDB_E_STALE);
9869 			} else {
9870 				return (0);
9871 			}
9872 		}
9873 	} else {
9874 		/* If set is a traditional or local set */
9875 		if (! (--s->s_optcmtcnt)) {
9876 			err = 0;
9877 			if (s->s_optwaiterr) {
9878 				err = writeretry(s);
9879 				s->s_optwaiterr = 0;
9880 				if (s->s_opthungerr) {
9881 					s->s_opthungerr = 0;
9882 					cv_broadcast(&s->s_opthungerr_cv);
9883 				}
9884 			}
9885 			single_thread_end(s);
9886 			s->s_opthavelck = 0;
9887 			mddb_setexit(s);
9888 			if (err)
9889 				return (MDDB_E_NOTNOW);
9890 			return (0);
9891 		}
9892 		if (s->s_optwaiterr) {
9893 			while (s->s_optwaiterr) {
9894 				s->s_opthungerr = 1;
9895 				cv_wait(&s->s_opthungerr_cv,
9896 				    SETMUTEX(s->s_setno));
9897 			}
9898 			if (checkstate(s, MDDB_NOPROBE)) {
9899 				mddb_setexit(s);
9900 				return (MDDB_E_NOTNOW);
9901 			}
9902 		}
9903 	}
9904 
9905 	mddb_setexit(s);
9906 	return (0);
9907 }
9908 
9909 int
9910 mddb_commitrecs(
9911 	mddb_recid_t	ids[]
9912 )
9913 {
9914 	mddb_set_t	*s;
9915 	mddb_db_t	*dbp;
9916 	mddb_de_ic_t	*dep;
9917 	mddb_rb32_t	*rbp;
9918 	mddb_rb32_t	*saverbp;
9919 	mddb_lb_t	*lbp;
9920 	int		li;
9921 	uint_t		checksum;
9922 	mddb_recid_t	*idp;
9923 	int		err = 0;
9924 	set_t		setno;
9925 
9926 	if (panicstr)
9927 		cmn_err(CE_PANIC, "md: mddb: commit not allowed");
9928 
9929 	/*
9930 	 * scan through and make sure ids are from the same set
9931 	 */
9932 	setno = DBSET(ids[0]);
9933 	for (idp = ids; *idp != NULL; idp++)
9934 		ASSERT(DBSET(*idp) == setno);
9935 
9936 	s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);
9937 
9938 	if (checkstate(s, MDDB_PROBE)) {
9939 		mddb_setexit(s);
9940 		return (MDDB_E_NOTNOW);
9941 	}
9942 
9943 	ASSERT(s->s_lbp != NULL);
9944 	err = 0;
9945 
9946 	if (! ids[0]) {
9947 		mddb_setexit(s);
9948 		return (0);
9949 	}
9950 
9951 	single_thread_start(s);
9952 	/*
9953 	 * scan through and make sure ids all exist
9954 	 */
9955 	for (idp = ids; *idp != NULL; idp++) {
9956 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9957 			for (dep = dbp->db_firstentry; dep;
9958 			    dep = dep->de_next) {
9959 				if (dep->de_recid == DBID(*idp))
9960 					break;
9961 			}
9962 			if (dep != NULL)
9963 				break;
9964 		}
9965 		if (dep == NULL) {
9966 			single_thread_end(s);
9967 			mddb_setexit(s);
9968 			return (MDDB_E_NORECORD);
9969 		}
9970 	}
9971 
9972 	/*
9973 	 * scan through records fix commit counts and
9974 	 * zero fiddles and update time stamp and rechecksum record
9975 	 */
9976 	checksum = 0;
9977 	idp = ids;
9978 	saverbp = NULL;
9979 	while (*idp) {
9980 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9981 			for (dep = dbp->db_firstentry; dep;
9982 			    dep = dep->de_next) {
9983 				if (dep->de_recid == DBID(*idp))
9984 					break;
9985 			}
9986 			if (dep != NULL)
9987 				break;
9988 		}
9989 		rbp = dep->de_rb;
9990 		ASSERT(! (dep->de_flags & MDDB_F_OPT));
9991 
9992 		getuserdata(setno, dep);
9993 		/* Don't do fiddles for CHANGE LOG records */
9994 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
9995 			checksum ^= rbp->rb_checksum_fiddle;
9996 			rbp->rb_checksum_fiddle = 0;
9997 			checksum ^= rbp->rb_checksum;
9998 			saverbp = rbp;
9999 		}
10000 		rbp->rb_commitcnt++;
10001 		uniqtime32(&rbp->rb_timestamp);
10002 		/* Generate the crc for this record */
10003 		rec_crcgen(s, dep, rbp);
10004 
10005 		/* Don't do fiddles for CHANGE LOG records */
10006 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
10007 			checksum ^= rbp->rb_checksum;
10008 		}
10009 		idp++;
10010 	}
10011 
10012 	if (saverbp)
10013 		saverbp->rb_checksum_fiddle = checksum;
10014 
10015 	/*
10016 	 * If this is a MN set but we are not the master, then we are not
10017 	 * supposed to update the mddb on disk. So we finish at this point.
10018 	 */
10019 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
10020 	    (md_set[setno].s_am_i_master == 0)) {
10021 		single_thread_end(s);
10022 		mddb_setexit(s);
10023 		return (0);
10024 	}
10025 
10026 	lbp = s->s_lbp;
10027 	for (li = 0; li < lbp->lb_loccnt; li++) {
10028 		if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
10029 			continue;
10030 
10031 		idp = ids;
10032 		while (*idp) {
10033 			for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10034 				dep = dbp->db_firstentry;
10035 				while (dep && (dep->de_recid != DBID(*idp)))
10036 					dep = dep->de_next;
10037 				if (dep != NULL)
10038 					break;
10039 			}
10040 			rbp = dep->de_rb;
10041 			err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
10042 			    dep->de_blkcount, li, (mddb_bf_t **)0,
10043 			    MDDB_WR_ONLY_MASTER);
10044 			if (err)
10045 				break;
10046 			idp++;
10047 		}
10048 		if (err)
10049 			break;
10050 	}
10051 	if (err) {
10052 		if (writeretry(s)) {
10053 			single_thread_end(s);
10054 			mddb_setexit(s);
10055 			return (MDDB_E_NOTNOW);
10056 		}
10057 	}
10058 	single_thread_end(s);
10059 	mddb_setexit(s);
10060 	return (0);
10061 }
10062 
10063 mddb_recid_t
10064 mddb_makerecid(
10065 	set_t		setno,
10066 	mddb_recid_t	id
10067 )
10068 {
10069 	return (MAKERECID(setno, id));
10070 }
10071 
10072 set_t
10073 mddb_getsetnum(
10074 	mddb_recid_t	id
10075 )
10076 {
10077 	return (DBSET(id));
10078 }
10079 
10080 char *
10081 mddb_getsetname(
10082 	set_t	setno
10083 )
10084 {
10085 	return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
10086 }
10087 
10088 side_t
10089 mddb_getsidenum(
10090 	set_t	setno
10091 )
10092 {
10093 	if (md_set[setno].s_db)
10094 		return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
10095 	return (0);
10096 }
10097 
10098 int
10099 mddb_ownset(
10100 	set_t	setno
10101 )
10102 {
10103 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
10104 		return (1);
10105 
10106 	if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
10107 		return (1);
10108 
10109 	return (0);
10110 }
10111 
10112 /*ARGSUSED*/
10113 int
10114 getmed_ioctl(mddb_med_parm_t *medpp, int mode)
10115 {
10116 	mddb_set_t	*s;
10117 	int		err = 0;
10118 	set_t		setno = medpp->med_setno;
10119 	md_error_t	*ep = &medpp->med_mde;
10120 
10121 	mdclrerror(ep);
10122 
10123 	if (setno >= md_nsets)
10124 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10125 
10126 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10127 		return (0);
10128 
10129 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10130 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10131 
10132 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10133 		return (mddbstatus2error(ep, err, NODEV32, setno));
10134 
10135 	medpp->med = s->s_med;			/* structure assignment */
10136 
10137 	mddb_setexit(s);
10138 
10139 	return (0);
10140 }
10141 
10142 int
10143 setmed_ioctl(mddb_med_parm_t *medpp, int mode)
10144 {
10145 
10146 	mddb_set_t	*s;
10147 	int		err = 0;
10148 	set_t		setno = medpp->med_setno;
10149 	md_error_t	*ep = &medpp->med_mde;
10150 
10151 	mdclrerror(ep);
10152 
10153 	if ((mode & FWRITE) == 0)
10154 		return (mdsyserror(ep, EACCES));
10155 
10156 	/*
10157 	 * This should be the only thing that prevents LOCAL sets from having
10158 	 * mediators, at least in the kernel, userland needs to have some code
10159 	 * written.
10160 	 */
10161 	if (setno == MD_LOCAL_SET)
10162 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10163 
10164 	if (setno >= md_nsets)
10165 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10166 
10167 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10168 		return (0);
10169 
10170 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10171 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10172 
10173 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10174 		return (mddbstatus2error(ep, err, NODEV32, setno));
10175 
10176 	s->s_med = medpp->med;			/* structure assignment */
10177 
10178 	mddb_setexit(s);
10179 
10180 	return (0);
10181 }
10182 
10183 int
10184 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
10185 {
10186 
10187 	mddb_set_t	*s;
10188 	int		err = 0;
10189 	set_t		setno = medpp->med_setno;
10190 	md_error_t	*ep = &medpp->med_mde;
10191 
10192 	mdclrerror(ep);
10193 
10194 	if ((mode & FWRITE) == 0)
10195 		return (mdsyserror(ep, EACCES));
10196 
10197 	if (setno >= md_nsets)
10198 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10199 
10200 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10201 		return (0);
10202 
10203 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10204 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10205 
10206 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10207 		return (mddbstatus2error(ep, err, NODEV32, setno));
10208 
10209 	single_thread_start(s);
10210 	(void) upd_med(s, "updmed_ioctl()");
10211 	single_thread_end(s);
10212 
10213 	mddb_setexit(s);
10214 
10215 	return (0);
10216 }
10217 
10218 int
10219 take_set(mddb_config_t *cp, int mode)
10220 {
10221 	int			err = 0;
10222 	mddb_med_upd_parm_t	medup;
10223 	set_t			setno = cp->c_setno;
10224 	md_error_t		*ep = &cp->c_mde;
10225 	int			snarf_ok = 0;
10226 
10227 	if (md_get_setstatus(setno) & MD_SET_SNARFED)
10228 		return (0);
10229 
10230 	err = mddb_configure(MDDB_GETDEV, cp);
10231 	if (! err && mdisok(ep)) {
10232 		if (md_snarf_db_set(setno, ep) != 0)
10233 			goto out;
10234 		snarf_ok = 1;
10235 	}
10236 
10237 	/*
10238 	 * Clear replicated import flag since this is
10239 	 * used during the take of a diskset with
10240 	 * previously unresolved replicated disks.
10241 	 */
10242 	if (md_get_setstatus(setno) &
10243 	    MD_SET_REPLICATED_IMPORT) {
10244 		md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
10245 	}
10246 
10247 	if (! err && mdisok(ep)) {
10248 		if (! cp->c_flags) {
10249 			medup.med_setno = setno;
10250 			mdclrerror(&medup.med_mde);
10251 
10252 			err = updmed_ioctl(&medup, mode);
10253 			if (! mdisok(&medup.med_mde))
10254 				(void) mdstealerror(ep, &medup.med_mde);
10255 		}
10256 	}
10257 
10258 out:
10259 	/*
10260 	 * In the case that the snarf failed, the diskset is
10261 	 * left with s_db set, but s_lbp not set.  The node is not
10262 	 * an owner of the set and won't be allowed to release the
10263 	 * diskset in order to cleanup.  With s_db set, any call to the
10264 	 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
10265 	 * will cause the diskset to be loaded.  So, cleanup the diskset so
10266 	 * that an inadvertent start of the diskset doesn't happen later.
10267 	 */
10268 	if ((snarf_ok == 0) && md_set[setno].s_db &&
10269 	    (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
10270 		mutex_enter(&mddb_lock);
10271 		mddb_unload_set(setno);
10272 		mutex_exit(&mddb_lock);
10273 	}
10274 	return (err);
10275 }
10276 
10277 /*ARGSUSED*/
10278 int
10279 release_set(mddb_config_t *cp, int mode)
10280 {
10281 	int			err = 0;
10282 	set_t			setno = cp->c_setno;
10283 	md_error_t		*ep = &cp->c_mde;
10284 
10285 	/*
10286 	 * Data integrity check
10287 	 */
10288 	if (setno >= md_nsets)
10289 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10290 
10291 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
10292 	md_haltsnarf_enter(setno);
10293 	/*
10294 	 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
10295 	 * that the mirror code is currently searching all mirrors for a
10296 	 * errored component that needs a hotspare. While this search is in
10297 	 * progress, we cannot release the set and thgerefore we return EBUSY.
10298 	 * Once we have set HOLD, the mirror function (check_4_hotspares) will
10299 	 * block before the search until the set is released.
10300 	 */
10301 	if (md_holdset_testandenter(setno) != 0) {
10302 		md_haltsnarf_exit(setno);
10303 		rw_exit(&md_unit_array_rw.lock);
10304 		return (EBUSY);
10305 	}
10306 
10307 	if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
10308 		err = mddb_configure(MDDB_RELEASESET, cp);
10309 
10310 	md_holdset_exit(setno);
10311 	md_haltsnarf_exit(setno);
10312 	rw_exit(&md_unit_array_rw.lock);
10313 
10314 	if (! err && mdisok(ep)) {
10315 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
10316 		    NODEV64);
10317 	}
10318 
10319 	return (err);
10320 }
10321 
10322 int
10323 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
10324 {
10325 	mddb_set_t	*s;
10326 	int		err = 0;
10327 	mddb_dtag_lst_t	*dtlp;
10328 	set_t		setno = dtgpp->dtgp_setno;
10329 	md_error_t	*ep = &dtgpp->dtgp_mde;
10330 
10331 	mdclrerror(ep);
10332 
10333 	if ((mode & FREAD) == 0)
10334 		return (mdsyserror(ep, EACCES));
10335 
10336 	if (setno >= md_nsets)
10337 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10338 
10339 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10340 		return (0);
10341 
10342 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10343 		return (mddbstatus2error(ep, err, NODEV32, setno));
10344 
10345 	/*
10346 	 * Data tags not supported on MN sets so return invalid operation.
10347 	 * This ioctl could be called before the mddb has been read in so
10348 	 * the set status may not yet be set to MNSET, so code following
10349 	 * this check must handle a MN diskset properly.
10350 	 */
10351 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10352 		mddb_setexit(s);
10353 		return (mderror(ep, MDE_INVAL_MNOP));
10354 	}
10355 
10356 	/* s_dtlp is NULL for MN diskset */
10357 	dtlp = s->s_dtlp;
10358 	while (dtlp != NULL) {
10359 		if (dtgpp->dtgp_dt.dt_id == 0 ||
10360 		    dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
10361 			bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
10362 			    sizeof (mddb_dtag_t));
10363 			break;
10364 		}
10365 		dtlp = dtlp->dtl_nx;
10366 	}
10367 
10368 	/* Walked the whole list and id not found, return error */
10369 	if (dtlp == (mddb_dtag_lst_t *)NULL) {
10370 		mddb_setexit(s);
10371 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10372 	}
10373 
10374 	mddb_setexit(s);
10375 
10376 	return (0);
10377 }
10378 
10379 int
10380 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
10381 {
10382 	mddb_set_t	*s;
10383 	int		err = 0;
10384 	mddb_config_t	*cp;
10385 	mddb_ri_t	*trip = NULL;
10386 	mddb_dtag_t	*dtagp = NULL;
10387 	set_t		setno = dtupp->dtup_setno;
10388 	md_error_t	*ep = &dtupp->dtup_mde;
10389 
10390 	mdclrerror(ep);
10391 
10392 	if ((mode & FWRITE) == 0)
10393 		return (mdsyserror(ep, EACCES));
10394 
10395 	if (setno >= md_nsets)
10396 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10397 
10398 	if (dtupp->dtup_id < 0)
10399 		return (mdsyserror(ep, EINVAL));
10400 	else if (dtupp->dtup_id == 0)
10401 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10402 
10403 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10404 		return (0);
10405 
10406 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
10407 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10408 
10409 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10410 		return (mddbstatus2error(ep, err, NODEV32, setno));
10411 
10412 	/*
10413 	 * Data tags not supported on MN sets so return invalid operation.
10414 	 * This ioctl could be called before the mddb has been read in so
10415 	 * the set status may not yet be set to MNSET, so code following
10416 	 * this check must handle a MN diskset properly.
10417 	 */
10418 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10419 		mddb_setexit(s);
10420 		return (mderror(ep, MDE_INVAL_MNOP));
10421 	}
10422 
10423 	/* Validate and find the id requested - nothing found if MN diskset */
10424 	if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
10425 		mddb_setexit(s);
10426 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10427 	}
10428 
10429 	/* Usetag is only valid when more than one tag exists */
10430 	if (dtl_cntl(s) < 2) {
10431 		mddb_setexit(s);
10432 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10433 	}
10434 
10435 	/* Put the selected tag in place */
10436 	dt_setup(s, dtagp);
10437 
10438 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10439 
10440 	/* Save the hint information */
10441 	trip = save_rip(s);
10442 
10443 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10444 	cp->c_setno = setno;
10445 	cp->c_sideno = s->s_sideno;
10446 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10447 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10448 	cp->c_med = s->s_med;				/* struct assignment */
10449 
10450 	mddb_setexit(s);
10451 
10452 	s = NULL;
10453 
10454 	/* shorthand */
10455 	setno = cp->c_setno;
10456 
10457 	/* Let unload know not to free the tag */
10458 	md_set_setstatus(setno, MD_SET_KEEPTAG);
10459 
10460 	/* Release the set */
10461 	if (err = release_set(cp, mode))
10462 		goto out;
10463 
10464 	if (! mdisok(&cp->c_mde)) {
10465 		(void) mdstealerror(ep, &cp->c_mde);
10466 		err = 1;
10467 		goto out;
10468 	}
10469 
10470 	/* Re-init set using the saved mddb_config_t structure */
10471 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10472 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10473 			err = mddbstatus2error(ep, err, NODEV32, setno);
10474 			goto out;
10475 		}
10476 	}
10477 
10478 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10479 
10480 	/* use the saved rip structure */
10481 	s->s_rip = trip;
10482 	trip = (mddb_ri_t *)NULL;
10483 
10484 	/* Let the take code know a tag is being used */
10485 	md_set_setstatus(setno, MD_SET_USETAG);
10486 
10487 	mddb_setexit(s);
10488 
10489 	s = NULL;
10490 
10491 	/* Take the set */
10492 	if (err = take_set(cp, mode))
10493 		goto out;
10494 
10495 	if (! mdisok(&cp->c_mde))
10496 		(void) mdstealerror(ep, &cp->c_mde);
10497 
10498 out:
10499 	md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));
10500 
10501 	kmem_free(cp, sizeof (mddb_config_t));
10502 
10503 	if (trip)
10504 		free_rip(&trip);
10505 
10506 	if (s)
10507 		mddb_setexit(s);
10508 
10509 	return (err);
10510 }
10511 
10512 int
10513 accept_ioctl(mddb_accept_parm_t *accpp, int mode)
10514 {
10515 	mddb_set_t	*s;
10516 	int		err = 0;
10517 	mddb_config_t	*cp;
10518 	mddb_ri_t	*trip = NULL;
10519 	set_t		setno = accpp->accp_setno;
10520 	md_error_t	*ep = &accpp->accp_mde;
10521 
10522 	mdclrerror(ep);
10523 
10524 	if ((mode & FWRITE) == 0)
10525 		return (mdsyserror(ep, EACCES));
10526 
10527 	if (setno >= md_nsets)
10528 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10529 
10530 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10531 		return (0);
10532 
10533 	if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
10534 		return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));
10535 
10536 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10537 		return (mddbstatus2error(ep, err, NODEV32, setno));
10538 
10539 	/*
10540 	 * Data tags not supported on MN sets so return invalid operation.
10541 	 * mddb is guaranteed to be incore at this point, so this
10542 	 * check will catch all MN disksets.
10543 	 */
10544 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10545 		mddb_setexit(s);
10546 		return (mderror(ep, MDE_INVAL_MNOP));
10547 	}
10548 
10549 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10550 
10551 	trip = save_rip(s);
10552 
10553 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10554 	cp->c_setno = setno;
10555 	cp->c_sideno = s->s_sideno;
10556 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10557 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10558 	cp->c_med = s->s_med;				/* struct assignment */
10559 
10560 	/* Tag the data */
10561 	if (err = set_dtag(s, ep)) {
10562 		err = mdsyserror(ep, err);
10563 		goto out;
10564 	}
10565 
10566 	/* If we had a BADTAG, it will be re-written, so clear the bit. */
10567 	if (md_get_setstatus(setno) & MD_SET_BADTAG)
10568 		md_clr_setstatus(setno, MD_SET_BADTAG);
10569 
10570 	if (err = dt_write(s)) {
10571 		err = mdsyserror(ep, err);
10572 		goto out;
10573 	}
10574 
10575 	mddb_setexit(s);
10576 
10577 	s = NULL;
10578 
10579 	/* shorthand */
10580 	setno = cp->c_setno;
10581 
10582 	/* Clear the keeptag */
10583 	md_clr_setstatus(setno, MD_SET_KEEPTAG);
10584 
10585 	/* Release the set */
10586 	if (err = release_set(cp, mode))
10587 		goto out;
10588 
10589 	if (! mdisok(&cp->c_mde)) {
10590 		(void) mdstealerror(ep, &cp->c_mde);
10591 		goto out;
10592 	}
10593 
10594 	/* Re-init set using the saved mddb_config_t structure */
10595 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10596 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10597 			err = mddbstatus2error(ep, err, NODEV32, setno);
10598 			goto out;
10599 		}
10600 	}
10601 
10602 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10603 
10604 	/* Free the allocated rip structure */
10605 	if (s->s_rip != (mddb_ri_t *)NULL)
10606 		free_rip(&s->s_rip);
10607 
10608 	/* use the saved rip structure */
10609 	s->s_rip = trip;
10610 	trip = (mddb_ri_t *)NULL;
10611 
10612 	/* Let the set init code know an accept is in progress */
10613 	md_set_setstatus(setno, MD_SET_ACCEPT);
10614 
10615 	mddb_setexit(s);
10616 
10617 	s = NULL;
10618 
10619 	/* Take the set */
10620 	if (err = take_set(cp, mode))
10621 		goto out;
10622 
10623 	if (! mdisok(&cp->c_mde))
10624 		(void) mdstealerror(ep, &cp->c_mde);
10625 
10626 out:
10627 	md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));
10628 
10629 	kmem_free(cp, sizeof (mddb_config_t));
10630 
10631 	if (trip)
10632 		free_rip(&trip);
10633 
10634 	if (s)
10635 		mddb_setexit(s);
10636 
10637 	return (err);
10638 }
10639 
10640 /*
10641  * mddb_getinvlb_devid - cycles through the locator block and determines
10642  *		if the device id's for any of the replica disks are invalid.
10643  *		If so, it returns the diskname in the ctdptr.
10644  *	RETURN
10645  *		-1	Error
10646  *		cnt	number of invalid device id's
10647  */
10648 int
10649 mddb_getinvlb_devid(
10650 	set_t	setno,
10651 	int	count,
10652 	int	size,
10653 	char	**ctdptr
10654 )
10655 {
10656 	mddb_set_t	*s;
10657 	int		err = 0;
10658 	mddb_lb_t	*lbp;
10659 	int		li;
10660 	mddb_did_blk_t	*did_blk;
10661 	mddb_did_info_t	*did_info;
10662 	int		len;
10663 	int		cnt = 0;
10664 	char		*cptr;
10665 	md_name_suffix	*sn;
10666 	int		i, dont_add_it;
10667 	char		*tmpctd, *diskname;
10668 	char		*tmpname;
10669 
10670 	cptr = *ctdptr;
10671 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
10672 		return (-1);
10673 	}
10674 
10675 	single_thread_start(s);
10676 	lbp = s->s_lbp;
10677 
10678 	if (lbp->lb_setno != setno) {
10679 		single_thread_end(s);
10680 		mddb_setexit(s);
10681 		return (-1);
10682 	}
10683 
10684 	/* check for lb being devid style */
10685 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
10686 		did_blk = s->s_did_icp->did_ic_blkp;
10687 		for (li = 0; li < lbp->lb_loccnt; li++) {
10688 			did_info = &(did_blk->blk_info[li]);
10689 			/* Only if devid exists and isn't valid */
10690 			if ((did_info->info_flags & MDDB_DID_EXISTS) &&
10691 			    !(did_info->info_flags & MDDB_DID_VALID)) {
10692 				/*
10693 				 * if we count more invalid did's than
10694 				 * was passed in there's an error somewhere
10695 				 */
10696 				if (cnt++ > count) {
10697 					single_thread_end(s);
10698 					mddb_setexit(s);
10699 					return (-1);
10700 				}
10701 
10702 				/*
10703 				 * Future note: Need to do something here
10704 				 * for the MN diskset case when device ids
10705 				 * are supported in disksets.
10706 				 * Can't add until merging devids_in_diskset
10707 				 * code into code base.
10708 				 */
10709 
10710 				sn = &s->s_lnp->ln_suffixes[0][li];
10711 				/*
10712 				 * check to make sure length of device name is
10713 				 * not greater than computed first time through
10714 				 */
10715 				len = sn->suf_len;
10716 				if (len > size) {
10717 					single_thread_end(s);
10718 					mddb_setexit(s);
10719 					return (-1);
10720 				}
10721 				tmpctd = *ctdptr;
10722 				/* strip off slice part */
10723 				diskname = md_strdup(sn->suf_data);
10724 				tmpname = strrchr(diskname, 's');
10725 				*tmpname = '\0';
10726 				dont_add_it = 0;
10727 				/* look to see if diskname is already in list */
10728 				for (i = 0; i < (cnt-1); i++) {
10729 					if (strcmp(diskname, tmpctd) == 0) {
10730 						/* already there, don't add */
10731 						dont_add_it = 1;
10732 						break;
10733 					}
10734 					/* point to next diskname in list */
10735 					tmpctd += size;
10736 				}
10737 				if (dont_add_it == 0) {
10738 					/* add diskname to list */
10739 					(void) strcpy(cptr, diskname);
10740 					cptr += size;
10741 				}
10742 				kmem_free(diskname, strlen(sn->suf_data) + 1);
10743 			}
10744 		}
10745 	}
10746 	/* null terminate the list */
10747 	*cptr = '\0';
10748 	/*
10749 	 * need to save the new pointer so that calling routine can continue
10750 	 * to add information onto the end.
10751 	 */
10752 	*ctdptr = cptr;
10753 	single_thread_end(s);
10754 	mddb_setexit(s);
10755 	return (cnt);
10756 }
10757 
10758 /*
10759  * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
10760  *		track of length of longest devicename.
10761  *	RETURN
10762  *		-1	error
10763  *		 cnt	number of lb's with invalid devid's
10764  */
10765 int
10766 mddb_validate_lb(
10767 	set_t	setno,
10768 	int	*rmaxsz
10769 )
10770 {
10771 	mddb_set_t	*s;
10772 	int		err = 0;
10773 	mddb_lb_t	*lbp;
10774 	int		li;
10775 	mddb_did_blk_t	*did_blk;
10776 	mddb_did_info_t	*did_info;
10777 	int		len;
10778 	int		cnt = 0;
10779 
10780 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10781 		return (-1);
10782 
10783 	single_thread_start(s);
10784 	lbp = s->s_lbp;
10785 
10786 	if (lbp->lb_setno != setno) {
10787 		single_thread_end(s);
10788 		mddb_setexit(s);
10789 		return (-1);
10790 	}
10791 
10792 	/* lb must be in devid style */
10793 	if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
10794 		goto mvl_out;
10795 
10796 	did_blk = s->s_did_icp->did_ic_blkp;
10797 	for (li = 0; li < lbp->lb_loccnt; li++) {
10798 		char		*minor_name;
10799 		mddb_locator_t	*lp;
10800 		dev_t		ddi_dev;
10801 		ddi_devid_t	devid;
10802 		ddi_devid_t	rtn_devid = NULL;
10803 		int		get_rval;
10804 
10805 		did_info = &(did_blk->blk_info[li]);
10806 		if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
10807 		    (did_info->info_flags & MDDB_DID_VALID))
10808 			continue;
10809 
10810 		/* Here we know, did exists but isn't valid */
10811 
10812 		lp = &lbp->lb_locators[li];
10813 		ddi_dev = expldev(lp->l_dev);
10814 		get_rval = mddb_devid_get(s, li, &devid, &minor_name);
10815 		ASSERT(get_rval == 1);
10816 		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
10817 		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
10818 			did_info->info_flags = MDDB_DID_VALID |
10819 			    MDDB_DID_EXISTS | MDDB_DID_UPDATED;
10820 		} else {
10821 			cnt++;
10822 			/*
10823 			 * Future note: Need to do something here
10824 			 * for the MN diskset case when device ids
10825 			 * are supported in disksets.
10826 			 * Can't add until merging devids_in_diskset
10827 			 * code into code base.
10828 			 */
10829 			len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
10830 			if (*rmaxsz < len)
10831 				*rmaxsz = len;
10832 		}
10833 		if (rtn_devid != NULL)
10834 			ddi_devid_free(rtn_devid);
10835 	}
10836 
10837 mvl_out:
10838 
10839 	if (push_lb(s) != 0)
10840 		cnt = -1;
10841 	(void) upd_med(s, "mddb_validate_lb(0)");
10842 	single_thread_end(s);
10843 	mddb_setexit(s);
10844 	return (cnt);
10845 }
10846 
10847 int
10848 check_active_locators()
10849 {
10850 	mddb_set_t	*s;
10851 	mddb_lb_t	*lbp;
10852 	int		li;
10853 	int		active = 0;
10854 
10855 	mutex_enter(&mddb_lock);
10856 	/* there is nothing here..so we can unload */
10857 	if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
10858 		mutex_exit(&mddb_lock);
10859 		return (0);
10860 	}
10861 	s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
10862 	lbp = s->s_lbp;
10863 	if (lbp == NULL) {
10864 		mutex_exit(&mddb_lock);
10865 		return (0);
10866 	}
10867 
10868 	for (li = 0; li < lbp->lb_loccnt; li++) {
10869 		mddb_locator_t *lp = &lbp->lb_locators[li];
10870 		if (lp->l_flags & MDDB_F_ACTIVE) {
10871 			active = 1;
10872 			break;
10873 		}
10874 	}
10875 	mutex_exit(&mddb_lock);
10876 	return (active);
10877 }
10878 
10879 /*
10880  * regetoptrecord:
10881  * --------------
10882  *	Update the in-core optimized resync record contents by re-reading the
10883  *	record from the on-disk metadb.
10884  *	The contents of the resync record will be overwritten by calling this
10885  *	routine. This means that callers that require the previous contents to
10886  *	be preserved must save the data before calling this routine.
10887  *	Return values:
10888  *	0 - successfully read in resync record from a mddb
10889  *	1 - failure.  Unable to read resync record from either mddb.
10890  */
10891 static int
10892 regetoptrecord(
10893 	mddb_set_t	*s,
10894 	mddb_de_ic_t	*dep
10895 )
10896 {
10897 	mddb_lb_t	*lbp;
10898 	mddb_locator_t	*lp;
10899 	mddb_rb32_t	*rbp, *crbp;
10900 	int		li;
10901 	int		i;
10902 	int		err = 0;
10903 	size_t		recsize;
10904 
10905 #if defined(_ILP32) && !defined(lint)
10906 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
10907 #endif
10908 
10909 	recsize = dep->de_recsize;
10910 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
10911 
10912 	single_thread_start(s);
10913 	rbp = dep->de_rb;
10914 
10915 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
10916 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10917 
10918 	lbp = s->s_lbp;
10919 
10920 	for (i = 0; i < 2; i++) {
10921 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
10922 			continue;
10923 		li = dep->de_optinfo[i].o_li;
10924 		lp = &lbp->lb_locators[li];
10925 
10926 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
10927 		    (lp->l_flags & MDDB_F_EMASTER))
10928 			continue;
10929 
10930 		/*
10931 		 * re-read the optimized resync record with failfast set
10932 		 * since a failed disk could lead to a very long wait.
10933 		 */
10934 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
10935 		    dep->de_blkcount, li, B_FAILFAST);
10936 
10937 		if (err)
10938 			continue;
10939 
10940 		if (rbp->rb_magic != MDDB_MAGIC_RB)
10941 			continue;
10942 
10943 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
10944 			continue;
10945 
10946 		/* Check the crc for this record */
10947 		if (rec_crcchk(s, dep, rbp)) {
10948 			continue;
10949 		}
10950 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
10951 
10952 		if (rbp == crbp) {
10953 			if (rbp->rb_checksum != crbp->rb_checksum)
10954 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10955 			break;
10956 		}
10957 		rbp = crbp;
10958 	}
10959 
10960 	single_thread_end(s);
10961 
10962 	if (rbp == crbp) {
10963 		rbp->rb_private = 0;
10964 		kmem_free((caddr_t)crbp, recsize);
10965 		return (0);
10966 	}
10967 	uniqtime32(&rbp->rb_timestamp);
10968 	/* Generate the crc for this record */
10969 	rec_crcgen(s, dep, rbp);
10970 	kmem_free((caddr_t)crbp, recsize);
10971 	return (1);
10972 }
10973 
10974 /*
10975  * mddb_reread_rr:
10976  *	Re-read the resync record from the on-disk copy. This is required for
10977  *	multi-node support so that a new mirror-owner can determine if a resync
10978  *	operation is required to guarantee data integrity.
10979  *
10980  * Arguments:
10981  *	setno	Associated set
10982  *	id	Resync record ID
10983  *
10984  * Return Value:
10985  *	0	successful reread
10986  *	-1	invalid set (not multi-node or non-existant)
10987  *	>0	metadb state invalid, failed to reread
10988  */
10989 int
10990 mddb_reread_rr(
10991 	set_t		setno,
10992 	mddb_recid_t	id
10993 )
10994 {
10995 	mddb_set_t	*s;
10996 	int		err = 0;
10997 	mddb_db_t	*dbp;
10998 	mddb_de_ic_t	*dep;
10999 
11000 	if (setno >= md_nsets)
11001 		return (-1);
11002 
11003 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
11004 		return (-1);
11005 
11006 	if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
11007 		mddb_setexit(s);
11008 		return (-1);
11009 	}
11010 
11011 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11012 		dep = dbp->db_firstentry;
11013 		while (dep && (dep->de_recid != DBID(id)))
11014 			dep = dep->de_next;
11015 		if (dep != NULL)
11016 			break;
11017 	}
11018 
11019 	if (dep != NULL) {
11020 		err = regetoptrecord(s, dep);
11021 	} else {
11022 		err = -1;
11023 	}
11024 	mddb_setexit(s);
11025 	return (err);
11026 }
11027 
11028 /*
11029  * Set owner associated with MN optimized resync record.
11030  *
11031  * Optimized records have an owner node associated with them in
11032  * a MN diskset.  The owner is only set on a node that is actively
11033  * writing to that record.  The other nodes will show that record
11034  * as having an invalid owner.  The owner for an optimized record
11035  * is used during fixoptrecord to determine which node should
11036  * write out the record when the replicas associated with that
11037  * optimized record have been changed.
11038  *
11039  * Called directly from mirror driver and not from an ioctl.
11040  *
11041  * Returns
11042  *	NULL if successful.
11043  *	MDDB_E_NORECORD if record not found.
11044  */
11045 int
11046 mddb_setowner(
11047 	mddb_recid_t		id,
11048 	md_mn_nodeid_t		owner
11049 )
11050 {
11051 	mddb_set_t		*s;
11052 	mddb_db_t		*dbp;
11053 	mddb_de_ic_t		*dep;
11054 	int			found = 0;
11055 
11056 
11057 	if (DBSET(id) >= md_nsets)
11058 		return (MDDB_E_NORECORD);
11059 
11060 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
11061 		return (MDDB_E_NORECORD);
11062 
11063 	id = DBID(id);
11064 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11065 		for (dep = dbp->db_firstentry;
11066 		    dep != NULL; dep = dep->de_next) {
11067 			if (dep->de_recid != id)
11068 				continue;
11069 			dep->de_owner_nodeid = owner;
11070 			found = 1;
11071 			break;
11072 		}
11073 		if (found)
11074 			break;
11075 	}
11076 
11077 	mddb_setexit(s);
11078 
11079 	if (!found) {
11080 		return (MDDB_E_NORECORD);
11081 	}
11082 
11083 	return (NULL);
11084 }
11085 
11086 /*
11087  * mddb_parse re-reads portions of the mddb from disk given a list
11088  * of good replicas to read from and flags describing
11089  * which portion of the mddb to read in.
11090  *
11091  * Used in a MN diskset when the master has made a change to some part
11092  * of the mddb and wants to relay this information to the slaves.
11093  */
11094 int
11095 mddb_parse(mddb_parse_parm_t *mpp)
11096 {
11097 	mddb_set_t	*s;
11098 	int		err = 0;
11099 	mddb_locator_t	*lp, *old_lp;
11100 	mddb_lb_t	*lbp, *old_lbp;
11101 	int		rval = 0;
11102 	int		i, li;
11103 	int		found_good_one = 0;
11104 	mddb_ln_t	*lnp;
11105 	mddb_block_t	ln_blkcnt;
11106 	md_error_t	*ep = &mpp->c_mde;
11107 
11108 	if (mpp->c_setno >= md_nsets)
11109 		return (EINVAL);
11110 
11111 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11112 		return (0);
11113 
11114 	if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11115 		return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
11116 	}
11117 
11118 	if (!(MD_MNSET_SETNO(mpp->c_setno))) {
11119 		mddb_setexit_no_parse(s);
11120 		return (EINVAL);
11121 	}
11122 
11123 	/*
11124 	 * Master node initiated this request, so there's no work for
11125 	 * the master node to do.
11126 	 */
11127 	if (md_set[mpp->c_setno].s_am_i_master) {
11128 		mddb_setexit_no_parse(s);
11129 		return (rval);
11130 	}
11131 
11132 	single_thread_start(s);
11133 
11134 	if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
11135 		lbp = 0;
11136 		for (i = 0; i < MDDB_NLB; i++) {
11137 			/* Walk through master's active list */
11138 			if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
11139 				continue;
11140 			if (s->s_mbiarray[i] == NULL)
11141 				continue;
11142 
11143 			/* Assumes master blocks are already setup */
11144 			if (lbp == (mddb_lb_t *)NULL) {
11145 				lbp = (mddb_lb_t *)kmem_zalloc(
11146 				    dbtob(MDDB_MNLBCNT), KM_SLEEP);
11147 			}
11148 			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
11149 
11150 			if (err)
11151 				continue;
11152 
11153 			if (lbp->lb_magic != MDDB_MAGIC_LB)
11154 				continue;
11155 			if (lbp->lb_blkcnt != MDDB_MNLBCNT)
11156 				continue;
11157 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
11158 				continue;
11159 			if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
11160 			    NULL))
11161 				continue;
11162 			if (lbp->lb_setno != s->s_setno)
11163 				continue;
11164 			/*
11165 			 * a commit count of zero means this locator has
11166 			 * been deleted
11167 			 */
11168 			if (lbp->lb_commitcnt == 0) {
11169 				continue;
11170 			}
11171 			/* Found a good locator - keep it */
11172 			found_good_one = 1;
11173 			break;
11174 		}
11175 
11176 		/*
11177 		 * If found a good copy of the mddb, then read it into
11178 		 * this node's locator block.  Fix up the set's s_mbiarray
11179 		 * pointer (master block incore array pointer) to be
11180 		 * in sync with the newly read in locator block.  If a
11181 		 * new mddb was added, read in the master blocks associated
11182 		 * with the new mddb.  If an mddb was deleted, free the
11183 		 * master blocks associated with deleted mddb.
11184 		 */
11185 		if (found_good_one)  {
11186 			/* Compare old and new view of mddb locator blocks */
11187 			old_lbp = s->s_lbp;
11188 			for (li = 0; li < lbp->lb_loccnt; li++) {
11189 				int	mn_set;
11190 
11191 				lp = &lbp->lb_locators[li];
11192 				old_lp = &old_lbp->lb_locators[li];
11193 
11194 				/* If old and new views match, continue */
11195 				if ((lp->l_flags & MDDB_F_ACTIVE) ==
11196 				    (old_lp->l_flags & MDDB_F_ACTIVE))
11197 					continue;
11198 
11199 				if (lp->l_flags & MDDB_F_ACTIVE) {
11200 					/*
11201 					 * If new mddb has been added - delete
11202 					 * old mbiarray and get new one.
11203 					 *
11204 					 * When devids are supported, will
11205 					 * need to get dev from devid.
11206 					 */
11207 					if (s->s_mbiarray[li]) {
11208 						free_mbipp(&s->s_mbiarray[li]);
11209 					}
11210 					/*
11211 					 * If getmasters fails, getmasters
11212 					 * will set appropriate error flags.
11213 					 */
11214 					s->s_mbiarray[li] = getmasters(s,
11215 					    md_expldev(lp->l_dev), lp->l_blkno,
11216 					    (uint_t *)&(lp->l_flags), &mn_set);
11217 				} else if (lp->l_flags & MDDB_F_DELETED) {
11218 					/*
11219 					 * If old one has been deleted -
11220 					 * delete old mbiarray.
11221 					 */
11222 					if (s->s_mbiarray[li]) {
11223 						free_mbipp(&s->s_mbiarray[li]);
11224 					}
11225 				}
11226 			}
11227 
11228 			/* Free this node's old view of mddb locator blocks */
11229 			kmem_free((caddr_t)s->s_lbp,
11230 			    dbtob(s->s_lbp->lb_blkcnt));
11231 			s->s_lbp = lbp;
11232 		} else {
11233 			if (lbp)
11234 				kmem_free(lbp, dbtob(MDDB_MNLBCNT));
11235 		}
11236 	}
11237 
11238 	if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
11239 		lnp = s->s_lnp;
11240 		lbp = s->s_lbp;
11241 		ln_blkcnt = lbp->lb_lnblkcnt;
11242 		s->s_lnp = NULL; /* readlocnames does this anyway */
11243 		for (li = 0; li < lbp->lb_loccnt; li++) {
11244 			lp = &lbp->lb_locators[li];
11245 
11246 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11247 			    (lp->l_flags & MDDB_F_EMASTER))
11248 				continue;
11249 
11250 			/* Successfully read the locator names */
11251 			if (readlocnames(s, li) == 0)
11252 				break;
11253 		}
11254 
11255 		if (li == lbp->lb_loccnt) {
11256 			/* Did not successfully read locnames; restore lnp */
11257 			s->s_lnp = lnp;
11258 		} else {
11259 			/* readlocnames successful, free old struct */
11260 			kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
11261 		}
11262 	}
11263 
11264 	if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
11265 		mddb_de_ic_t	*dep, *tdep, *first_dep, *dep2;
11266 		mddb_db_t	*dbp;
11267 		mddb_db32_t	*db32p;
11268 		mddb_de32_t	*de32p, *de32p2;
11269 		int		writeout;
11270 
11271 		lbp = s->s_lbp;
11272 		/*
11273 		 * Walk through directory block and directory entry incore
11274 		 * linked list looking for optimized resync records.
11275 		 * For each opt record found, re-read in directory block.
11276 		 * The directoy block consists of a number of directory
11277 		 * entries.  The directory entry for this opt record will
11278 		 * describe which 2 mddbs actually contain the resync record
11279 		 * since it could have been relocated by the master node
11280 		 * due to mddb failure or mddb deletion.  If this node
11281 		 * is the record owner for this opt record, then write out
11282 		 * the record to the 2 mddbs listed in the directory entry
11283 		 * if the mddbs locations are different than previously known.
11284 		 */
11285 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11286 			for (dep = dbp->db_firstentry; dep;
11287 			    dep = dep->de_next) {
11288 				/* Found an opt record */
11289 				if (dep->de_flags & MDDB_F_OPT)
11290 					break;
11291 			}
11292 			/* If no opt records found, go to next dbp */
11293 			if (dep == NULL)
11294 				continue;
11295 
11296 			/*
11297 			 * Reread directory block from disk since
11298 			 * master could have rewritten in during fixoptrecord.
11299 			 */
11300 			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
11301 			    KM_SLEEP);
11302 			create_db32rec(db32p, dbp);
11303 			for (li = 0; li < lbp->lb_loccnt; li++) {
11304 				lp = &lbp->lb_locators[li];
11305 
11306 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11307 				    (lp->l_flags & MDDB_F_EMASTER))
11308 					continue;
11309 
11310 				err = readblks(s, (caddr_t)db32p,
11311 				    db32p->db32_blknum, 1, li);
11312 				if (err)
11313 					continue;
11314 
11315 				/* Reverify db; go to next mddb if bad */
11316 				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
11317 				    (revchk(MDDB_REV_DB,
11318 				    db32p->db32_revision)) ||
11319 				    (crcchk(db32p, &db32p->db32_checksum,
11320 				    MDDB_BSIZE, NULL))) {
11321 					continue;
11322 				} else {
11323 					break;
11324 				}
11325 			}
11326 			/*
11327 			 * If all mddbs are unavailable then panic since
11328 			 * this slave cannot be allowed to continue out-of-sync
11329 			 * with the master node.  Since the optimized resync
11330 			 * records are written by all nodes, all nodes must
11331 			 * stay in sync with the master.
11332 			 *
11333 			 * This also handles the case when all storage
11334 			 * connectivity to a slave node has failed.  The
11335 			 * slave node will send an MDDB_OPTRECERR message to
11336 			 * the master node when the slave node has been unable
11337 			 * to write an optimized resync record to both
11338 			 * designated mddbs.  After the master has fixed the
11339 			 * optimized records to be on available mddbs, the
11340 			 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
11341 			 * is sent to all slave nodes.  If a slave node is
11342 			 * unable to access any mddb in order to read in the
11343 			 * relocated optimized resync record, then the slave
11344 			 * node must panic.
11345 			 */
11346 			if (li == lbp->lb_loccnt) {
11347 				kmem_free((caddr_t)db32p, MDDB_BSIZE);
11348 				cmn_err(CE_PANIC, "md: mddb: Node unable to "
11349 				    "access any SVM state database "
11350 				    "replicas for diskset %s\n", s->s_setname);
11351 			}
11352 			/*
11353 			 * Setup temp copy of linked list of de's.
11354 			 * Already have an incore copy, but need to walk
11355 			 * the directory entry list contained in the
11356 			 * new directory block that was just read in above.
11357 			 * After finding the directory entry of an opt record
11358 			 * by walking the incore list, find the corresponding
11359 			 * entry in the temporary list and then update
11360 			 * the incore directory entry record with
11361 			 * the (possibly changed) mddb location stored
11362 			 * for the optimized resync records.
11363 			 */
11364 			de32p = (mddb_de32_t *)
11365 			    ((void *) ((caddr_t)
11366 			    (&db32p->db32_firstentry)
11367 			    + sizeof (db32p->db32_firstentry)));
11368 			tdep = (mddb_de_ic_t *)
11369 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
11370 			    sizeof (mddb_block_t) +
11371 			    sizeof (mddb_block_t) *
11372 			    de32p->de32_blkcount, KM_SLEEP);
11373 			de32tode(de32p, tdep);
11374 			first_dep = tdep;
11375 			while (de32p && de32p->de32_next) {
11376 				de32p2 = nextentry(de32p);
11377 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
11378 				    sizeof (mddb_de_ic_t) -
11379 				    sizeof (mddb_block_t) +
11380 				    sizeof (mddb_block_t) *
11381 				    de32p2->de32_blkcount, KM_SLEEP);
11382 				de32tode(de32p2, dep2);
11383 				tdep->de_next = dep2;
11384 				tdep = dep2;
11385 				de32p = de32p2;
11386 			}
11387 
11388 			/* Now, walk the incore directory entry list */
11389 			for (dep = dbp->db_firstentry; dep;
11390 			    dep = dep->de_next) {
11391 				if (! (dep->de_flags & MDDB_F_OPT))
11392 					continue;
11393 				/*
11394 				 * Found an opt record in the incore copy.
11395 				 * Find the corresponding entry in the temp
11396 				 * list.  If anything has changed in the
11397 				 * opt record info between the incore copy
11398 				 * and the temp copy, update the incore copy
11399 				 * and set a flag to writeout the opt record
11400 				 * to the new mddb locations.
11401 				 */
11402 				for (tdep = first_dep; tdep;
11403 				    tdep = tdep->de_next) {
11404 					if (dep->de_recid == tdep->de_recid) {
11405 						writeout = 0;
11406 						/* Check first mddb location */
11407 						if ((dep->de_optinfo[0].o_li !=
11408 						    tdep->de_optinfo[0].o_li) ||
11409 						    (dep->de_optinfo[0].
11410 						    o_flags != tdep->de_optinfo
11411 						    [0].o_flags)) {
11412 							dep->de_optinfo[0] =
11413 							    tdep->de_optinfo[0];
11414 							writeout = 1;
11415 						}
11416 						/* Check second mddb location */
11417 						if ((dep->de_optinfo[1].o_li !=
11418 						    tdep->de_optinfo[1].o_li) ||
11419 						    (dep->de_optinfo[1].
11420 						    o_flags != tdep->de_optinfo
11421 						    [1].o_flags)) {
11422 							dep->de_optinfo[1] =
11423 							    tdep->de_optinfo[1];
11424 							writeout = 1;
11425 						}
11426 						/*
11427 						 * Record owner should rewrite
11428 						 * it
11429 						 */
11430 						if ((writeout) &&
11431 						    (dep->de_owner_nodeid ==
11432 						    md_set[mpp->c_setno].
11433 						    s_nodeid))
11434 							(void) writeoptrecord(s,
11435 							    dep);
11436 						break;
11437 					}
11438 				}
11439 			}
11440 			/*
11441 			 * Update the incore checksum information for this
11442 			 * directory block to match the newly read in checksum.
11443 			 * This should have only changed if the incore and
11444 			 * temp directory entries differed, but it takes
11445 			 * more code to do the check than to just update
11446 			 * the information everytime.
11447 			 */
11448 			dbp->db_checksum = db32p->db32_checksum;
11449 
11450 			/* Now free everything */
11451 			tdep = first_dep;
11452 			while (tdep) {
11453 				dep2 = tdep->de_next;
11454 				kmem_free((caddr_t)tdep,
11455 				    sizeofde(tdep));
11456 				tdep = dep2;
11457 			}
11458 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
11459 		}
11460 		rval = 0;
11461 	}
11462 out:
11463 	single_thread_end(s);
11464 	mddb_setexit_no_parse(s);
11465 	return (rval);
11466 }
11467 
11468 int
11469 mddb_block(mddb_block_parm_t *mbp)
11470 {
11471 	mddb_set_t	*s;
11472 	int		err = 0;
11473 	md_error_t	*ep = &mbp->c_mde;
11474 
11475 	if (mbp->c_setno >= md_nsets)
11476 		return (EINVAL);
11477 
11478 	/*
11479 	 * If the new_master flag is set for this setno we are in the middle
11480 	 * of a reconfig cycle, and blocking or unblocking is not needed.
11481 	 * Hence we can return success immediately
11482 	 */
11483 	if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
11484 		return (0);
11485 	}
11486 
11487 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11488 		return (0);
11489 
11490 	if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11491 		return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
11492 	}
11493 
11494 	if (!(MD_MNSET_SETNO(mbp->c_setno))) {
11495 		mddb_setexit_no_parse(s);
11496 		return (EINVAL);
11497 	}
11498 
11499 	single_thread_start(s);
11500 
11501 	if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
11502 		md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11503 
11504 	if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
11505 		md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11506 
11507 	single_thread_end(s);
11508 	mddb_setexit_no_parse(s);
11509 	return (err);
11510 }
11511 
11512 /*
11513  * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
11514  * to relocate any optimized resync records to available mddbs.
11515  * This routine is only called on the master node.
11516  *
11517  * Used in a MN diskset when a slave node has failed to write an optimized
11518  * resync record.  The failed mddb information is sent to the master node
11519  * so the master can relocate the optimized records, if possible.  If the
11520  * failed mddb information has a mddb marked as failed that was previously
11521  * marked active on the master, the master sets its incore mddb state to
11522  * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
11523  * to relocate any optimized records on the newly failed mddbs by calling
11524  * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
11525  * optimized records are relocated.)
11526  *
11527  * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
11528  * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
11529  * flag causes the slave node to re-read in the locator block from disk.
11530  * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
11531  * blocks and write out any optimized resync records that have been
11532  * relocated to a different mddb.
11533  */
11534 int
11535 mddb_optrecfix(mddb_optrec_parm_t *mop)
11536 {
11537 	mddb_set_t		*s;
11538 	int			err = 0;
11539 	mddb_lb_t		*lbp;
11540 	mddb_mnlb_t		*mnlbp;
11541 	mddb_locator_t		*lp;
11542 	int			li;
11543 	mddb_mnsidelocator_t	*mnslp;
11544 	mddb_drvnm_t		*dn;
11545 	int			i, j;
11546 	md_replica_recerr_t	*recerr;
11547 	md_error_t		*ep = &mop->c_mde;
11548 	int			something_changed = 0;
11549 	int			alc, lc;
11550 	int			setno;
11551 
11552 	setno = mop->c_setno;
11553 	if (mop->c_setno >= md_nsets)
11554 		return (EINVAL);
11555 
11556 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11557 		return (0);
11558 
11559 	if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11560 		return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
11561 	}
11562 
11563 	if (!(MD_MNSET_SETNO(mop->c_setno))) {
11564 		mddb_setexit(s);
11565 		return (EINVAL);
11566 	}
11567 
11568 	single_thread_start(s);
11569 	lbp = s->s_lbp;
11570 	mnlbp = (mddb_mnlb_t *)lbp;
11571 
11572 	/*
11573 	 * If slave node has seen an mddb failure, but the master node
11574 	 * hasn't encountered this failure, mark the mddb as failed on
11575 	 * the master node and set the something_changed flag to 1.
11576 	 */
11577 	for (i = 0; i < 2; i++) {
11578 		recerr = &mop->c_recerr[i];
11579 		if (recerr->r_flags & MDDB_F_EWRITE) {
11580 			li = recerr->r_li;
11581 			lp = &lbp->lb_locators[li];
11582 			for (j = 0; j < MD_MNMAXSIDES; j++) {
11583 				mnslp = &mnlbp->lb_mnsidelocators[j][li];
11584 				if (mnslp->mnl_sideno == s->s_sideno)
11585 					break;
11586 			}
11587 			/* Do quick check using li */
11588 			if (j != MD_MNMAXSIDES)
11589 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11590 
11591 			if ((j != MD_MNMAXSIDES) &&
11592 			    (strncmp(dn->dn_data, recerr->r_driver_name,
11593 			    MD_MAXDRVNM) == 0) &&
11594 			    (recerr->r_blkno == lp->l_blkno) &&
11595 			    (recerr->r_mnum == mnslp->mnl_mnum)) {
11596 				if ((lp->l_flags & MDDB_F_ACTIVE) ||
11597 				    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
11598 					something_changed = 1;
11599 					lp->l_flags |= MDDB_F_EWRITE;
11600 					lp->l_flags &= ~MDDB_F_ACTIVE;
11601 				}
11602 			} else {
11603 				/*
11604 				 * Passed in li from slave does not match
11605 				 * the replica in the master's structures.
11606 				 * This could have occurred if a delete
11607 				 * mddb command was running when the
11608 				 * optimized resync record had a failure.
11609 				 * Search all replicas for this entry.
11610 				 * If no match, just ignore.
11611 				 * If a match, set replica in error.
11612 				 */
11613 				for (li = 0; li < lbp->lb_loccnt; li++) {
11614 					lp = &lbp->lb_locators[li];
11615 					if (lp->l_flags & MDDB_F_DELETED)
11616 						continue;
11617 
11618 					for (j = 0; j < MD_MNMAXSIDES; j++) {
11619 						mnslp =
11620 						    &mnlbp->
11621 						    lb_mnsidelocators[j][li];
11622 						if (mnslp->mnl_sideno ==
11623 						    s->s_sideno)
11624 							break;
11625 					}
11626 					if (j == MD_MNMAXSIDES)
11627 						continue;
11628 
11629 					dn = &lbp->
11630 					    lb_drvnm[mnslp->mnl_drvnm_index];
11631 					if ((strncmp(dn->dn_data,
11632 					    recerr->r_driver_name,
11633 					    MD_MAXDRVNM) == 0) &&
11634 					    (recerr->r_blkno == lp->l_blkno) &&
11635 					    (recerr->r_mnum ==
11636 					    mnslp->mnl_mnum)) {
11637 						if ((lp->l_flags &
11638 						    MDDB_F_ACTIVE) ||
11639 						    ((lp->l_flags &
11640 						    MDDB_F_EWRITE) == 0)) {
11641 							something_changed = 1;
11642 							lp->l_flags |=
11643 							    MDDB_F_EWRITE;
11644 							lp->l_flags &=
11645 							    ~MDDB_F_ACTIVE;
11646 						}
11647 						break;
11648 					}
11649 				}
11650 			}
11651 		}
11652 	}
11653 
11654 	/*
11655 	 * If this message changed nothing, then we're done since this
11656 	 * failure has already been handled.
11657 	 * If some mddb state has been changed, send a parse message to
11658 	 * the slave nodes so that the slaves will re-read the locator
11659 	 * block from disk.
11660 	 */
11661 	if (something_changed == 0) {
11662 		single_thread_end(s);
11663 		mddb_setexit(s);
11664 		return (0);
11665 	} else {
11666 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11667 	}
11668 
11669 	/*
11670 	 * Scan replicas setting MD_SET_TOOFEW if
11671 	 * 50% or more of the mddbs have seen errors.
11672 	 * Note: Don't call selectreplicas or writeretry
11673 	 * since these routines may end up setting the ACTIVE flag
11674 	 * on a failed mddb if the master is able to access the mddb
11675 	 * but the slave node couldn't.  Need to have the ACTIVE flag
11676 	 * turned off in order to relocate the optimized records to
11677 	 * mddbs that are (hopefully) available on all nodes.
11678 	 */
11679 	alc = 0;
11680 	lc = 0;
11681 	for (li = 0; li < lbp->lb_loccnt; li++) {
11682 		lp = &lbp->lb_locators[li];
11683 		if (lp->l_flags & MDDB_F_DELETED)
11684 			continue;
11685 		lc++;
11686 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11687 			continue;
11688 		alc++;
11689 	}
11690 
11691 	/*
11692 	 * If more than 50% mddbs have failed, then don't relocate opt recs.
11693 	 * The node sending the mddb failure information will detect TOOFEW
11694 	 * and will panic when it attempts to re-write the optimized record.
11695 	 */
11696 	if (alc < ((lc + 1) / 2)) {
11697 		md_set_setstatus(setno, MD_SET_TOOFEW);
11698 		(void) push_lb(s);
11699 		(void) upd_med(s, "mddb_optrecfix(0)");
11700 		single_thread_end(s);
11701 		mddb_setexit(s);
11702 		return (0);
11703 	}
11704 
11705 	/* Attempt to relocate optimized records that are on failed mddbs */
11706 	(void) fixoptrecords(s);
11707 
11708 	/* Push changed locator block out to disk */
11709 	(void) push_lb(s);
11710 	(void) upd_med(s, "mddb_optrecfix(1)");
11711 
11712 	/* Recheck for TOOFEW after writing out locator blocks */
11713 	alc = 0;
11714 	lc = 0;
11715 	for (li = 0; li < lbp->lb_loccnt; li++) {
11716 		lp = &lbp->lb_locators[li];
11717 		if (lp->l_flags & MDDB_F_DELETED)
11718 			continue;
11719 		lc++;
11720 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11721 			continue;
11722 		alc++;
11723 	}
11724 
11725 	/* If more than 50% mddbs have failed, then don't relocate opt recs */
11726 	if (alc < ((lc + 1) / 2)) {
11727 		md_set_setstatus(setno, MD_SET_TOOFEW);
11728 		single_thread_end(s);
11729 		mddb_setexit(s);
11730 		return (0);
11731 	}
11732 
11733 	single_thread_end(s);
11734 	mddb_setexit(s);
11735 	return (0);
11736 }
11737 
11738 /*
11739  * Check if incore mddb on master node matches ondisk mddb.
11740  * If not, master writes out incore view to all mddbs.
11741  * Have previously verified that master is an owner of the
11742  * diskset (master has snarfed diskset) and that diskset is
11743  * not stale.
11744  *
11745  * Meant to be called during reconfig cycle during change of master.
11746  * Previous master in diskset may have changed the mddb and
11747  * panic'd before relaying information to slave nodes.  New
11748  * master node just writes out its incore view of the mddb and
11749  * the replay of the change log will resync all the nodes.
11750  *
11751  * Only supported for MN disksets.
11752  *
11753  * Return values:
11754  *	0 - success
11755  *	non-zero - failure
11756  */
11757 int
11758 mddb_check_write_ioctl(mddb_config_t *info)
11759 {
11760 	int			err = 0;
11761 	set_t			setno = info->c_setno;
11762 	mddb_set_t		*s;
11763 	int			li;
11764 	mddb_locator_t		*lp;
11765 	mddb_lb_t		*lbp;
11766 	mddb_mnlb_t		*mnlbp_od;
11767 	mddb_ln_t		*lnp;
11768 	mddb_mnln_t		*mnlnp_od;
11769 	mddb_db_t		*dbp;
11770 	mddb_de_ic_t		*dep;
11771 	int			write_out_mddb;
11772 	md_error_t		*ep = &info->c_mde;
11773 	int			mddb_err = 0;
11774 	int			prev_li = 0;
11775 	int			rval = 0;
11776 	int			alc, lc;
11777 	int			mddbs_present = 0;
11778 
11779 	/* Verify that setno is in valid range */
11780 	if (setno >= md_nsets)
11781 		return (EINVAL);
11782 
11783 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11784 		return (0);
11785 
11786 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
11787 		return (mddbstatus2error(ep, err, NODEV32, setno));
11788 	}
11789 
11790 	/* Calling diskset must be a MN diskset */
11791 	if (!(MD_MNSET_SETNO(setno))) {
11792 		mddb_setexit(s);
11793 		return (EINVAL);
11794 	}
11795 
11796 	/* Re-verify that set is not stale */
11797 	if (md_get_setstatus(setno) & MD_SET_STALE) {
11798 		mddb_setexit(s);
11799 		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
11800 	}
11801 
11802 	lbp = s->s_lbp;
11803 	lnp = s->s_lnp;
11804 
11805 	/*
11806 	 * Previous master could have died during the write of data to
11807 	 * the mddbs so that the ondisk mddbs may not be consistent.
11808 	 * So, need to check the contents of the first and last active mddb
11809 	 * to see if the mddbs need to be rewritten.
11810 	 */
11811 	for (li = 0; li < lbp->lb_loccnt; li++) {
11812 		int	checkcopy_err;
11813 
11814 		lp = &lbp->lb_locators[li];
11815 		/* Find replica that is active */
11816 		if (lp->l_flags & MDDB_F_DELETED)
11817 			continue;
11818 		mddbs_present = 1;
11819 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11820 			continue;
11821 		if (s->s_mbiarray[li] == NULL)
11822 			continue;
11823 		/* Check locator block */
11824 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11825 		    KM_SLEEP);
11826 		/* read in on-disk locator block */
11827 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11828 
11829 		/* If err, try next mddb */
11830 		if (err) {
11831 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11832 			continue;
11833 		}
11834 
11835 		/*
11836 		 * We resnarf all changelog entries for this set.
11837 		 * They may have been altered by the previous master
11838 		 */
11839 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11840 			for (dep = dbp->db_firstentry; dep; dep =
11841 			    dep->de_next) {
11842 				if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
11843 					continue;
11844 				}
11845 				/*
11846 				 * This has been alloc'ed while
11847 				 * joining the set
11848 				 */
11849 				if (dep->de_rb) {
11850 					kmem_free(dep->de_rb, dep->de_recsize);
11851 					dep->de_rb = (mddb_rb32_t *)NULL;
11852 				}
11853 				if (dep->de_rb_userdata) {
11854 					kmem_free(dep->de_rb_userdata,
11855 					    dep->de_reqsize);
11856 					dep->de_rb_userdata = (caddr_t)NULL;
11857 				}
11858 
11859 				err = getrecord(s, dep, li);
11860 				if (err) {
11861 					/*
11862 					 * When we see on error while reading
11863 					 * the changelog entries, we move on
11864 					 * to the next mddb
11865 					 */
11866 					err = 1;
11867 					break; /* out of inner for-loop */
11868 				}
11869 				allocuserdata(dep);
11870 			}
11871 			if (err)
11872 				break; /* out of outer for-loop */
11873 		}
11874 
11875 		/* If err, try next mddb */
11876 		if (err) {
11877 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11878 			continue;
11879 		}
11880 
11881 		/* Is incore locator block same as ondisk? */
11882 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11883 		    == 1) {
11884 			write_out_mddb = 1;
11885 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11886 			break;
11887 		}
11888 
11889 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11890 
11891 		/* If lb ok, check locator names */
11892 		mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
11893 		    KM_SLEEP);
11894 		/* read in on-disk locator names */
11895 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11896 		    lbp->lb_lnblkcnt, li);
11897 
11898 		/* If err, try next mddb */
11899 		if (err) {
11900 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11901 			continue;
11902 		}
11903 
11904 		/* Are incore locator names same as ondisk? */
11905 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11906 		    == 1) {
11907 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11908 			write_out_mddb = 1;
11909 			break;
11910 		}
11911 
11912 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11913 
11914 		/*
11915 		 * Check records in mddb.
11916 		 * If a read error is encountered, set the error flag and
11917 		 * continue to the next mddb.  Otherwise, if incore data is
11918 		 * different from ondisk, then set the flag to write out
11919 		 * the mddb and break out.
11920 		 */
11921 		checkcopy_err = checkcopy(s, li);
11922 		if (checkcopy_err == MDDB_F_EREAD) {
11923 			lp->l_flags |= MDDB_F_EREAD;
11924 			mddb_err = 1;
11925 			continue;
11926 		} else if (checkcopy_err == 1) {
11927 			write_out_mddb = 1;
11928 			break;
11929 		}
11930 		/*
11931 		 * Have found first active mddb and the data is the same as
11932 		 * incore - break out of loop
11933 		 */
11934 		write_out_mddb = 0;
11935 		break;
11936 	}
11937 
11938 	/*
11939 	 * Skip checking for last active mddb if:
11940 	 *	- already found a mismatch in the first active mddb
11941 	 *		(write_out_mddb is 1)  OR
11942 	 * 	- didn't find a readable mddb when looking for first
11943 	 *	  active mddb (there are mddbs present but all failed
11944 	 *	  when read was attempted).
11945 	 *
11946 	 * In either case, go to write_out_mddb label in order to attempt
11947 	 * to write out the data. If < 50% mddbs are available, panic.
11948 	 */
11949 	if ((write_out_mddb == 1) ||
11950 	    ((li == lbp->lb_loccnt) && mddbs_present)) {
11951 		write_out_mddb = 1;
11952 		goto write_out_mddb;
11953 	}
11954 
11955 	/*
11956 	 * Save which index was checked for the first active mddb.  If only 1
11957 	 * active mddb, don't want to recheck the same mddb when looking for
11958 	 * last active mddb.
11959 	 */
11960 	prev_li = li;
11961 
11962 	/*
11963 	 * Now, checking for last active mddb.  If found same index as before
11964 	 * (only 1 active mddb), then skip.
11965 	 */
11966 	for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
11967 		int	checkcopy_err;
11968 
11969 		lp = &lbp->lb_locators[li];
11970 		/* Find replica that is active */
11971 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11972 			continue;
11973 		if (lp->l_flags & MDDB_F_DELETED)
11974 			continue;
11975 		if (s->s_mbiarray[li] == NULL)
11976 			continue;
11977 		/* If already checked mddb, bail out */
11978 		if (li == prev_li)
11979 			break;
11980 		/* Check locator block */
11981 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11982 		    KM_SLEEP);
11983 		/* read in on-disk locator block */
11984 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11985 
11986 		/* If err, try next mddb */
11987 		if (err) {
11988 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11989 			continue;
11990 		}
11991 
11992 
11993 		/* Is incore locator block same as ondisk? */
11994 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11995 		    == 1) {
11996 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11997 			write_out_mddb = 1;
11998 			break;
11999 		}
12000 
12001 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
12002 
12003 		/* If lb ok, check locator names */
12004 		mnlnp_od = (mddb_mnln_t *)
12005 		    kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);
12006 
12007 		/* read in on-disk locator names */
12008 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
12009 		    lbp->lb_lnblkcnt, li);
12010 
12011 		/* If err, try next mddb */
12012 		if (err) {
12013 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
12014 			continue;
12015 		}
12016 
12017 		/* Are incore locator names same as ondisk? */
12018 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
12019 		    == 1) {
12020 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
12021 			write_out_mddb = 1;
12022 			break;
12023 		}
12024 
12025 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
12026 
12027 		/*
12028 		 * Check records in mddb.
12029 		 * If a read error is encountered, set the error flag and
12030 		 * continue to the next mddb.  Otherwise, if incore data is
12031 		 * different from ondisk, then set the flag to write out
12032 		 * the mddb and break out.
12033 		 */
12034 		checkcopy_err = checkcopy(s, li);
12035 		if (checkcopy_err == MDDB_F_EREAD) {
12036 			lp->l_flags |= MDDB_F_EREAD;
12037 			mddb_err = 1;
12038 			continue;
12039 		} else if (checkcopy_err == 1) {
12040 			write_out_mddb = 1;
12041 			break;
12042 		}
12043 		/*
12044 		 * Have found last active mddb and the data is the same as
12045 		 * incore - break out of loop
12046 		 */
12047 		write_out_mddb = 0;
12048 		break;
12049 	}
12050 
12051 	/*
12052 	 * If ondisk and incore versions of the mddb don't match, then
12053 	 * write out this node's incore version to disk.
12054 	 * Or, if unable to read a copy of the mddb, attempt to write
12055 	 * out a new one.
12056 	 */
12057 write_out_mddb:
12058 	if (write_out_mddb) {
12059 		/* Recompute free blocks based on incore information */
12060 		computefreeblks(s); /* set up free block bits */
12061 
12062 		/*
12063 		 * Write directory entries and record blocks.
12064 		 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
12065 		 * routine won't write out change log records.
12066 		 */
12067 		for (li = 0; li < lbp->lb_loccnt; li++) {
12068 			lp = &lbp->lb_locators[li];
12069 			/* Don't write to inactive or deleted mddbs */
12070 			if (! (lp->l_flags & MDDB_F_ACTIVE))
12071 				continue;
12072 			if (lp->l_flags & MDDB_F_DELETED)
12073 				continue;
12074 			if (s->s_mbiarray[li] == NULL)
12075 				continue;
12076 			/* If encounter a write error, save it for later */
12077 			if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
12078 				lp->l_flags |= MDDB_F_EWRITE;
12079 				mddb_err = 1;
12080 			}
12081 		}
12082 
12083 		/*
12084 		 * Write out locator blocks to all replicas.
12085 		 * push_lb will set MDDB_F_EWRITE on replicas that fail.
12086 		 */
12087 		if (push_lb(s))
12088 			mddb_err = 1;
12089 		(void) upd_med(s, "mddb_check_write_ioctl(0)");
12090 
12091 		/* Write out locator names to all replicas */
12092 		lnp = s->s_lnp;
12093 		uniqtime32(&lnp->ln_timestamp);
12094 		lnp->ln_revision = MDDB_REV_MNLN;
12095 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
12096 
12097 		/* writeall sets MDDB_F_EWRITE if writes fails to replica */
12098 		if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
12099 		    lbp->lb_lnblkcnt, 0))
12100 			mddb_err = 1;
12101 
12102 		/*
12103 		 * The writes to the replicas above would have set
12104 		 * the MDDB_F_EWRITE flags if any write error was
12105 		 * encountered.
12106 		 * If < 50% of the mddbs are available, panic.
12107 		 */
12108 		lc = alc = 0;
12109 		for (li = 0; li < lbp->lb_loccnt; li++) {
12110 			lp = &lbp->lb_locators[li];
12111 			if (lp->l_flags & MDDB_F_DELETED)
12112 				continue;
12113 			lc++;
12114 			/*
12115 			 * If mddb:
12116 			 *	- is not active (previously had an error)
12117 			 *	- had an error reading the master blocks  or
12118 			 *	- had an error in writing to the mddb
12119 			 * then don't count this mddb in the active count.
12120 			 */
12121 			if (! (lp->l_flags & MDDB_F_ACTIVE) ||
12122 			    (lp->l_flags & MDDB_F_EMASTER) ||
12123 			    (lp->l_flags & MDDB_F_EWRITE))
12124 				continue;
12125 			alc++;
12126 		}
12127 		if (alc < ((lc + 1) / 2)) {
12128 			cmn_err(CE_PANIC,
12129 			    "md: Panic due to lack of DiskSuite state\n"
12130 			    " database replicas. Fewer than 50%% of "
12131 			    "the total were available,\n so panic to "
12132 			    "ensure data integrity.");
12133 		}
12134 	}
12135 
12136 	/*
12137 	 * If encountered an error during checking or writing of
12138 	 * mddbs, call selectreplicas so that replica error can
12139 	 * be properly handled. This will involve another attempt
12140 	 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
12141 	 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
12142 	 * turned off. Set the MDDB_SCANALLSYNC flag so that
12143 	 * selectreplicas doesn't overwrite the change log entries.
12144 	 *
12145 	 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
12146 	 * that the locator block has been changed.
12147 	 */
12148 	if (mddb_err) {
12149 		(void) selectreplicas(s, MDDB_SCANALLSYNC);
12150 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
12151 	}
12152 
12153 write_out_end:
12154 	mddb_setexit(s);
12155 	return (rval);
12156 }
12157 
12158 /*
12159  * Set/reset/get set flags in set structure.
12160  * Used during reconfig cycle
12161  * Only supported for MN disksets.
12162  *
12163  * Return values:
12164  *	0 - success
12165  *	non-zero - failure
12166  */
12167 int
12168 mddb_setflags_ioctl(mddb_setflags_config_t *info)
12169 {
12170 	set_t			setno = info->sf_setno;
12171 
12172 	/* Verify that setno is in valid range */
12173 	if (setno >= md_nsets)
12174 		return (EINVAL);
12175 
12176 	/*
12177 	 * When setting the flags, the set may not
12178 	 * be snarfed yet. So, don't check for SNARFED or MNset
12179 	 * and don't call mddb_setenter.
12180 	 * In order to discourage bad ioctl calls,
12181 	 * verify that magic field in structure is set correctly.
12182 	 */
12183 	if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
12184 		return (EINVAL);
12185 
12186 	switch (info->sf_flags) {
12187 	case MDDB_NM_SET:
12188 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12189 			md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12190 		if (info->sf_setflags & MD_SET_MN_START_RC)
12191 			md_set_setstatus(setno, MD_SET_MN_START_RC);
12192 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12193 			md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12194 		break;
12195 
12196 	case MDDB_NM_RESET:
12197 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12198 			md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12199 		if (info->sf_setflags & MD_SET_MN_START_RC)
12200 			md_clr_setstatus(setno, MD_SET_MN_START_RC);
12201 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12202 			md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12203 		break;
12204 
12205 	case MDDB_NM_GET:
12206 		info->sf_setflags = md_get_setstatus(setno) &
12207 		    (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
12208 		    MD_SET_MN_MIR_STATE_RC);
12209 		break;
12210 	}
12211 
12212 	return (0);
12213 }
12214 
12215 /*
12216  * md_update_minor
12217  *
12218  * This function updates the minor in the namespace entry for an
12219  * underlying metadevice.  The function is called in mod_imp_set
12220  * where mod is sp, stripe, mirror and raid.
12221  *
12222  */
12223 int
12224 md_update_minor(
12225 	set_t	setno,
12226 	side_t	side,
12227 	mdkey_t	key
12228 )
12229 {
12230 	struct nm_next_hdr	*nh;
12231 	struct nm_name		*n;
12232 	char			*shn;
12233 	int			retval = 1;
12234 
12235 	/*
12236 	 * Load the devid name space if it exists
12237 	 */
12238 	(void) md_load_namespace(setno, NULL, NM_DEVID);
12239 	if (! md_load_namespace(setno, NULL, 0L)) {
12240 		/*
12241 		 * Unload the devid namespace
12242 		 */
12243 		(void) md_unload_namespace(setno, NM_DEVID);
12244 		return (0);
12245 	}
12246 
12247 	rw_enter(&nm_lock.lock, RW_READER);
12248 
12249 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12250 		retval = 0;
12251 		goto out;
12252 	}
12253 
12254 	/*
12255 	 * Look up the key
12256 	 */
12257 	if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) {
12258 		/*
12259 		 * Find the entry, update its n_minor if metadevice
12260 		 */
12261 		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12262 		    == NULL) {
12263 			retval = 0;
12264 			goto out;
12265 		}
12266 
12267 		if (strcmp(shn, "md") == 0) {
12268 			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12269 		}
12270 	}
12271 
12272 out:
12273 	rw_exit(&nm_lock.lock);
12274 	return (retval);
12275 }
12276 
12277 /*
12278  * md_update_top_device_minor
12279  *
12280  * This function updates the minor in the namespace entry for a top
12281  * level metadevice.  The function is called in mod_imp_set where
12282  * mod is sp, stripe, mirror and raid.
12283  *
12284  */
12285 int
12286 md_update_top_device_minor(
12287 	set_t	setno,
12288 	side_t	side,
12289 	md_dev64_t dev
12290 )
12291 {
12292 	struct nm_next_hdr	*nh;
12293 	struct nm_name		*n;
12294 	char			*shn;
12295 	int			retval = 1;
12296 
12297 	/*
12298 	 * Load the devid name space if it exists
12299 	 */
12300 	(void) md_load_namespace(setno, NULL, NM_DEVID);
12301 	if (! md_load_namespace(setno, NULL, 0L)) {
12302 		/*
12303 		 * Unload the devid namespace
12304 		 */
12305 		(void) md_unload_namespace(setno, NM_DEVID);
12306 		return (0);
12307 	}
12308 
12309 	rw_enter(&nm_lock.lock, RW_READER);
12310 
12311 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12312 		retval = 0;
12313 		goto out;
12314 	}
12315 
12316 	/*
12317 	 * Look up the key
12318 	 */
12319 	if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
12320 		/*
12321 		 * Find the entry, update its n_minor if metadevice
12322 		 */
12323 		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12324 		    == NULL) {
12325 			retval = 0;
12326 			goto out;
12327 		}
12328 
12329 		if (strcmp(shn, "md") == 0) {
12330 			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12331 		}
12332 	}
12333 
12334 out:
12335 	rw_exit(&nm_lock.lock);
12336 	return (retval);
12337 }
12338 
12339 static void
12340 md_imp_nm(
12341 	mddb_set_t	*s
12342 )
12343 {
12344 	mddb_db_t		*dbp;
12345 	mddb_de_ic_t		*dep;
12346 	struct nm_rec_hdr	*hdr;
12347 	struct nm_header	*hhdr;
12348 	set_t			setno = s->s_setno;
12349 
12350 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12351 		for (dep = dbp->db_firstentry; dep != NULL;
12352 		    dep = dep->de_next) {
12353 			switch (dep->de_type1) {
12354 
12355 			case MDDB_NM_HDR:
12356 			case MDDB_DID_NM_HDR:
12357 
12358 				hhdr = (struct nm_header *)
12359 				    dep->de_rb_userdata;
12360 
12361 				hdr = &hhdr->h_names;
12362 				if (hdr->r_next_recid > 0) {
12363 					hdr->r_next_recid = MAKERECID(setno,
12364 					    DBID(hdr->r_next_recid));
12365 				}
12366 
12367 				hdr = &hhdr->h_shared;
12368 				if (hdr->r_next_recid > 0) {
12369 					hdr->r_next_recid = MAKERECID(setno,
12370 					    DBID(hdr->r_next_recid));
12371 				}
12372 				break;
12373 
12374 			case MDDB_NM:
12375 			case MDDB_DID_NM:
12376 			case MDDB_SHR_NM:
12377 			case MDDB_DID_SHR_NM:
12378 
12379 				hdr = (struct nm_rec_hdr *)
12380 				    dep->de_rb_userdata;
12381 
12382 				if (hdr->r_next_recid > 0) {
12383 					hdr->r_next_recid = MAKERECID
12384 					    (setno, DBID(hdr->r_next_recid));
12385 				}
12386 				break;
12387 
12388 			default:
12389 				break;
12390 			}
12391 		}
12392 	}
12393 }
12394 
12395 static int
12396 update_db_rec(
12397 	mddb_set_t	*s
12398 )
12399 {
12400 	mddb_db_t	*dbp;
12401 	mddb_de_ic_t	*dep;
12402 	mddb_recid_t	ids[2];
12403 
12404 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12405 		for (dep = dbp->db_firstentry; dep != NULL;
12406 		    dep = dep->de_next) {
12407 			if (! (dep->de_flags & MDDB_F_OPT)) {
12408 				ids[0] = MAKERECID(s->s_setno, dep->de_recid);
12409 				ids[1] = 0;
12410 				if (mddb_commitrecs(ids)) {
12411 					return (MDDB_E_NORECORD);
12412 				}
12413 			}
12414 		}
12415 	}
12416 	return (0);
12417 }
12418 
12419 static int
12420 update_mb(
12421 	mddb_set_t	*s
12422 )
12423 {
12424 	mddb_ri_t	*rip;
12425 	int	err = 0;
12426 
12427 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12428 		if (rip->ri_flags & MDDB_F_EMASTER)
12429 			/* disk is powered off or not there */
12430 			continue;
12431 
12432 		if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12433 			/*
12434 			 * It is a replicated set
12435 			 */
12436 			if (rip->ri_devid == (ddi_devid_t)NULL) {
12437 				return (-1);
12438 			}
12439 			err = update_mb_devid(s, rip, rip->ri_devid);
12440 		} else {
12441 			/*
12442 			 * It is a non-replicated set
12443 			 * and there is no need to update
12444 			 * devid
12445 			 */
12446 			err = update_mb_devid(s, rip, NULL);
12447 		}
12448 
12449 		if (err)
12450 			return (err);
12451 	}
12452 
12453 	return (0);
12454 }
12455 
12456 static int
12457 update_setname(
12458 	set_t	setno
12459 )
12460 {
12461 	struct nm_next_hdr	*nh;
12462 	struct nm_shared_name	*shn, *new_shn;
12463 	char			*prefix = "/dev/md/";
12464 	char			*shrname;
12465 	int			len;
12466 	mdkey_t			o_key;
12467 	uint32_t		o_count, o_data;
12468 	mddb_recid_t		recid, ids[3];
12469 	int			err = 0;
12470 	mddb_set_t		*dbp;
12471 
12472 	/* Import setname */
12473 	dbp = (mddb_set_t *)md_set[setno].s_db;
12474 	len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
12475 	shrname = kmem_zalloc(len, KM_SLEEP);
12476 	(void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");
12477 
12478 	rw_enter(&nm_lock.lock, RW_WRITER);
12479 	if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
12480 		/*
12481 		 * No namespace is okay
12482 		 */
12483 		err = 0;
12484 		goto out;
12485 	}
12486 
12487 	if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
12488 	    0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
12489 		/*
12490 		 * No metadevice is okay
12491 		 */
12492 		err = 0;
12493 		goto out;
12494 	}
12495 
12496 	/*
12497 	 * We have it, go ahead and update the namespace.
12498 	 */
12499 	o_key = shn->sn_key;
12500 	o_count = shn->sn_count;
12501 	o_data = shn->sn_data;
12502 
12503 	if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
12504 	    NM_NOCOMMIT)) {
12505 		err = MDDB_E_NORECORD;
12506 		goto out;
12507 	}
12508 	if ((new_shn = (struct nm_shared_name *)alloc_entry(
12509 	    nh, md_set[setno].s_nmid, len, NM_SHARED |
12510 	    NM_NOCOMMIT, &recid)) == NULL) {
12511 		err = MDDB_E_NORECORD;
12512 		goto out;
12513 	}
12514 
12515 	new_shn->sn_key = o_key;
12516 	new_shn->sn_count = o_count;
12517 	new_shn->sn_data = o_data;
12518 	new_shn->sn_namlen = (ushort_t)len;
12519 	(void) strcpy(new_shn->sn_name, shrname);
12520 
12521 	ids[0] = recid;
12522 	ids[1] = md_set[setno].s_nmid;
12523 	ids[2] = 0;
12524 	err = mddb_commitrecs(ids);
12525 
12526 out:
12527 	if (shrname)
12528 		kmem_free(shrname, len);
12529 	rw_exit(&nm_lock.lock);
12530 	return (err);
12531 }
12532 
12533 /*
12534  * Returns 0 on success.
12535  * Returns -1 on failure with ep filled in.
12536  */
12537 static int
12538 md_imp_db(
12539 	set_t		setno,
12540 	int		stale_flag,
12541 	md_error_t	*ep
12542 )
12543 {
12544 	mddb_set_t	*s;
12545 	int		err = 0;
12546 	mddb_dt_t	*dtp;
12547 	mddb_lb_t	*lbp;
12548 	int		i;
12549 	int		loccnt;
12550 
12551 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12552 		return (mddbstatus2error(ep, err, NODEV32, setno));
12553 	}
12554 
12555 	/* Update dt */
12556 	if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
12557 		crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
12558 	}
12559 
12560 	if ((err = dt_write(s)) != 0) {
12561 		err = mdsyserror(ep, err);
12562 		mddb_setexit(s);
12563 		return (err);
12564 	}
12565 
12566 	/*
12567 	 * Update lb, no need to update the mediator because
12568 	 * the diskset will only exist on the importing node
12569 	 * and as such a mediator adds no value.
12570 	 */
12571 
12572 	/* Update lb */
12573 	if (stale_flag & MD_IMP_STALE_SET) {
12574 		lbp = s->s_lbp;
12575 		loccnt = lbp->lb_loccnt;
12576 		for (i = 0; i < loccnt; i++) {
12577 			mddb_locator_t	*lp = &lbp->lb_locators[i];
12578 			md_dev64_t	ndev = md_expldev(lp->l_dev);
12579 			ddi_devid_t	devid_ptr;
12580 
12581 			devid_ptr = s->s_did_icp->did_ic_devid[i];
12582 			if (devid_ptr == NULL) {
12583 				/*
12584 				 * Already deleted, go to next one.
12585 				 */
12586 				continue;
12587 			}
12588 			if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
12589 			    NULL)) {
12590 				/* disk unavailable, mark deleted */
12591 				lp->l_flags = MDDB_F_DELETED;
12592 				/* then remove the device id from the list */
12593 				free_mbipp(&s->s_mbiarray[i]);
12594 				(void) mddb_devid_delete(s, i);
12595 			}
12596 		}
12597 		md_clr_setstatus(setno, MD_SET_STALE);
12598 	}
12599 
12600 	if ((err = writelocall(s)) != 0) {
12601 		err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
12602 		mddb_setexit(s);
12603 		return (err);
12604 	}
12605 
12606 	mddb_setexit(s);
12607 
12608 	/* Update db records */
12609 	if ((err = update_db_rec(s)) != 0) {
12610 		return (mddbstatus2error(ep, err, NODEV32, setno));
12611 	}
12612 
12613 	/* Update setname embedded in the namespace */
12614 	if ((err = update_setname(setno)) != 0)
12615 		return (mddbstatus2error(ep, err, NODEV32, setno));
12616 
12617 	return (err);
12618 }
12619 
12620 static void
12621 md_dr_add(
12622 	md_set_record	*sr,
12623 	md_drive_record	*dr
12624 )
12625 {
12626 	md_drive_record	*drv;
12627 
12628 	if (sr->sr_driverec == 0) {
12629 		sr->sr_driverec = dr->dr_selfid;
12630 		return;
12631 	}
12632 
12633 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12634 	    drv->dr_nextrec != 0;
12635 	    drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
12636 		;
12637 	drv->dr_nextrec = dr->dr_selfid;
12638 }
12639 
12640 static void
12641 md_setup_recids(
12642 	md_set_record	*sr,
12643 	mddb_recid_t	**ids,
12644 	size_t		size
12645 )
12646 {
12647 	md_drive_record	*drv;
12648 	int		cnt;
12649 	mddb_recid_t	*recids;
12650 
12651 	recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
12652 	    * size, KM_SLEEP);
12653 	recids[0] = sr->sr_selfid;
12654 	cnt = 1;
12655 
12656 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12657 	    /* CSTYLED */
12658 	    drv != NULL;) {
12659 		recids[cnt++] = drv->dr_selfid;
12660 		if (drv->dr_nextrec != 0)
12661 			drv = (md_drive_record *)mddb_getrecaddr
12662 			    (drv->dr_nextrec);
12663 		else
12664 			drv = NULL;
12665 	}
12666 	recids[cnt] = 0;
12667 	*ids = &recids[0];
12668 }
12669 
12670 /*
12671  * The purpose of this function is to replace the old_devid with the
12672  * new_devid in the given namespace.   This is used for importing
12673  * remotely replicated drives.
12674  */
12675 int
12676 md_update_namespace_rr_did(
12677 	mddb_config_t	*cp
12678 )
12679 {
12680 	set_t			setno = cp->c_setno;
12681 	struct nm_next_hdr	*nh;
12682 	mdkey_t			key = MD_KEYWILD;
12683 	side_t			side = MD_SIDEWILD;
12684 	mddb_recid_t		recids[3];
12685 	struct did_min_name	*n;
12686 	struct nm_next_hdr	*did_shr_nh;
12687 	struct did_shr_name	*shr_n;
12688 	mdkey_t			ent_did_key;
12689 	uint32_t		ent_did_count;
12690 	uint32_t		ent_did_data;
12691 	size_t			ent_size, size;
12692 	ddi_devid_t		devid = NULL;
12693 	struct did_shr_name	*shn;
12694 	size_t			offset;
12695 	struct nm_next_hdr	*this_did_shr_nh;
12696 	void			*old_devid, *new_devid;
12697 
12698 	if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
12699 		return (EIO);
12700 
12701 	old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
12702 	new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;
12703 
12704 	/*
12705 	 * It is okay if we dont have any configuration
12706 	 */
12707 	offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name));
12708 	if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
12709 	    == NULL) {
12710 		return (0);
12711 	}
12712 	while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
12713 		/* check out every entry in the namespace */
12714 		if ((n = (struct did_min_name *)lookup_entry(nh, setno,
12715 		    side, key, NODEV64, NM_DEVID)) == NULL) {
12716 			continue;
12717 		} else {
12718 			did_shr_nh = get_first_record(setno, 0, NM_DEVID |
12719 			    NM_SHARED);
12720 			if (did_shr_nh == NULL) {
12721 				return (ENOENT);
12722 			}
12723 			this_did_shr_nh = did_shr_nh->nmn_nextp;
12724 			shr_n = (struct did_shr_name *)lookup_shared_entry(
12725 			    did_shr_nh, n->min_devid_key, (char *)0,
12726 			    &recids[0], NM_DEVID);
12727 			if (shr_n == NULL) {
12728 				return (ENOENT);
12729 			}
12730 			rw_enter(&nm_lock.lock, RW_WRITER);
12731 			devid = (ddi_devid_t)shr_n->did_devid;
12732 			/* find this devid in the incore replica  */
12733 			if (ddi_devid_compare(devid, old_devid) == 0) {
12734 				/*
12735 				 * found the corresponding entry
12736 				 * update with new devid
12737 				 */
12738 				/* first remove old devid info */
12739 				ent_did_key = shr_n ->did_key;
12740 				ent_did_count = shr_n->did_count;
12741 				ent_did_data = shr_n->did_data;
12742 				ent_size = DID_SHR_NAMSIZ(shr_n);
12743 				size = ((struct nm_rec_hdr *)
12744 				    this_did_shr_nh->nmn_record)->
12745 				    r_used_size - offset - ent_size;
12746 				if (size == 0) {
12747 					(void) bzero(shr_n, ent_size);
12748 				} else {
12749 					(void) ovbcopy((caddr_t)shr_n +
12750 					    ent_size, shr_n, size);
12751 					(void) bzero((caddr_t)shr_n +
12752 					    size, ent_size);
12753 				}
12754 				((struct nm_rec_hdr *)this_did_shr_nh->
12755 				    nmn_record)->r_used_size -=
12756 				    ent_size;
12757 				/* add in new devid info */
12758 				if ((shn = (struct did_shr_name *)
12759 				    alloc_entry(did_shr_nh,
12760 				    md_set[setno].s_did_nmid,
12761 				    cp->c_locator.l_devid_sz,
12762 				    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
12763 				    &recids[0])) == NULL) {
12764 						rw_exit(&nm_lock.lock);
12765 						return (ENOMEM);
12766 					}
12767 					shn->did_key = ent_did_key;
12768 					shn->did_count = ent_did_count;
12769 					ent_did_data |= NM_DEVID_VALID;
12770 					shn->did_data = ent_did_data;
12771 					shn->did_size = ddi_devid_sizeof(
12772 					    new_devid);
12773 					bcopy((void *)new_devid, (void *)
12774 					    shn->did_devid, shn->did_size);
12775 					recids[1] = md_set[setno].s_nmid;
12776 					recids[2] = 0;
12777 					mddb_commitrecs_wrapper(recids);
12778 			}
12779 			rw_exit(&nm_lock.lock);
12780 		}
12781 	}
12782 
12783 	return (0);
12784 }
12785 
12786 /*
12787  * namespace is loaded before this is called.
12788  * This function is a wrapper for md_update_namespace_rr_did.
12789  *
12790  * md_update_namespace_rr_did may be called twice if attempting to
12791  * resolve a replicated device id during the take of a diskset - once
12792  * for the diskset namespace and a second time for the local namespace.
12793  * The local namespace would need to be updated when a drive has been
12794  * found during a take of the diskset that hadn't been resolved during
12795  * the import (aka partial replicated import).
12796  *
12797  * If being called during the import of the diskset (IMPORT flag set)
12798  * md_update_namespace_rr_did will only be called once with the disket
12799  * namespace.
12800  */
12801 int
12802 md_update_nm_rr_did_ioctl(
12803 	mddb_config_t	*cp
12804 )
12805 {
12806 	int	rval = 0;
12807 
12808 	/* If update of diskset namespace fails, stop and return failure */
12809 	if ((rval = md_update_namespace_rr_did(cp)) != 0)
12810 		return (rval);
12811 
12812 	if (cp->c_flags & MDDB_C_IMPORT)
12813 		return (0);
12814 
12815 	/* If update of local namespace fails, return failure */
12816 	cp->c_setno = MD_LOCAL_SET;
12817 	rval = md_update_namespace_rr_did(cp);
12818 	return (rval);
12819 }
12820 
12821 /*ARGSUSED*/
12822 int
12823 md_imp_snarf_set(
12824 	mddb_config_t	*cp
12825 )
12826 {
12827 	set_t		setno;
12828 	int		stale_flag;
12829 	mddb_set_t	*s;
12830 	int		i, err = 0;
12831 	md_ops_t	*ops;
12832 	md_error_t	*ep = &cp->c_mde;
12833 
12834 	setno = cp->c_setno;
12835 	stale_flag = cp->c_flags;
12836 
12837 	mdclrerror(ep);
12838 	if (setno >= md_nsets) {
12839 		return (mdsyserror(ep, EINVAL));
12840 	}
12841 
12842 	md_haltsnarf_enter(setno);
12843 	if (md_get_setstatus(setno) & MD_SET_IMPORT) {
12844 		goto out;
12845 	}
12846 
12847 	/* Set the bit first otherwise load_old_replicas can fail */
12848 	md_set_setstatus(setno, MD_SET_IMPORT);
12849 
12850 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12851 		err = mddbstatus2error(ep, err, NODEV32, setno);
12852 		goto out;
12853 	}
12854 
12855 	/*
12856 	 * Upon completion of load_old_replicas, the old setno is
12857 	 * restored from the disk so we need to reset
12858 	 */
12859 	s->s_lbp->lb_setno = setno;
12860 
12861 	/*
12862 	 * Fixup the NM records before loading namespace
12863 	 */
12864 	(void) md_imp_nm(s);
12865 	mddb_setexit(s);
12866 
12867 	/*
12868 	 * Load the devid name space if it exists
12869 	 * and ask each module to fixup unit records
12870 	 */
12871 	if (!md_load_namespace(setno, NULL, NM_DEVID)) {
12872 		err = mdsyserror(ep, ENOENT);
12873 		goto cleanup;
12874 	}
12875 	if (!md_load_namespace(setno, NULL, 0L)) {
12876 		(void) md_unload_namespace(setno, NM_DEVID);
12877 		err = mdsyserror(ep, ENOENT);
12878 		goto cleanup;
12879 	}
12880 
12881 	do {
12882 		i = 0;
12883 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
12884 			if (ops->md_imp_set != NULL)
12885 				i += ops->md_imp_set(setno);
12886 	} while (i);
12887 
12888 	/*
12889 	 * Fixup
12890 	 *	(1) locator block
12891 	 *	(2) locator name block if necessary
12892 	 *	(3) master block
12893 	 *	(4) directory block
12894 	 * calls appropriate writes to push changes out
12895 	 */
12896 	if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
12897 		goto cleanup;
12898 	}
12899 
12900 	/*
12901 	 * Don't unload namespace if importing a replicated diskset.
12902 	 * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
12903 	 */
12904 	if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12905 		md_haltsnarf_exit(setno);
12906 		return (err);
12907 	}
12908 
12909 cleanup:
12910 	/*
12911 	 * Halt the set
12912 	 */
12913 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
12914 	(void) md_halt_set(setno, MD_HALT_ALL);
12915 	rw_exit(&md_unit_array_rw.lock);
12916 
12917 	/*
12918 	 * Unload the namespace for the imported set
12919 	 */
12920 	mutex_enter(&mddb_lock);
12921 	mddb_unload_set(setno);
12922 	mutex_exit(&mddb_lock);
12923 
12924 out:
12925 	md_haltsnarf_exit(setno);
12926 	md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
12927 	return (err);
12928 }
12929 #endif	/* MDDB_FAKE */
12930