xref: /onnv-gate/usr/src/uts/common/io/lvm/raid/raid_ioctl.c (revision 1623:7bac4a816ebe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * NAME:	raid_ioctl.c
30  *
31  * DESCRIPTION: RAID driver source file containing IOCTL operations.
32  *
33  * ROUTINES PROVIDED FOR EXTERNAL USE:
34  *	  raid_commit() - commits MD database updates for a RAID metadevice
35  *	md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
36  *
37  * ROUTINES PROVIDED FOR INTERNAL USE:
38  *	 raid_getun() - Performs unit checking on a RAID metadevice
39  *    init_col_nextio() - normal backend when zeroing column of RAID metadevice.
40  *	 init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
41  *  raid_init_columns() - Zero one or more columns of a RAID metadevice.
42  *	     raid_set() - used to create a RAID metadevice
43  *	     raid_get() - used to get the unit structure of a RAID metadevice
44  *	 raid_replace() - used to replace a component of a RAID metadevice
45  *	    raid_grow() - Concatenate to a RAID metadevice
46  *	  raid_change() - change dynamic values of a RAID metadevice
47  *	   raid_reset() - used to reset (clear / remove) a RAID metadevice
48  *	raid_get_geom() - used to get the geometry of a RAID metadevice
49  *	raid_get_vtoc() - used to get the VTOC on a RAID metadevice
50  *	raid_set_vtoc() - used to set the VTOC on a RAID metadevice
51  *	 raid_getdevs() - return all devices within a RAID metadevice
52  *   raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
53  */
54 
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/conf.h>
59 #include <sys/file.h>
60 #include <sys/user.h>
61 #include <sys/uio.h>
62 #include <sys/t_lock.h>
63 #include <sys/buf.h>
64 #include <sys/dkio.h>
65 #include <sys/vtoc.h>
66 #include <sys/kmem.h>
67 #include <vm/page.h>
68 #include <sys/sysmacros.h>
69 #include <sys/types.h>
70 #include <sys/mkdev.h>
71 #include <sys/stat.h>
72 #include <sys/open.h>
73 #include <sys/disp.h>
74 #include <sys/modctl.h>
75 #include <sys/ddi.h>
76 #include <sys/sunddi.h>
77 #include <sys/cred.h>
78 #include <sys/lvm/mdvar.h>
79 #include <sys/lvm/md_names.h>
80 #include <sys/lvm/md_mddb.h>
81 #include <sys/lvm/md_raid.h>
82 #include <sys/lvm/md_convert.h>
83 
84 #include <sys/sysevent/eventdefs.h>
85 #include <sys/sysevent/svm.h>
86 
87 extern int		md_status;
88 extern unit_t		md_nunits;
89 extern set_t		md_nsets;
90 extern md_set_t		md_set[];
91 extern md_ops_t		raid_md_ops;
92 extern major_t		md_major;
93 extern md_krwlock_t	md_unit_array_rw;
94 extern mdq_anchor_t	md_done_daemon;
95 extern mdq_anchor_t	md_ff_daemonq;
96 extern	int		mdopen();
97 extern	int		mdclose();
98 extern	void		md_probe_one();
99 extern int		md_init_probereq(md_probedev_impl_t *,
100 				daemon_queue_t **);
101 extern md_resync_t	md_cpr_resync;
102 
103 
104 extern void dump_mr_unit(mr_unit_t *);
105 
106 typedef struct raid_ci {
107 	DAEMON_QUEUE
108 	struct raid_ci	*ci_next;
109 	mr_unit_t	*ci_un;
110 	int		ci_col;
111 	int		ci_err;
112 	int		ci_flag;
113 	size_t		ci_zerosize;
114 	diskaddr_t	ci_blkno;
115 	diskaddr_t	ci_lastblk;
116 	buf_t		ci_buf;
117 } raid_ci_t;
118 /* values for the ci_flag */
119 #define	COL_INITING	(0x0001)
120 #define	COL_INIT_DONE	(0x0002)
121 #define	COL_READY	(0x0004)
122 
123 /*
124  * NAME:	raid_getun
125  * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
126  * PARAMETERS:	minor_t	      mnum - minor device number for RAID unit
127  *		md_error_t    *mde - pointer to error reporting structure
128  *		int	     flags - pointer to error reporting structure
129  *					STALE_OK - allow stale MD memory
130  *					  NO_OLD - unit must not exist
131  *					 NO_LOCK - no IOCTL lock needed
132  *					 WR_LOCK - write IOCTL lock needed
133  *					 RD_LOCK - read IOCTL lock needed
134  *		IOLOCK	     *lock - pointer to IOCTL lock
135  *
136  * LOCKS:	obtains unit reader or writer lock via IOLOCK
137  *
138  */
139 static mr_unit_t *
140 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
141 {
142 	mr_unit_t	*un;
143 	mdi_unit_t	*ui;
144 	set_t		setno = MD_MIN2SET(mnum);
145 
146 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
147 		(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
148 		return (NULL);
149 	}
150 
151 	if (!(flags & STALE_OK)) {
152 		if (md_get_setstatus(setno) & MD_SET_STALE) {
153 			(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
154 			return (NULL);
155 		}
156 	}
157 
158 	ui = MDI_UNIT(mnum);
159 	if (flags & NO_OLD) {
160 		if (ui != NULL) {
161 			(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
162 			return (NULL);
163 		}
164 		return ((mr_unit_t *)1);
165 	}
166 
167 	if (ui == NULL) {
168 		(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
169 		return (NULL);
170 	}
171 	if (flags & ARRAY_WRITER)
172 		md_array_writer(lock);
173 	else if (flags & ARRAY_READER)
174 		md_array_reader(lock);
175 
176 	if (!(flags & NO_LOCK)) {
177 		if (flags & WR_LOCK) {
178 			(void) md_ioctl_io_lock(lock, ui);
179 			(void) md_ioctl_writerlock(lock, ui);
180 		} else /* RD_LOCK */
181 			(void) md_ioctl_readerlock(lock, ui);
182 	}
183 	un = (mr_unit_t *)MD_UNIT(mnum);
184 
185 	if (un->c.un_type != MD_METARAID) {
186 		(void) mdmderror(mde, MDE_NOT_RAID, mnum);
187 		return (NULL);
188 	}
189 
190 	return (un);
191 }
192 
193 
194 /*
195  * NAME:	raid_commit
196  * DESCRIPTION: commits MD database updates for a RAID metadevice
197  * PARAMETERS:	mr_unit_t	 *un - RAID unit to update in the MD database
198  *		mddb_recid_t *extras - array of other record IDs to update
199  *
200  * LOCKS:	assumes caller holds unit writer lock
201  *
202  */
203 void
204 raid_commit(mr_unit_t *un, mddb_recid_t	*extras)
205 {
206 	mddb_recid_t	*recids;
207 	int 		ri = 0;
208 	int		nrecids = 0;
209 
210 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
211 		return;
212 
213 	/* Count the extra recids */
214 	if (extras != NULL) {
215 		while (extras[nrecids] != 0) {
216 			nrecids++;
217 		}
218 	}
219 
220 	/*
221 	 * Allocate space for two recids in addition to the extras:
222 	 * one for the unit structure, one for the null terminator.
223 	 */
224 	nrecids += 2;
225 	recids = (mddb_recid_t *)
226 	    kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
227 
228 	if (un != NULL) {
229 		ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
230 		recids[ri++] = un->c.un_record_id;
231 	}
232 
233 	if (extras != NULL) {
234 		while (*extras != 0) {
235 			recids[ri++] = *extras;
236 			extras++;
237 		}
238 	}
239 
240 	if (ri > 0) {
241 		mddb_commitrecs_wrapper(recids);
242 	}
243 
244 	kmem_free(recids, nrecids * sizeof (mddb_recid_t));
245 }
246 
247 static int
248 raid_check_pw(mr_unit_t *un)
249 {
250 	buf_t		bp;
251 	char		*buf;
252 	mr_column_t	*colptr;
253 	minor_t		mnum = MD_SID(un);
254 	int		i;
255 	int		err = 0;
256 	minor_t		unit;
257 
258 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
259 
260 	for (i = 0; i < un->un_totalcolumncnt; i++) {
261 		md_dev64_t tmpdev;
262 
263 		colptr = &un->un_column[i];
264 
265 		tmpdev = colptr->un_dev;
266 		/*
267 		 * Open by device id
268 		 * If this device is hotspared
269 		 * use the hotspare key
270 		 */
271 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
272 			colptr->un_hs_key : colptr->un_orig_key);
273 		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
274 			colptr->un_dev = tmpdev;
275 			return (1);
276 		}
277 		colptr->un_dev = tmpdev;
278 
279 		bzero((caddr_t)&bp, sizeof (buf_t));
280 		bp.b_back = &bp;
281 		bp.b_forw = &bp;
282 		bp.b_flags = B_READ | B_BUSY;
283 		sema_init(&bp.b_io, 0, NULL,
284 		    SEMA_DEFAULT, NULL);
285 		sema_init(&bp.b_sem, 0, NULL,
286 		    SEMA_DEFAULT, NULL);
287 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
288 		bp.b_lblkno = colptr->un_pwstart;
289 		bp.b_bcount = DEV_BSIZE;
290 		bp.b_bufsize = DEV_BSIZE;
291 		bp.b_un.b_addr = (caddr_t)buf;
292 		bp.b_offset = -1;
293 		(void) md_call_strategy(&bp, 0, NULL);
294 		if (biowait(&bp))
295 			err = 1;
296 		if (i == 0) {
297 			if (un->c.un_revision & MD_64BIT_META_DEV) {
298 				unit = ((raid_pwhdr_t *)buf)->rpw_unit;
299 			} else {
300 				unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
301 			}
302 		}
303 		/*
304 		 * depending upon being an 64bit or 32 bit raid, the
305 		 * pre write headers have different layout
306 		 */
307 		if (un->c.un_revision & MD_64BIT_META_DEV) {
308 			if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
309 			    (((raid_pwhdr_t *)buf)->rpw_unit != unit))
310 				err = 1;
311 		} else {
312 			if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
313 			    (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
314 				err = 1;
315 		}
316 		md_layered_close(colptr->un_dev, MD_OFLG_NULL);
317 		if (err)
318 			break;
319 	}
320 	kmem_free(buf, DEV_BSIZE);
321 	return (err);
322 }
323 
324 /*
325  * NAME:	init_col_nextio
326  * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
327  * PARAMETERS:	raid_ci_t *cur - struct for column being zeroed
328  *
329  * LOCKS:	assumes caller holds unit reader lock,
330  *		preiodically releases and reacquires unit reader lock,
331  *		broadcasts on unit conditional variable (un_cv)
332  *
333  */
334 #define	INIT_RLS_CNT	10
335 static void
336 init_col_nextio(raid_ci_t *cur)
337 {
338 	mr_unit_t	*un;
339 
340 	un = cur->ci_un;
341 
342 	cur->ci_blkno += cur->ci_zerosize;
343 
344 	mutex_enter(&un->un_mx);
345 	/* ===> update un_percent_done */
346 	un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
347 	mutex_exit(&un->un_mx);
348 
349 	/*
350 	 * When gorwing a device, normal I/O is still going on.
351 	 * The init thread still holds the unit reader lock which
352 	 * prevents I/O from doing state changes.
353 	 * So every INIT_RLS_CNT init I/Os, we will release the
354 	 * unit reader lock.
355 	 *
356 	 * CAVEAT:
357 	 * We know we are in the middle of a grow operation and the
358 	 * unit cannot be grown or removed (through reset or halt)
359 	 * so the mr_unit_t structure will not move or disappear.
360 	 * In addition, we know that only one of the init I/Os
361 	 * can be in col_init_nextio at a time because they are
362 	 * placed on the md_done_daemon queue and md only processes
363 	 * one element of this queue at a time. In addition, any
364 	 * code that needs to acquire the unit writer lock to change
365 	 * state is supposed to be on the md_mstr_daemon queue so
366 	 * it can be processing while we sit here waiting to get the
367 	 * unit reader lock back.
368 	 */
369 
370 	if (cur->ci_blkno < cur->ci_lastblk) {
371 		/* truncate last chunk to end_addr if needed */
372 		if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
373 			cur->ci_zerosize = (size_t)
374 				(cur->ci_lastblk - cur->ci_blkno);
375 		}
376 
377 		/* set address and length for I/O bufs */
378 		cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
379 		cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
380 		cur->ci_buf.b_lblkno = cur->ci_blkno;
381 
382 		(void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
383 		return;
384 	}
385 	/* finished initializing this column */
386 	mutex_enter(&un->un_mx);
387 	cur->ci_flag = COL_INIT_DONE;
388 	uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
389 	mutex_exit(&un->un_mx);
390 	cv_broadcast(&un->un_cv);
391 }
392 
393 /*
394  * NAME:	init_col_int
395  * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
396  * PARAMETERS:	buf_t	  *cb - I/O buffer for which interrupt occurred
397  *
398  * LOCKS:	assumes caller holds unit reader or writer lock
399  *
400  */
401 static int
402 init_col_int(buf_t *cb)
403 {
404 	raid_ci_t	*cur;
405 
406 	cur = (raid_ci_t *)cb->b_chain;
407 	if (cb->b_flags & B_ERROR) {
408 		mutex_enter(&cur->ci_un->un_mx);
409 		cur->ci_err = EIO;
410 		mutex_exit(&cur->ci_un->un_mx);
411 		cv_broadcast(&cur->ci_un->un_cv);
412 		return (1);
413 	}
414 	daemon_request(&md_done_daemon, init_col_nextio,
415 			(daemon_queue_t *)cur, REQ_OLD);
416 	return (1);
417 }
418 
419 /*
420  * NAME:	raid_init_columns
421  * DESCRIPTION: Zero one or more columns of a RAID metadevice.
422  * PARAMETERS:	minor_t	 mnum - RAID unit minor identifier
423  *
424  * LOCKS:	obtains and releases unit reader lock,
425  *		obtains and releases unit writer lock,
426  *		obtains and releases md_unit_array_rw write lock,
427  *		obtains and releases unit mutex (un_mx) lock,
428  *		waits on unit conditional variable (un_cv)
429  *
430  */
431 static void
432 raid_init_columns(minor_t mnum)
433 {
434 	mr_unit_t	*un;
435 	mdi_unit_t	*ui;
436 	raid_ci_t	*ci_chain = NULL, *cur;
437 	rus_state_t	state;
438 	caddr_t		zero_addr;
439 	diskaddr_t	end_off;
440 	size_t		zerosize;
441 	int		err = 0;
442 	int		ix;
443 	int		colcnt = 0;
444 	int		col;
445 	set_t		setno = MD_MIN2SET(mnum);
446 
447 	/*
448 	 * Increment the raid resync count for cpr
449 	 */
450 	mutex_enter(&md_cpr_resync.md_resync_mutex);
451 	md_cpr_resync.md_raid_resync++;
452 	mutex_exit(&md_cpr_resync.md_resync_mutex);
453 
454 	/*
455 	 * initialization is a multiple step process.  The first step
456 	 * is to go through the unit structure and start each device
457 	 * in the init state writing zeros over the component.
458 	 * Next initialize the prewrite areas, so the device can be
459 	 * used if a metainit -k is done.  Now close the componenets.
460 	 *
461 	 * Once this complete set the state of each component being
462 	 * zeroed and set the correct state for the unit.
463 	 *
464 	 * last commit the records.
465 	 */
466 
467 	ui = MDI_UNIT(mnum);
468 	un = md_unit_readerlock(ui);
469 
470 	/* check for active init on this column */
471 	/* exiting is cpr safe */
472 	if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
473 		md_unit_readerexit(ui);
474 		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
475 		/*
476 		 * Decrement the raid resync count for cpr
477 		 */
478 		mutex_enter(&md_cpr_resync.md_resync_mutex);
479 		md_cpr_resync.md_raid_resync--;
480 		mutex_exit(&md_cpr_resync.md_resync_mutex);
481 		thread_exit();
482 	}
483 
484 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
485 	    MD_SID(un));
486 	un->un_init_colcnt = 0;
487 	un->un_init_iocnt = 0;
488 	end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
489 	zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
490 
491 	/* allocate zero-filled buffer */
492 	zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
493 
494 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
495 		if (un->un_column[ix].un_devstate != RCS_INIT)
496 			continue;
497 		/* allocate new column init structure */
498 		cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
499 		ASSERT(cur != NULL);
500 		un->un_init_colcnt++;
501 		cur->ci_next = ci_chain;
502 		ci_chain = cur;
503 		cur->ci_un = un;
504 		cur->ci_col = ix;
505 		cur->ci_err = 0;
506 		cur->ci_flag = COL_INITING;
507 		cur->ci_zerosize = zerosize;
508 		cur->ci_blkno = un->un_column[ix].un_pwstart;
509 		cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
510 		    + (un->un_segsize * un->un_segsincolumn);
511 		/* initialize static buf fields */
512 		cur->ci_buf.b_un.b_addr = zero_addr;
513 		cur->ci_buf.b_chain = (buf_t *)cur;
514 		cur->ci_buf.b_back = &cur->ci_buf;
515 		cur->ci_buf.b_forw = &cur->ci_buf;
516 		cur->ci_buf.b_iodone = init_col_int;
517 		cur->ci_buf.b_flags = B_BUSY | B_WRITE;
518 		cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
519 		sema_init(&cur->ci_buf.b_io, 0, NULL,
520 			SEMA_DEFAULT, NULL);
521 		sema_init(&cur->ci_buf.b_sem, 0, NULL,
522 			SEMA_DEFAULT, NULL);
523 		/* set address and length for I/O bufs */
524 		cur->ci_buf.b_bufsize = dbtob(zerosize);
525 		cur->ci_buf.b_bcount = dbtob(zerosize);
526 		cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
527 		cur->ci_buf.b_offset = -1;
528 
529 		if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
530 			md_dev64_t tmpdev = un->un_column[ix].un_dev;
531 			/*
532 			 * Open by device id
533 			 * If this column is hotspared then
534 			 * use the hotspare key
535 			 */
536 			tmpdev = md_resolve_bydevid(mnum, tmpdev,
537 				HOTSPARED(un, ix) ?
538 				un->un_column[ix].un_hs_key :
539 				un->un_column[ix].un_orig_key);
540 			if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
541 			    MD_OFLG_NULL)) == 0)
542 				un->un_column[ix].un_devflags |=
543 				    MD_RAID_DEV_ISOPEN;
544 			un->un_column[ix].un_dev = tmpdev;
545 		}
546 		if (cur->ci_err == 0)
547 			md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
548 	}
549 
550 	md_unit_readerexit(ui);
551 	state = un->un_state;
552 	colcnt = un->un_init_colcnt;
553 	mutex_enter(&un->un_mx);
554 	while (colcnt) {
555 		cv_wait(&un->un_cv, &un->un_mx);
556 
557 		colcnt = 0;
558 		for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
559 			col = cur->ci_col;
560 			if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
561 				if (cur->ci_err)
562 					err = cur->ci_err;
563 				else if (cur->ci_flag == COL_INIT_DONE) {
564 					(void) init_pw_area(un,
565 						un->un_column[col].un_dev,
566 						un->un_column[col].un_pwstart,
567 						col);
568 					cur->ci_flag = COL_READY;
569 				}
570 			} else {
571 				colcnt++;
572 			}
573 		}
574 	}
575 	mutex_exit(&un->un_mx);
576 
577 	/* This prevents new opens */
578 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
579 	(void) md_io_writerlock(ui);
580 	un = (mr_unit_t *)md_unit_writerlock(ui);
581 	while (ci_chain) {
582 		cur = ci_chain;
583 
584 		/* take this element out of the chain */
585 		ci_chain = cur->ci_next;
586 		/* free this element */
587 		sema_destroy(&cur->ci_buf.b_io);
588 		sema_destroy(&cur->ci_buf.b_sem);
589 		if (cur->ci_err)
590 			raid_set_state(cur->ci_un, cur->ci_col,
591 			    RCS_INIT_ERRED, 0);
592 		else
593 			raid_set_state(cur->ci_un, cur->ci_col,
594 			    RCS_OKAY, 0);
595 		kmem_free(cur, sizeof (raid_ci_t));
596 	}
597 
598 	/* free the zeroed buffer */
599 	kmem_free(zero_addr, dbtob(zerosize));
600 
601 	/* determine new unit state */
602 	if (err == 0) {
603 		if (state == RUS_INIT)
604 			un->un_state = RUS_OKAY;
605 		else {
606 			un->c.un_total_blocks = un->un_grow_tb;
607 			un->un_grow_tb = 0;
608 			if (raid_state_cnt(un, RCS_OKAY) ==
609 			    un->un_totalcolumncnt)
610 				un->un_state = RUS_OKAY;
611 		}
612 	} else {  /* error orcurred */
613 		if (state & RUS_INIT)
614 			un->un_state = RUS_DOI;
615 	}
616 	uniqtime32(&un->un_timestamp);
617 	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
618 	un->un_init_colcnt = 0;
619 	un->un_init_iocnt = 0;
620 	raid_commit(un, NULL);
621 	md_unit_writerexit(ui);
622 	(void) md_io_writerexit(ui);
623 	rw_exit(&md_unit_array_rw.lock);
624 	if (err) {
625 		if (un->un_state & RUS_DOI) {
626 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
627 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
628 		} else {
629 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
630 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
631 		}
632 	} else {
633 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
634 		    SVM_TAG_METADEVICE, setno, MD_SID(un));
635 	}
636 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
637 	/*
638 	 * Decrement the raid resync count for cpr
639 	 */
640 	mutex_enter(&md_cpr_resync.md_resync_mutex);
641 	md_cpr_resync.md_raid_resync--;
642 	mutex_exit(&md_cpr_resync.md_resync_mutex);
643 	thread_exit();
644 	/*NOTREACHED*/
645 }
646 
647 static int
648 raid_init_unit(minor_t mnum, md_error_t *ep)
649 {
650 	mdi_unit_t	*ui;
651 	mr_unit_t	*un;
652 	int		rval, i;
653 	set_t		setno = MD_MIN2SET(mnum);
654 
655 	ui = MDI_UNIT(mnum);
656 	if (md_get_setstatus(setno) & MD_SET_STALE)
657 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
658 
659 	/* Don't start an init if the device is not available */
660 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
661 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
662 	}
663 
664 	if (raid_internal_open(mnum, (FREAD | FWRITE),
665 			OTYP_LYR, MD_OFLG_ISINIT)) {
666 		rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
667 		goto out;
668 	}
669 
670 	un = md_unit_readerlock(ui);
671 	un->un_percent_done = 0;
672 	md_unit_readerexit(ui);
673 	/* start resync_unit thread */
674 	(void) thread_create(NULL, 0, raid_init_columns,
675 	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
676 
677 	return (0);
678 
679 out:
680 	un = md_unit_writerlock(ui);
681 	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
682 	/* recover state */
683 	for (i = 0; i < un->un_totalcolumncnt; i++)
684 		if (COLUMN_STATE(un, i) == RCS_INIT)
685 			raid_set_state(un, i, RCS_ERRED, 0);
686 	if (un->un_state & RUS_INIT)
687 		un->un_state = RUS_DOI;
688 	raid_commit(un, NULL);
689 	md_unit_writerexit(ui);
690 	if (un->un_state & RUS_DOI) {
691 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
692 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
693 	} else {
694 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
695 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
696 	}
697 	return (rval);
698 }
699 
700 /*
701  * NAME:	raid_regen
702  *
703  * DESCRIPTION:	regenerate all the parity on the raid device.  This
704  *		routine starts a thread that will regenerate the
705  *		parity on a raid device.  If an I/O error occurs during
706  *		this process the entire device is placed in error.
707  *
708  * PARAMETERS:	md_set_params_t *msp - ioctl packet
709  */
710 static void
711 regen_unit(minor_t mnum)
712 {
713 	mdi_unit_t	*ui = MDI_UNIT(mnum);
714 	mr_unit_t	*un = MD_UNIT(mnum);
715 	buf_t		buf, *bp;
716 	caddr_t		buffer;
717 	int		err = 0;
718 	diskaddr_t	total_segments;
719 	diskaddr_t	line;
720 	size_t		iosize;
721 
722 	/*
723 	 * Increment raid resync count for cpr
724 	 */
725 	mutex_enter(&md_cpr_resync.md_resync_mutex);
726 	md_cpr_resync.md_raid_resync++;
727 	mutex_exit(&md_cpr_resync.md_resync_mutex);
728 
729 	iosize = dbtob(un->un_segsize);
730 	buffer = kmem_alloc(iosize, KM_SLEEP);
731 	bp = &buf;
732 	total_segments = un->un_segsincolumn;
733 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
734 	    MD_UN2SET(un), MD_SID(un));
735 	un->un_percent_done = 0;
736 	init_buf(bp, B_READ | B_BUSY, iosize);
737 
738 	for (line = 0; line < total_segments; line++) {
739 		bp->b_lblkno = line *
740 				((un->un_origcolumncnt - 1) * un->un_segsize);
741 		bp->b_un.b_addr = buffer;
742 		bp->b_bcount = iosize;
743 		bp->b_iodone = NULL;
744 		/*
745 		 * The following assignment is only correct because
746 		 * md_raid_strategy is fine when it's only a minor number
747 		 * and not a real dev_t. Yuck.
748 		 */
749 		bp->b_edev = mnum;
750 		md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
751 		if (biowait(bp)) {
752 			err = 1;
753 			break;
754 		}
755 		un->un_percent_done = (uint_t)((line * 1000) /
756 						un->un_segsincolumn);
757 		/* just to avoid rounding errors */
758 		if (un->un_percent_done > 1000)
759 			un->un_percent_done = 1000;
760 		reset_buf(bp, B_READ | B_BUSY, iosize);
761 	}
762 	destroy_buf(bp);
763 	kmem_free(buffer, iosize);
764 
765 	(void) md_io_writerlock(ui);
766 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
767 	(void) md_io_writerexit(ui);
768 	un = md_unit_writerlock(ui);
769 	if (!err &&
770 		(raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
771 			un->un_state = RUS_OKAY;
772 	raid_commit(un, NULL);
773 	md_unit_writerexit(ui);
774 	if (err ||
775 		raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
776 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
777 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
778 	} else {
779 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
780 		    MD_UN2SET(un), MD_SID(un));
781 	}
782 
783 	/*
784 	 * Decrement the raid resync count for cpr
785 	 */
786 	mutex_enter(&md_cpr_resync.md_resync_mutex);
787 	md_cpr_resync.md_raid_resync--;
788 	mutex_exit(&md_cpr_resync.md_resync_mutex);
789 	thread_exit();
790 }
791 
792 static int
793 raid_regen_unit(minor_t mnum, md_error_t *ep)
794 {
795 	mdi_unit_t	*ui;
796 	mr_unit_t	*un;
797 	int		i;
798 	set_t		setno = MD_MIN2SET(mnum);
799 
800 	ui = MDI_UNIT(mnum);
801 	un = (mr_unit_t *)MD_UNIT(mnum);
802 
803 	if (md_get_setstatus(setno) & MD_SET_STALE)
804 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
805 
806 	/* Don't start a regen if the device is not available */
807 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
808 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
809 	}
810 
811 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
812 		(void) md_unit_writerlock(ui);
813 		for (i = 0; i < un->un_totalcolumncnt; i++)
814 			raid_set_state(un, i, RCS_ERRED, 0);
815 		md_unit_writerexit(ui);
816 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
817 	}
818 
819 	/* start resync_unit thread */
820 	(void) thread_create(NULL, 0, regen_unit,
821 	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
822 
823 	return (0);
824 }
825 
826 static int
827 raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
828 {
829 	minor_t		mnum = mrp->mnum;
830 	mr_unit_t	*un;
831 
832 	mdclrerror(&mrp->mde);
833 
834 	un = md_unit_readerlock(MDI_UNIT(mnum));
835 
836 	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
837 		md_unit_readerexit(MDI_UNIT(mnum));
838 		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
839 	}
840 
841 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
842 	    (raid_state_cnt(un, RCS_RESYNC))) {
843 		md_unit_readerexit(MDI_UNIT(mnum));
844 		return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
845 	}
846 
847 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
848 		md_unit_readerexit(MDI_UNIT(mnum));
849 		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
850 	}
851 
852 	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
853 	    (! (un->un_state & RUS_OKAY))) {
854 		md_unit_readerexit(MDI_UNIT(mnum));
855 		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
856 	}
857 
858 	md_unit_readerexit(MDI_UNIT(mnum));
859 
860 	/* get locks and recheck to be sure something did not change */
861 	if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
862 		return (0);
863 
864 	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
865 	    (! (un->un_state & RUS_OKAY))) {
866 		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
867 	}
868 
869 	raid_set_state(un, 0, RCS_REGEN, 0);
870 	raid_commit(un, NULL);
871 	md_ioctl_droplocks(lock);
872 	return (raid_regen_unit(mnum, &mrp->mde));
873 }
874 
875 /*
876  * NAME:	raid_set
877  * DESCRIPTION: used to create a RAID metadevice
878  * PARAMETERS:	md_set_params_t *d   - pointer to set data structure
879  *		int		mode - must be FWRITE
880  *
881  * LOCKS:	none
882  *
883  */
884 static int
885 raid_set(void	*d, int mode)
886 {
887 	minor_t		mnum;
888 	mr_unit_t	*un;
889 	mddb_recid_t	mr_recid;
890 	mddb_recid_t	*recids;
891 	mddb_type_t	typ1;
892 	int		err;
893 	set_t		setno;
894 	int		num_recs;
895 	int		rid;
896 	int		col;
897 	md_set_params_t	*msp = d;
898 
899 
900 	mnum = msp->mnum;
901 	setno = MD_MIN2SET(mnum);
902 
903 	mdclrerror(&msp->mde);
904 
905 	if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
906 		return (0);
907 
908 	typ1 = (mddb_type_t)md_getshared_key(setno,
909 	    raid_md_ops.md_driver.md_drivername);
910 
911 	/* create the db record for this mdstruct */
912 
913 	if (msp->options & MD_CRO_64BIT) {
914 #if defined(_ILP32)
915 		return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
916 #else
917 		mr_recid = mddb_createrec(msp->size, typ1, 0,
918 			MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
919 #endif
920 	} else {
921 		mr_recid = mddb_createrec(msp->size, typ1, 0,
922 			MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
923 	}
924 
925 	if (mr_recid < 0)
926 		return (mddbstatus2error(&msp->mde,
927 				(int)mr_recid, mnum, setno));
928 
929 	/* get the address of the mdstruct */
930 	un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
931 	/*
932 	 * It is okay that we muck with the mdstruct here,
933 	 * since no one else will know about the mdstruct
934 	 * until we commit it. If we crash, the record will
935 	 * be automatically purged, since we haven't
936 	 * committed it yet.
937 	 */
938 
939 	/* copy in the user's mdstruct */
940 	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
941 	    msp->size, mode)) {
942 		mddb_deleterec_wrapper(mr_recid);
943 		return (EFAULT);
944 	}
945 	/* All 64 bit metadevices only support EFI labels. */
946 	if (msp->options & MD_CRO_64BIT) {
947 		un->c.un_flag |= MD_EFILABEL;
948 	}
949 
950 	/*
951 	 * allocate the real recids array.  since we may have to commit
952 	 * underlying metadevice records, we need an array of size:
953 	 * total number of components in raid + 3 (1 for the raid itself,
954 	 * one for the hotspare, one for the end marker).
955 	 */
956 	num_recs = un->un_totalcolumncnt + 3;
957 	rid = 0;
958 	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
959 	recids[rid++] = mr_recid;
960 
961 	MD_SID(un) = mnum;
962 	MD_RECID(un) = recids[0];
963 	MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
964 	MD_PARENT(un) = MD_NO_PARENT;
965 	un->un_resync_copysize = 0;
966 	un->c.un_revision |= MD_FN_META_DEV;
967 
968 	if (UNIT_STATE(un) == RUS_INIT)
969 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
970 
971 	if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
972 		mddb_deleterec_wrapper(mr_recid);
973 		err = mderror(&msp->mde, MDE_RAID_INVALID);
974 		goto out;
975 	}
976 
977 	if (err = raid_build_incore(un, 0)) {
978 		if (un->mr_ic) {
979 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
980 				un->un_totalcolumncnt);
981 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
982 		}
983 		MD_UNIT(mnum) = NULL;
984 		mddb_deleterec_wrapper(mr_recid);
985 		goto out;
986 	}
987 
988 	/*
989 	 * Update unit availability
990 	 */
991 	md_set[setno].s_un_avail--;
992 
993 	recids[rid] = 0;
994 	if (un->un_hsp_id != -1) {
995 		/* increment the reference count of the hot spare pool */
996 		err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
997 		    &recids[rid], NULL, NULL, NULL);
998 		if (err) {
999 			MD_UNIT(mnum) = NULL;
1000 			mddb_deleterec_wrapper(mr_recid);
1001 			goto out;
1002 		}
1003 		rid++;
1004 	}
1005 
1006 	/*
1007 	 * set the parent on any metadevice components.
1008 	 * NOTE: currently soft partitions are the only metadevices
1009 	 * which can appear within a RAID metadevice.
1010 	 */
1011 	for (col = 0; col < un->un_totalcolumncnt; col++) {
1012 		mr_column_t	*mr_col = &un->un_column[col];
1013 		md_unit_t	*comp_un;
1014 
1015 		if (md_getmajor(mr_col->un_dev) == md_major) {
1016 			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1017 			recids[rid++] = MD_RECID(comp_un);
1018 			md_set_parent(mr_col->un_dev, MD_SID(un));
1019 		}
1020 	}
1021 
1022 	/* set the end marker */
1023 	recids[rid] = 0;
1024 
1025 	mddb_commitrecs_wrapper(recids);
1026 	md_create_unit_incore(mnum, &raid_md_ops, 1);
1027 
1028 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
1029 	    MD_SID(un));
1030 
1031 out:
1032 	kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
1033 	if (err)
1034 		return (err);
1035 
1036 	/* only attempt to init a device that is in the init state */
1037 	if (UNIT_STATE(un) != RUS_INIT)
1038 		return (0);
1039 
1040 	return (raid_init_unit(mnum, &msp->mde));
1041 }
1042 
1043 /*
1044  * NAME:	raid_get
1045  * DESCRIPTION: used to get the unit structure of a RAID metadevice
1046  * PARAMETERS:	md_i_get_t   *migp - pointer to get data structure
1047  *		int	      mode - must be FREAD
1048  *		IOLOCK	     *lock - pointer to IOCTL lock
1049  *
1050  * LOCKS:	obtains unit reader lock via IOLOCK
1051  *
1052  */
1053 static int
1054 raid_get(
1055 	void		*migp,
1056 	int		mode,
1057 	IOLOCK		*lock
1058 )
1059 {
1060 	minor_t		mnum;
1061 	mr_unit_t	*un;
1062 	md_i_get_t	*migph = migp;
1063 
1064 
1065 	mnum = migph->id;
1066 
1067 	mdclrerror(&migph->mde);
1068 
1069 	if ((un = raid_getun(mnum, &migph->mde,
1070 		RD_LOCK, lock)) == NULL)
1071 		return (0);
1072 
1073 	if (migph->size == 0) {
1074 		migph->size = un->c.un_size;
1075 		return (0);
1076 	}
1077 
1078 	if (migph->size < un->c.un_size) {
1079 		return (EFAULT);
1080 	}
1081 	if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
1082 	    un->c.un_size, mode))
1083 		return (EFAULT);
1084 
1085 	return (0);
1086 }
1087 
1088 
1089 /*
1090  * NAME:	raid_replace
1091  * DESCRIPTION: used to replace a component of a RAID metadevice
1092  * PARAMETERS:	replace_params_t *mrp - pointer to replace data structure
1093  *		IOLOCK	     *lock - pointer to IOCTL lock
1094  *
1095  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1096  *		obtains and releases md_unit_array_rw write lock
1097  *
1098  */
1099 static int
1100 raid_replace(
1101 	replace_params_t	*mrp,
1102 	IOLOCK			*lock
1103 )
1104 {
1105 	minor_t		mnum = mrp->mnum;
1106 	md_dev64_t	odev = mrp->old_dev;
1107 	md_error_t	*ep = &mrp->mde;
1108 	mr_unit_t	*un;
1109 	rcs_state_t	state;
1110 	int		ix, col = -1;
1111 	int		force = 0;
1112 	int		err = 0;
1113 	replace_cmd_t	cmd;
1114 	set_t		setno;
1115 	side_t		side;
1116 	mdkey_t		devkey;
1117 	int		nkeys;
1118 	mddb_recid_t	extra_recids[3] = { 0, 0, 0 };
1119 	int		extra_rids = 0;
1120 	md_error_t	mde = mdnullerror;
1121 	sv_dev_t	sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
1122 
1123 	mdclrerror(ep);
1124 	setno = MD_MIN2SET(mnum);
1125 	side = mddb_getsidenum(setno);
1126 
1127 	un = md_unit_readerlock(MDI_UNIT(mnum));
1128 
1129 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1130 	    (raid_state_cnt(un, RCS_RESYNC) != 0)) {
1131 		md_unit_readerexit(MDI_UNIT(mnum));
1132 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1133 	}
1134 
1135 	if (un->un_state & RUS_DOI) {
1136 		md_unit_readerexit(MDI_UNIT(mnum));
1137 		return (mdmderror(ep, MDE_RAID_DOI, mnum));
1138 	}
1139 
1140 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1141 	    (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
1142 		md_unit_readerexit(MDI_UNIT(mnum));
1143 		return (mdmderror(ep, MDE_IN_USE, mnum));
1144 	}
1145 
1146 	md_unit_readerexit(MDI_UNIT(mnum));
1147 
1148 	/* get locks and recheck to be sure something did not change */
1149 	if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
1150 		return (0);
1151 
1152 	if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
1153 		return (mddeverror(ep, MDE_NAME_SPACE, odev));
1154 	}
1155 
1156 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1157 		md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
1158 		/*
1159 		 * Try to resolve devt again if NODEV64
1160 		 */
1161 		if (tmpdevt == NODEV64) {
1162 			tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
1163 				un->un_column[ix].un_orig_key);
1164 			un->un_column[ix].un_orig_dev = tmpdevt;
1165 		}
1166 
1167 		if (un->un_column[ix].un_orig_dev == odev) {
1168 			col = ix;
1169 			break;
1170 		} else {
1171 			if (un->un_column[ix].un_orig_dev == NODEV64) {
1172 				/*
1173 				 * Now we use the keys to match.
1174 				 * If no key found, continue.
1175 				 */
1176 				if (nkeys == 0) {
1177 					continue;
1178 				}
1179 				if (un->un_column[ix].un_orig_key == devkey) {
1180 					if (nkeys > 1)
1181 						return (mddeverror(ep,
1182 						    MDE_MULTNM, odev));
1183 					col = ix;
1184 					break;
1185 				}
1186 			}
1187 		}
1188 	}
1189 
1190 	if (col == -1)
1191 		return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1192 		    mnum, odev));
1193 
1194 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1195 	    (raid_state_cnt(un, RCS_RESYNC) != 0))
1196 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1197 
1198 	if (un->un_state & RUS_DOI)
1199 		return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
1200 			un->un_column[col].un_dev));
1201 
1202 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1203 	    (MD_STATUS(un) & MD_UN_GROW_PENDING))
1204 		return (mdmderror(ep, MDE_IN_USE, mnum));
1205 
1206 	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
1207 		force = 1;
1208 	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
1209 		cmd = ENABLE_COMP;
1210 	if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
1211 		cmd = REPLACE_COMP;
1212 
1213 	if (un->un_state == RUS_LAST_ERRED) {
1214 		/* Must use -f force flag for unit in LAST_ERRED state */
1215 		if (!force)
1216 			return (mdmderror(ep,
1217 				MDE_RAID_NEED_FORCE, mnum));
1218 
1219 		/* Must use -f force flag on ERRED column first */
1220 		if (un->un_column[col].un_devstate != RCS_ERRED) {
1221 			for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1222 				if (un->un_column[ix].un_devstate & RCS_ERRED)
1223 					return (mdcomperror(ep,
1224 						MDE_RAID_COMP_ERRED, mnum,
1225 						un->un_column[ix].un_dev));
1226 			}
1227 		}
1228 
1229 		/* must use -f force flag on LAST_ERRED columns next */
1230 		if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
1231 		    (un->un_column[col].un_devstate != RCS_ERRED))
1232 			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1233 				mnum, un->un_column[col].un_dev));
1234 	}
1235 
1236 	if (un->un_state == RUS_ERRED) {
1237 		if (! (un->un_column[col].un_devstate &
1238 		    (RCS_ERRED | RCS_INIT_ERRED)))
1239 			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1240 			    mnum, un->un_column[ix].un_dev));
1241 	}
1242 
1243 	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
1244 	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
1245 
1246 	state = un->un_column[col].un_devstate;
1247 	if (state & RCS_INIT_ERRED) {
1248 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1249 		un->un_percent_done = 0;
1250 		raid_set_state(un, col, RCS_INIT, 0);
1251 	} else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
1252 	    resync_request(mnum, col, 0, ep))
1253 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1254 
1255 
1256 	if (cmd == REPLACE_COMP) {
1257 		md_dev64_t tmpdev = mrp->new_dev;
1258 
1259 		/*
1260 		 * open the device by device id
1261 		 */
1262 		tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
1263 		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
1264 			return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
1265 			    tmpdev));
1266 		}
1267 
1268 		/*
1269 		 * If it's a metadevice, make sure it gets reparented
1270 		 */
1271 		if (md_getmajor(tmpdev) == md_major) {
1272 			minor_t		new_mnum = md_getminor(tmpdev);
1273 			md_unit_t	*new_un = MD_UNIT(new_mnum);
1274 
1275 			md_set_parent(tmpdev, MD_SID(un));
1276 			extra_recids[extra_rids++] = MD_RECID(new_un);
1277 		}
1278 
1279 		mrp->new_dev = tmpdev;
1280 		un->un_column[col].un_orig_dev = tmpdev;
1281 		un->un_column[col].un_orig_key = mrp->new_key;
1282 		un->un_column[col].un_orig_pwstart = mrp->start_blk;
1283 		un->un_column[col].un_orig_devstart =
1284 		    mrp->start_blk + un->un_pwsize;
1285 
1286 		/*
1287 		 * If the old device was a metadevice, make sure to
1288 		 * reset its parent.
1289 		 */
1290 		if (md_getmajor(odev) == md_major) {
1291 			minor_t		old_mnum = md_getminor(odev);
1292 			md_unit_t	*old_un = MD_UNIT(old_mnum);
1293 
1294 			md_reset_parent(odev);
1295 			extra_recids[extra_rids++] =
1296 			    MD_RECID(old_un);
1297 		}
1298 
1299 		if (HOTSPARED(un, col)) {
1300 			md_layered_close(mrp->new_dev, MD_OFLG_NULL);
1301 			un->un_column[col].un_alt_dev = mrp->new_dev;
1302 			un->un_column[col].un_alt_pwstart = mrp->start_blk;
1303 			un->un_column[col].un_alt_devstart =
1304 			    mrp->start_blk + un->un_pwsize;
1305 			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1306 		} else {
1307 			/*
1308 			 * not hot spared.  Close the old device and
1309 			 * move the new device in.
1310 			 */
1311 			if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
1312 				md_layered_close(odev, MD_OFLG_NULL);
1313 			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1314 			un->un_column[col].un_dev = mrp->new_dev;
1315 			un->un_column[col].un_pwstart = mrp->start_blk;
1316 			un->un_column[col].un_devstart =
1317 			    mrp->start_blk + un->un_pwsize;
1318 			if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
1319 				un->un_column[col].un_devflags |=
1320 				    MD_RAID_REGEN_RESYNC;
1321 			}
1322 		}
1323 		/*
1324 		 * If the old device is not a metadevice then
1325 		 * save off the set number and key so that it
1326 		 * can be removed from the namespace later.
1327 		 */
1328 		if (md_getmajor(odev) != md_major) {
1329 			sv.setno = setno;
1330 			sv.key = devkey;
1331 		}
1332 	}
1333 
1334 	if (cmd == ENABLE_COMP) {
1335 		md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
1336 		mdkey_t raidkey =  un->un_column[col].un_orig_key;
1337 
1338 		/*
1339 		 * We trust the dev_t because we cannot determine the
1340 		 * dev_t from the device id since a new disk is in the
1341 		 * same location. Since this is a call from metareplace -e dx
1342 		 * AND it is SCSI a new dev_t is not generated.  So the
1343 		 * dev_t from the mddb is used. Before enabling the device
1344 		 * we check to make sure that multiple entries for the same
1345 		 * device does not exist in the namespace. If they do we
1346 		 * fail the ioctl.
1347 		 * One of the many ways multiple entries in the name space
1348 		 * can occur is if one removed the failed component in a
1349 		 * RAID metadevice and put another disk that was part of
1350 		 * another metadevice. After reboot metadevadm would correctly
1351 		 * update the device name for the metadevice whose component
1352 		 * has moved. However now in the metadb there are two entries
1353 		 * for the same name (ctds) that belong to different
1354 		 * metadevices. One is valid, the other is a ghost or "last
1355 		 * know as" ctds.
1356 		 */
1357 		tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
1358 		if (tmpdev == NODEV64)
1359 			tmpdev = md_getdevnum(setno, side, raidkey,
1360 			    MD_TRUST_DEVT);
1361 		/*
1362 		 * check for multiple entries in namespace for the
1363 		 * same dev
1364 		 */
1365 
1366 		if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
1367 		    &nkeys) != 0)
1368 			return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
1369 		/*
1370 		 * If number of keys are greater that
1371 		 * 1, then we have an invalid
1372 		 * namespace. STOP and return.
1373 		 */
1374 		if (nkeys > 1)
1375 			return (mddeverror(ep, MDE_MULTNM, tmpdev));
1376 		if (devkey != raidkey)
1377 			return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1378 			    mnum, tmpdev));
1379 
1380 		if (un->un_column[col].un_orig_dev == NODEV64)
1381 			un->un_column[col].un_orig_dev = tmpdev;
1382 
1383 		if (HOTSPARED(un, col)) {
1384 			un->un_column[col].un_alt_dev =
1385 			    un->un_column[col].un_orig_dev;
1386 			un->un_column[col].un_alt_pwstart =
1387 			    un->un_column[col].un_orig_pwstart;
1388 			un->un_column[col].un_alt_devstart =
1389 			    un->un_column[col].un_orig_devstart;
1390 			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1391 		} else {
1392 			if (!(un->un_column[col].un_devflags &
1393 				MD_RAID_DEV_ISOPEN)) {
1394 				if (md_layered_open(mnum, &tmpdev,
1395 				    MD_OFLG_NULL)) {
1396 					un->un_column[col].un_dev = tmpdev;
1397 					return (mdcomperror(ep,
1398 					    MDE_COMP_OPEN_ERR, mnum, tmpdev));
1399 				}
1400 				ASSERT(tmpdev != NODEV64 &&
1401 				    tmpdev != 0);
1402 
1403 				if ((md_getmajor(tmpdev) != md_major) &&
1404 					(md_devid_found(setno, side, raidkey)
1405 						== 1)) {
1406 					if (md_update_namespace_did(setno, side,
1407 					    raidkey, &mde) != 0) {
1408 						cmn_err(CE_WARN,
1409 						    "md: could not"
1410 							" update namespace\n");
1411 					}
1412 				}
1413 				un->un_column[col].un_dev =
1414 					un->un_column[col].un_orig_dev;
1415 			}
1416 			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1417 			un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
1418 		}
1419 	}
1420 	if (mrp->has_label) {
1421 		un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
1422 	} else {
1423 		un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
1424 	}
1425 
1426 	raid_commit(un, extra_recids);
1427 
1428 	/* If the component has been replaced - clean up the name space */
1429 	if (sv.setno != MD_SET_BAD) {
1430 		md_rem_names(&sv, 1);
1431 	}
1432 
1433 	md_ioctl_droplocks(lock);
1434 
1435 	if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
1436 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
1437 		    setno, MD_SID(un));
1438 	} else {
1439 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
1440 		    setno, MD_SID(un));
1441 	}
1442 
1443 	if (un->un_column[col].un_devstate & RCS_INIT)
1444 		err = raid_init_unit(mnum, ep);
1445 	else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
1446 		err = raid_resync_unit(mnum, ep);
1447 
1448 	mdclrerror(ep);
1449 	if (!err)
1450 		return (0);
1451 
1452 	/* be sure state */
1453 	/* is already set by this time */
1454 	/* fix state  and commit record */
1455 	un = md_unit_writerlock(MDI_UNIT(mnum));
1456 	if (state & RCS_INIT_ERRED)
1457 		raid_set_state(un, col, state, 1);
1458 	else if (state & RCS_OKAY)
1459 		raid_set_state(un, col, RCS_ERRED, 0);
1460 	else
1461 		raid_set_state(un, col, state, 1);
1462 	raid_commit(un, NULL);
1463 	md_unit_writerexit(MDI_UNIT(mnum));
1464 	mdclrerror(ep);
1465 	return (0);
1466 }
1467 
1468 
1469 /*
1470  * NAME:	raid_set_sync
1471  * DESCRIPTION: used to sync a component of a RAID metadevice
1472  * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
1473  *		int	      mode - must be FWRITE
1474  *		IOLOCK	     *lock - pointer to IOCTL lock
1475  *
1476  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1477  *		obtains and releases md_unit_array_rw write lock
1478  *
1479  */
1480 static int
1481 raid_set_sync(
1482 	md_resync_ioctl_t	*rip,
1483 	IOLOCK			*lock
1484 )
1485 {
1486 	minor_t			mnum = rip->ri_mnum;
1487 	mr_unit_t		*un;
1488 	int			init = 0;
1489 	int			resync = 0;
1490 	int			regen = 0;
1491 	int			ix;
1492 	int			err;
1493 
1494 	mdclrerror(&rip->mde);
1495 
1496 	if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
1497 		return (0);
1498 
1499 	if (un->un_state & RUS_DOI)
1500 		return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
1501 
1502 	if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
1503 		return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
1504 
1505 	/* This prevents new opens */
1506 
1507 	rip->ri_flags = 0;
1508 	if (un->un_state & RUS_REGEN)
1509 		regen++;
1510 
1511 	if (raid_state_cnt(un, RCS_RESYNC))
1512 		resync++;
1513 
1514 	if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
1515 		init++;
1516 
1517 	ASSERT(!(resync && init && regen));
1518 	md_ioctl_droplocks(lock);
1519 	rip->ri_percent_done = 0;
1520 
1521 	if (init) {
1522 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1523 		return (raid_init_unit(mnum, &rip->mde));
1524 	}
1525 
1526 	/*
1527 	 * If resync is needed, it will call raid_internal_open forcing
1528 	 * replay before the open completes.
1529 	 * Otherwise, call raid_internal_open directly to force
1530 	 * replay to complete during boot (metasync -r).
1531 	 * NOTE: the unit writer lock must remain held while setting
1532 	 *	 MD_UN_RESYNC_ACTIVE but must be released before
1533 	 *	 calling raid_resync_unit or raid_internal_open.
1534 	 */
1535 	if (resync) {
1536 		ASSERT(resync < 2);
1537 		un = md_unit_writerlock(MDI_UNIT(mnum));
1538 		MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
1539 		/* Must release unit writer lock for resync */
1540 		/*
1541 		 * correctly setup the devices before trying to start the
1542 		 * resync operation.
1543 		 */
1544 		for (ix = 0; un->un_totalcolumncnt; ix++) {
1545 			if (un->un_column[ix].un_devstate & RCS_RESYNC) {
1546 				if ((un->un_column[ix].un_devflags &
1547 				    MD_RAID_COPY_RESYNC) &&
1548 				    HOTSPARED(un, ix)) {
1549 					un->un_column[ix].un_alt_dev =
1550 					    un->un_column[ix].un_orig_dev;
1551 					un->un_column[ix].un_alt_devstart =
1552 					    un->un_column[ix].un_orig_devstart;
1553 					un->un_column[ix].un_alt_pwstart =
1554 					    un->un_column[ix].un_orig_pwstart;
1555 				}
1556 				break;
1557 			}
1558 		}
1559 		ASSERT(un->un_column[ix].un_devflags &
1560 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1561 		rip->ri_percent_done = 0;
1562 		un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
1563 		(void) resync_request(mnum, ix, 0, NULL);
1564 		md_unit_writerexit(MDI_UNIT(mnum));
1565 		err = raid_resync_unit(mnum, &rip->mde);
1566 		return (err);
1567 	}
1568 
1569 	if (regen) {
1570 		err = raid_regen_unit(mnum, &rip->mde);
1571 		return (err);
1572 	}
1573 
1574 	/* The unit requires not work so just force replay of the device */
1575 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
1576 		return (mdmderror(&rip->mde,
1577 			MDE_RAID_OPEN_FAILURE, mnum));
1578 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1579 
1580 	return (0);
1581 }
1582 
1583 /*
1584  * NAME:	raid_get_resync
1585  * DESCRIPTION: used to check resync status on a component of a RAID metadevice
1586  * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
1587  *		int	      mode - must be FWRITE
1588  *		IOLOCK	     *lock - pointer to IOCTL lock
1589  *
1590  * LOCKS:	none
1591  *
1592  */
1593 static int
1594 raid_get_resync(
1595 	md_resync_ioctl_t	*rip,
1596 	IOLOCK			*lock
1597 )
1598 {
1599 	minor_t			mnum = rip->ri_mnum;
1600 	mr_unit_t		*un;
1601 	u_longlong_t		percent;
1602 	int			cnt;
1603 	int			ix;
1604 	uint64_t		d;
1605 
1606 	mdclrerror(&rip->mde);
1607 
1608 	if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
1609 		return (0);
1610 
1611 	rip->ri_flags = 0;
1612 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1613 		d = un->un_segsincolumn;
1614 		percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
1615 		if (percent > 1000)
1616 			percent = 1000;	/* can't go over 100% */
1617 		rip->ri_percent_done = (int)percent;
1618 		rip->ri_flags |= MD_RI_INPROGRESS;
1619 	}
1620 
1621 	if (UNIT_STATE(un) & RUS_INIT) {
1622 		d = un->un_segsize * un->un_segsincolumn *
1623 		    un->un_totalcolumncnt;
1624 		percent =
1625 		    d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
1626 		if (percent > 1000)
1627 			percent = 1000;	/* can't go over 100% */
1628 		rip->ri_percent_done = (int)percent;
1629 		rip->ri_flags |= MD_GROW_INPROGRESS;
1630 	} else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1631 		d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
1632 		percent =
1633 		    d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
1634 		if (percent > 1000)
1635 			percent = 1000;
1636 		rip->ri_percent_done = (int)percent;
1637 		rip->ri_flags |= MD_GROW_INPROGRESS;
1638 	}
1639 
1640 	if (un->un_state & RUS_REGEN)
1641 		rip->ri_percent_done = un->un_percent_done;
1642 
1643 	cnt = 0;
1644 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1645 		switch (un->un_column[ix].un_devstate) {
1646 		case RCS_INIT:
1647 		case RCS_ERRED:
1648 		case RCS_LAST_ERRED:
1649 			cnt++;
1650 			break;
1651 		default:
1652 			break;
1653 		}
1654 	}
1655 	d = un->un_totalcolumncnt;
1656 	rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
1657 	return (0);
1658 }
1659 
1660 /*
1661  * NAME:	raid_grow
1662  * DESCRIPTION: Concatenate to a RAID metadevice
1663  * PARAMETERS:	md_grow_params_t *mgp
1664  *			      - pointer to IOCGROW data structure
1665  *		int	 mode - must be FWRITE
1666  *		IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
1667  *
1668  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1669  *		obtains and releases md_unit_array_rw write lock
1670  *
1671  */
1672 static int
1673 raid_grow(void *mgp, int mode, IOLOCK *lock)
1674 {
1675 	minor_t		mnum;
1676 	mr_unit_t	*un, *new_un;
1677 	mdi_unit_t	*ui;
1678 	mddb_type_t	typ1;
1679 	mddb_recid_t	mr_recid;
1680 	mddb_recid_t	old_vtoc = 0;
1681 	mddb_recid_t	*recids;
1682 	md_create_rec_option_t options;
1683 	int		err;
1684 	int		col, i;
1685 	int64_t		tb, atb;
1686 	u_longlong_t	unrev;
1687 	int		tc;
1688 	int		rval = 0;
1689 	set_t		setno;
1690 	mr_column_ic_t	*mrc;
1691 	int		num_recs, rid;
1692 	md_grow_params_t	*mgph = mgp;
1693 
1694 
1695 	mnum = mgph->mnum;
1696 
1697 	mdclrerror(&mgph->mde);
1698 
1699 	ui = MDI_UNIT(mnum);
1700 	un = md_unit_readerlock(ui);
1701 
1702 	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1703 		md_unit_readerexit(ui);
1704 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1705 	}
1706 
1707 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1708 		md_unit_readerexit(ui);
1709 		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE,
1710 			mnum));
1711 	}
1712 
1713 	if (UNIT_STATE(un) & RUS_LAST_ERRED) {
1714 		md_unit_readerexit(ui);
1715 		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED,
1716 			mnum));
1717 	}
1718 
1719 	if (UNIT_STATE(un) & RUS_DOI) {
1720 		md_unit_readerexit(ui);
1721 		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1722 	}
1723 
1724 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
1725 		md_unit_readerexit(ui);
1726 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1727 	}
1728 
1729 	md_unit_readerexit(ui);
1730 
1731 	if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
1732 		NULL)
1733 		return (0);
1734 
1735 	if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1736 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1737 
1738 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1739 		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE,
1740 			mnum));
1741 
1742 	if (un->c.un_size >= mgph->size)
1743 		return (EINVAL);
1744 
1745 	if (UNIT_STATE(un) & RUS_LAST_ERRED)
1746 		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED,
1747 			mnum));
1748 
1749 	if (UNIT_STATE(un) & RUS_DOI)
1750 		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1751 
1752 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
1753 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1754 
1755 	setno = MD_MIN2SET(mnum);
1756 
1757 	typ1 = (mddb_type_t)md_getshared_key(setno,
1758 	    raid_md_ops.md_driver.md_drivername);
1759 
1760 	/*
1761 	 * Preserve the friendly name nature of the device that is
1762 	 * growing.
1763 	 */
1764 	options = MD_CRO_RAID;
1765 	if (un->c.un_revision & MD_FN_META_DEV)
1766 		options |= MD_CRO_FN;
1767 	if (mgph->options & MD_CRO_64BIT) {
1768 #if defined(_ILP32)
1769 		return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
1770 #else
1771 		mr_recid = mddb_createrec(mgph->size, typ1, 0,
1772 				MD_CRO_64BIT | options, setno);
1773 #endif
1774 	} else {
1775 		mr_recid = mddb_createrec(mgph->size, typ1, 0,
1776 				MD_CRO_32BIT | options, setno);
1777 	}
1778 	if (mr_recid < 0) {
1779 		rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
1780 			mnum, setno);
1781 		return (rval);
1782 	}
1783 
1784 	/* get the address of the new unit */
1785 	new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
1786 
1787 	/*
1788 	 * It is okay that we muck with the new unit here,
1789 	 * since no one else will know about the unit struct
1790 	 * until we commit it. If we crash, the record will
1791 	 * be automatically purged, since we haven't
1792 	 * committed it yet and the old unit struct will be found.
1793 	 */
1794 
1795 	/* copy in the user's unit struct */
1796 	err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
1797 	    mgph->size, mode);
1798 	if (err) {
1799 		mddb_deleterec_wrapper(mr_recid);
1800 		return (EFAULT);
1801 	}
1802 
1803 	/* make sure columns are being added */
1804 	if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
1805 		mddb_deleterec_wrapper(mr_recid);
1806 		return (EINVAL);
1807 	}
1808 
1809 	/*
1810 	 * Save a few of the new unit structs fields.
1811 	 * Before they get clobbered.
1812 	 */
1813 	tc = new_un->un_totalcolumncnt;
1814 	tb = new_un->c.un_total_blocks;
1815 	atb = new_un->c.un_actual_tb;
1816 	unrev = new_un->c.un_revision;
1817 
1818 	/*
1819 	 * Copy the old unit struct (static stuff)
1820 	 * into new unit struct
1821 	 */
1822 	bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
1823 
1824 	/*
1825 	 * Restore a few of the new unit struct values.
1826 	 */
1827 	new_un->un_totalcolumncnt = tc;
1828 	new_un->c.un_actual_tb = atb;
1829 	new_un->un_grow_tb = tb;
1830 	new_un->c.un_revision = unrev;
1831 	new_un->c.un_record_id = mr_recid;
1832 	new_un->c.un_size = mgph->size;
1833 
1834 	ASSERT(new_un->mr_ic == un->mr_ic);
1835 
1836 	/*
1837 	 * Save old column slots
1838 	 */
1839 	mrc = un->un_column_ic;
1840 
1841 	/*
1842 	 * Allocate new column slot
1843 	 */
1844 	new_un->un_column_ic = (mr_column_ic_t *)
1845 	    kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
1846 		KM_SLEEP);
1847 
1848 	/*
1849 	 * Restore old column slots
1850 	 * Free the old column slots
1851 	 */
1852 	bcopy(mrc, new_un->un_column_ic,
1853 		sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1854 	kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1855 
1856 	/* All 64 bit metadevices only support EFI labels. */
1857 	if (mgph->options & MD_CRO_64BIT) {
1858 		new_un->c.un_flag |= MD_EFILABEL;
1859 		/*
1860 		 * If the device was previously smaller than a terabyte,
1861 		 * and had a vtoc record attached to it, we remove the
1862 		 * vtoc record, because the layout has changed completely.
1863 		 */
1864 		if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1865 		    (un->c.un_vtoc_id != 0)) {
1866 			old_vtoc = un->c.un_vtoc_id;
1867 			new_un->c.un_vtoc_id =
1868 				md_vtoc_to_efi_record(old_vtoc, setno);
1869 		}
1870 	}
1871 
1872 
1873 	/*
1874 	 * allocate the real recids array.  since we may have to commit
1875 	 * underlying metadevice records, we need an array of size:
1876 	 * total number of new components being attach + 2 (one for the
1877 	 * raid itself, one for the end marker).
1878 	 */
1879 	num_recs = new_un->un_totalcolumncnt + 2;
1880 	rid = 0;
1881 	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
1882 	recids[rid++] = mr_recid;
1883 
1884 	for (col = un->un_totalcolumncnt;
1885 	    (col < new_un->un_totalcolumncnt); col++) {
1886 		mr_column_t	*mr_col = &new_un->un_column[col];
1887 		md_unit_t	*comp_un;
1888 
1889 		if (raid_build_pw_reservation(new_un, col) != 0) {
1890 			/* release pwslots already allocated by grow */
1891 			for (i = un->un_totalcolumncnt; i < col; i++) {
1892 				raid_free_pw_reservation(new_un, i);
1893 			}
1894 			kmem_free(new_un->un_column_ic,
1895 			    sizeof (mr_column_ic_t) *
1896 			    new_un->un_totalcolumncnt);
1897 			kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
1898 			kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1899 			mddb_deleterec_wrapper(mr_recid);
1900 			return (EINVAL);
1901 		}
1902 		/*
1903 		 * set parent on metadevices being added.
1904 		 * NOTE: currently soft partitions are the only metadevices
1905 		 * which can appear within a RAID metadevice.
1906 		 */
1907 		if (md_getmajor(mr_col->un_dev) == md_major) {
1908 			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1909 			recids[rid++] = MD_RECID(comp_un);
1910 			md_set_parent(mr_col->un_dev, MD_SID(new_un));
1911 		}
1912 		new_un->un_column[col].un_devflags = 0;
1913 	}
1914 
1915 	/* set end marker */
1916 	recids[rid] = 0;
1917 
1918 	/* commit new unit struct */
1919 	mddb_commitrecs_wrapper(recids);
1920 
1921 	/* delete old unit struct */
1922 	mddb_deleterec_wrapper(un->c.un_record_id);
1923 	MD_UNIT(mnum) = new_un;
1924 
1925 	/*
1926 	 * If old_vtoc has a non zero value, we know:
1927 	 * - This unit crossed the border from smaller to larger one TB
1928 	 * - There was a vtoc record for the unit,
1929 	 * - This vtoc record is no longer needed, because
1930 	 *   a new efi record has been created for this un.
1931 	 */
1932 	if (old_vtoc != 0) {
1933 		mddb_deleterec_wrapper(old_vtoc);
1934 	}
1935 
1936 	/* free recids */
1937 	kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1938 
1939 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1940 	    MD_UN2SET(new_un), MD_SID(new_un));
1941 	MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
1942 
1943 	/*
1944 	 * Since the md_ioctl_writelock aquires the unit write lock
1945 	 * and open/close aquires the unit reader lock it is necessary
1946 	 * to drop the unit write lock and then reaquire it as needed
1947 	 * later.
1948 	 */
1949 	md_unit_writerexit(ui);
1950 
1951 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
1952 		rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE,
1953 			mnum);
1954 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1955 		    MD_UN2SET(new_un), MD_SID(new_un));
1956 		return (rval);
1957 	}
1958 	(void) md_unit_writerlock(ui);
1959 	for (i = 0; i < new_un->un_totalcolumncnt; i++) {
1960 		if (new_un->un_column[i].un_devstate & RCS_OKAY)
1961 			(void) init_pw_area(new_un, new_un->un_column[i].un_dev,
1962 				new_un->un_column[i].un_pwstart, i);
1963 	}
1964 	md_unit_writerexit(ui);
1965 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1966 	(void) md_unit_writerlock(ui);
1967 	/* create a background thread to initialize the columns */
1968 	md_ioctl_droplocks(lock);
1969 
1970 	return (raid_init_unit(mnum, &mgph->mde));
1971 }
1972 
1973 /*
1974  * NAME:	raid_reset
1975  * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
1976  * PARAMETERS:	md_i_reset_t *mirp - pointer to reset data structure
1977  *
1978  * LOCKS:	obtains and releases md_unit_array_rw write lock
1979  *
1980  */
1981 static int
1982 raid_reset(md_i_reset_t	*mirp)
1983 {
1984 	minor_t		mnum = mirp->mnum;
1985 	mr_unit_t	*un;
1986 	mdi_unit_t	*ui;
1987 	set_t		setno = MD_MIN2SET(mnum);
1988 
1989 	mdclrerror(&mirp->mde);
1990 
1991 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1992 	/*
1993 	 * NOTE: need to get md_unit_writerlock to avoid conflict
1994 	 * with raid_init thread.
1995 	 */
1996 	if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
1997 	    NULL) {
1998 		rw_exit(&md_unit_array_rw.lock);
1999 		return (0);
2000 	}
2001 	ui = MDI_UNIT(mnum);
2002 
2003 	if (MD_HAS_PARENT(MD_PARENT(un))) {
2004 		rw_exit(&md_unit_array_rw.lock);
2005 		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
2006 	}
2007 
2008 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
2009 	if (md_unit_isopen(MDI_UNIT(mnum))) {
2010 		md_unit_openclose_exit(ui);
2011 		rw_exit(&md_unit_array_rw.lock);
2012 		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
2013 	}
2014 	md_unit_openclose_exit(ui);
2015 	if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
2016 		rw_exit(&md_unit_array_rw.lock);
2017 		return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
2018 	}
2019 
2020 	reset_raid(un, mnum, 1);
2021 
2022 	/*
2023 	 * Update unit availability
2024 	 */
2025 	md_set[setno].s_un_avail++;
2026 
2027 	/*
2028 	 * If MN set, reset s_un_next so all nodes can have
2029 	 * the same view of the next available slot when
2030 	 * nodes are -w and -j
2031 	 */
2032 	if (MD_MNSET_SETNO(setno)) {
2033 		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
2034 	}
2035 
2036 	rw_exit(&md_unit_array_rw.lock);
2037 
2038 	return (0);
2039 }
2040 
2041 /*
2042  * NAME:	raid_get_geom
2043  * DESCRIPTION: used to get the geometry of a RAID metadevice
2044  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the geometry for
2045  *		struct dk_geom *gp - pointer to geometry data structure
2046  *
2047  * LOCKS:	none
2048  *
2049  */
2050 static int
2051 raid_get_geom(
2052 	mr_unit_t	*un,
2053 	struct dk_geom	*geomp
2054 )
2055 {
2056 	md_get_geom((md_unit_t *)un, geomp);
2057 
2058 	return (0);
2059 }
2060 
2061 /*
2062  * NAME:	raid_get_vtoc
2063  * DESCRIPTION: used to get the VTOC on a RAID metadevice
2064  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
2065  *		struct vtoc *vtocp - pointer to VTOC data structure
2066  *
2067  * LOCKS:	none
2068  *
2069  */
2070 static int
2071 raid_get_vtoc(
2072 	mr_unit_t	*un,
2073 	struct vtoc	*vtocp
2074 )
2075 {
2076 	md_get_vtoc((md_unit_t *)un, vtocp);
2077 
2078 	return (0);
2079 }
2080 
2081 /*
2082  * NAME:	raid_set_vtoc
2083  * DESCRIPTION: used to set the VTOC on a RAID metadevice
2084  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2085  *		struct vtoc *vtocp - pointer to VTOC data structure
2086  *
2087  * LOCKS:	none
2088  *
2089  */
2090 static int
2091 raid_set_vtoc(
2092 	mr_unit_t	*un,
2093 	struct vtoc	*vtocp
2094 )
2095 {
2096 	return (md_set_vtoc((md_unit_t *)un, vtocp));
2097 }
2098 
2099 
2100 
2101 /*
2102  * NAME:	raid_get_cgapart
2103  * DESCRIPTION: used to get the dk_map on a RAID metadevice
2104  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2105  *		struct vtoc *dkmapp - pointer to dk_map data structure
2106  *
2107  * LOCKS:	none
2108  *
2109  */
2110 
2111 static int
2112 raid_get_cgapart(
2113 	mr_unit_t	*un,
2114 	struct dk_map	*dkmapp
2115 )
2116 {
2117 	md_get_cgapart((md_unit_t *)un, dkmapp);
2118 	return (0);
2119 }
2120 
2121 /*
2122  * NAME:	raid_getdevs
2123  * DESCRIPTION: return all devices within a RAID metadevice
2124  * PARAMETERS:	md_getdevs_params_t *mgdp
2125  *			      - pointer to getdevs IOCTL data structure
2126  *		int	 mode - should be FREAD
2127  *		IOLOCK *lockp - IOCTL read/write lock
2128  *
2129  * LOCKS:	obtains unit reader lock via IOLOCK
2130  *
2131  */
2132 static int
2133 raid_getdevs(
2134 	void			*mgdp,
2135 	int			mode,
2136 	IOLOCK			*lock
2137 )
2138 {
2139 	minor_t			mnum;
2140 	mr_unit_t		*un;
2141 	md_dev64_t		*udevs;
2142 	int			i, cnt;
2143 	md_dev64_t		unit_dev;
2144 	md_getdevs_params_t	*mgdph = mgdp;
2145 
2146 
2147 	mnum = mgdph->mnum;
2148 
2149 	/* check out unit */
2150 	mdclrerror(&mgdph->mde);
2151 
2152 	if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK,
2153 		lock)) == NULL)
2154 		return (0);
2155 
2156 	udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
2157 
2158 	for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
2159 		if (cnt < mgdph->cnt) {
2160 			unit_dev = un->un_column[i].un_orig_dev;
2161 			if (md_getmajor(unit_dev) != md_major) {
2162 				if ((unit_dev = md_xlate_mini_2_targ
2163 				    (unit_dev)) == NODEV64)
2164 					return (ENODEV);
2165 			}
2166 
2167 			if (ddi_copyout((caddr_t)&unit_dev,
2168 					(caddr_t)&udevs[cnt], sizeof (*udevs),
2169 					mode) != 0)
2170 				return (EFAULT);
2171 		}
2172 		if (HOTSPARED(un, i)) {
2173 			cnt++;
2174 			if (cnt >= mgdph->cnt)
2175 				continue;
2176 
2177 			unit_dev = un->un_column[i].un_dev;
2178 			if (md_getmajor(unit_dev) != md_major) {
2179 				if ((unit_dev = md_xlate_mini_2_targ
2180 				    (unit_dev)) == NODEV64)
2181 					return (ENODEV);
2182 			}
2183 
2184 			if (ddi_copyout((caddr_t)&unit_dev,
2185 					(caddr_t)&udevs[cnt], sizeof (*udevs),
2186 					mode) != 0)
2187 				return (EFAULT);
2188 		}
2189 	}
2190 	mgdph->cnt = cnt;
2191 	return (0);
2192 }
2193 
2194 /*
2195  * NAME:	raid_change
2196  * DESCRIPTION: used to change the following dynamic values:
2197  *			the hot spare pool
2198  *		in the unit structure of a RAID metadevice
2199  * PARAMETERS:	md_change_params_t   *mcp - pointer to change data structure
2200  *		IOLOCK	     *lock - pointer to IOCTL lock
2201  *
2202  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun)
2203  *
2204  */
2205 static int
2206 raid_change(
2207 	md_raid_params_t	*mrp,
2208 	IOLOCK			*lock
2209 )
2210 {
2211 	minor_t		mnum = mrp->mnum;
2212 	mr_unit_t	*un;
2213 	int		ix;
2214 	mddb_recid_t	recids[3] = {0, 0, 0};
2215 	int		err;
2216 	int		irecid;
2217 	int		inc_new_hsp = 0;
2218 
2219 	mdclrerror(&mrp->mde);
2220 
2221 	if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
2222 		return (0);
2223 
2224 	if (!mrp->params.change_hsp_id)
2225 		return (0);
2226 
2227 	/* verify that no hotspare is in use */
2228 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
2229 		if (HOTSPARED(un, ix)) {
2230 			return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
2231 		}
2232 	}
2233 
2234 	/* replace the hot spare pool */
2235 
2236 	irecid = 0;
2237 	if (mrp->params.hsp_id != -1) {
2238 		/* increment the reference count of the new hsp */
2239 		err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
2240 		    &recids[0], NULL, NULL, NULL);
2241 		if (err) {
2242 			return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2243 			    mrp->params.hsp_id));
2244 		}
2245 		inc_new_hsp = 1;
2246 		irecid++;
2247 	}
2248 
2249 	if (un->un_hsp_id != -1) {
2250 		/* decrement the reference count of the old hsp */
2251 		err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
2252 		    &recids[irecid], NULL, NULL, NULL);
2253 		if (err) {
2254 			err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2255 			    mrp->params.hsp_id);
2256 			if (inc_new_hsp) {
2257 				(void) md_hot_spare_ifc(HSP_DECREF,
2258 				    mrp->params.hsp_id, 0, 0,
2259 				    &recids[0], NULL, NULL, NULL);
2260 				/*
2261 				 * Don't need to commit the record,
2262 				 * because it wasn't committed before
2263 				 */
2264 			}
2265 			return (err);
2266 		}
2267 	}
2268 
2269 	un->un_hsp_id = mrp->params.hsp_id;
2270 
2271 	raid_commit(un, recids);
2272 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
2273 	    MD_UN2SET(un), MD_SID(un));
2274 
2275 	/* Now trigger hot spare processing in case one is needed. */
2276 	if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
2277 		(void) raid_hotspares();
2278 
2279 	return (0);
2280 }
2281 
2282 /*
2283  * NAME:	raid_admin_ioctl
2284  * DESCRIPTION: IOCTL operations unique to metadevices and RAID
2285  * PARAMETERS:	int	  cmd - IOCTL command to be executed
2286  *		void	*data - pointer to IOCTL data structure
2287  *		int	 mode - either FREAD or FWRITE
2288  *		IOLOCK *lockp - IOCTL read/write lock
2289  *
2290  * LOCKS:	none
2291  *
2292  */
2293 static int
2294 raid_admin_ioctl(
2295 	int		cmd,
2296 	void		*data,
2297 	int		mode,
2298 	IOLOCK		*lockp
2299 )
2300 {
2301 	size_t		sz = 0;
2302 	void		*d = NULL;
2303 	int		err = 0;
2304 
2305 	/* We can only handle 32-bit clients for internal commands */
2306 	if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2307 		return (EINVAL);
2308 	}
2309 
2310 
2311 	/* dispatch ioctl */
2312 	switch (cmd) {
2313 
2314 	case MD_IOCSET:
2315 	{
2316 		if (! (mode & FWRITE))
2317 			return (EACCES);
2318 
2319 		sz = sizeof (md_set_params_t);
2320 		d = kmem_alloc(sz, KM_SLEEP);
2321 
2322 		if (ddi_copyin(data, d, sz, mode)) {
2323 			err = EFAULT;
2324 			break;
2325 		}
2326 
2327 		err = raid_set(d, mode);
2328 		break;
2329 	}
2330 
2331 	case MD_IOCGET:
2332 	{
2333 		if (! (mode & FREAD))
2334 			return (EACCES);
2335 
2336 		sz = sizeof (md_i_get_t);
2337 		d = kmem_alloc(sz, KM_SLEEP);
2338 
2339 		if (ddi_copyin(data, d, sz, mode)) {
2340 			err = EFAULT;
2341 			break;
2342 		}
2343 
2344 		err = raid_get(d, mode, lockp);
2345 		break;
2346 	}
2347 
2348 	case MD_IOCREPLACE:
2349 	{
2350 		if (! (mode & FWRITE))
2351 			return (EACCES);
2352 
2353 		sz = sizeof (replace_params_t);
2354 		d = kmem_alloc(sz, KM_SLEEP);
2355 
2356 		if (ddi_copyin(data, d, sz, mode)) {
2357 			err = EFAULT;
2358 			break;
2359 		}
2360 
2361 		err = raid_replace((replace_params_t *)d, lockp);
2362 		break;
2363 	}
2364 
2365 	case MD_IOCSETSYNC:
2366 	{
2367 		if (! (mode & FWRITE))
2368 			return (EACCES);
2369 
2370 		sz = sizeof (md_resync_ioctl_t);
2371 		d = kmem_alloc(sz, KM_SLEEP);
2372 
2373 		if (ddi_copyin(data, d, sz, mode)) {
2374 			err = EFAULT;
2375 			break;
2376 		}
2377 
2378 		err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
2379 		break;
2380 	}
2381 
2382 	case MD_IOCGETSYNC:
2383 	{
2384 		if (! (mode & FREAD))
2385 			return (EACCES);
2386 
2387 		sz = sizeof (md_resync_ioctl_t);
2388 		d = kmem_alloc(sz, KM_SLEEP);
2389 
2390 		if (ddi_copyin(data, d, sz, mode)) {
2391 			err = EFAULT;
2392 			break;
2393 		}
2394 		err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
2395 
2396 		break;
2397 	}
2398 
2399 	case MD_IOCGROW:
2400 	{
2401 		if (! (mode & FWRITE))
2402 			return (EACCES);
2403 
2404 		sz = sizeof (md_grow_params_t);
2405 		d = kmem_alloc(sz, KM_SLEEP);
2406 
2407 		if (ddi_copyin(data, d, sz, mode)) {
2408 			err = EFAULT;
2409 			break;
2410 		}
2411 
2412 		err = raid_grow(d, mode, lockp);
2413 		break;
2414 	}
2415 
2416 	case MD_IOCCHANGE:
2417 	{
2418 		if (! (mode & FWRITE))
2419 			return (EACCES);
2420 
2421 		sz = sizeof (md_raid_params_t);
2422 		d = kmem_alloc(sz, KM_SLEEP);
2423 
2424 		if (ddi_copyin(data, d, sz, mode)) {
2425 			err = EFAULT;
2426 			break;
2427 		}
2428 
2429 		err = raid_change((md_raid_params_t *)d, lockp);
2430 		break;
2431 	}
2432 
2433 	case MD_IOCRESET:
2434 	{
2435 		if (! (mode & FWRITE))
2436 			return (EACCES);
2437 
2438 		sz = sizeof (md_i_reset_t);
2439 		d = kmem_alloc(sz, KM_SLEEP);
2440 
2441 		if (ddi_copyin(data, d, sz, mode)) {
2442 			err = EFAULT;
2443 			break;
2444 		}
2445 
2446 		err = raid_reset((md_i_reset_t *)d);
2447 		break;
2448 	}
2449 
2450 	case MD_IOCGET_DEVS:
2451 	{
2452 		if (! (mode & FREAD))
2453 			return (EACCES);
2454 
2455 		sz = sizeof (md_getdevs_params_t);
2456 		d = kmem_alloc(sz, KM_SLEEP);
2457 
2458 		if (ddi_copyin(data, d, sz, mode)) {
2459 			err = EFAULT;
2460 			break;
2461 		}
2462 
2463 		err = raid_getdevs(d, mode, lockp);
2464 		break;
2465 	}
2466 
2467 	case MD_IOCSETREGEN:
2468 	{
2469 		if (! (mode & FWRITE))
2470 			return (EACCES);
2471 
2472 		sz = sizeof (md_regen_param_t);
2473 		d = kmem_alloc(sz, KM_SLEEP);
2474 
2475 		if (ddi_copyin(data, d, sz, mode)) {
2476 			err = EFAULT;
2477 			break;
2478 		}
2479 
2480 		err = raid_regen((md_regen_param_t *)d, lockp);
2481 		break;
2482 	}
2483 
2484 	case MD_IOCPROBE_DEV:
2485 	{
2486 		md_probedev_impl_t	*p = NULL;
2487 		md_probedev_t		*ph = NULL;
2488 		daemon_queue_t		*hdr = NULL;
2489 		int			i;
2490 		size_t			sz1 = 0;
2491 
2492 
2493 		if (! (mode & FREAD))
2494 			return (EACCES);
2495 
2496 		sz = sizeof (md_probedev_t);
2497 
2498 		d = kmem_alloc(sz, KM_SLEEP);
2499 
2500 		/* now copy in the data */
2501 		if (ddi_copyin(data, d, sz, mode)) {
2502 			err = EFAULT;
2503 			goto free_mem;
2504 		}
2505 
2506 		/*
2507 		 * Sanity test the args. Test name should have the keyword
2508 		 * probe.
2509 		 */
2510 		p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2511 		p->probe_sema = NULL;
2512 		p->probe_mx = NULL;
2513 		p->probe.mnum_list = (uint64_t)NULL;
2514 
2515 		ph = (md_probedev_t *)d;
2516 		p->probe.nmdevs = ph->nmdevs;
2517 		(void) strcpy(p->probe.test_name, ph->test_name);
2518 		bcopy(&ph->md_driver, &(p->probe.md_driver),
2519 				sizeof (md_driver_t));
2520 
2521 		if ((p->probe.nmdevs < 1) ||
2522 			(strstr(p->probe.test_name, "probe") == NULL)) {
2523 			err = EINVAL;
2524 			goto free_mem;
2525 		}
2526 
2527 		sz1 = sizeof (minor_t) * p->probe.nmdevs;
2528 
2529 		p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
2530 		    KM_SLEEP);
2531 
2532 		if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
2533 		    (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
2534 			err = EFAULT;
2535 			goto free_mem;
2536 		}
2537 
2538 		if (err = md_init_probereq(p, &hdr))
2539 			goto free_mem;
2540 
2541 		/*
2542 		 * put the request on the queue and wait.
2543 		 */
2544 
2545 		daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2546 
2547 		(void) IOLOCK_RETURN(0, lockp);
2548 		/* wait for the events to occur */
2549 		for (i = 0; i < p->probe.nmdevs; i++) {
2550 			sema_p(PROBE_SEMA(p));
2551 		}
2552 		while (md_ioctl_lock_enter() == EINTR);
2553 
2554 		/*
2555 		 * clean up. The hdr list is freed in the probe routines
2556 		 * since the list is NULL by the time we get here.
2557 		 */
2558 free_mem:
2559 		if (p) {
2560 			if (p->probe_sema != NULL) {
2561 				sema_destroy(PROBE_SEMA(p));
2562 				kmem_free(p->probe_sema, sizeof (ksema_t));
2563 			}
2564 			if (p->probe_mx != NULL) {
2565 				mutex_destroy(PROBE_MX(p));
2566 				kmem_free(p->probe_mx, sizeof (kmutex_t));
2567 			}
2568 			if (p->probe.mnum_list)
2569 				kmem_free((caddr_t)(uintptr_t)
2570 				    p->probe.mnum_list, sz1);
2571 
2572 			kmem_free(p, sizeof (md_probedev_impl_t));
2573 		}
2574 		break;
2575 	}
2576 
2577 	default:
2578 		return (ENOTTY);
2579 	}
2580 
2581 	/*
2582 	 * copyout and free any args
2583 	 */
2584 	if (sz != 0) {
2585 		if (err == 0) {
2586 			if (ddi_copyout(d, data, sz, mode) != 0) {
2587 				err = EFAULT;
2588 			}
2589 		}
2590 		kmem_free(d, sz);
2591 	}
2592 	return (err);
2593 }
2594 
2595 /*
2596  * NAME:	md_raid_ioctl
2597  * DESCRIPTION: RAID metadevice IOCTL operations entry point.
2598  * PARAMETERS:	md_dev64_t dev - RAID device identifier
2599  *		int	  cmd  - IOCTL command to be executed
2600  *		void	*data  - pointer to IOCTL data structure
2601  *		int	 mode  - either FREAD or FWRITE
2602  *		IOLOCK *lockp  - IOCTL read/write lock
2603  *
2604  * LOCKS:	none
2605  *
2606  */
2607 int
2608 md_raid_ioctl(
2609 	dev_t		dev,
2610 	int		cmd,
2611 	void		*data,
2612 	int		mode,
2613 	IOLOCK		*lockp
2614 )
2615 {
2616 	minor_t		mnum = getminor(dev);
2617 	mr_unit_t	*un;
2618 	int		err = 0;
2619 
2620 	/* handle admin ioctls */
2621 	if (mnum == MD_ADM_MINOR)
2622 		return (raid_admin_ioctl(cmd, data, mode, lockp));
2623 
2624 	/* check unit */
2625 	if ((MD_MIN2SET(mnum) >= md_nsets) ||
2626 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
2627 	    ((un = MD_UNIT(mnum)) == NULL))
2628 		return (ENXIO);
2629 
2630 	/* is this a supported ioctl? */
2631 	err = md_check_ioctl_against_efi(cmd, un->c.un_flag);
2632 	if (err != 0) {
2633 		return (err);
2634 	}
2635 
2636 	/* dispatch ioctl */
2637 	switch (cmd) {
2638 
2639 	case DKIOCINFO:
2640 	{
2641 		struct dk_cinfo *p;
2642 
2643 		if (! (mode & FREAD))
2644 			return (EACCES);
2645 
2646 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2647 
2648 		get_info(p, mnum);
2649 		if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
2650 			err = EFAULT;
2651 
2652 		kmem_free(p, sizeof (*p));
2653 		return (err);
2654 	}
2655 
2656 	case DKIOCGMEDIAINFO:
2657 	{
2658 		struct dk_minfo	p;
2659 
2660 		if (! (mode & FREAD))
2661 			return (EACCES);
2662 
2663 		get_minfo(&p, mnum);
2664 		if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
2665 			err = EFAULT;
2666 
2667 		return (err);
2668 	}
2669 
2670 	case DKIOCGGEOM:
2671 	{
2672 		struct dk_geom	*p;
2673 
2674 		if (! (mode & FREAD))
2675 			return (EACCES);
2676 
2677 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2678 
2679 		if ((err = raid_get_geom(un, p)) == 0) {
2680 			if (ddi_copyout((caddr_t)p, data, sizeof (*p),
2681 			    mode) != 0)
2682 				err = EFAULT;
2683 		}
2684 
2685 		kmem_free(p, sizeof (*p));
2686 		return (err);
2687 	}
2688 
2689 	case DKIOCGVTOC:
2690 	{
2691 		struct vtoc	vtoc;
2692 
2693 		if (! (mode & FREAD))
2694 			return (EACCES);
2695 
2696 		if ((err = raid_get_vtoc(un, &vtoc)) != 0) {
2697 			return (err);
2698 		}
2699 
2700 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2701 			if (ddi_copyout(&vtoc, data, sizeof (vtoc), mode))
2702 				err = EFAULT;
2703 		}
2704 #ifdef _SYSCALL32
2705 		else {
2706 			struct vtoc32 vtoc32;
2707 			vtoctovtoc32(vtoc, vtoc32);
2708 			if (ddi_copyout(&vtoc32, data, sizeof (vtoc32), mode))
2709 				err = EFAULT;
2710 		}
2711 #endif /* _SYSCALL32 */
2712 
2713 		return (err);
2714 	}
2715 
2716 	case DKIOCSVTOC:
2717 	{
2718 		struct vtoc	vtoc;
2719 
2720 		if (! (mode & FWRITE))
2721 			return (EACCES);
2722 
2723 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2724 			if (ddi_copyin(data, &vtoc, sizeof (vtoc), mode)) {
2725 				err = EFAULT;
2726 			}
2727 		}
2728 #ifdef _SYSCALL32
2729 		else {
2730 			struct vtoc32 vtoc32;
2731 			if (ddi_copyin(data, &vtoc32, sizeof (vtoc32), mode)) {
2732 				err = EFAULT;
2733 			} else {
2734 				vtoc32tovtoc(vtoc32, vtoc);
2735 			}
2736 		}
2737 #endif /* _SYSCALL32 */
2738 
2739 		if (err == 0)
2740 			err = raid_set_vtoc(un, &vtoc);
2741 
2742 		return (err);
2743 	}
2744 
2745 	case DKIOCGAPART:
2746 	{
2747 		struct dk_map	dmp;
2748 
2749 		if ((err = raid_get_cgapart(un, &dmp)) != 0) {
2750 			return (err);
2751 		}
2752 
2753 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2754 			if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
2755 				mode) != 0)
2756 				err = EFAULT;
2757 		}
2758 #ifdef _SYSCALL32
2759 		else {
2760 			struct dk_map32 dmp32;
2761 
2762 			dmp32.dkl_cylno = dmp.dkl_cylno;
2763 			dmp32.dkl_nblk = dmp.dkl_nblk;
2764 
2765 			if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
2766 				mode) != 0)
2767 				err = EFAULT;
2768 		}
2769 #endif /* _SYSCALL32 */
2770 
2771 		return (err);
2772 	}
2773 	case DKIOCGETEFI:
2774 	{
2775 		/*
2776 		 * This one can be done centralized,
2777 		 * no need to put in the same code for all types of metadevices
2778 		 */
2779 		return (md_dkiocgetefi(mnum, data, mode));
2780 	}
2781 
2782 	case DKIOCSETEFI:
2783 	{
2784 		/*
2785 		 * This one can be done centralized,
2786 		 * no need to put in the same code for all types of metadevices
2787 		 */
2788 		return (md_dkiocsetefi(mnum, data, mode));
2789 	}
2790 
2791 	case DKIOCPARTITION:
2792 	{
2793 		return (md_dkiocpartition(mnum, data, mode));
2794 	}
2795 
2796 	default:
2797 		return (ENOTTY);
2798 	}
2799 }
2800 
2801 /*
2802  * rename/exchange named service entry points and support functions follow.
2803  * Most functions are handled generically, except for raid-specific locking
2804  * and checking
2805  */
2806 
2807 /*
2808  * NAME:	raid_may_renexch_self
2809  * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
2810  * PARAMETERS:	mr_unit_t	*un - unit struct of raid unit to be renamed
2811  *		mdi_unit_t	*ui - in-core unit struct of same raid unit
2812  *		md_rentxn_t	*rtxnp - rename transaction state
2813  *
2814  * LOCKS:	none
2815  *
2816  */
2817 static int
2818 raid_may_renexch_self(
2819 	mr_unit_t	*un,
2820 	mdi_unit_t	*ui,
2821 	md_rentxn_t	*rtxnp)
2822 {
2823 	minor_t	from_min;
2824 	minor_t	to_min;
2825 	bool_t	toplevel;
2826 	bool_t	related;
2827 
2828 	from_min = rtxnp->from.mnum;
2829 	to_min = rtxnp->to.mnum;
2830 
2831 	if (!un || !ui) {
2832 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2833 								from_min);
2834 		return (EINVAL);
2835 	}
2836 
2837 	ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
2838 	if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
2839 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2840 		return (EINVAL);
2841 	}
2842 
2843 	if (MD_PARENT(un) == MD_MULTI_PARENT) {
2844 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2845 		return (EINVAL);
2846 	}
2847 
2848 	toplevel = !MD_HAS_PARENT(MD_PARENT(un));
2849 
2850 	/* we're related if trying to swap with our parent */
2851 	related = (!toplevel) && (MD_PARENT(un) == to_min);
2852 
2853 	switch (rtxnp->op) {
2854 	case MDRNOP_EXCHANGE:
2855 
2856 		if (!related) {
2857 			(void) mdmderror(&rtxnp->mde,
2858 					MDE_RENAME_TARGET_UNRELATED, to_min);
2859 			return (EINVAL);
2860 		}
2861 
2862 		break;
2863 
2864 	case MDRNOP_RENAME:
2865 		/*
2866 		 * if from is top-level and is open, then the kernel is using
2867 		 * the md_dev64_t.
2868 		 */
2869 
2870 		if (toplevel && md_unit_isopen(ui)) {
2871 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2872 								from_min);
2873 			return (EBUSY);
2874 		}
2875 		break;
2876 
2877 	default:
2878 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2879 								from_min);
2880 		return (EINVAL);
2881 	}
2882 
2883 	return (0);	/* ok */
2884 }
2885 
2886 /*
2887  * NAME:	raid_rename_check
2888  * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
2889  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
2890  *					 raid device for rename transaction
2891  *		md_rentxn_t	*rtxnp - rename transaction state
2892  *
2893  * LOCKS:	none
2894  *
2895  */
2896 intptr_t
2897 raid_rename_check(
2898 	md_rendelta_t	*delta,
2899 	md_rentxn_t	*rtxnp)
2900 {
2901 	int		 err	= 0;
2902 	int		 column;
2903 	mr_unit_t	*un;
2904 
2905 	ASSERT(delta);
2906 	ASSERT(rtxnp);
2907 	ASSERT(delta->unp);
2908 	ASSERT(delta->uip);
2909 
2910 	if (!delta || !rtxnp || !delta->unp || !delta->uip) {
2911 		(void) mdsyserror(&rtxnp->mde, EINVAL);
2912 		return (EINVAL);
2913 	}
2914 
2915 	un = (mr_unit_t *)delta->unp;
2916 
2917 	for (column = 0; column < un->un_totalcolumncnt; column++) {
2918 		rcs_state_t	colstate;
2919 
2920 		colstate = un->un_column[column].un_devstate;
2921 
2922 		if (colstate & RCS_LAST_ERRED) {
2923 			(void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
2924 						md_getminor(delta->dev));
2925 			return (EINVAL);
2926 		}
2927 
2928 		if (colstate & RCS_INIT_ERRED) {
2929 			(void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
2930 						md_getminor(delta->dev));
2931 			return (EINVAL);
2932 		}
2933 
2934 		/* How did we get this far before detecting this? */
2935 		if (colstate & RCS_RESYNC) {
2936 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2937 						md_getminor(delta->dev));
2938 			return (EBUSY);
2939 		}
2940 
2941 		if (colstate & RCS_ERRED) {
2942 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
2943 						md_getminor(delta->dev));
2944 			return (EINVAL);
2945 		}
2946 
2947 		if (!(colstate & RCS_OKAY)) {
2948 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
2949 						md_getminor(delta->dev));
2950 			return (EINVAL);
2951 		}
2952 
2953 		if (HOTSPARED(un, column)) {
2954 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
2955 						md_getminor(delta->dev));
2956 			return (EINVAL);
2957 		}
2958 	}
2959 
2960 	/* self does additional checks */
2961 	if (delta->old_role == MDRR_SELF) {
2962 		err = raid_may_renexch_self((mr_unit_t *)delta->unp,
2963 							delta->uip, rtxnp);
2964 	}
2965 	return (err);
2966 }
2967 
2968 /*
2969  * NAME:	raid_rename_lock
2970  * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
2971  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
2972  *					 raid device for rename transaction
2973  *		md_rentxn_t	*rtxnp - rename transaction state
2974  *
2975  * LOCKS:	io and unit locks (taken explicitly *not* via ioctl wrappers)
2976  *
2977  */
2978 intptr_t
2979 raid_rename_lock(
2980 	md_rendelta_t	*delta,
2981 	md_rentxn_t	*rtxnp)
2982 {
2983 	minor_t		mnum;
2984 
2985 	ASSERT(delta);
2986 	ASSERT(rtxnp);
2987 
2988 	mnum = md_getminor(delta->dev);
2989 	if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
2990 		return (0);
2991 	}
2992 
2993 	ASSERT(delta->uip);
2994 	if (!delta->uip) {
2995 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
2996 		return (ENODEV);
2997 	}
2998 
2999 	ASSERT(delta->unp);
3000 	if (!delta->unp) {
3001 
3002 		return (ENODEV);
3003 	}
3004 
3005 	ASSERT(!IO_WRITER_HELD(delta->unp));
3006 	(void) md_io_writerlock(delta->uip);
3007 	ASSERT(IO_WRITER_HELD(delta->unp));
3008 
3009 
3010 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
3011 	(void) md_unit_writerlock(delta->uip);
3012 	ASSERT(UNIT_WRITER_HELD(delta->unp));
3013 
3014 	return (0);
3015 }
3016 
3017 /*
3018  * NAME:	raid_rename_unlock
3019  * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
3020  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
3021  *					 raid device for rename transaction
3022  *		md_rentxn_t	*rtxnp - rename transaction state
3023  *
3024  * LOCKS:	drops io and unit locks
3025  *
3026  */
3027 /* ARGSUSED */
3028 void
3029 raid_rename_unlock(
3030 	md_rendelta_t	*delta,
3031 	md_rentxn_t	*rtxnp)
3032 {
3033 	mr_unit_t	*un = (mr_unit_t *)delta->unp;
3034 	minor_t		mnum = MD_SID(un);
3035 	int		col;
3036 
3037 	ASSERT(delta);
3038 	ASSERT(delta->unp);
3039 	ASSERT(delta->uip);
3040 
3041 	ASSERT(UNIT_WRITER_HELD(delta->unp));
3042 	md_unit_writerexit(delta->uip);
3043 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
3044 
3045 	if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
3046 		goto out;
3047 	}
3048 	if (raid_internal_open(mnum, (FREAD | FWRITE),
3049 			OTYP_LYR, MD_OFLG_ISINIT) == 0) {
3050 		for (col = 0; col < un->un_totalcolumncnt; col++) {
3051 			if (un->un_column[col].un_devstate & RCS_OKAY)
3052 				(void) init_pw_area(un,
3053 						un->un_column[col].un_dev,
3054 						un->un_column[col].un_pwstart,
3055 						col);
3056 		}
3057 		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
3058 	}
3059 
3060 out:
3061 	ASSERT(IO_WRITER_HELD(delta->unp));
3062 	md_io_writerexit(delta->uip);
3063 	ASSERT(!IO_WRITER_HELD(delta->unp));
3064 }
3065 /* end of rename/exchange named service and support functions */
3066