1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate  * NAME:	raid.c
31*0Sstevel@tonic-gate  *
32*0Sstevel@tonic-gate  * DESCRIPTION: Main RAID driver source file containing open, close and I/O
33*0Sstevel@tonic-gate  *		operations.
34*0Sstevel@tonic-gate  *
35*0Sstevel@tonic-gate  * ROUTINES PROVIDED FOR EXTERNAL USE:
36*0Sstevel@tonic-gate  *  raid_open()			- open the RAID metadevice for access.
37*0Sstevel@tonic-gate  *  raid_internal_open()	- internal open routine of RAID metdevice.
38*0Sstevel@tonic-gate  *  md_raid_strategy()		- perform normal I/O operations,
39*0Sstevel@tonic-gate  *				    such as read and write.
40*0Sstevel@tonic-gate  *  raid_close()		- close the RAID metadevice.
41*0Sstevel@tonic-gate  *  raid_internal_close()	- internal close routine of RAID metadevice.
42*0Sstevel@tonic-gate  *  raid_snarf()		- initialize and clean up MDD records.
43*0Sstevel@tonic-gate  *  raid_halt()			- reset the RAID metadevice
44*0Sstevel@tonic-gate  *  raid_line()			- return the line # of this segment
45*0Sstevel@tonic-gate  *  raid_dcolumn()		- return the data column # of this segment
46*0Sstevel@tonic-gate  *  raid_pcolumn()		- return the parity column # of this segment
47*0Sstevel@tonic-gate  */
48*0Sstevel@tonic-gate 
49*0Sstevel@tonic-gate #include <sys/param.h>
50*0Sstevel@tonic-gate #include <sys/systm.h>
51*0Sstevel@tonic-gate #include <sys/conf.h>
52*0Sstevel@tonic-gate #include <sys/file.h>
53*0Sstevel@tonic-gate #include <sys/user.h>
54*0Sstevel@tonic-gate #include <sys/uio.h>
55*0Sstevel@tonic-gate #include <sys/t_lock.h>
56*0Sstevel@tonic-gate #include <sys/buf.h>
57*0Sstevel@tonic-gate #include <sys/dkio.h>
58*0Sstevel@tonic-gate #include <sys/vtoc.h>
59*0Sstevel@tonic-gate #include <sys/kmem.h>
60*0Sstevel@tonic-gate #include <vm/page.h>
61*0Sstevel@tonic-gate #include <sys/cmn_err.h>
62*0Sstevel@tonic-gate #include <sys/sysmacros.h>
63*0Sstevel@tonic-gate #include <sys/types.h>
64*0Sstevel@tonic-gate #include <sys/mkdev.h>
65*0Sstevel@tonic-gate #include <sys/stat.h>
66*0Sstevel@tonic-gate #include <sys/open.h>
67*0Sstevel@tonic-gate #include <sys/modctl.h>
68*0Sstevel@tonic-gate #include <sys/ddi.h>
69*0Sstevel@tonic-gate #include <sys/sunddi.h>
70*0Sstevel@tonic-gate #include <sys/debug.h>
71*0Sstevel@tonic-gate #include <sys/lvm/md_raid.h>
72*0Sstevel@tonic-gate #include <sys/lvm/mdvar.h>
73*0Sstevel@tonic-gate #include <sys/lvm/md_convert.h>
74*0Sstevel@tonic-gate 
75*0Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
76*0Sstevel@tonic-gate #include <sys/sysevent/svm.h>
77*0Sstevel@tonic-gate 
78*0Sstevel@tonic-gate md_ops_t		raid_md_ops;
79*0Sstevel@tonic-gate #ifndef lint
80*0Sstevel@tonic-gate static char		_depends_on[] = "drv/md";
81*0Sstevel@tonic-gate md_ops_t		*md_interface_ops = &raid_md_ops;
82*0Sstevel@tonic-gate #endif	/* lint */
83*0Sstevel@tonic-gate 
84*0Sstevel@tonic-gate extern unit_t		md_nunits;
85*0Sstevel@tonic-gate extern unit_t		md_nsets;
86*0Sstevel@tonic-gate extern md_set_t		md_set[];
87*0Sstevel@tonic-gate extern int		md_status;
88*0Sstevel@tonic-gate extern major_t		md_major;
89*0Sstevel@tonic-gate extern mdq_anchor_t	md_done_daemon;
90*0Sstevel@tonic-gate extern mdq_anchor_t	md_mstr_daemon;
91*0Sstevel@tonic-gate extern int		md_sleep_for_test;
92*0Sstevel@tonic-gate extern clock_t		md_hz;
93*0Sstevel@tonic-gate 
94*0Sstevel@tonic-gate extern md_event_queue_t	*md_event_queue;
95*0Sstevel@tonic-gate 
96*0Sstevel@tonic-gate 
97*0Sstevel@tonic-gate int pchunks		= 16;
98*0Sstevel@tonic-gate int phigh		= 1024;
99*0Sstevel@tonic-gate int plow		= 128;
100*0Sstevel@tonic-gate int cchunks		= 64;
101*0Sstevel@tonic-gate int chigh		= 1024;
102*0Sstevel@tonic-gate int clow		= 512;
103*0Sstevel@tonic-gate int bchunks		= 32;
104*0Sstevel@tonic-gate int bhigh		= 256;
105*0Sstevel@tonic-gate int blow		= 128;
106*0Sstevel@tonic-gate 
107*0Sstevel@tonic-gate int raid_total_io		= 0;
108*0Sstevel@tonic-gate int raid_reads			= 0;
109*0Sstevel@tonic-gate int raid_writes			= 0;
110*0Sstevel@tonic-gate int raid_no_bpmaps		= 0;
111*0Sstevel@tonic-gate int raid_512			= 0;
112*0Sstevel@tonic-gate int raid_1024			= 0;
113*0Sstevel@tonic-gate int raid_1024_8192		= 0;
114*0Sstevel@tonic-gate int raid_8192			= 0;
115*0Sstevel@tonic-gate int raid_8192_bigger		= 0;
116*0Sstevel@tonic-gate int raid_line_lock_wait	= 0;
117*0Sstevel@tonic-gate 
118*0Sstevel@tonic-gate int data_buffer_waits		= 0;
119*0Sstevel@tonic-gate int parity_buffer_waits	= 0;
120*0Sstevel@tonic-gate 
121*0Sstevel@tonic-gate /* writer line locks */
122*0Sstevel@tonic-gate int raid_writer_locks		= 0; /* total writer locks */
123*0Sstevel@tonic-gate int raid_write_waits		= 0; /* total writer locks that waited */
124*0Sstevel@tonic-gate int raid_full_line_writes	= 0; /* total full line writes */
125*0Sstevel@tonic-gate int raid_write_queue_length	= 0; /* wait queue length */
126*0Sstevel@tonic-gate int raid_max_write_q_length	= 0; /* maximum queue length */
127*0Sstevel@tonic-gate int raid_write_locks_active	= 0; /* writer locks at any time */
128*0Sstevel@tonic-gate int raid_max_write_locks	= 0; /* maximum writer locks active */
129*0Sstevel@tonic-gate 
130*0Sstevel@tonic-gate /* read line locks */
131*0Sstevel@tonic-gate int raid_reader_locks		= 0; /* total reader locks held */
132*0Sstevel@tonic-gate int raid_reader_locks_active	= 0; /* reader locks held */
133*0Sstevel@tonic-gate int raid_max_reader_locks	= 0; /* maximum reader locks held in run */
134*0Sstevel@tonic-gate int raid_read_overlaps		= 0; /* number of times 2 reads hit same line */
135*0Sstevel@tonic-gate int raid_read_waits		= 0; /* times a reader waited on writer */
136*0Sstevel@tonic-gate 
137*0Sstevel@tonic-gate /* prewrite stats */
138*0Sstevel@tonic-gate int raid_prewrite_waits		= 0; /* number of waits for a pw slot */
139*0Sstevel@tonic-gate int raid_pw			= 0; /* number of pw slots in use */
140*0Sstevel@tonic-gate int raid_prewrite_max		= 0; /* maximum number of pw slots in use */
141*0Sstevel@tonic-gate int raid_pw_invalidates		= 0;
142*0Sstevel@tonic-gate 
143*0Sstevel@tonic-gate static clock_t md_wr_wait	= 0;
144*0Sstevel@tonic-gate 
145*0Sstevel@tonic-gate int nv_available	= 0; /* presence of nv-ram support in device */
146*0Sstevel@tonic-gate int nv_prewrite		= 1; /* mark prewrites with nv_available */
147*0Sstevel@tonic-gate int nv_parity		= 1; /* mark parity with nv_available */
148*0Sstevel@tonic-gate 
149*0Sstevel@tonic-gate kmem_cache_t	*raid_parent_cache = NULL;
150*0Sstevel@tonic-gate kmem_cache_t	*raid_child_cache = NULL;
151*0Sstevel@tonic-gate kmem_cache_t	*raid_cbuf_cache = NULL;
152*0Sstevel@tonic-gate 
153*0Sstevel@tonic-gate int			raid_internal_open(minor_t mnum, int flag, int otyp,
154*0Sstevel@tonic-gate 			    int md_oflags);
155*0Sstevel@tonic-gate 
156*0Sstevel@tonic-gate static void		freebuffers(md_raidcs_t *cs);
157*0Sstevel@tonic-gate static int		raid_read(mr_unit_t *un, md_raidcs_t *cs);
158*0Sstevel@tonic-gate static void		raid_read_io(mr_unit_t *un, md_raidcs_t *cs);
159*0Sstevel@tonic-gate static int		raid_write(mr_unit_t *un, md_raidcs_t *cs);
160*0Sstevel@tonic-gate static void		raid_write_io(mr_unit_t *un, md_raidcs_t *cs);
161*0Sstevel@tonic-gate static void		raid_stage(md_raidcs_t *cs);
162*0Sstevel@tonic-gate static void		raid_enqueue(md_raidcs_t *cs);
163*0Sstevel@tonic-gate static diskaddr_t	raid_line(diskaddr_t segment, mr_unit_t *un);
164*0Sstevel@tonic-gate uint_t			raid_dcolumn(diskaddr_t segment, mr_unit_t *un);
165*0Sstevel@tonic-gate static void		getpbuffer(md_raidcs_t *cs);
166*0Sstevel@tonic-gate static void		getdbuffer(md_raidcs_t *cs);
167*0Sstevel@tonic-gate static void		raid_done(buf_t *bp);
168*0Sstevel@tonic-gate static void		raid_io_startup(mr_unit_t *un);
169*0Sstevel@tonic-gate 
170*0Sstevel@tonic-gate static rus_state_t
171*0Sstevel@tonic-gate raid_col2unit(rcs_state_t state, rus_state_t unitstate)
172*0Sstevel@tonic-gate {
173*0Sstevel@tonic-gate 	switch (state) {
174*0Sstevel@tonic-gate 	case RCS_INIT:
175*0Sstevel@tonic-gate 		return (RUS_INIT);
176*0Sstevel@tonic-gate 	case RCS_OKAY:
177*0Sstevel@tonic-gate 		return (RUS_OKAY);
178*0Sstevel@tonic-gate 	case RCS_RESYNC:
179*0Sstevel@tonic-gate 		if (unitstate & RUS_LAST_ERRED)
180*0Sstevel@tonic-gate 			return (RUS_LAST_ERRED);
181*0Sstevel@tonic-gate 		else
182*0Sstevel@tonic-gate 			return (RUS_ERRED);
183*0Sstevel@tonic-gate 	case RCS_ERRED:
184*0Sstevel@tonic-gate 		return (RUS_ERRED);
185*0Sstevel@tonic-gate 	case RCS_LAST_ERRED:
186*0Sstevel@tonic-gate 		return (RUS_ERRED);
187*0Sstevel@tonic-gate 	default:
188*0Sstevel@tonic-gate 		break;
189*0Sstevel@tonic-gate 	}
190*0Sstevel@tonic-gate 	panic("raid_col2unit");
191*0Sstevel@tonic-gate 	/*NOTREACHED*/
192*0Sstevel@tonic-gate }
193*0Sstevel@tonic-gate 
194*0Sstevel@tonic-gate void
195*0Sstevel@tonic-gate raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force)
196*0Sstevel@tonic-gate {
197*0Sstevel@tonic-gate 
198*0Sstevel@tonic-gate 	rus_state_t	unitstate, origstate;
199*0Sstevel@tonic-gate 	rcs_state_t	colstate;
200*0Sstevel@tonic-gate 	rcs_state_t	orig_colstate;
201*0Sstevel@tonic-gate 	int		errcnt = 0,
202*0Sstevel@tonic-gate 			okaycnt = 0,
203*0Sstevel@tonic-gate 			resynccnt = 0;
204*0Sstevel@tonic-gate 	int		i;
205*0Sstevel@tonic-gate 	char		*devname;
206*0Sstevel@tonic-gate 
207*0Sstevel@tonic-gate 	ASSERT(un);
208*0Sstevel@tonic-gate 	ASSERT(col < un->un_totalcolumncnt);
209*0Sstevel@tonic-gate 	ASSERT(newstate &
210*0Sstevel@tonic-gate 	    (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
211*0Sstevel@tonic-gate 	    RCS_LAST_ERRED | RCS_REGEN));
212*0Sstevel@tonic-gate 	ASSERT((newstate &
213*0Sstevel@tonic-gate 	    ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
214*0Sstevel@tonic-gate 	    RCS_LAST_ERRED | RCS_REGEN))
215*0Sstevel@tonic-gate 	    == 0);
216*0Sstevel@tonic-gate 
217*0Sstevel@tonic-gate 	ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
218*0Sstevel@tonic-gate 
219*0Sstevel@tonic-gate 	unitstate = un->un_state;
220*0Sstevel@tonic-gate 	origstate = unitstate;
221*0Sstevel@tonic-gate 
222*0Sstevel@tonic-gate 	if (force) {
223*0Sstevel@tonic-gate 		un->un_column[col].un_devstate = newstate;
224*0Sstevel@tonic-gate 		un->un_state = raid_col2unit(newstate, unitstate);
225*0Sstevel@tonic-gate 		uniqtime32(&un->un_column[col].un_devtimestamp);
226*0Sstevel@tonic-gate 		uniqtime32(&un->un_timestamp);
227*0Sstevel@tonic-gate 		return;
228*0Sstevel@tonic-gate 	}
229*0Sstevel@tonic-gate 
230*0Sstevel@tonic-gate 	ASSERT(un->un_state &
231*0Sstevel@tonic-gate 	    (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED |
232*0Sstevel@tonic-gate 	    RUS_REGEN));
233*0Sstevel@tonic-gate 	ASSERT((un->un_state & ~(RUS_INIT |
234*0Sstevel@tonic-gate 	    RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0);
235*0Sstevel@tonic-gate 
236*0Sstevel@tonic-gate 	if (un->un_column[col].un_devstate == newstate)
237*0Sstevel@tonic-gate 		return;
238*0Sstevel@tonic-gate 
239*0Sstevel@tonic-gate 	if (newstate == RCS_REGEN) {
240*0Sstevel@tonic-gate 		if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt)
241*0Sstevel@tonic-gate 			return;
242*0Sstevel@tonic-gate 		un->un_state = RUS_REGEN;
243*0Sstevel@tonic-gate 		return;
244*0Sstevel@tonic-gate 	}
245*0Sstevel@tonic-gate 
246*0Sstevel@tonic-gate 	orig_colstate = un->un_column[col].un_devstate;
247*0Sstevel@tonic-gate 
248*0Sstevel@tonic-gate 	/*
249*0Sstevel@tonic-gate 	 * if there is another column in the error state then this
250*0Sstevel@tonic-gate 	 * column should go to the last errored state
251*0Sstevel@tonic-gate 	 */
252*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
253*0Sstevel@tonic-gate 		if (i == col)
254*0Sstevel@tonic-gate 			colstate = newstate;
255*0Sstevel@tonic-gate 		else
256*0Sstevel@tonic-gate 			colstate = un->un_column[i].un_devstate;
257*0Sstevel@tonic-gate 		if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED))
258*0Sstevel@tonic-gate 			errcnt++;
259*0Sstevel@tonic-gate 		if (colstate & RCS_OKAY)
260*0Sstevel@tonic-gate 			okaycnt++;
261*0Sstevel@tonic-gate 		if (colstate & RCS_RESYNC)
262*0Sstevel@tonic-gate 			resynccnt++;
263*0Sstevel@tonic-gate 	}
264*0Sstevel@tonic-gate 	ASSERT(resynccnt < 2);
265*0Sstevel@tonic-gate 
266*0Sstevel@tonic-gate 	if (okaycnt == un->un_totalcolumncnt)
267*0Sstevel@tonic-gate 		unitstate = RUS_OKAY;
268*0Sstevel@tonic-gate 	else if (errcnt > 1) {
269*0Sstevel@tonic-gate 		unitstate = RUS_LAST_ERRED;
270*0Sstevel@tonic-gate 		if (newstate & RCS_ERRED)
271*0Sstevel@tonic-gate 			newstate = RCS_LAST_ERRED;
272*0Sstevel@tonic-gate 	} else if (errcnt == 1)
273*0Sstevel@tonic-gate 		if (!(unitstate & RUS_LAST_ERRED))
274*0Sstevel@tonic-gate 			unitstate = RUS_ERRED;
275*0Sstevel@tonic-gate 
276*0Sstevel@tonic-gate 	if (un->un_state == RUS_DOI)
277*0Sstevel@tonic-gate 		unitstate = RUS_DOI;
278*0Sstevel@tonic-gate 
279*0Sstevel@tonic-gate 	un->un_column[col].un_devstate = newstate;
280*0Sstevel@tonic-gate 	uniqtime32(&un->un_column[col].un_devtimestamp);
281*0Sstevel@tonic-gate 	/*
282*0Sstevel@tonic-gate 	 * if there are last errored column being brought back online
283*0Sstevel@tonic-gate 	 * by open or snarf, then be sure to clear the RUS_LAST_ERRED
284*0Sstevel@tonic-gate 	 * bit to allow writes.  If there is a real error then the
285*0Sstevel@tonic-gate 	 * column will go back into last erred.
286*0Sstevel@tonic-gate 	 */
287*0Sstevel@tonic-gate 	if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) &&
288*0Sstevel@tonic-gate 	    (raid_state_cnt(un, RCS_ERRED) == 1))
289*0Sstevel@tonic-gate 		unitstate = RUS_ERRED;
290*0Sstevel@tonic-gate 
291*0Sstevel@tonic-gate 	un->un_state = unitstate;
292*0Sstevel@tonic-gate 	uniqtime32(&un->un_timestamp);
293*0Sstevel@tonic-gate 
294*0Sstevel@tonic-gate 	if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) &&
295*0Sstevel@tonic-gate 	    (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) {
296*0Sstevel@tonic-gate 		devname = md_devname(MD_UN2SET(un),
297*0Sstevel@tonic-gate 			un->un_column[col].un_dev, NULL, 0);
298*0Sstevel@tonic-gate 
299*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
300*0Sstevel@tonic-gate 		    md_shortname(MD_SID(un)), devname);
301*0Sstevel@tonic-gate 
302*0Sstevel@tonic-gate 		if (unitstate & RUS_LAST_ERRED) {
303*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: %s last erred",
304*0Sstevel@tonic-gate 			    md_shortname(MD_SID(un)), devname);
305*0Sstevel@tonic-gate 
306*0Sstevel@tonic-gate 		} else if (un->un_column[col].un_devflags &
307*0Sstevel@tonic-gate 		    MD_RAID_DEV_ISOPEN) {
308*0Sstevel@tonic-gate 			/*
309*0Sstevel@tonic-gate 			 * Close the broken device and clear the open flag on
310*0Sstevel@tonic-gate 			 * it.  We have to check that the device is open,
311*0Sstevel@tonic-gate 			 * otherwise the first open on it has resulted in the
312*0Sstevel@tonic-gate 			 * error that is being processed and the actual un_dev
313*0Sstevel@tonic-gate 			 * will be NODEV64.
314*0Sstevel@tonic-gate 			 */
315*0Sstevel@tonic-gate 			md_layered_close(un->un_column[col].un_dev,
316*0Sstevel@tonic-gate 			    MD_OFLG_NULL);
317*0Sstevel@tonic-gate 			un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
318*0Sstevel@tonic-gate 		}
319*0Sstevel@tonic-gate 	} else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED &&
320*0Sstevel@tonic-gate 	    un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) {
321*0Sstevel@tonic-gate 		/*
322*0Sstevel@tonic-gate 		 * Similar to logic above except no log messages since we
323*0Sstevel@tonic-gate 		 * are just transitioning from Last Erred to Erred.
324*0Sstevel@tonic-gate 		 */
325*0Sstevel@tonic-gate 		md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL);
326*0Sstevel@tonic-gate 		un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
327*0Sstevel@tonic-gate 	}
328*0Sstevel@tonic-gate 
329*0Sstevel@tonic-gate 	/*
330*0Sstevel@tonic-gate 	 * If a resync has completed, see if there is a Last Erred
331*0Sstevel@tonic-gate 	 * component that we can change to the Erred state.
332*0Sstevel@tonic-gate 	 */
333*0Sstevel@tonic-gate 	if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) {
334*0Sstevel@tonic-gate 		for (i = 0; i < un->un_totalcolumncnt; i++) {
335*0Sstevel@tonic-gate 			if (i != col &&
336*0Sstevel@tonic-gate 			    (un->un_column[i].un_devstate & RCS_LAST_ERRED)) {
337*0Sstevel@tonic-gate 				raid_set_state(un, i, RCS_ERRED, 0);
338*0Sstevel@tonic-gate 				break;
339*0Sstevel@tonic-gate 			}
340*0Sstevel@tonic-gate 		}
341*0Sstevel@tonic-gate 	}
342*0Sstevel@tonic-gate }
343*0Sstevel@tonic-gate 
344*0Sstevel@tonic-gate /*
345*0Sstevel@tonic-gate  * NAME:	erred_check_line
346*0Sstevel@tonic-gate  *
347*0Sstevel@tonic-gate  * DESCRIPTION: Return the type of write to perform on an erred column based
348*0Sstevel@tonic-gate  *		upon any resync activity.
349*0Sstevel@tonic-gate  *
350*0Sstevel@tonic-gate  *		if a column is being resynced and the write is above the
351*0Sstevel@tonic-gate  *		resync point may have to write to the target being resynced.
352*0Sstevel@tonic-gate  *
353*0Sstevel@tonic-gate  *		Column state may make it impossible to do the write
354*0Sstevel@tonic-gate  *		in which case RCL_EIO or RCL_ENXIO is returned.
355*0Sstevel@tonic-gate  *
356*0Sstevel@tonic-gate  *		If a column cannot be written directly, RCL_ERRED is
357*0Sstevel@tonic-gate  *		returned and processing should proceed accordingly.
358*0Sstevel@tonic-gate  *
359*0Sstevel@tonic-gate  * PARAMETERS:	minor_t		 mnum - minor number identity of metadevice
360*0Sstevel@tonic-gate  *		md_raidcs_t	 *cs - child save structure
361*0Sstevel@tonic-gate  *		mr_column_t	 *dcolumn - pointer to data column structure
362*0Sstevel@tonic-gate  *		mr_column_t	 *pcolumn - pointer to parity column structure
363*0Sstevel@tonic-gate  *
364*0Sstevel@tonic-gate  * RETURNS:	RCL_OKAY, RCL_ERRED
365*0Sstevel@tonic-gate  *
366*0Sstevel@tonic-gate  * LOCKS:	Expects Line Writer Lock and Unit Resource Lock to be held
367*0Sstevel@tonic-gate  *		across call.
368*0Sstevel@tonic-gate  */
369*0Sstevel@tonic-gate 
370*0Sstevel@tonic-gate static int
371*0Sstevel@tonic-gate erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column)
372*0Sstevel@tonic-gate {
373*0Sstevel@tonic-gate 
374*0Sstevel@tonic-gate 	ASSERT(un != NULL);
375*0Sstevel@tonic-gate 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
376*0Sstevel@tonic-gate 
377*0Sstevel@tonic-gate 	if (column->un_devstate & RCS_OKAY)
378*0Sstevel@tonic-gate 		return (RCL_OKAY);
379*0Sstevel@tonic-gate 
380*0Sstevel@tonic-gate 	if (column->un_devstate & RCS_ERRED)
381*0Sstevel@tonic-gate 		return (RCL_ERRED);  /* do not read from errored disk */
382*0Sstevel@tonic-gate 
383*0Sstevel@tonic-gate 	/*
384*0Sstevel@tonic-gate 	 * for the last errored case their are two considerations.
385*0Sstevel@tonic-gate 	 * When the last errored column is the only errored column then
386*0Sstevel@tonic-gate 	 * do treat it like a maintenance column, not doing I/O from
387*0Sstevel@tonic-gate 	 * it.   When it there are other failures then just attempt
388*0Sstevel@tonic-gate 	 * to use it.
389*0Sstevel@tonic-gate 	 */
390*0Sstevel@tonic-gate 	if (column->un_devstate & RCS_LAST_ERRED)
391*0Sstevel@tonic-gate 		return (RCL_ERRED);
392*0Sstevel@tonic-gate 
393*0Sstevel@tonic-gate 	ASSERT(column->un_devstate & RCS_RESYNC);
394*0Sstevel@tonic-gate 
395*0Sstevel@tonic-gate 	/*
396*0Sstevel@tonic-gate 	 * When a resync from a hotspare is being done (copy resync)
397*0Sstevel@tonic-gate 	 * then always treat it as an OKAY column, since no regen
398*0Sstevel@tonic-gate 	 * is required.
399*0Sstevel@tonic-gate 	 */
400*0Sstevel@tonic-gate 	if (column->un_devflags & MD_RAID_COPY_RESYNC) {
401*0Sstevel@tonic-gate 		return (RCL_OKAY);
402*0Sstevel@tonic-gate 	}
403*0Sstevel@tonic-gate 
404*0Sstevel@tonic-gate 	mutex_enter(&un->un_mx);
405*0Sstevel@tonic-gate 	if (cs->cs_line < un->un_resync_line_index) {
406*0Sstevel@tonic-gate 		mutex_exit(&un->un_mx);
407*0Sstevel@tonic-gate 		return (RCL_OKAY);
408*0Sstevel@tonic-gate 	}
409*0Sstevel@tonic-gate 	mutex_exit(&un->un_mx);
410*0Sstevel@tonic-gate 	return (RCL_ERRED);
411*0Sstevel@tonic-gate 
412*0Sstevel@tonic-gate }
413*0Sstevel@tonic-gate 
414*0Sstevel@tonic-gate /*
415*0Sstevel@tonic-gate  * NAMES:	raid_state_cnt
416*0Sstevel@tonic-gate  *
417*0Sstevel@tonic-gate  * DESCRIPTION: counts number of column in a specific state
418*0Sstevel@tonic-gate  *
419*0Sstevel@tonic-gate  * PARAMETERS:	md_raid_t *un
420*0Sstevel@tonic-gate  *		rcs_state state
421*0Sstevel@tonic-gate  */
422*0Sstevel@tonic-gate int
423*0Sstevel@tonic-gate raid_state_cnt(mr_unit_t *un, rcs_state_t state)
424*0Sstevel@tonic-gate {
425*0Sstevel@tonic-gate 	int	i, retval = 0;
426*0Sstevel@tonic-gate 
427*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++)
428*0Sstevel@tonic-gate 		if (un->un_column[i].un_devstate & state)
429*0Sstevel@tonic-gate 			retval++;
430*0Sstevel@tonic-gate 	return (retval);
431*0Sstevel@tonic-gate }
432*0Sstevel@tonic-gate 
433*0Sstevel@tonic-gate /*
434*0Sstevel@tonic-gate  * NAMES:	raid_io_overlaps
435*0Sstevel@tonic-gate  *
436*0Sstevel@tonic-gate  * DESCRIPTION: checkst for overlap of 2 child save structures
437*0Sstevel@tonic-gate  *
438*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t cs1
439*0Sstevel@tonic-gate  *		md_raidcs_t cs2
440*0Sstevel@tonic-gate  *
441*0Sstevel@tonic-gate  * RETURNS:	0 - no overlap
442*0Sstevel@tonic-gate  *		1 - overlap
443*0Sstevel@tonic-gate  */
444*0Sstevel@tonic-gate int
445*0Sstevel@tonic-gate raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2)
446*0Sstevel@tonic-gate {
447*0Sstevel@tonic-gate 	if (cs1->cs_blkno > cs2->cs_lastblk)
448*0Sstevel@tonic-gate 		return (0);
449*0Sstevel@tonic-gate 	if (cs1->cs_lastblk < cs2->cs_blkno)
450*0Sstevel@tonic-gate 		return (0);
451*0Sstevel@tonic-gate 	return (1);
452*0Sstevel@tonic-gate }
453*0Sstevel@tonic-gate 
454*0Sstevel@tonic-gate /*
455*0Sstevel@tonic-gate  * NAMES:	raid_parent_constructor
456*0Sstevel@tonic-gate  * DESCRIPTION: parent structure constructor routine
457*0Sstevel@tonic-gate  * PARAMETERS:
458*0Sstevel@tonic-gate  */
459*0Sstevel@tonic-gate /*ARGSUSED1*/
460*0Sstevel@tonic-gate static int
461*0Sstevel@tonic-gate raid_parent_constructor(void *p, void *d1, int d2)
462*0Sstevel@tonic-gate {
463*0Sstevel@tonic-gate 	mutex_init(&((md_raidps_t *)p)->ps_mx,
464*0Sstevel@tonic-gate 	    NULL, MUTEX_DEFAULT, NULL);
465*0Sstevel@tonic-gate 	mutex_init(&((md_raidps_t *)p)->ps_mapin_mx,
466*0Sstevel@tonic-gate 	    NULL, MUTEX_DEFAULT, NULL);
467*0Sstevel@tonic-gate 	return (0);
468*0Sstevel@tonic-gate }
469*0Sstevel@tonic-gate 
470*0Sstevel@tonic-gate void
471*0Sstevel@tonic-gate raid_parent_init(md_raidps_t *ps)
472*0Sstevel@tonic-gate {
473*0Sstevel@tonic-gate 	bzero(ps, offsetof(md_raidps_t, ps_mx));
474*0Sstevel@tonic-gate 	((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE;
475*0Sstevel@tonic-gate 	((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC;
476*0Sstevel@tonic-gate }
477*0Sstevel@tonic-gate 
478*0Sstevel@tonic-gate /*ARGSUSED1*/
479*0Sstevel@tonic-gate static void
480*0Sstevel@tonic-gate raid_parent_destructor(void *p, void *d)
481*0Sstevel@tonic-gate {
482*0Sstevel@tonic-gate 	mutex_destroy(&((md_raidps_t *)p)->ps_mx);
483*0Sstevel@tonic-gate 	mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx);
484*0Sstevel@tonic-gate }
485*0Sstevel@tonic-gate 
486*0Sstevel@tonic-gate /*
487*0Sstevel@tonic-gate  * NAMES:	raid_child_constructor
488*0Sstevel@tonic-gate  * DESCRIPTION: child structure constructor routine
489*0Sstevel@tonic-gate  * PARAMETERS:
490*0Sstevel@tonic-gate  */
491*0Sstevel@tonic-gate /*ARGSUSED1*/
492*0Sstevel@tonic-gate static int
493*0Sstevel@tonic-gate raid_child_constructor(void *p, void *d1, int d2)
494*0Sstevel@tonic-gate {
495*0Sstevel@tonic-gate 	md_raidcs_t	*cs = (md_raidcs_t *)p;
496*0Sstevel@tonic-gate 	mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL);
497*0Sstevel@tonic-gate 	bioinit(&cs->cs_dbuf);
498*0Sstevel@tonic-gate 	bioinit(&cs->cs_pbuf);
499*0Sstevel@tonic-gate 	bioinit(&cs->cs_hbuf);
500*0Sstevel@tonic-gate 	return (0);
501*0Sstevel@tonic-gate }
502*0Sstevel@tonic-gate 
503*0Sstevel@tonic-gate void
504*0Sstevel@tonic-gate raid_child_init(md_raidcs_t *cs)
505*0Sstevel@tonic-gate {
506*0Sstevel@tonic-gate 	bzero(cs, offsetof(md_raidcs_t, cs_mx));
507*0Sstevel@tonic-gate 
508*0Sstevel@tonic-gate 	md_bioreset(&cs->cs_dbuf);
509*0Sstevel@tonic-gate 	md_bioreset(&cs->cs_pbuf);
510*0Sstevel@tonic-gate 	md_bioreset(&cs->cs_hbuf);
511*0Sstevel@tonic-gate 
512*0Sstevel@tonic-gate 	((md_raidcs_t *)cs)->cs_dbuf.b_chain =
513*0Sstevel@tonic-gate 	    ((md_raidcs_t *)cs)->cs_pbuf.b_chain =
514*0Sstevel@tonic-gate 	    ((md_raidcs_t *)cs)->cs_hbuf.b_chain =
515*0Sstevel@tonic-gate 	    (struct buf *)(cs);
516*0Sstevel@tonic-gate 
517*0Sstevel@tonic-gate 	cs->cs_magic = RAID_CSMAGIC;
518*0Sstevel@tonic-gate 	cs->cs_line = MD_DISKADDR_ERROR;
519*0Sstevel@tonic-gate 	cs->cs_dpwslot = -1;
520*0Sstevel@tonic-gate 	cs->cs_ppwslot = -1;
521*0Sstevel@tonic-gate }
522*0Sstevel@tonic-gate 
523*0Sstevel@tonic-gate /*ARGSUSED1*/
524*0Sstevel@tonic-gate static void
525*0Sstevel@tonic-gate raid_child_destructor(void *p, void *d)
526*0Sstevel@tonic-gate {
527*0Sstevel@tonic-gate 	biofini(&((md_raidcs_t *)p)->cs_dbuf);
528*0Sstevel@tonic-gate 	biofini(&((md_raidcs_t *)p)->cs_hbuf);
529*0Sstevel@tonic-gate 	biofini(&((md_raidcs_t *)p)->cs_pbuf);
530*0Sstevel@tonic-gate 	mutex_destroy(&((md_raidcs_t *)p)->cs_mx);
531*0Sstevel@tonic-gate }
532*0Sstevel@tonic-gate 
533*0Sstevel@tonic-gate /*ARGSUSED1*/
534*0Sstevel@tonic-gate static int
535*0Sstevel@tonic-gate raid_cbuf_constructor(void *p, void *d1, int d2)
536*0Sstevel@tonic-gate {
537*0Sstevel@tonic-gate 	bioinit(&((md_raidcbuf_t *)p)->cbuf_bp);
538*0Sstevel@tonic-gate 	return (0);
539*0Sstevel@tonic-gate }
540*0Sstevel@tonic-gate 
541*0Sstevel@tonic-gate static void
542*0Sstevel@tonic-gate raid_cbuf_init(md_raidcbuf_t *cb)
543*0Sstevel@tonic-gate {
544*0Sstevel@tonic-gate 	bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp));
545*0Sstevel@tonic-gate 	md_bioreset(&cb->cbuf_bp);
546*0Sstevel@tonic-gate 	cb->cbuf_magic = RAID_BUFMAGIC;
547*0Sstevel@tonic-gate 	cb->cbuf_pwslot = -1;
548*0Sstevel@tonic-gate 	cb->cbuf_flags = CBUF_WRITE;
549*0Sstevel@tonic-gate }
550*0Sstevel@tonic-gate 
551*0Sstevel@tonic-gate /*ARGSUSED1*/
552*0Sstevel@tonic-gate static void
553*0Sstevel@tonic-gate raid_cbuf_destructor(void *p, void *d)
554*0Sstevel@tonic-gate {
555*0Sstevel@tonic-gate 	biofini(&((md_raidcbuf_t *)p)->cbuf_bp);
556*0Sstevel@tonic-gate }
557*0Sstevel@tonic-gate 
558*0Sstevel@tonic-gate /*
559*0Sstevel@tonic-gate  * NAMES:	raid_run_queue
560*0Sstevel@tonic-gate  * DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
561*0Sstevel@tonic-gate  * PARAMETERS:
562*0Sstevel@tonic-gate  */
563*0Sstevel@tonic-gate /*ARGSUSED*/
564*0Sstevel@tonic-gate static void
565*0Sstevel@tonic-gate raid_run_queue(void *d)
566*0Sstevel@tonic-gate {
567*0Sstevel@tonic-gate 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
568*0Sstevel@tonic-gate 		md_daemon(1, &md_done_daemon);
569*0Sstevel@tonic-gate }
570*0Sstevel@tonic-gate 
571*0Sstevel@tonic-gate /*
572*0Sstevel@tonic-gate  * NAME:	raid_build_pwslot
573*0Sstevel@tonic-gate  * DESCRIPTION: builds mr_pw_reserve for the column
574*0Sstevel@tonic-gate  * PARAMETERS:	un is the pointer to the unit structure
575*0Sstevel@tonic-gate  *		colindex is the column to create the structure for
576*0Sstevel@tonic-gate  */
577*0Sstevel@tonic-gate int
578*0Sstevel@tonic-gate raid_build_pw_reservation(mr_unit_t *un, int colindex)
579*0Sstevel@tonic-gate {
580*0Sstevel@tonic-gate 	mr_pw_reserve_t	*pw;
581*0Sstevel@tonic-gate 	mr_scoreboard_t	*sb;
582*0Sstevel@tonic-gate 	int		i;
583*0Sstevel@tonic-gate 
584*0Sstevel@tonic-gate 	pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) +
585*0Sstevel@tonic-gate 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP);
586*0Sstevel@tonic-gate 	pw->pw_magic = RAID_PWMAGIC;
587*0Sstevel@tonic-gate 	pw->pw_column = colindex;
588*0Sstevel@tonic-gate 	pw->pw_free = un->un_pwcnt;
589*0Sstevel@tonic-gate 	sb = &pw->pw_sb[0];
590*0Sstevel@tonic-gate 	for (i = 0; i < un->un_pwcnt; i++) {
591*0Sstevel@tonic-gate 		sb[i].sb_column = colindex;
592*0Sstevel@tonic-gate 		sb[i].sb_flags = SB_UNUSED;
593*0Sstevel@tonic-gate 		sb[i].sb_start_blk = 0;
594*0Sstevel@tonic-gate 		sb[i].sb_last_blk = 0;
595*0Sstevel@tonic-gate 		sb[i].sb_cs = NULL;
596*0Sstevel@tonic-gate 	}
597*0Sstevel@tonic-gate 	un->un_column_ic[colindex].un_pw_reserve = pw;
598*0Sstevel@tonic-gate 	return (0);
599*0Sstevel@tonic-gate }
600*0Sstevel@tonic-gate /*
601*0Sstevel@tonic-gate  * NAME:	raid_free_pw_reservation
602*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
603*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
604*0Sstevel@tonic-gate  *		int colindex  - index of the column whose pre-write slot struct
605*0Sstevel@tonic-gate  *			is to be destroyed.
606*0Sstevel@tonic-gate  */
607*0Sstevel@tonic-gate void
608*0Sstevel@tonic-gate raid_free_pw_reservation(mr_unit_t *un, int colindex)
609*0Sstevel@tonic-gate {
610*0Sstevel@tonic-gate 	mr_pw_reserve_t	*pw = un->un_column_ic[colindex].un_pw_reserve;
611*0Sstevel@tonic-gate 
612*0Sstevel@tonic-gate 	kmem_free(pw, sizeof (mr_pw_reserve_t) +
613*0Sstevel@tonic-gate 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt));
614*0Sstevel@tonic-gate }
615*0Sstevel@tonic-gate 
616*0Sstevel@tonic-gate /*
617*0Sstevel@tonic-gate  * NAME:	raid_cancel_pwslot
618*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
619*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
620*0Sstevel@tonic-gate  */
621*0Sstevel@tonic-gate static void
622*0Sstevel@tonic-gate raid_cancel_pwslot(md_raidcs_t *cs)
623*0Sstevel@tonic-gate {
624*0Sstevel@tonic-gate 	mr_unit_t		*un = cs->cs_un;
625*0Sstevel@tonic-gate 	mr_pw_reserve_t		*pw;
626*0Sstevel@tonic-gate 	mr_scoreboard_t		*sb;
627*0Sstevel@tonic-gate 	mr_column_ic_t		*col;
628*0Sstevel@tonic-gate 	md_raidcbuf_t		*cbuf;
629*0Sstevel@tonic-gate 	int			broadcast = 0;
630*0Sstevel@tonic-gate 
631*0Sstevel@tonic-gate 	if (cs->cs_ps->ps_flags & MD_RPS_READ)
632*0Sstevel@tonic-gate 		return;
633*0Sstevel@tonic-gate 	if (cs->cs_dpwslot != -1) {
634*0Sstevel@tonic-gate 		col = &un->un_column_ic[cs->cs_dcolumn];
635*0Sstevel@tonic-gate 		pw = col->un_pw_reserve;
636*0Sstevel@tonic-gate 		sb = &pw->pw_sb[cs->cs_dpwslot];
637*0Sstevel@tonic-gate 		sb->sb_flags = SB_AVAIL;
638*0Sstevel@tonic-gate 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
639*0Sstevel@tonic-gate 			broadcast++;
640*0Sstevel@tonic-gate 		sb->sb_cs = NULL;
641*0Sstevel@tonic-gate 	}
642*0Sstevel@tonic-gate 
643*0Sstevel@tonic-gate 	if (cs->cs_ppwslot != -1) {
644*0Sstevel@tonic-gate 		col = &un->un_column_ic[cs->cs_pcolumn];
645*0Sstevel@tonic-gate 		pw = col->un_pw_reserve;
646*0Sstevel@tonic-gate 		sb = &pw->pw_sb[cs->cs_ppwslot];
647*0Sstevel@tonic-gate 		sb->sb_flags = SB_AVAIL;
648*0Sstevel@tonic-gate 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
649*0Sstevel@tonic-gate 			broadcast++;
650*0Sstevel@tonic-gate 		sb->sb_cs = NULL;
651*0Sstevel@tonic-gate 	}
652*0Sstevel@tonic-gate 
653*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
654*0Sstevel@tonic-gate 		if (cbuf->cbuf_pwslot == -1)
655*0Sstevel@tonic-gate 			continue;
656*0Sstevel@tonic-gate 		col = &un->un_column_ic[cbuf->cbuf_column];
657*0Sstevel@tonic-gate 		pw = col->un_pw_reserve;
658*0Sstevel@tonic-gate 		sb = &pw->pw_sb[cbuf->cbuf_pwslot];
659*0Sstevel@tonic-gate 		sb->sb_flags = SB_AVAIL;
660*0Sstevel@tonic-gate 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
661*0Sstevel@tonic-gate 			broadcast++;
662*0Sstevel@tonic-gate 		sb->sb_cs = NULL;
663*0Sstevel@tonic-gate 	}
664*0Sstevel@tonic-gate 	if (broadcast) {
665*0Sstevel@tonic-gate 		cv_broadcast(&un->un_cv);
666*0Sstevel@tonic-gate 		return;
667*0Sstevel@tonic-gate 	}
668*0Sstevel@tonic-gate 	mutex_enter(&un->un_mx);
669*0Sstevel@tonic-gate 	if (un->un_rflags & MD_RFLAG_NEEDPW)
670*0Sstevel@tonic-gate 		cv_broadcast(&un->un_cv);
671*0Sstevel@tonic-gate 	mutex_exit(&un->un_mx);
672*0Sstevel@tonic-gate }
673*0Sstevel@tonic-gate 
674*0Sstevel@tonic-gate static void
675*0Sstevel@tonic-gate raid_free_pwinvalidate(md_raidcs_t *cs)
676*0Sstevel@tonic-gate {
677*0Sstevel@tonic-gate 	md_raidcbuf_t		*cbuf;
678*0Sstevel@tonic-gate 	md_raidcbuf_t		*cbuf_to_free;
679*0Sstevel@tonic-gate 	mr_unit_t		*un = cs->cs_un;
680*0Sstevel@tonic-gate 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
681*0Sstevel@tonic-gate 	mr_pw_reserve_t		*pw;
682*0Sstevel@tonic-gate 	mr_scoreboard_t		*sb;
683*0Sstevel@tonic-gate 	int			broadcast = 0;
684*0Sstevel@tonic-gate 
685*0Sstevel@tonic-gate 	cbuf = cs->cs_pw_inval_list;
686*0Sstevel@tonic-gate 	ASSERT(cbuf);
687*0Sstevel@tonic-gate 	mutex_enter(&un->un_linlck_mx);
688*0Sstevel@tonic-gate 	while (cbuf) {
689*0Sstevel@tonic-gate 		pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve;
690*0Sstevel@tonic-gate 		sb = &pw->pw_sb[0];
691*0Sstevel@tonic-gate 		ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND);
692*0Sstevel@tonic-gate 		sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED;
693*0Sstevel@tonic-gate 		sb[cbuf->cbuf_pwslot].sb_cs = NULL;
694*0Sstevel@tonic-gate 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
695*0Sstevel@tonic-gate 			broadcast++;
696*0Sstevel@tonic-gate 		cbuf_to_free = cbuf;
697*0Sstevel@tonic-gate 		cbuf = cbuf->cbuf_next;
698*0Sstevel@tonic-gate 		kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize));
699*0Sstevel@tonic-gate 		kmem_cache_free(raid_cbuf_cache, cbuf_to_free);
700*0Sstevel@tonic-gate 	}
701*0Sstevel@tonic-gate 	cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL;
702*0Sstevel@tonic-gate 	/*
703*0Sstevel@tonic-gate 	 * now that there is a free prewrite slot, check to see if there
704*0Sstevel@tonic-gate 	 * are any io operations waiting first wake up the raid_io_startup
705*0Sstevel@tonic-gate 	 * then signal the the processes waiting in raid_write.
706*0Sstevel@tonic-gate 	 */
707*0Sstevel@tonic-gate 	if (ui->ui_io_lock->io_list_front)
708*0Sstevel@tonic-gate 		raid_io_startup(un);
709*0Sstevel@tonic-gate 	mutex_exit(&un->un_linlck_mx);
710*0Sstevel@tonic-gate 	if (broadcast) {
711*0Sstevel@tonic-gate 		cv_broadcast(&un->un_cv);
712*0Sstevel@tonic-gate 		return;
713*0Sstevel@tonic-gate 	}
714*0Sstevel@tonic-gate 	mutex_enter(&un->un_mx);
715*0Sstevel@tonic-gate 	if (un->un_rflags & MD_RFLAG_NEEDPW)
716*0Sstevel@tonic-gate 		cv_broadcast(&un->un_cv);
717*0Sstevel@tonic-gate 	mutex_exit(&un->un_mx);
718*0Sstevel@tonic-gate }
719*0Sstevel@tonic-gate 
720*0Sstevel@tonic-gate 
721*0Sstevel@tonic-gate static int
722*0Sstevel@tonic-gate raid_get_pwslot(md_raidcs_t *cs, int column)
723*0Sstevel@tonic-gate {
724*0Sstevel@tonic-gate 	mr_scoreboard_t	*sb;
725*0Sstevel@tonic-gate 	mr_pw_reserve_t	*pw;
726*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
727*0Sstevel@tonic-gate 	diskaddr_t	start_blk = cs->cs_blkno;
728*0Sstevel@tonic-gate 	diskaddr_t	last_blk = cs->cs_lastblk;
729*0Sstevel@tonic-gate 	int		i;
730*0Sstevel@tonic-gate 	int		pwcnt = un->un_pwcnt;
731*0Sstevel@tonic-gate 	int		avail = -1;
732*0Sstevel@tonic-gate 	int		use = -1;
733*0Sstevel@tonic-gate 	int		flags;
734*0Sstevel@tonic-gate 
735*0Sstevel@tonic-gate 
736*0Sstevel@tonic-gate 	/* start with the data column */
737*0Sstevel@tonic-gate 	pw = cs->cs_un->un_column_ic[column].un_pw_reserve;
738*0Sstevel@tonic-gate 	sb = &pw->pw_sb[0];
739*0Sstevel@tonic-gate 	ASSERT(pw->pw_free > 0);
740*0Sstevel@tonic-gate 	for (i = 0; i < pwcnt; i++) {
741*0Sstevel@tonic-gate 		flags = sb[i].sb_flags;
742*0Sstevel@tonic-gate 		if (flags & SB_INVAL_PEND)
743*0Sstevel@tonic-gate 			continue;
744*0Sstevel@tonic-gate 
745*0Sstevel@tonic-gate 		if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED)))
746*0Sstevel@tonic-gate 			avail = i;
747*0Sstevel@tonic-gate 
748*0Sstevel@tonic-gate 		if ((start_blk > sb[i].sb_last_blk) ||
749*0Sstevel@tonic-gate 		    (last_blk < sb[i].sb_start_blk))
750*0Sstevel@tonic-gate 			continue;
751*0Sstevel@tonic-gate 
752*0Sstevel@tonic-gate 		/* OVERLAP */
753*0Sstevel@tonic-gate 		ASSERT(! (sb[i].sb_flags & SB_INUSE));
754*0Sstevel@tonic-gate 
755*0Sstevel@tonic-gate 		/*
756*0Sstevel@tonic-gate 		 * raid_invalidate_pwslot attempts to zero out prewrite entry
757*0Sstevel@tonic-gate 		 * in parallel with other disk reads/writes related to current
758*0Sstevel@tonic-gate 		 * transaction. however cs_frags accounting for this case is
759*0Sstevel@tonic-gate 		 * broken because raid_write_io resets cs_frags i.e. ignoring
760*0Sstevel@tonic-gate 		 * that it could have been been set to > 0 value by
761*0Sstevel@tonic-gate 		 * raid_invalidate_pwslot. While this can be fixed an
762*0Sstevel@tonic-gate 		 * additional problem is that we don't seem to handle
763*0Sstevel@tonic-gate 		 * correctly the case of getting a disk error for prewrite
764*0Sstevel@tonic-gate 		 * entry invalidation.
765*0Sstevel@tonic-gate 		 * It does not look like we really need
766*0Sstevel@tonic-gate 		 * to invalidate prewrite slots because raid_replay sorts
767*0Sstevel@tonic-gate 		 * prewrite id's in ascending order and during recovery the
768*0Sstevel@tonic-gate 		 * latest prewrite entry for the same block will be replay
769*0Sstevel@tonic-gate 		 * last. That's why i ifdef'd out the call to
770*0Sstevel@tonic-gate 		 * raid_invalidate_pwslot. --aguzovsk@east
771*0Sstevel@tonic-gate 		 */
772*0Sstevel@tonic-gate 
773*0Sstevel@tonic-gate 		if (use == -1) {
774*0Sstevel@tonic-gate 			use = i;
775*0Sstevel@tonic-gate 		}
776*0Sstevel@tonic-gate 	}
777*0Sstevel@tonic-gate 
778*0Sstevel@tonic-gate 	ASSERT(avail != -1);
779*0Sstevel@tonic-gate 	pw->pw_free--;
780*0Sstevel@tonic-gate 	if (use == -1)
781*0Sstevel@tonic-gate 		use = avail;
782*0Sstevel@tonic-gate 
783*0Sstevel@tonic-gate 	ASSERT(! (sb[use].sb_flags & SB_INUSE));
784*0Sstevel@tonic-gate 	sb[use].sb_flags = SB_INUSE;
785*0Sstevel@tonic-gate 	sb[use].sb_cs = cs;
786*0Sstevel@tonic-gate 	sb[use].sb_start_blk = start_blk;
787*0Sstevel@tonic-gate 	sb[use].sb_last_blk = last_blk;
788*0Sstevel@tonic-gate 	ASSERT((use >= 0) && (use < un->un_pwcnt));
789*0Sstevel@tonic-gate 	return (use);
790*0Sstevel@tonic-gate }
791*0Sstevel@tonic-gate 
792*0Sstevel@tonic-gate static int
793*0Sstevel@tonic-gate raid_check_pw(md_raidcs_t *cs)
794*0Sstevel@tonic-gate {
795*0Sstevel@tonic-gate 
796*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
797*0Sstevel@tonic-gate 	int		i;
798*0Sstevel@tonic-gate 
799*0Sstevel@tonic-gate 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
800*0Sstevel@tonic-gate 	/*
801*0Sstevel@tonic-gate 	 * check to be sure there is a prewrite slot available
802*0Sstevel@tonic-gate 	 * if not just return.
803*0Sstevel@tonic-gate 	 */
804*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_LINE) {
805*0Sstevel@tonic-gate 		for (i = 0; i < un->un_totalcolumncnt; i++)
806*0Sstevel@tonic-gate 			if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0)
807*0Sstevel@tonic-gate 				return (1);
808*0Sstevel@tonic-gate 		return (0);
809*0Sstevel@tonic-gate 	}
810*0Sstevel@tonic-gate 
811*0Sstevel@tonic-gate 	if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0)
812*0Sstevel@tonic-gate 		return (1);
813*0Sstevel@tonic-gate 	if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0)
814*0Sstevel@tonic-gate 		return (1);
815*0Sstevel@tonic-gate 	return (0);
816*0Sstevel@tonic-gate }
817*0Sstevel@tonic-gate static int
818*0Sstevel@tonic-gate raid_alloc_pwslot(md_raidcs_t *cs)
819*0Sstevel@tonic-gate {
820*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
821*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
822*0Sstevel@tonic-gate 
823*0Sstevel@tonic-gate 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
824*0Sstevel@tonic-gate 	if (raid_check_pw(cs))
825*0Sstevel@tonic-gate 		return (1);
826*0Sstevel@tonic-gate 
827*0Sstevel@tonic-gate 	mutex_enter(&un->un_mx);
828*0Sstevel@tonic-gate 	un->un_pwid++;
829*0Sstevel@tonic-gate 	cs->cs_pwid = un->un_pwid;
830*0Sstevel@tonic-gate 	mutex_exit(&un->un_mx);
831*0Sstevel@tonic-gate 
832*0Sstevel@tonic-gate 	cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn);
833*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
834*0Sstevel@tonic-gate 		cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column);
835*0Sstevel@tonic-gate 	}
836*0Sstevel@tonic-gate 	cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn);
837*0Sstevel@tonic-gate 
838*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS;
839*0Sstevel@tonic-gate 
840*0Sstevel@tonic-gate 	return (0);
841*0Sstevel@tonic-gate }
842*0Sstevel@tonic-gate 
843*0Sstevel@tonic-gate /*
844*0Sstevel@tonic-gate  * NAMES:	raid_build_incore
845*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice incore structure building routine
846*0Sstevel@tonic-gate  * PARAMETERS:	void *p - pointer to a unit structure
847*0Sstevel@tonic-gate  *		int snarfing - a flag to indicate snarfing is required
848*0Sstevel@tonic-gate  */
849*0Sstevel@tonic-gate int
850*0Sstevel@tonic-gate raid_build_incore(void *p, int snarfing)
851*0Sstevel@tonic-gate {
852*0Sstevel@tonic-gate 	mr_unit_t	*un = (mr_unit_t *)p;
853*0Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
854*0Sstevel@tonic-gate 	mddb_recid_t	hs_recid = 0;
855*0Sstevel@tonic-gate 	int		i;
856*0Sstevel@tonic-gate 	int		preserve_flags;
857*0Sstevel@tonic-gate 	mr_column_t	*column;
858*0Sstevel@tonic-gate 	int		iosize;
859*0Sstevel@tonic-gate 	md_dev64_t	hs, dev;
860*0Sstevel@tonic-gate 	int		resync_cnt = 0,
861*0Sstevel@tonic-gate 			error_cnt = 0;
862*0Sstevel@tonic-gate 
863*0Sstevel@tonic-gate 	hs = NODEV64;
864*0Sstevel@tonic-gate 	dev = NODEV64;
865*0Sstevel@tonic-gate 
866*0Sstevel@tonic-gate 	/* clear out bogus pointer incase we return(1) prior to alloc */
867*0Sstevel@tonic-gate 	un->mr_ic = NULL;
868*0Sstevel@tonic-gate 
869*0Sstevel@tonic-gate 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
870*0Sstevel@tonic-gate 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
871*0Sstevel@tonic-gate 		return (1);
872*0Sstevel@tonic-gate 	}
873*0Sstevel@tonic-gate 
874*0Sstevel@tonic-gate 	if (MD_UNIT(mnum) != NULL)
875*0Sstevel@tonic-gate 		return (0);
876*0Sstevel@tonic-gate 
877*0Sstevel@tonic-gate 	if (snarfing)
878*0Sstevel@tonic-gate 		MD_STATUS(un) = 0;
879*0Sstevel@tonic-gate 
880*0Sstevel@tonic-gate 	un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic),
881*0Sstevel@tonic-gate 	    KM_SLEEP);
882*0Sstevel@tonic-gate 
883*0Sstevel@tonic-gate 	un->un_column_ic = (mr_column_ic_t *)
884*0Sstevel@tonic-gate 	    kmem_zalloc(sizeof (mr_column_ic_t) *
885*0Sstevel@tonic-gate 		un->un_totalcolumncnt, KM_SLEEP);
886*0Sstevel@tonic-gate 
887*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
888*0Sstevel@tonic-gate 
889*0Sstevel@tonic-gate 		column	= &un->un_column[i];
890*0Sstevel@tonic-gate 		preserve_flags = column->un_devflags &
891*0Sstevel@tonic-gate 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
892*0Sstevel@tonic-gate 		column->un_devflags &=
893*0Sstevel@tonic-gate 		    ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN |
894*0Sstevel@tonic-gate 		    MD_RAID_WRITE_ALT);
895*0Sstevel@tonic-gate 		if (raid_build_pw_reservation(un, i) != 0) {
896*0Sstevel@tonic-gate 			/* could not build pwslot */
897*0Sstevel@tonic-gate 			return (1);
898*0Sstevel@tonic-gate 		}
899*0Sstevel@tonic-gate 
900*0Sstevel@tonic-gate 		if (snarfing) {
901*0Sstevel@tonic-gate 			set_t		setno = MD_MIN2SET(mnum);
902*0Sstevel@tonic-gate 			dev =  md_getdevnum(setno, mddb_getsidenum(setno),
903*0Sstevel@tonic-gate 			    column->un_orig_key, MD_NOTRUST_DEVT);
904*0Sstevel@tonic-gate 			/*
905*0Sstevel@tonic-gate 			 * Comment out instead of remove so we have history
906*0Sstevel@tonic-gate 			 * In the pre-SVM releases stored devt is used so
907*0Sstevel@tonic-gate 			 * as long as there is one snarf is always happy
908*0Sstevel@tonic-gate 			 * even the component is powered off.  This is not
909*0Sstevel@tonic-gate 			 * the case in current SVM implementation.  NODEV64
910*0Sstevel@tonic-gate 			 * can be returned and in this case since we resolve
911*0Sstevel@tonic-gate 			 * the devt at 'open' time (first use of metadevice)
912*0Sstevel@tonic-gate 			 * we will allow snarf continue.
913*0Sstevel@tonic-gate 			 *
914*0Sstevel@tonic-gate 			 * if (dev == NODEV64)
915*0Sstevel@tonic-gate 			 *	return (1);
916*0Sstevel@tonic-gate 			 */
917*0Sstevel@tonic-gate 
918*0Sstevel@tonic-gate 			/*
919*0Sstevel@tonic-gate 			 * Setup un_orig_dev from device id info if the device
920*0Sstevel@tonic-gate 			 * is valid (not NODEV64).
921*0Sstevel@tonic-gate 			 */
922*0Sstevel@tonic-gate 			if (dev != NODEV64)
923*0Sstevel@tonic-gate 				column->un_orig_dev = dev;
924*0Sstevel@tonic-gate 
925*0Sstevel@tonic-gate 			if (column->un_devstate & RCS_RESYNC)
926*0Sstevel@tonic-gate 				resync_cnt++;
927*0Sstevel@tonic-gate 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
928*0Sstevel@tonic-gate 				error_cnt++;
929*0Sstevel@tonic-gate 
930*0Sstevel@tonic-gate 			if (HOTSPARED(un, i)) {
931*0Sstevel@tonic-gate 				(void) md_hot_spare_ifc(HS_MKDEV,
932*0Sstevel@tonic-gate 				    0, 0, 0, &column->un_hs_id, NULL,
933*0Sstevel@tonic-gate 				    &hs, NULL);
934*0Sstevel@tonic-gate 				/*
935*0Sstevel@tonic-gate 				 * Same here
936*0Sstevel@tonic-gate 				 *
937*0Sstevel@tonic-gate 				 * if (hs == NODEV64)
938*0Sstevel@tonic-gate 				 *	return (1);
939*0Sstevel@tonic-gate 				 */
940*0Sstevel@tonic-gate 			}
941*0Sstevel@tonic-gate 
942*0Sstevel@tonic-gate 			if (HOTSPARED(un, i)) {
943*0Sstevel@tonic-gate 				if (column->un_devstate &
944*0Sstevel@tonic-gate 				    (RCS_OKAY | RCS_LAST_ERRED)) {
945*0Sstevel@tonic-gate 					column->un_dev = hs;
946*0Sstevel@tonic-gate 					column->un_pwstart =
947*0Sstevel@tonic-gate 					    column->un_hs_pwstart;
948*0Sstevel@tonic-gate 					column->un_devstart =
949*0Sstevel@tonic-gate 					    column->un_hs_devstart;
950*0Sstevel@tonic-gate 					preserve_flags &=
951*0Sstevel@tonic-gate 					    ~(MD_RAID_COPY_RESYNC |
952*0Sstevel@tonic-gate 					    MD_RAID_REGEN_RESYNC);
953*0Sstevel@tonic-gate 				} else  if (column->un_devstate & RCS_RESYNC) {
954*0Sstevel@tonic-gate 					/*
955*0Sstevel@tonic-gate 					 * if previous system was 4.0 set
956*0Sstevel@tonic-gate 					 * the direction flags
957*0Sstevel@tonic-gate 					 */
958*0Sstevel@tonic-gate 					if ((preserve_flags &
959*0Sstevel@tonic-gate 					    (MD_RAID_COPY_RESYNC |
960*0Sstevel@tonic-gate 					    MD_RAID_REGEN_RESYNC)) == 0) {
961*0Sstevel@tonic-gate 					if (column->un_alt_dev != NODEV64)
962*0Sstevel@tonic-gate 						preserve_flags |=
963*0Sstevel@tonic-gate 						MD_RAID_COPY_RESYNC;
964*0Sstevel@tonic-gate 					else
965*0Sstevel@tonic-gate 					    preserve_flags |=
966*0Sstevel@tonic-gate 						MD_RAID_REGEN_RESYNC;
967*0Sstevel@tonic-gate 					}
968*0Sstevel@tonic-gate 				}
969*0Sstevel@tonic-gate 			} else { /* no hot spares */
970*0Sstevel@tonic-gate 				column->un_dev = dev;
971*0Sstevel@tonic-gate 				column->un_pwstart = column->un_orig_pwstart;
972*0Sstevel@tonic-gate 				column->un_devstart = column->un_orig_devstart;
973*0Sstevel@tonic-gate 				if (column->un_devstate & RCS_RESYNC) {
974*0Sstevel@tonic-gate 					preserve_flags |= MD_RAID_REGEN_RESYNC;
975*0Sstevel@tonic-gate 					preserve_flags &= ~MD_RAID_COPY_RESYNC;
976*0Sstevel@tonic-gate 				}
977*0Sstevel@tonic-gate 			}
978*0Sstevel@tonic-gate 			if (! (column->un_devstate & RCS_RESYNC)) {
979*0Sstevel@tonic-gate 				preserve_flags &=
980*0Sstevel@tonic-gate 				    ~(MD_RAID_REGEN_RESYNC |
981*0Sstevel@tonic-gate 				    MD_RAID_COPY_RESYNC);
982*0Sstevel@tonic-gate 			}
983*0Sstevel@tonic-gate 
984*0Sstevel@tonic-gate 			column->un_devflags = preserve_flags;
985*0Sstevel@tonic-gate 			column->un_alt_dev = NODEV64;
986*0Sstevel@tonic-gate 			column->un_alt_pwstart = 0;
987*0Sstevel@tonic-gate 			column->un_alt_devstart = 0;
988*0Sstevel@tonic-gate 			un->un_resync_line_index = 0;
989*0Sstevel@tonic-gate 			un->un_resync_index = 0;
990*0Sstevel@tonic-gate 			un->un_percent_done = 0;
991*0Sstevel@tonic-gate 		}
992*0Sstevel@tonic-gate 	}
993*0Sstevel@tonic-gate 
994*0Sstevel@tonic-gate 	if (resync_cnt && error_cnt) {
995*0Sstevel@tonic-gate 		for (i = 0; i < un->un_totalcolumncnt; i++) {
996*0Sstevel@tonic-gate 			column  = &un->un_column[i];
997*0Sstevel@tonic-gate 			if (HOTSPARED(un, i) &&
998*0Sstevel@tonic-gate 			    (column->un_devstate & RCS_RESYNC) &&
999*0Sstevel@tonic-gate 			    (column->un_devflags & MD_RAID_COPY_RESYNC))
1000*0Sstevel@tonic-gate 				/* hotspare has data */
1001*0Sstevel@tonic-gate 				continue;
1002*0Sstevel@tonic-gate 
1003*0Sstevel@tonic-gate 			if (HOTSPARED(un, i) &&
1004*0Sstevel@tonic-gate 			    (column->un_devstate & RCS_RESYNC)) {
1005*0Sstevel@tonic-gate 				/* hotspare does not have data */
1006*0Sstevel@tonic-gate 				raid_hs_release(HS_FREE, un, &hs_recid, i);
1007*0Sstevel@tonic-gate 				column->un_dev = column->un_orig_dev;
1008*0Sstevel@tonic-gate 				column->un_pwstart = column->un_orig_pwstart;
1009*0Sstevel@tonic-gate 				column->un_devstart = column->un_orig_devstart;
1010*0Sstevel@tonic-gate 				mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM);
1011*0Sstevel@tonic-gate 			}
1012*0Sstevel@tonic-gate 
1013*0Sstevel@tonic-gate 			if (column->un_devstate & RCS_ERRED)
1014*0Sstevel@tonic-gate 				column->un_devstate = RCS_LAST_ERRED;
1015*0Sstevel@tonic-gate 
1016*0Sstevel@tonic-gate 			if (column->un_devstate & RCS_RESYNC)
1017*0Sstevel@tonic-gate 				column->un_devstate = RCS_ERRED;
1018*0Sstevel@tonic-gate 		}
1019*0Sstevel@tonic-gate 	}
1020*0Sstevel@tonic-gate 	mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
1021*0Sstevel@tonic-gate 
1022*0Sstevel@tonic-gate 	un->un_pwid = 1; /* or some other possible value */
1023*0Sstevel@tonic-gate 	un->un_magic = RAID_UNMAGIC;
1024*0Sstevel@tonic-gate 	iosize = un->un_iosize;
1025*0Sstevel@tonic-gate 	un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
1026*0Sstevel@tonic-gate 	un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
1027*0Sstevel@tonic-gate 	mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL);
1028*0Sstevel@tonic-gate 	cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL);
1029*0Sstevel@tonic-gate 	un->un_linlck_chn = NULL;
1030*0Sstevel@tonic-gate 	MD_UNIT(mnum) = un;
1031*0Sstevel@tonic-gate 
1032*0Sstevel@tonic-gate 
1033*0Sstevel@tonic-gate 	return (0);
1034*0Sstevel@tonic-gate }
1035*0Sstevel@tonic-gate 
1036*0Sstevel@tonic-gate /*
1037*0Sstevel@tonic-gate  * NAMES:	reset_raid
1038*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice reset routine
1039*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
1040*0Sstevel@tonic-gate  *		minor_t mnum - RAID metadevice minor number
1041*0Sstevel@tonic-gate  *		int removing - a flag to imply removing device name from
1042*0Sstevel@tonic-gate  *			MDDB database.
1043*0Sstevel@tonic-gate  */
1044*0Sstevel@tonic-gate void
1045*0Sstevel@tonic-gate reset_raid(mr_unit_t *un, minor_t mnum, int removing)
1046*0Sstevel@tonic-gate {
1047*0Sstevel@tonic-gate 	int		i, n = 0;
1048*0Sstevel@tonic-gate 	sv_dev_t	*sv;
1049*0Sstevel@tonic-gate 	mr_column_t	*column;
1050*0Sstevel@tonic-gate 	int		column_cnt = un->un_totalcolumncnt;
1051*0Sstevel@tonic-gate 	mddb_recid_t	*recids, vtoc_id;
1052*0Sstevel@tonic-gate 	int		hserr;
1053*0Sstevel@tonic-gate 
1054*0Sstevel@tonic-gate 	ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) &&
1055*0Sstevel@tonic-gate 	    (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL));
1056*0Sstevel@tonic-gate 
1057*0Sstevel@tonic-gate 	md_destroy_unit_incore(mnum, &raid_md_ops);
1058*0Sstevel@tonic-gate 
1059*0Sstevel@tonic-gate 	MD_UNIT(mnum) = NULL;
1060*0Sstevel@tonic-gate 
1061*0Sstevel@tonic-gate 	if (un->un_pbuffer) {
1062*0Sstevel@tonic-gate 		kmem_free(un->un_pbuffer, dbtob(un->un_iosize));
1063*0Sstevel@tonic-gate 		un->un_pbuffer = NULL;
1064*0Sstevel@tonic-gate 	}
1065*0Sstevel@tonic-gate 	if (un->un_dbuffer) {
1066*0Sstevel@tonic-gate 		kmem_free(un->un_dbuffer, dbtob(un->un_iosize));
1067*0Sstevel@tonic-gate 		un->un_dbuffer = NULL;
1068*0Sstevel@tonic-gate 	}
1069*0Sstevel@tonic-gate 
1070*0Sstevel@tonic-gate 	/* free all pre-write slots created during build incore */
1071*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++)
1072*0Sstevel@tonic-gate 		raid_free_pw_reservation(un, i);
1073*0Sstevel@tonic-gate 
1074*0Sstevel@tonic-gate 	kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
1075*0Sstevel@tonic-gate 		un->un_totalcolumncnt);
1076*0Sstevel@tonic-gate 
1077*0Sstevel@tonic-gate 	kmem_free(un->mr_ic, sizeof (*un->mr_ic));
1078*0Sstevel@tonic-gate 
1079*0Sstevel@tonic-gate 	if (!removing)
1080*0Sstevel@tonic-gate 		return;
1081*0Sstevel@tonic-gate 
1082*0Sstevel@tonic-gate 	sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t),
1083*0Sstevel@tonic-gate 	    KM_SLEEP);
1084*0Sstevel@tonic-gate 
1085*0Sstevel@tonic-gate 	recids = (mddb_recid_t *)
1086*0Sstevel@tonic-gate 	    kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP);
1087*0Sstevel@tonic-gate 
1088*0Sstevel@tonic-gate 	for (i = 0; i < column_cnt; i++) {
1089*0Sstevel@tonic-gate 		md_unit_t	*comp_un;
1090*0Sstevel@tonic-gate 		md_dev64_t	comp_dev;
1091*0Sstevel@tonic-gate 
1092*0Sstevel@tonic-gate 		column = &un->un_column[i];
1093*0Sstevel@tonic-gate 		sv[i].setno = MD_MIN2SET(mnum);
1094*0Sstevel@tonic-gate 		sv[i].key = column->un_orig_key;
1095*0Sstevel@tonic-gate 		if (HOTSPARED(un, i)) {
1096*0Sstevel@tonic-gate 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
1097*0Sstevel@tonic-gate 				hserr = HS_BAD;
1098*0Sstevel@tonic-gate 			else
1099*0Sstevel@tonic-gate 				hserr = HS_FREE;
1100*0Sstevel@tonic-gate 			raid_hs_release(hserr, un, &recids[n++], i);
1101*0Sstevel@tonic-gate 		}
1102*0Sstevel@tonic-gate 		/*
1103*0Sstevel@tonic-gate 		 * deparent any metadevices.
1104*0Sstevel@tonic-gate 		 * NOTE: currently soft partitions are the only metadevices
1105*0Sstevel@tonic-gate 		 * allowed in RAID metadevices.
1106*0Sstevel@tonic-gate 		 */
1107*0Sstevel@tonic-gate 		comp_dev = column->un_dev;
1108*0Sstevel@tonic-gate 		if (md_getmajor(comp_dev) == md_major) {
1109*0Sstevel@tonic-gate 			comp_un = MD_UNIT(md_getminor(comp_dev));
1110*0Sstevel@tonic-gate 			recids[n++] = MD_RECID(comp_un);
1111*0Sstevel@tonic-gate 			md_reset_parent(comp_dev);
1112*0Sstevel@tonic-gate 		}
1113*0Sstevel@tonic-gate 	}
1114*0Sstevel@tonic-gate 	/* decrement the reference count of the old hsp */
1115*0Sstevel@tonic-gate 	if (un->un_hsp_id != -1)
1116*0Sstevel@tonic-gate 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
1117*0Sstevel@tonic-gate 		    &recids[n++], NULL, NULL, NULL);
1118*0Sstevel@tonic-gate 	recids[n] = 0;
1119*0Sstevel@tonic-gate 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1120*0Sstevel@tonic-gate 	vtoc_id = un->c.un_vtoc_id;
1121*0Sstevel@tonic-gate 
1122*0Sstevel@tonic-gate 	raid_commit(un, recids);
1123*0Sstevel@tonic-gate 
1124*0Sstevel@tonic-gate 
1125*0Sstevel@tonic-gate 	/* Remove the unit structure */
1126*0Sstevel@tonic-gate 	mddb_deleterec_wrapper(un->c.un_record_id);
1127*0Sstevel@tonic-gate 
1128*0Sstevel@tonic-gate 	/* Remove the vtoc, if present */
1129*0Sstevel@tonic-gate 	if (vtoc_id)
1130*0Sstevel@tonic-gate 		mddb_deleterec_wrapper(vtoc_id);
1131*0Sstevel@tonic-gate 	md_rem_names(sv, column_cnt);
1132*0Sstevel@tonic-gate 	kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t));
1133*0Sstevel@tonic-gate 	kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t));
1134*0Sstevel@tonic-gate 
1135*0Sstevel@tonic-gate 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
1136*0Sstevel@tonic-gate 	    MD_MIN2SET(mnum), mnum);
1137*0Sstevel@tonic-gate }
1138*0Sstevel@tonic-gate 
1139*0Sstevel@tonic-gate /*
1140*0Sstevel@tonic-gate  * NAMES:	raid_error_parent
1141*0Sstevel@tonic-gate  * DESCRIPTION: mark a parent structure in error
1142*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1143*0Sstevel@tonic-gate  *		int	error - error value to set
1144*0Sstevel@tonic-gate  * NOTE:	(TBR) - this routine currently is not in use.
1145*0Sstevel@tonic-gate  */
1146*0Sstevel@tonic-gate static void
1147*0Sstevel@tonic-gate raid_error_parent(md_raidps_t *ps, int error)
1148*0Sstevel@tonic-gate {
1149*0Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
1150*0Sstevel@tonic-gate 	ps->ps_flags |= MD_RPS_ERROR;
1151*0Sstevel@tonic-gate 	ps->ps_error = error;
1152*0Sstevel@tonic-gate 	mutex_exit(&ps->ps_mx);
1153*0Sstevel@tonic-gate }
1154*0Sstevel@tonic-gate 
1155*0Sstevel@tonic-gate /*
1156*0Sstevel@tonic-gate  * The following defines tell raid_free_parent
1157*0Sstevel@tonic-gate  *	RFP_RLS_LOCK		release the unit reader lock when done.
1158*0Sstevel@tonic-gate  *	RFP_DECR_PWFRAGS	decrement ps_pwfrags
1159*0Sstevel@tonic-gate  *	RFP_DECR_FRAGS		decrement ps_frags
1160*0Sstevel@tonic-gate  *	RFP_DECR_READFRAGS	read keeps FRAGS and PWFRAGS in lockstep
1161*0Sstevel@tonic-gate  */
1162*0Sstevel@tonic-gate #define	RFP_RLS_LOCK		0x00001
1163*0Sstevel@tonic-gate #define	RFP_DECR_PWFRAGS	0x00002
1164*0Sstevel@tonic-gate #define	RFP_DECR_FRAGS		0x00004
1165*0Sstevel@tonic-gate #define	RFP_DECR_READFRAGS	(RFP_DECR_PWFRAGS | RFP_DECR_FRAGS)
1166*0Sstevel@tonic-gate 
1167*0Sstevel@tonic-gate /*
1168*0Sstevel@tonic-gate  * NAMES:	raid_free_parent
1169*0Sstevel@tonic-gate  * DESCRIPTION: free a parent structure
1170*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1171*0Sstevel@tonic-gate  *		int	todo - indicates what needs to be done
1172*0Sstevel@tonic-gate  */
1173*0Sstevel@tonic-gate static void
1174*0Sstevel@tonic-gate raid_free_parent(md_raidps_t *ps, int todo)
1175*0Sstevel@tonic-gate {
1176*0Sstevel@tonic-gate 	mdi_unit_t	*ui = ps->ps_ui;
1177*0Sstevel@tonic-gate 
1178*0Sstevel@tonic-gate 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
1179*0Sstevel@tonic-gate 	ASSERT(ps->ps_flags & MD_RPS_INUSE);
1180*0Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
1181*0Sstevel@tonic-gate 	if (todo & RFP_DECR_PWFRAGS) {
1182*0Sstevel@tonic-gate 		ASSERT(ps->ps_pwfrags);
1183*0Sstevel@tonic-gate 		ps->ps_pwfrags--;
1184*0Sstevel@tonic-gate 		if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) {
1185*0Sstevel@tonic-gate 			if (ps->ps_flags & MD_RPS_ERROR) {
1186*0Sstevel@tonic-gate 				ps->ps_bp->b_flags |= B_ERROR;
1187*0Sstevel@tonic-gate 				ps->ps_bp->b_error = ps->ps_error;
1188*0Sstevel@tonic-gate 			}
1189*0Sstevel@tonic-gate 			md_kstat_done(ui, ps->ps_bp, 0);
1190*0Sstevel@tonic-gate 			biodone(ps->ps_bp);
1191*0Sstevel@tonic-gate 			ps->ps_flags |= MD_RPS_IODONE;
1192*0Sstevel@tonic-gate 		}
1193*0Sstevel@tonic-gate 	}
1194*0Sstevel@tonic-gate 
1195*0Sstevel@tonic-gate 	if (todo & RFP_DECR_FRAGS) {
1196*0Sstevel@tonic-gate 		ASSERT(ps->ps_frags);
1197*0Sstevel@tonic-gate 		ps->ps_frags--;
1198*0Sstevel@tonic-gate 	}
1199*0Sstevel@tonic-gate 
1200*0Sstevel@tonic-gate 	if (ps->ps_frags != 0) {
1201*0Sstevel@tonic-gate 		mutex_exit(&ps->ps_mx);
1202*0Sstevel@tonic-gate 		return;
1203*0Sstevel@tonic-gate 	}
1204*0Sstevel@tonic-gate 
1205*0Sstevel@tonic-gate 	ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0));
1206*0Sstevel@tonic-gate 	mutex_exit(&ps->ps_mx);
1207*0Sstevel@tonic-gate 
1208*0Sstevel@tonic-gate 	if (todo & RFP_RLS_LOCK)
1209*0Sstevel@tonic-gate 		md_io_readerexit(ui);
1210*0Sstevel@tonic-gate 
1211*0Sstevel@tonic-gate 	if (panicstr) {
1212*0Sstevel@tonic-gate 		ps->ps_flags |= MD_RPS_DONE;
1213*0Sstevel@tonic-gate 		return;
1214*0Sstevel@tonic-gate 	}
1215*0Sstevel@tonic-gate 
1216*0Sstevel@tonic-gate 	if (ps->ps_flags & MD_RPS_HSREQ)
1217*0Sstevel@tonic-gate 		(void) raid_hotspares();
1218*0Sstevel@tonic-gate 
1219*0Sstevel@tonic-gate 	ASSERT(todo & RFP_RLS_LOCK);
1220*0Sstevel@tonic-gate 	ps->ps_flags &= ~MD_RPS_INUSE;
1221*0Sstevel@tonic-gate 
1222*0Sstevel@tonic-gate 	md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id));
1223*0Sstevel@tonic-gate 
1224*0Sstevel@tonic-gate 	kmem_cache_free(raid_parent_cache, ps);
1225*0Sstevel@tonic-gate }
1226*0Sstevel@tonic-gate 
1227*0Sstevel@tonic-gate /*
1228*0Sstevel@tonic-gate  * NAMES:	raid_free_child
1229*0Sstevel@tonic-gate  * DESCRIPTION: free a parent structure
1230*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1231*0Sstevel@tonic-gate  *		int drop_locks	- 0 for no locks held
1232*0Sstevel@tonic-gate  * NOTE:	(TBR) - this routine currently is not in use.
1233*0Sstevel@tonic-gate  */
1234*0Sstevel@tonic-gate static void
1235*0Sstevel@tonic-gate raid_free_child(md_raidcs_t *cs, int drop_locks)
1236*0Sstevel@tonic-gate {
1237*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
1238*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf, *cbuf1;
1239*0Sstevel@tonic-gate 
1240*0Sstevel@tonic-gate 	if (cs->cs_pw_inval_list)
1241*0Sstevel@tonic-gate 		raid_free_pwinvalidate(cs);
1242*0Sstevel@tonic-gate 
1243*0Sstevel@tonic-gate 	if (drop_locks) {
1244*0Sstevel@tonic-gate 		ASSERT(cs->cs_flags & MD_RCS_LLOCKD &&
1245*0Sstevel@tonic-gate 		    (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER)));
1246*0Sstevel@tonic-gate 		md_unit_readerexit(MDI_UNIT(MD_SID(un)));
1247*0Sstevel@tonic-gate 		raid_line_exit(cs);
1248*0Sstevel@tonic-gate 	} else {
1249*0Sstevel@tonic-gate 		ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD));
1250*0Sstevel@tonic-gate 	}
1251*0Sstevel@tonic-gate 
1252*0Sstevel@tonic-gate 	freebuffers(cs);
1253*0Sstevel@tonic-gate 	cbuf = cs->cs_buflist;
1254*0Sstevel@tonic-gate 	while (cbuf) {
1255*0Sstevel@tonic-gate 		cbuf1 = cbuf->cbuf_next;
1256*0Sstevel@tonic-gate 		kmem_cache_free(raid_cbuf_cache, cbuf);
1257*0Sstevel@tonic-gate 		cbuf = cbuf1;
1258*0Sstevel@tonic-gate 	}
1259*0Sstevel@tonic-gate 	if (cs->cs_dbuf.b_flags & B_REMAPPED)
1260*0Sstevel@tonic-gate 		bp_mapout(&cs->cs_dbuf);
1261*0Sstevel@tonic-gate 	kmem_cache_free(raid_child_cache, cs);
1262*0Sstevel@tonic-gate }
1263*0Sstevel@tonic-gate 
1264*0Sstevel@tonic-gate /*
1265*0Sstevel@tonic-gate  * NAME:	raid_regen_parity
1266*0Sstevel@tonic-gate  *
1267*0Sstevel@tonic-gate  * DESCRIPTION:	This routine is used to regenerate the parity blocks
1268*0Sstevel@tonic-gate  *		for the entire raid device.  It is called from
1269*0Sstevel@tonic-gate  *		both the regen thread and the IO path.
1270*0Sstevel@tonic-gate  *
1271*0Sstevel@tonic-gate  *		On error the entire device is marked as in error by
1272*0Sstevel@tonic-gate  *		placing the erroring device in error and all other
1273*0Sstevel@tonic-gate  *		devices in last_errored.
1274*0Sstevel@tonic-gate  *
1275*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t	*cs
1276*0Sstevel@tonic-gate  */
1277*0Sstevel@tonic-gate void
1278*0Sstevel@tonic-gate raid_regen_parity(md_raidcs_t *cs)
1279*0Sstevel@tonic-gate {
1280*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
1281*0Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(un->c.un_self_id);
1282*0Sstevel@tonic-gate 	caddr_t		buffer;
1283*0Sstevel@tonic-gate 	caddr_t		parity_buffer;
1284*0Sstevel@tonic-gate 	buf_t		*bp;
1285*0Sstevel@tonic-gate 	uint_t		*dbuf, *pbuf;
1286*0Sstevel@tonic-gate 	uint_t		colcnt = un->un_totalcolumncnt;
1287*0Sstevel@tonic-gate 	int		column;
1288*0Sstevel@tonic-gate 	int		parity_column = cs->cs_pcolumn;
1289*0Sstevel@tonic-gate 	size_t		bcount;
1290*0Sstevel@tonic-gate 	int		j;
1291*0Sstevel@tonic-gate 
1292*0Sstevel@tonic-gate 	/*
1293*0Sstevel@tonic-gate 	 * This routine uses the data and parity buffers allocated to a
1294*0Sstevel@tonic-gate 	 * write.  In the case of a read the buffers are allocated and
1295*0Sstevel@tonic-gate 	 * freed at the end.
1296*0Sstevel@tonic-gate 	 */
1297*0Sstevel@tonic-gate 
1298*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(un));
1299*0Sstevel@tonic-gate 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
1300*0Sstevel@tonic-gate 	ASSERT(UNIT_READER_HELD(un));
1301*0Sstevel@tonic-gate 
1302*0Sstevel@tonic-gate 	if (raid_state_cnt(un, RCS_OKAY) != colcnt)
1303*0Sstevel@tonic-gate 		return;
1304*0Sstevel@tonic-gate 
1305*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_READER) {
1306*0Sstevel@tonic-gate 		getpbuffer(cs);
1307*0Sstevel@tonic-gate 		getdbuffer(cs);
1308*0Sstevel@tonic-gate 	}
1309*0Sstevel@tonic-gate 	ASSERT(cs->cs_dbuffer && cs->cs_pbuffer);
1310*0Sstevel@tonic-gate 	bcount = cs->cs_bcount;
1311*0Sstevel@tonic-gate 	buffer = cs->cs_dbuffer;
1312*0Sstevel@tonic-gate 	parity_buffer = cs->cs_pbuffer;
1313*0Sstevel@tonic-gate 	bzero(parity_buffer, bcount);
1314*0Sstevel@tonic-gate 	bp = &cs->cs_dbuf;
1315*0Sstevel@tonic-gate 	for (column = 0; column < colcnt; column++) {
1316*0Sstevel@tonic-gate 		if (column == parity_column)
1317*0Sstevel@tonic-gate 			continue;
1318*0Sstevel@tonic-gate 		reset_buf(bp, B_READ | B_BUSY, bcount);
1319*0Sstevel@tonic-gate 		bp->b_un.b_addr = buffer;
1320*0Sstevel@tonic-gate 		bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
1321*0Sstevel@tonic-gate 		bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart;
1322*0Sstevel@tonic-gate 		bp->b_bcount = bcount;
1323*0Sstevel@tonic-gate 		bp->b_bufsize = bcount;
1324*0Sstevel@tonic-gate 		(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
1325*0Sstevel@tonic-gate 		if (biowait(bp))
1326*0Sstevel@tonic-gate 			goto bail;
1327*0Sstevel@tonic-gate 		pbuf = (uint_t *)(void *)parity_buffer;
1328*0Sstevel@tonic-gate 		dbuf = (uint_t *)(void *)buffer;
1329*0Sstevel@tonic-gate 		for (j = 0; j < (bcount / (sizeof (uint_t))); j++) {
1330*0Sstevel@tonic-gate 			*pbuf = *pbuf ^ *dbuf;
1331*0Sstevel@tonic-gate 			pbuf++;
1332*0Sstevel@tonic-gate 			dbuf++;
1333*0Sstevel@tonic-gate 		}
1334*0Sstevel@tonic-gate 	}
1335*0Sstevel@tonic-gate 
1336*0Sstevel@tonic-gate 	reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount);
1337*0Sstevel@tonic-gate 	bp->b_un.b_addr = parity_buffer;
1338*0Sstevel@tonic-gate 	bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev);
1339*0Sstevel@tonic-gate 	bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart;
1340*0Sstevel@tonic-gate 	bp->b_bcount = bcount;
1341*0Sstevel@tonic-gate 	bp->b_bufsize = bcount;
1342*0Sstevel@tonic-gate 	(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
1343*0Sstevel@tonic-gate 	if (biowait(bp))
1344*0Sstevel@tonic-gate 		goto bail;
1345*0Sstevel@tonic-gate 
1346*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_READER) {
1347*0Sstevel@tonic-gate 		freebuffers(cs);
1348*0Sstevel@tonic-gate 		cs->cs_pbuffer = NULL;
1349*0Sstevel@tonic-gate 		cs->cs_dbuffer = NULL;
1350*0Sstevel@tonic-gate 	}
1351*0Sstevel@tonic-gate 	bp->b_chain = (struct buf *)cs;
1352*0Sstevel@tonic-gate 	return;
1353*0Sstevel@tonic-gate bail:
1354*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_READER) {
1355*0Sstevel@tonic-gate 		freebuffers(cs);
1356*0Sstevel@tonic-gate 		cs->cs_pbuffer = NULL;
1357*0Sstevel@tonic-gate 		cs->cs_dbuffer = NULL;
1358*0Sstevel@tonic-gate 	}
1359*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
1360*0Sstevel@tonic-gate 	un = md_unit_writerlock(ui);
1361*0Sstevel@tonic-gate 	raid_set_state(un, column, RCS_ERRED, 0);
1362*0Sstevel@tonic-gate 	for (column = 0; column < colcnt; column++)
1363*0Sstevel@tonic-gate 		raid_set_state(un, column, RCS_ERRED, 0);
1364*0Sstevel@tonic-gate 	raid_commit(un, NULL);
1365*0Sstevel@tonic-gate 	md_unit_writerexit(ui);
1366*0Sstevel@tonic-gate 	un = md_unit_readerlock(ui);
1367*0Sstevel@tonic-gate 	bp->b_chain = (struct buf *)cs;
1368*0Sstevel@tonic-gate }
1369*0Sstevel@tonic-gate 
1370*0Sstevel@tonic-gate /*
1371*0Sstevel@tonic-gate  * NAMES:	raid_error_state
1372*0Sstevel@tonic-gate  * DESCRIPTION: check unit and column states' impact on I/O error
1373*0Sstevel@tonic-gate  *		NOTE:	the state now may not be the state when the
1374*0Sstevel@tonic-gate  *			I/O completed due to race conditions.
1375*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to raid unit structure
1376*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to child structure
1377*0Sstevel@tonic-gate  *		buf_t	  *bp - pointer to buffer structure
1378*0Sstevel@tonic-gate  */
1379*0Sstevel@tonic-gate static int
1380*0Sstevel@tonic-gate raid_error_state(mr_unit_t *un, buf_t *bp)
1381*0Sstevel@tonic-gate {
1382*0Sstevel@tonic-gate 	int		column;
1383*0Sstevel@tonic-gate 	int		i;
1384*0Sstevel@tonic-gate 
1385*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(un));
1386*0Sstevel@tonic-gate 	ASSERT(UNIT_WRITER_HELD(un));
1387*0Sstevel@tonic-gate 
1388*0Sstevel@tonic-gate 	column = -1;
1389*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
1390*0Sstevel@tonic-gate 		if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) {
1391*0Sstevel@tonic-gate 			column = i;
1392*0Sstevel@tonic-gate 			break;
1393*0Sstevel@tonic-gate 		}
1394*0Sstevel@tonic-gate 		if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) {
1395*0Sstevel@tonic-gate 			column = i;
1396*0Sstevel@tonic-gate 			break;
1397*0Sstevel@tonic-gate 		}
1398*0Sstevel@tonic-gate 	}
1399*0Sstevel@tonic-gate 
1400*0Sstevel@tonic-gate 	/* in case a replace snuck in while waiting on unit writer lock */
1401*0Sstevel@tonic-gate 
1402*0Sstevel@tonic-gate 	if (column == -1) {
1403*0Sstevel@tonic-gate 		return (0);
1404*0Sstevel@tonic-gate 	}
1405*0Sstevel@tonic-gate 
1406*0Sstevel@tonic-gate 	(void) raid_set_state(un, column, RCS_ERRED, 0);
1407*0Sstevel@tonic-gate 	ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED));
1408*0Sstevel@tonic-gate 
1409*0Sstevel@tonic-gate 	raid_commit(un, NULL);
1410*0Sstevel@tonic-gate 	if (un->un_state & RUS_ERRED) {
1411*0Sstevel@tonic-gate 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
1412*0Sstevel@tonic-gate 		    MD_UN2SET(un), MD_SID(un));
1413*0Sstevel@tonic-gate 	} else if (un->un_state & RUS_LAST_ERRED) {
1414*0Sstevel@tonic-gate 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
1415*0Sstevel@tonic-gate 		    MD_UN2SET(un), MD_SID(un));
1416*0Sstevel@tonic-gate 	}
1417*0Sstevel@tonic-gate 
1418*0Sstevel@tonic-gate 	return (EIO);
1419*0Sstevel@tonic-gate }
1420*0Sstevel@tonic-gate 
1421*0Sstevel@tonic-gate /*
1422*0Sstevel@tonic-gate  * NAME:	raid_mapin_buf
1423*0Sstevel@tonic-gate  * DESCRIPTION:	wait for the input buffer header to be maped in
1424*0Sstevel@tonic-gate  * PARAMETERS:	md_raidps_t *ps
1425*0Sstevel@tonic-gate  */
1426*0Sstevel@tonic-gate static void
1427*0Sstevel@tonic-gate raid_mapin_buf(md_raidcs_t *cs)
1428*0Sstevel@tonic-gate {
1429*0Sstevel@tonic-gate 	md_raidps_t	*ps = cs->cs_ps;
1430*0Sstevel@tonic-gate 
1431*0Sstevel@tonic-gate 	/*
1432*0Sstevel@tonic-gate 	 * check to see if the buffer is maped.  If all is ok return the
1433*0Sstevel@tonic-gate 	 * offset of the data and return.  Since it is expensive to grab
1434*0Sstevel@tonic-gate 	 * a mutex this is only done if the mapin is not complete.
1435*0Sstevel@tonic-gate 	 * Once the mutex is aquired it is possible that the mapin was
1436*0Sstevel@tonic-gate 	 * not done so recheck and if necessary do the mapin.
1437*0Sstevel@tonic-gate 	 */
1438*0Sstevel@tonic-gate 	if (ps->ps_mapin > 0) {
1439*0Sstevel@tonic-gate 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
1440*0Sstevel@tonic-gate 		return;
1441*0Sstevel@tonic-gate 	}
1442*0Sstevel@tonic-gate 	mutex_enter(&ps->ps_mapin_mx);
1443*0Sstevel@tonic-gate 	if (ps->ps_mapin > 0) {
1444*0Sstevel@tonic-gate 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
1445*0Sstevel@tonic-gate 		mutex_exit(&ps->ps_mapin_mx);
1446*0Sstevel@tonic-gate 		return;
1447*0Sstevel@tonic-gate 	}
1448*0Sstevel@tonic-gate 	bp_mapin(ps->ps_bp);
1449*0Sstevel@tonic-gate 	/*
1450*0Sstevel@tonic-gate 	 * get the new b_addr out of the parent since bp_mapin just changed it
1451*0Sstevel@tonic-gate 	 */
1452*0Sstevel@tonic-gate 	ps->ps_addr = ps->ps_bp->b_un.b_addr;
1453*0Sstevel@tonic-gate 	cs->cs_addr = ps->ps_addr + cs->cs_offset;
1454*0Sstevel@tonic-gate 	ps->ps_mapin++;
1455*0Sstevel@tonic-gate 	mutex_exit(&ps->ps_mapin_mx);
1456*0Sstevel@tonic-gate }
1457*0Sstevel@tonic-gate 
1458*0Sstevel@tonic-gate /*
1459*0Sstevel@tonic-gate  * NAMES:	raid_read_no_retry
1460*0Sstevel@tonic-gate  * DESCRIPTION: I/O retry routine for a RAID metadevice read
1461*0Sstevel@tonic-gate  *		read failed attempting to regenerate the data,
1462*0Sstevel@tonic-gate  *		no retry possible, error occured in raid_raidregenloop().
1463*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t   *un - pointer to raid unit structure
1464*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to child structure
1465*0Sstevel@tonic-gate  */
1466*0Sstevel@tonic-gate /*ARGSUSED*/
1467*0Sstevel@tonic-gate static void
1468*0Sstevel@tonic-gate raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs)
1469*0Sstevel@tonic-gate {
1470*0Sstevel@tonic-gate 	md_raidps_t	*ps = cs->cs_ps;
1471*0Sstevel@tonic-gate 
1472*0Sstevel@tonic-gate 	raid_error_parent(ps, EIO);
1473*0Sstevel@tonic-gate 	raid_free_child(cs, 1);
1474*0Sstevel@tonic-gate 
1475*0Sstevel@tonic-gate 	/* decrement readfrags */
1476*0Sstevel@tonic-gate 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
1477*0Sstevel@tonic-gate }
1478*0Sstevel@tonic-gate 
1479*0Sstevel@tonic-gate /*
1480*0Sstevel@tonic-gate  * NAMES:	raid_read_retry
1481*0Sstevel@tonic-gate  * DESCRIPTION: I/O retry routine for a RAID metadevice read
1482*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1483*0Sstevel@tonic-gate  */
1484*0Sstevel@tonic-gate static void
1485*0Sstevel@tonic-gate raid_read_retry(mr_unit_t *un, md_raidcs_t *cs)
1486*0Sstevel@tonic-gate {
1487*0Sstevel@tonic-gate 	/* re-initialize the buf_t structure for raid_read() */
1488*0Sstevel@tonic-gate 	cs->cs_dbuf.b_chain = (struct buf *)cs;
1489*0Sstevel@tonic-gate 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
1490*0Sstevel@tonic-gate 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
1491*0Sstevel@tonic-gate 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
1492*0Sstevel@tonic-gate 	cs->cs_dbuf.b_error = 0;	/* initialize error */
1493*0Sstevel@tonic-gate 	cs->cs_dbuf.b_offset = -1;
1494*0Sstevel@tonic-gate 	/* Initialize semaphores */
1495*0Sstevel@tonic-gate 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
1496*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
1497*0Sstevel@tonic-gate 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
1498*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
1499*0Sstevel@tonic-gate 
1500*0Sstevel@tonic-gate 	cs->cs_pbuf.b_chain = (struct buf *)cs;
1501*0Sstevel@tonic-gate 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
1502*0Sstevel@tonic-gate 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
1503*0Sstevel@tonic-gate 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
1504*0Sstevel@tonic-gate 	cs->cs_pbuf.b_error = 0;	/* initialize error */
1505*0Sstevel@tonic-gate 	cs->cs_pbuf.b_offset = -1;
1506*0Sstevel@tonic-gate 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
1507*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
1508*0Sstevel@tonic-gate 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
1509*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
1510*0Sstevel@tonic-gate 
1511*0Sstevel@tonic-gate 	cs->cs_flags &= ~MD_RCS_ERROR;	/* reset child error flag */
1512*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_RECOVERY;  /* set RECOVERY flag */
1513*0Sstevel@tonic-gate 
1514*0Sstevel@tonic-gate 	/*
1515*0Sstevel@tonic-gate 	 * re-scheduling I/O with raid_read_io() is simpler. basically,
1516*0Sstevel@tonic-gate 	 * raid_read_io() is invoked again with same child structure.
1517*0Sstevel@tonic-gate 	 * (NOTE: we aren`t supposed to do any error recovery when an I/O
1518*0Sstevel@tonic-gate 	 * error occured in raid_raidregenloop().
1519*0Sstevel@tonic-gate 	 */
1520*0Sstevel@tonic-gate 	raid_mapin_buf(cs);
1521*0Sstevel@tonic-gate 	raid_read_io(un, cs);
1522*0Sstevel@tonic-gate }
1523*0Sstevel@tonic-gate 
1524*0Sstevel@tonic-gate /*
1525*0Sstevel@tonic-gate  * NAMES:	raid_rderr
1526*0Sstevel@tonic-gate  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1527*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1528*0Sstevel@tonic-gate  * LOCKS:	must obtain unit writer lock while calling raid_error_state
1529*0Sstevel@tonic-gate  *		since a unit or column state transition may take place.
1530*0Sstevel@tonic-gate  *		must obtain unit reader lock to retry I/O.
1531*0Sstevel@tonic-gate  */
1532*0Sstevel@tonic-gate /*ARGSUSED*/
1533*0Sstevel@tonic-gate static void
1534*0Sstevel@tonic-gate raid_rderr(md_raidcs_t *cs)
1535*0Sstevel@tonic-gate {
1536*0Sstevel@tonic-gate 	md_raidps_t	*ps;
1537*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
1538*0Sstevel@tonic-gate 	mr_unit_t	*un;
1539*0Sstevel@tonic-gate 	int		error = 0;
1540*0Sstevel@tonic-gate 
1541*0Sstevel@tonic-gate 	ps = cs->cs_ps;
1542*0Sstevel@tonic-gate 	ui = ps->ps_ui;
1543*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_writerlock(ui);
1544*0Sstevel@tonic-gate 	ASSERT(un != 0);
1545*0Sstevel@tonic-gate 
1546*0Sstevel@tonic-gate 	if (cs->cs_dbuf.b_flags & B_ERROR)
1547*0Sstevel@tonic-gate 		error = raid_error_state(un, &cs->cs_dbuf);
1548*0Sstevel@tonic-gate 	if (cs->cs_pbuf.b_flags & B_ERROR)
1549*0Sstevel@tonic-gate 		error |= raid_error_state(un, &cs->cs_pbuf);
1550*0Sstevel@tonic-gate 
1551*0Sstevel@tonic-gate 	md_unit_writerexit(ui);
1552*0Sstevel@tonic-gate 
1553*0Sstevel@tonic-gate 	ps->ps_flags |= MD_RPS_HSREQ;
1554*0Sstevel@tonic-gate 
1555*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_readerlock(ui);
1556*0Sstevel@tonic-gate 	ASSERT(un != 0);
1557*0Sstevel@tonic-gate 	/* now attempt the appropriate retry routine */
1558*0Sstevel@tonic-gate 	(*(cs->cs_retry_call))(un, cs);
1559*0Sstevel@tonic-gate }
1560*0Sstevel@tonic-gate 
1561*0Sstevel@tonic-gate 
1562*0Sstevel@tonic-gate /*
1563*0Sstevel@tonic-gate  * NAMES:	raid_read_error
1564*0Sstevel@tonic-gate  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1565*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1566*0Sstevel@tonic-gate  */
1567*0Sstevel@tonic-gate /*ARGSUSED*/
1568*0Sstevel@tonic-gate static void
1569*0Sstevel@tonic-gate raid_read_error(md_raidcs_t *cs)
1570*0Sstevel@tonic-gate {
1571*0Sstevel@tonic-gate 	md_raidps_t	*ps;
1572*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
1573*0Sstevel@tonic-gate 	mr_unit_t	*un;
1574*0Sstevel@tonic-gate 	set_t		setno;
1575*0Sstevel@tonic-gate 
1576*0Sstevel@tonic-gate 	ps = cs->cs_ps;
1577*0Sstevel@tonic-gate 	ui = ps->ps_ui;
1578*0Sstevel@tonic-gate 	un = cs->cs_un;
1579*0Sstevel@tonic-gate 
1580*0Sstevel@tonic-gate 	setno = MD_UN2SET(un);
1581*0Sstevel@tonic-gate 
1582*0Sstevel@tonic-gate 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
1583*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
1584*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
1585*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md %s: read error on %s",
1586*0Sstevel@tonic-gate 		    md_shortname(MD_SID(un)),
1587*0Sstevel@tonic-gate 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
1588*0Sstevel@tonic-gate 
1589*0Sstevel@tonic-gate 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
1590*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
1591*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
1592*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md %s: read error on %s",
1593*0Sstevel@tonic-gate 		    md_shortname(MD_SID(un)),
1594*0Sstevel@tonic-gate 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
1595*0Sstevel@tonic-gate 
1596*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
1597*0Sstevel@tonic-gate 
1598*0Sstevel@tonic-gate 	ASSERT(cs->cs_frags == 0);
1599*0Sstevel@tonic-gate 
1600*0Sstevel@tonic-gate 	/* now schedule processing for possible state change */
1601*0Sstevel@tonic-gate 	daemon_request(&md_mstr_daemon, raid_rderr,
1602*0Sstevel@tonic-gate 		(daemon_queue_t *)cs, REQ_OLD);
1603*0Sstevel@tonic-gate 
1604*0Sstevel@tonic-gate }
1605*0Sstevel@tonic-gate 
1606*0Sstevel@tonic-gate /*
1607*0Sstevel@tonic-gate  * NAMES:	getdbuffer
1608*0Sstevel@tonic-gate  * DESCRIPTION: data buffer allocation for a child structure
1609*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1610*0Sstevel@tonic-gate  *
1611*0Sstevel@tonic-gate  * NOTE: always get dbuffer before pbuffer
1612*0Sstevel@tonic-gate  *	 and get both buffers before pwslot
1613*0Sstevel@tonic-gate  *	 otherwise a deadlock could be introduced.
1614*0Sstevel@tonic-gate  */
1615*0Sstevel@tonic-gate static void
1616*0Sstevel@tonic-gate getdbuffer(md_raidcs_t *cs)
1617*0Sstevel@tonic-gate {
1618*0Sstevel@tonic-gate 	mr_unit_t	*un;
1619*0Sstevel@tonic-gate 
1620*0Sstevel@tonic-gate 	cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
1621*0Sstevel@tonic-gate 	if (cs->cs_dbuffer != NULL)
1622*0Sstevel@tonic-gate 		return;
1623*0Sstevel@tonic-gate 	un = cs->cs_ps->ps_un;
1624*0Sstevel@tonic-gate 	mutex_enter(&un->un_mx);
1625*0Sstevel@tonic-gate 	while (un->un_dbuffer == NULL) {
1626*0Sstevel@tonic-gate 		STAT_INC(data_buffer_waits);
1627*0Sstevel@tonic-gate 		un->un_rflags |= MD_RFLAG_NEEDBUF;
1628*0Sstevel@tonic-gate 		cv_wait(&un->un_cv, &un->un_mx);
1629*0Sstevel@tonic-gate 	}
1630*0Sstevel@tonic-gate 	cs->cs_dbuffer = un->un_dbuffer;
1631*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_UNDBUF;
1632*0Sstevel@tonic-gate 	un->un_dbuffer = NULL;
1633*0Sstevel@tonic-gate 	mutex_exit(&un->un_mx);
1634*0Sstevel@tonic-gate }
1635*0Sstevel@tonic-gate 
1636*0Sstevel@tonic-gate /*
1637*0Sstevel@tonic-gate  * NAMES:	getpbuffer
1638*0Sstevel@tonic-gate  * DESCRIPTION: parity buffer allocation for a child structure
1639*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1640*0Sstevel@tonic-gate  *
1641*0Sstevel@tonic-gate  * NOTE: always get dbuffer before pbuffer
1642*0Sstevel@tonic-gate  *	 and get both buffers before pwslot
1643*0Sstevel@tonic-gate  *	 otherwise a deadlock could be introduced.
1644*0Sstevel@tonic-gate  */
1645*0Sstevel@tonic-gate static void
1646*0Sstevel@tonic-gate getpbuffer(md_raidcs_t *cs)
1647*0Sstevel@tonic-gate {
1648*0Sstevel@tonic-gate 	mr_unit_t *un;
1649*0Sstevel@tonic-gate 
1650*0Sstevel@tonic-gate 	cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
1651*0Sstevel@tonic-gate 	if (cs->cs_pbuffer != NULL)
1652*0Sstevel@tonic-gate 		return;
1653*0Sstevel@tonic-gate 	un = cs->cs_ps->ps_un;
1654*0Sstevel@tonic-gate 	mutex_enter(&un->un_mx);
1655*0Sstevel@tonic-gate 	while (un->un_pbuffer == NULL) {
1656*0Sstevel@tonic-gate 		STAT_INC(parity_buffer_waits);
1657*0Sstevel@tonic-gate 		un->un_rflags |= MD_RFLAG_NEEDBUF;
1658*0Sstevel@tonic-gate 		cv_wait(&un->un_cv, &un->un_mx);
1659*0Sstevel@tonic-gate 	}
1660*0Sstevel@tonic-gate 	cs->cs_pbuffer = un->un_pbuffer;
1661*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_UNPBUF;
1662*0Sstevel@tonic-gate 	un->un_pbuffer = NULL;
1663*0Sstevel@tonic-gate 	mutex_exit(&un->un_mx);
1664*0Sstevel@tonic-gate }
1665*0Sstevel@tonic-gate static void
1666*0Sstevel@tonic-gate getresources(md_raidcs_t *cs)
1667*0Sstevel@tonic-gate {
1668*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
1669*0Sstevel@tonic-gate 	/*
1670*0Sstevel@tonic-gate 	 * NOTE: always get dbuffer before pbuffer
1671*0Sstevel@tonic-gate 	 *	 and get both buffers before pwslot
1672*0Sstevel@tonic-gate 	 *	 otherwise a deadlock could be introduced.
1673*0Sstevel@tonic-gate 	 */
1674*0Sstevel@tonic-gate 	getdbuffer(cs);
1675*0Sstevel@tonic-gate 	getpbuffer(cs);
1676*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
1677*0Sstevel@tonic-gate 		cbuf->cbuf_buffer =
1678*0Sstevel@tonic-gate 		    kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP);
1679*0Sstevel@tonic-gate }
1680*0Sstevel@tonic-gate /*
1681*0Sstevel@tonic-gate  * NAMES:	freebuffers
1682*0Sstevel@tonic-gate  * DESCRIPTION: child structure buffer freeing routine
1683*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1684*0Sstevel@tonic-gate  */
1685*0Sstevel@tonic-gate static void
1686*0Sstevel@tonic-gate freebuffers(md_raidcs_t *cs)
1687*0Sstevel@tonic-gate {
1688*0Sstevel@tonic-gate 	mr_unit_t	*un;
1689*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
1690*0Sstevel@tonic-gate 
1691*0Sstevel@tonic-gate 	/* free buffers used for full line write */
1692*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
1693*0Sstevel@tonic-gate 		if (cbuf->cbuf_buffer == NULL)
1694*0Sstevel@tonic-gate 			continue;
1695*0Sstevel@tonic-gate 		kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE);
1696*0Sstevel@tonic-gate 		cbuf->cbuf_buffer = NULL;
1697*0Sstevel@tonic-gate 		cbuf->cbuf_bcount = 0;
1698*0Sstevel@tonic-gate 	}
1699*0Sstevel@tonic-gate 
1700*0Sstevel@tonic-gate 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
1701*0Sstevel@tonic-gate 		un = cs->cs_un;
1702*0Sstevel@tonic-gate 		mutex_enter(&un->un_mx);
1703*0Sstevel@tonic-gate 	}
1704*0Sstevel@tonic-gate 	if (cs->cs_dbuffer) {
1705*0Sstevel@tonic-gate 		if (cs->cs_flags & MD_RCS_UNDBUF)
1706*0Sstevel@tonic-gate 			un->un_dbuffer = cs->cs_dbuffer;
1707*0Sstevel@tonic-gate 		else
1708*0Sstevel@tonic-gate 			kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE);
1709*0Sstevel@tonic-gate 	}
1710*0Sstevel@tonic-gate 	if (cs->cs_pbuffer) {
1711*0Sstevel@tonic-gate 		if (cs->cs_flags & MD_RCS_UNPBUF)
1712*0Sstevel@tonic-gate 			un->un_pbuffer = cs->cs_pbuffer;
1713*0Sstevel@tonic-gate 		else
1714*0Sstevel@tonic-gate 			kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE);
1715*0Sstevel@tonic-gate 	}
1716*0Sstevel@tonic-gate 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
1717*0Sstevel@tonic-gate 		un->un_rflags &= ~MD_RFLAG_NEEDBUF;
1718*0Sstevel@tonic-gate 		cv_broadcast(&un->un_cv);
1719*0Sstevel@tonic-gate 		mutex_exit(&un->un_mx);
1720*0Sstevel@tonic-gate 	}
1721*0Sstevel@tonic-gate }
1722*0Sstevel@tonic-gate 
1723*0Sstevel@tonic-gate /*
1724*0Sstevel@tonic-gate  * NAMES:	raid_line_reader_lock, raid_line_writer_lock
1725*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice line reader and writer lock routines
1726*0Sstevel@tonic-gate  *		data column # and parity column #.
1727*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1728*0Sstevel@tonic-gate  */
1729*0Sstevel@tonic-gate 
1730*0Sstevel@tonic-gate void
1731*0Sstevel@tonic-gate raid_line_reader_lock(md_raidcs_t *cs, int resync_thread)
1732*0Sstevel@tonic-gate {
1733*0Sstevel@tonic-gate 	mr_unit_t	*un;
1734*0Sstevel@tonic-gate 	md_raidcs_t	*cs1;
1735*0Sstevel@tonic-gate 
1736*0Sstevel@tonic-gate 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
1737*0Sstevel@tonic-gate 	un = cs->cs_un;
1738*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_READER;
1739*0Sstevel@tonic-gate 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1740*0Sstevel@tonic-gate 	if (!panicstr)
1741*0Sstevel@tonic-gate 		mutex_enter(&un->un_linlck_mx);
1742*0Sstevel@tonic-gate 	cs1 = un->un_linlck_chn;
1743*0Sstevel@tonic-gate 	while (cs1 != NULL) {
1744*0Sstevel@tonic-gate 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1745*0Sstevel@tonic-gate 			if (raid_io_overlaps(cs, cs1) == 1)
1746*0Sstevel@tonic-gate 				if (cs1->cs_flags & MD_RCS_WRITER)
1747*0Sstevel@tonic-gate 					break;
1748*0Sstevel@tonic-gate 
1749*0Sstevel@tonic-gate 		if (cs1 != NULL) {
1750*0Sstevel@tonic-gate 			if (panicstr)
1751*0Sstevel@tonic-gate 				panic("md; raid line write lock held");
1752*0Sstevel@tonic-gate 			un->un_linlck_flg = 1;
1753*0Sstevel@tonic-gate 			cv_wait(&un->un_linlck_cv, &un->un_linlck_mx);
1754*0Sstevel@tonic-gate 			STAT_INC(raid_read_waits);
1755*0Sstevel@tonic-gate 		}
1756*0Sstevel@tonic-gate 	}
1757*0Sstevel@tonic-gate 	STAT_MAX(raid_max_reader_locks, raid_reader_locks_active);
1758*0Sstevel@tonic-gate 	STAT_INC(raid_reader_locks);
1759*0Sstevel@tonic-gate 	cs1 = un->un_linlck_chn;
1760*0Sstevel@tonic-gate 	if (cs1 != NULL)
1761*0Sstevel@tonic-gate 		cs1->cs_linlck_prev = cs;
1762*0Sstevel@tonic-gate 	cs->cs_linlck_next = cs1;
1763*0Sstevel@tonic-gate 	cs->cs_linlck_prev = NULL;
1764*0Sstevel@tonic-gate 	un->un_linlck_chn = cs;
1765*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_LLOCKD;
1766*0Sstevel@tonic-gate 	if (resync_thread) {
1767*0Sstevel@tonic-gate 		diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
1768*0Sstevel@tonic-gate 		diskaddr_t line = (lastblk + 1) / un->un_segsize;
1769*0Sstevel@tonic-gate 		ASSERT(raid_state_cnt(un, RCS_RESYNC));
1770*0Sstevel@tonic-gate 		mutex_enter(&un->un_mx);
1771*0Sstevel@tonic-gate 		un->un_resync_line_index = line;
1772*0Sstevel@tonic-gate 		mutex_exit(&un->un_mx);
1773*0Sstevel@tonic-gate 	}
1774*0Sstevel@tonic-gate 	if (!panicstr)
1775*0Sstevel@tonic-gate 		mutex_exit(&un->un_linlck_mx);
1776*0Sstevel@tonic-gate }
1777*0Sstevel@tonic-gate 
1778*0Sstevel@tonic-gate int
1779*0Sstevel@tonic-gate raid_line_writer_lock(md_raidcs_t *cs, int lock)
1780*0Sstevel@tonic-gate {
1781*0Sstevel@tonic-gate 	mr_unit_t	*un;
1782*0Sstevel@tonic-gate 	md_raidcs_t	*cs1;
1783*0Sstevel@tonic-gate 
1784*0Sstevel@tonic-gate 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
1785*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_WRITER;
1786*0Sstevel@tonic-gate 	un = cs->cs_ps->ps_un;
1787*0Sstevel@tonic-gate 
1788*0Sstevel@tonic-gate 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1789*0Sstevel@tonic-gate 	if (lock && !panicstr)
1790*0Sstevel@tonic-gate 		mutex_enter(&un->un_linlck_mx);
1791*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
1792*0Sstevel@tonic-gate 
1793*0Sstevel@tonic-gate 	cs1 = un->un_linlck_chn;
1794*0Sstevel@tonic-gate 	for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1795*0Sstevel@tonic-gate 		if (raid_io_overlaps(cs, cs1))
1796*0Sstevel@tonic-gate 			break;
1797*0Sstevel@tonic-gate 
1798*0Sstevel@tonic-gate 	if (cs1 != NULL) {
1799*0Sstevel@tonic-gate 		if (panicstr)
1800*0Sstevel@tonic-gate 			panic("md: line writer lock inaccessible");
1801*0Sstevel@tonic-gate 		goto no_lock_exit;
1802*0Sstevel@tonic-gate 	}
1803*0Sstevel@tonic-gate 
1804*0Sstevel@tonic-gate 	if (raid_alloc_pwslot(cs)) {
1805*0Sstevel@tonic-gate 		if (panicstr)
1806*0Sstevel@tonic-gate 			panic("md: no prewrite slots");
1807*0Sstevel@tonic-gate 		STAT_INC(raid_prewrite_waits);
1808*0Sstevel@tonic-gate 		goto no_lock_exit;
1809*0Sstevel@tonic-gate 	}
1810*0Sstevel@tonic-gate 
1811*0Sstevel@tonic-gate 	cs1 = un->un_linlck_chn;
1812*0Sstevel@tonic-gate 	if (cs1 != NULL)
1813*0Sstevel@tonic-gate 		cs1->cs_linlck_prev = cs;
1814*0Sstevel@tonic-gate 	cs->cs_linlck_next = cs1;
1815*0Sstevel@tonic-gate 	cs->cs_linlck_prev = NULL;
1816*0Sstevel@tonic-gate 	un->un_linlck_chn = cs;
1817*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_LLOCKD;
1818*0Sstevel@tonic-gate 	cs->cs_flags &= ~MD_RCS_WAITING;
1819*0Sstevel@tonic-gate 	STAT_INC(raid_writer_locks);
1820*0Sstevel@tonic-gate 	STAT_MAX(raid_max_write_locks, raid_write_locks_active);
1821*0Sstevel@tonic-gate 	if (lock && !panicstr)
1822*0Sstevel@tonic-gate 		mutex_exit(&un->un_linlck_mx);
1823*0Sstevel@tonic-gate 	return (0);
1824*0Sstevel@tonic-gate 
1825*0Sstevel@tonic-gate no_lock_exit:
1826*0Sstevel@tonic-gate 	/* if this is already queued then do not requeue it */
1827*0Sstevel@tonic-gate 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
1828*0Sstevel@tonic-gate 	if (!lock || (cs->cs_flags & MD_RCS_WAITING))
1829*0Sstevel@tonic-gate 		return (1);
1830*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_WAITING;
1831*0Sstevel@tonic-gate 	cs->cs_un = un;
1832*0Sstevel@tonic-gate 	raid_enqueue(cs);
1833*0Sstevel@tonic-gate 	if (lock && !panicstr)
1834*0Sstevel@tonic-gate 		mutex_exit(&un->un_linlck_mx);
1835*0Sstevel@tonic-gate 	return (1);
1836*0Sstevel@tonic-gate }
1837*0Sstevel@tonic-gate 
1838*0Sstevel@tonic-gate static void
1839*0Sstevel@tonic-gate raid_startio(md_raidcs_t *cs)
1840*0Sstevel@tonic-gate {
1841*0Sstevel@tonic-gate 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
1842*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
1843*0Sstevel@tonic-gate 
1844*0Sstevel@tonic-gate 	un = md_unit_readerlock(ui);
1845*0Sstevel@tonic-gate 	raid_write_io(un, cs);
1846*0Sstevel@tonic-gate }
1847*0Sstevel@tonic-gate 
1848*0Sstevel@tonic-gate void
1849*0Sstevel@tonic-gate raid_io_startup(mr_unit_t *un)
1850*0Sstevel@tonic-gate {
1851*0Sstevel@tonic-gate 	md_raidcs_t	*waiting_list, *cs1;
1852*0Sstevel@tonic-gate 	md_raidcs_t	*previous = NULL, *next = NULL;
1853*0Sstevel@tonic-gate 	mdi_unit_t	*ui =  MDI_UNIT(un->c.un_self_id);
1854*0Sstevel@tonic-gate 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
1855*0Sstevel@tonic-gate 
1856*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
1857*0Sstevel@tonic-gate 	mutex_enter(io_list_mutex);
1858*0Sstevel@tonic-gate 
1859*0Sstevel@tonic-gate 	/*
1860*0Sstevel@tonic-gate 	 * check to be sure there are no reader locks outstanding.  If
1861*0Sstevel@tonic-gate 	 * there are not then pass on the writer lock.
1862*0Sstevel@tonic-gate 	 */
1863*0Sstevel@tonic-gate 	waiting_list = ui->ui_io_lock->io_list_front;
1864*0Sstevel@tonic-gate 	while (waiting_list) {
1865*0Sstevel@tonic-gate 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1866*0Sstevel@tonic-gate 		ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD));
1867*0Sstevel@tonic-gate 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1868*0Sstevel@tonic-gate 			if (raid_io_overlaps(waiting_list, cs1) == 1)
1869*0Sstevel@tonic-gate 				break;
1870*0Sstevel@tonic-gate 		/*
1871*0Sstevel@tonic-gate 		 * there was an IOs that overlaps this io so go onto
1872*0Sstevel@tonic-gate 		 * the next io in the waiting list
1873*0Sstevel@tonic-gate 		 */
1874*0Sstevel@tonic-gate 		if (cs1) {
1875*0Sstevel@tonic-gate 			previous = waiting_list;
1876*0Sstevel@tonic-gate 			waiting_list = waiting_list->cs_linlck_next;
1877*0Sstevel@tonic-gate 			continue;
1878*0Sstevel@tonic-gate 		}
1879*0Sstevel@tonic-gate 
1880*0Sstevel@tonic-gate 		/*
1881*0Sstevel@tonic-gate 		 * There are no IOs that overlap this, so remove it from
1882*0Sstevel@tonic-gate 		 * the waiting queue, and start it
1883*0Sstevel@tonic-gate 		 */
1884*0Sstevel@tonic-gate 
1885*0Sstevel@tonic-gate 		if (raid_check_pw(waiting_list)) {
1886*0Sstevel@tonic-gate 			ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1887*0Sstevel@tonic-gate 			previous = waiting_list;
1888*0Sstevel@tonic-gate 			waiting_list = waiting_list->cs_linlck_next;
1889*0Sstevel@tonic-gate 			continue;
1890*0Sstevel@tonic-gate 		}
1891*0Sstevel@tonic-gate 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1892*0Sstevel@tonic-gate 
1893*0Sstevel@tonic-gate 		next = waiting_list->cs_linlck_next;
1894*0Sstevel@tonic-gate 		if (previous)
1895*0Sstevel@tonic-gate 			previous->cs_linlck_next = next;
1896*0Sstevel@tonic-gate 		else
1897*0Sstevel@tonic-gate 			ui->ui_io_lock->io_list_front = next;
1898*0Sstevel@tonic-gate 
1899*0Sstevel@tonic-gate 		if (ui->ui_io_lock->io_list_front == NULL)
1900*0Sstevel@tonic-gate 			ui->ui_io_lock->io_list_back = NULL;
1901*0Sstevel@tonic-gate 
1902*0Sstevel@tonic-gate 		if (ui->ui_io_lock->io_list_back == waiting_list)
1903*0Sstevel@tonic-gate 			ui->ui_io_lock->io_list_back = previous;
1904*0Sstevel@tonic-gate 
1905*0Sstevel@tonic-gate 		waiting_list->cs_linlck_next = NULL;
1906*0Sstevel@tonic-gate 		waiting_list->cs_flags &= ~MD_RCS_WAITING;
1907*0Sstevel@tonic-gate 		STAT_DEC(raid_write_queue_length);
1908*0Sstevel@tonic-gate 		if (raid_line_writer_lock(waiting_list, 0))
1909*0Sstevel@tonic-gate 			panic("region locking corrupted");
1910*0Sstevel@tonic-gate 
1911*0Sstevel@tonic-gate 		ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD);
1912*0Sstevel@tonic-gate 		daemon_request(&md_mstr_daemon, raid_startio,
1913*0Sstevel@tonic-gate 		    (daemon_queue_t *)waiting_list, REQ_OLD);
1914*0Sstevel@tonic-gate 		waiting_list = next;
1915*0Sstevel@tonic-gate 
1916*0Sstevel@tonic-gate 	}
1917*0Sstevel@tonic-gate 	mutex_exit(io_list_mutex);
1918*0Sstevel@tonic-gate }
1919*0Sstevel@tonic-gate 
1920*0Sstevel@tonic-gate void
1921*0Sstevel@tonic-gate raid_line_exit(md_raidcs_t *cs)
1922*0Sstevel@tonic-gate {
1923*0Sstevel@tonic-gate 	mr_unit_t	*un;
1924*0Sstevel@tonic-gate 
1925*0Sstevel@tonic-gate 	un = cs->cs_ps->ps_un;
1926*0Sstevel@tonic-gate 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1927*0Sstevel@tonic-gate 	mutex_enter(&un->un_linlck_mx);
1928*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_READER)
1929*0Sstevel@tonic-gate 		STAT_DEC(raid_reader_locks_active);
1930*0Sstevel@tonic-gate 	else
1931*0Sstevel@tonic-gate 		STAT_DEC(raid_write_locks_active);
1932*0Sstevel@tonic-gate 
1933*0Sstevel@tonic-gate 	if (cs->cs_linlck_prev)
1934*0Sstevel@tonic-gate 		cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next;
1935*0Sstevel@tonic-gate 	else
1936*0Sstevel@tonic-gate 		un->un_linlck_chn = cs->cs_linlck_next;
1937*0Sstevel@tonic-gate 	if (cs->cs_linlck_next)
1938*0Sstevel@tonic-gate 		cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev;
1939*0Sstevel@tonic-gate 
1940*0Sstevel@tonic-gate 	cs->cs_flags &= ~MD_RCS_LLOCKD;
1941*0Sstevel@tonic-gate 
1942*0Sstevel@tonic-gate 	if (un->un_linlck_flg)
1943*0Sstevel@tonic-gate 		cv_broadcast(&un->un_linlck_cv);
1944*0Sstevel@tonic-gate 
1945*0Sstevel@tonic-gate 	un->un_linlck_flg = 0;
1946*0Sstevel@tonic-gate 	cs->cs_line = MD_DISKADDR_ERROR;
1947*0Sstevel@tonic-gate 
1948*0Sstevel@tonic-gate 	raid_cancel_pwslot(cs);
1949*0Sstevel@tonic-gate 	/*
1950*0Sstevel@tonic-gate 	 * now that the lock is droped go ahead and see if there are any
1951*0Sstevel@tonic-gate 	 * other writes that can be started up
1952*0Sstevel@tonic-gate 	 */
1953*0Sstevel@tonic-gate 	raid_io_startup(un);
1954*0Sstevel@tonic-gate 
1955*0Sstevel@tonic-gate 	mutex_exit(&un->un_linlck_mx);
1956*0Sstevel@tonic-gate }
1957*0Sstevel@tonic-gate 
1958*0Sstevel@tonic-gate /*
1959*0Sstevel@tonic-gate  * NAMES:	raid_line, raid_pcolumn, raid_dcolumn
1960*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
1961*0Sstevel@tonic-gate  *		data column # and parity column #.
1962*0Sstevel@tonic-gate  * PARAMETERS:	int segment - segment number
1963*0Sstevel@tonic-gate  *		mr_unit_t *un - pointer to an unit structure
1964*0Sstevel@tonic-gate  * RETURNS:	raid_line returns line #
1965*0Sstevel@tonic-gate  *		raid_dcolumn returns data column #
1966*0Sstevel@tonic-gate  *		raid_pcolumn returns parity column #
1967*0Sstevel@tonic-gate  */
1968*0Sstevel@tonic-gate static diskaddr_t
1969*0Sstevel@tonic-gate raid_line(diskaddr_t segment, mr_unit_t *un)
1970*0Sstevel@tonic-gate {
1971*0Sstevel@tonic-gate 	diskaddr_t	adj_seg;
1972*0Sstevel@tonic-gate 	diskaddr_t	line;
1973*0Sstevel@tonic-gate 	diskaddr_t	max_orig_segment;
1974*0Sstevel@tonic-gate 
1975*0Sstevel@tonic-gate 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
1976*0Sstevel@tonic-gate 	if (segment >= max_orig_segment) {
1977*0Sstevel@tonic-gate 		adj_seg = segment - max_orig_segment;
1978*0Sstevel@tonic-gate 		line = adj_seg % un->un_segsincolumn;
1979*0Sstevel@tonic-gate 	} else {
1980*0Sstevel@tonic-gate 		line = segment / (un->un_origcolumncnt - 1);
1981*0Sstevel@tonic-gate 	}
1982*0Sstevel@tonic-gate 	return (line);
1983*0Sstevel@tonic-gate }
1984*0Sstevel@tonic-gate 
1985*0Sstevel@tonic-gate uint_t
1986*0Sstevel@tonic-gate raid_dcolumn(diskaddr_t segment, mr_unit_t *un)
1987*0Sstevel@tonic-gate {
1988*0Sstevel@tonic-gate 	diskaddr_t	adj_seg;
1989*0Sstevel@tonic-gate 	diskaddr_t	line;
1990*0Sstevel@tonic-gate 	diskaddr_t	max_orig_segment;
1991*0Sstevel@tonic-gate 	uint_t		column;
1992*0Sstevel@tonic-gate 
1993*0Sstevel@tonic-gate 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
1994*0Sstevel@tonic-gate 	if (segment >= max_orig_segment) {
1995*0Sstevel@tonic-gate 		adj_seg = segment - max_orig_segment;
1996*0Sstevel@tonic-gate 		column = un->un_origcolumncnt  +
1997*0Sstevel@tonic-gate 			(uint_t)(adj_seg / un->un_segsincolumn);
1998*0Sstevel@tonic-gate 	} else {
1999*0Sstevel@tonic-gate 		line = segment / (un->un_origcolumncnt - 1);
2000*0Sstevel@tonic-gate 		column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line)
2001*0Sstevel@tonic-gate 		    % un->un_origcolumncnt);
2002*0Sstevel@tonic-gate 	}
2003*0Sstevel@tonic-gate 	return (column);
2004*0Sstevel@tonic-gate }
2005*0Sstevel@tonic-gate 
2006*0Sstevel@tonic-gate uint_t
2007*0Sstevel@tonic-gate raid_pcolumn(diskaddr_t segment, mr_unit_t *un)
2008*0Sstevel@tonic-gate {
2009*0Sstevel@tonic-gate 	diskaddr_t	adj_seg;
2010*0Sstevel@tonic-gate 	diskaddr_t	line;
2011*0Sstevel@tonic-gate 	diskaddr_t	max_orig_segment;
2012*0Sstevel@tonic-gate 	uint_t		column;
2013*0Sstevel@tonic-gate 
2014*0Sstevel@tonic-gate 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
2015*0Sstevel@tonic-gate 	if (segment >= max_orig_segment) {
2016*0Sstevel@tonic-gate 		adj_seg = segment - max_orig_segment;
2017*0Sstevel@tonic-gate 		line = adj_seg % un->un_segsincolumn;
2018*0Sstevel@tonic-gate 	} else {
2019*0Sstevel@tonic-gate 		line = segment / (un->un_origcolumncnt - 1);
2020*0Sstevel@tonic-gate 	}
2021*0Sstevel@tonic-gate 	column = (uint_t)((line + (un->un_origcolumncnt - 1))
2022*0Sstevel@tonic-gate 				% un->un_origcolumncnt);
2023*0Sstevel@tonic-gate 	return (column);
2024*0Sstevel@tonic-gate }
2025*0Sstevel@tonic-gate 
2026*0Sstevel@tonic-gate 
2027*0Sstevel@tonic-gate /*
2028*0Sstevel@tonic-gate  * Is called in raid_iosetup to probe each column to insure
2029*0Sstevel@tonic-gate  * that all the columns are in 'okay' state and meet the
2030*0Sstevel@tonic-gate  * 'full line' requirement.  If any column is in error,
2031*0Sstevel@tonic-gate  * we don't want to enable the 'full line' flag.  Previously,
2032*0Sstevel@tonic-gate  * we would do so and disable it only when a error is
2033*0Sstevel@tonic-gate  * detected after the first 'full line' io which is too late
2034*0Sstevel@tonic-gate  * and leads to the potential data corruption.
2035*0Sstevel@tonic-gate  */
2036*0Sstevel@tonic-gate static int
2037*0Sstevel@tonic-gate raid_check_cols(mr_unit_t *un)
2038*0Sstevel@tonic-gate {
2039*0Sstevel@tonic-gate 	buf_t		bp;
2040*0Sstevel@tonic-gate 	char		*buf;
2041*0Sstevel@tonic-gate 	mr_column_t	*colptr;
2042*0Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
2043*0Sstevel@tonic-gate 	int		i;
2044*0Sstevel@tonic-gate 	int		err = 0;
2045*0Sstevel@tonic-gate 
2046*0Sstevel@tonic-gate 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
2047*0Sstevel@tonic-gate 
2048*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
2049*0Sstevel@tonic-gate 		md_dev64_t tmpdev;
2050*0Sstevel@tonic-gate 
2051*0Sstevel@tonic-gate 		colptr = &un->un_column[i];
2052*0Sstevel@tonic-gate 
2053*0Sstevel@tonic-gate 		tmpdev = colptr->un_dev;
2054*0Sstevel@tonic-gate 		/*
2055*0Sstevel@tonic-gate 		 * Open by device id
2056*0Sstevel@tonic-gate 		 * If this device is hotspared
2057*0Sstevel@tonic-gate 		 * use the hotspare key
2058*0Sstevel@tonic-gate 		 */
2059*0Sstevel@tonic-gate 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
2060*0Sstevel@tonic-gate 			colptr->un_hs_key : colptr->un_orig_key);
2061*0Sstevel@tonic-gate 
2062*0Sstevel@tonic-gate 		if (tmpdev == NODEV64) {
2063*0Sstevel@tonic-gate 			err = 1;
2064*0Sstevel@tonic-gate 			break;
2065*0Sstevel@tonic-gate 		}
2066*0Sstevel@tonic-gate 
2067*0Sstevel@tonic-gate 		colptr->un_dev = tmpdev;
2068*0Sstevel@tonic-gate 
2069*0Sstevel@tonic-gate 		bzero((caddr_t)&bp, sizeof (buf_t));
2070*0Sstevel@tonic-gate 		bp.b_back = &bp;
2071*0Sstevel@tonic-gate 		bp.b_forw = &bp;
2072*0Sstevel@tonic-gate 		bp.b_flags = (B_READ | B_BUSY);
2073*0Sstevel@tonic-gate 		sema_init(&bp.b_io, 0, NULL,
2074*0Sstevel@tonic-gate 		    SEMA_DEFAULT, NULL);
2075*0Sstevel@tonic-gate 		sema_init(&bp.b_sem, 0, NULL,
2076*0Sstevel@tonic-gate 		    SEMA_DEFAULT, NULL);
2077*0Sstevel@tonic-gate 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
2078*0Sstevel@tonic-gate 		bp.b_lblkno = colptr->un_pwstart;
2079*0Sstevel@tonic-gate 		bp.b_bcount = DEV_BSIZE;
2080*0Sstevel@tonic-gate 		bp.b_bufsize = DEV_BSIZE;
2081*0Sstevel@tonic-gate 		bp.b_un.b_addr = (caddr_t)buf;
2082*0Sstevel@tonic-gate 		(void) md_call_strategy(&bp, 0, NULL);
2083*0Sstevel@tonic-gate 		if (biowait(&bp)) {
2084*0Sstevel@tonic-gate 			err = 1;
2085*0Sstevel@tonic-gate 			break;
2086*0Sstevel@tonic-gate 		}
2087*0Sstevel@tonic-gate 	}
2088*0Sstevel@tonic-gate 
2089*0Sstevel@tonic-gate 	kmem_free(buf, DEV_BSIZE);
2090*0Sstevel@tonic-gate 	return (err);
2091*0Sstevel@tonic-gate }
2092*0Sstevel@tonic-gate 
2093*0Sstevel@tonic-gate /*
2094*0Sstevel@tonic-gate  * NAME:	raid_iosetup
2095*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice specific I/O set up routine which does
2096*0Sstevel@tonic-gate  *		all the necessary calculations to determine the location
2097*0Sstevel@tonic-gate  *		of the segement for the I/O.
2098*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - unit number of RAID metadevice
2099*0Sstevel@tonic-gate  *		diskaddr_t	blkno - block number of the I/O attempt
2100*0Sstevel@tonic-gate  *		size_t		blkcnt - block count for this I/O
2101*0Sstevel@tonic-gate  *		md_raidcs_t *cs - child structure for each segmented I/O
2102*0Sstevel@tonic-gate  *
2103*0Sstevel@tonic-gate  * NOTE:	The following is an example of a raid disk layer out:
2104*0Sstevel@tonic-gate  *
2105*0Sstevel@tonic-gate  *		Total Column = 5
2106*0Sstevel@tonic-gate  *		Original Column = 4
2107*0Sstevel@tonic-gate  *		Segment Per Column = 10
2108*0Sstevel@tonic-gate  *
2109*0Sstevel@tonic-gate  *			Col#0	Col#1	Col#2	Col#3	Col#4	Col#5	Col#6
2110*0Sstevel@tonic-gate  *		-------------------------------------------------------------
2111*0Sstevel@tonic-gate  *		line#0	Seg#0	Seg#1	Seg#2	Parity	Seg#30	Seg#40
2112*0Sstevel@tonic-gate  *		line#1	Parity	Seg#3	Seg#4	Seg#5	Seg#31
2113*0Sstevel@tonic-gate  *		line#2	Seg#8	Parity	Seg#6	Seg#7	Seg#32
2114*0Sstevel@tonic-gate  *		line#3	Seg#10	Seg#11	Parity	Seg#9	Seg#33
2115*0Sstevel@tonic-gate  *		line#4	Seg#12	Seg#13	Seg#14	Parity	Seg#34
2116*0Sstevel@tonic-gate  *		line#5	Parity	Seg#15	Seg#16	Seg#17	Seg#35
2117*0Sstevel@tonic-gate  *		line#6	Seg#20	Parity	Seg#18	Seg#19	Seg#36
2118*0Sstevel@tonic-gate  *		line#7	Seg#22	Seg#23	Parity	Seg#21	Seg#37
2119*0Sstevel@tonic-gate  *		line#8	Seg#24	Seg#25	Seg#26	Parity	Seg#38
2120*0Sstevel@tonic-gate  *		line#9	Parity	Seg#27	Seg#28	Seg#29	Seg#39
2121*0Sstevel@tonic-gate  */
2122*0Sstevel@tonic-gate static size_t
2123*0Sstevel@tonic-gate raid_iosetup(
2124*0Sstevel@tonic-gate 	mr_unit_t	*un,
2125*0Sstevel@tonic-gate 	diskaddr_t	blkno,
2126*0Sstevel@tonic-gate 	size_t		blkcnt,
2127*0Sstevel@tonic-gate 	md_raidcs_t	*cs
2128*0Sstevel@tonic-gate )
2129*0Sstevel@tonic-gate {
2130*0Sstevel@tonic-gate 	diskaddr_t	segment;
2131*0Sstevel@tonic-gate 	diskaddr_t	segstart;
2132*0Sstevel@tonic-gate 	diskaddr_t	segoff;
2133*0Sstevel@tonic-gate 	size_t		leftover;
2134*0Sstevel@tonic-gate 	diskaddr_t	line;
2135*0Sstevel@tonic-gate 	uint_t		iosize;
2136*0Sstevel@tonic-gate 	uint_t		colcnt;
2137*0Sstevel@tonic-gate 
2138*0Sstevel@tonic-gate 	/* caculate the segment# and offset for the block */
2139*0Sstevel@tonic-gate 	segment = blkno / un->un_segsize;
2140*0Sstevel@tonic-gate 	segstart = segment * un->un_segsize;
2141*0Sstevel@tonic-gate 	segoff = blkno - segstart;
2142*0Sstevel@tonic-gate 	iosize = un->un_iosize - 1;
2143*0Sstevel@tonic-gate 	colcnt = un->un_totalcolumncnt - 1;
2144*0Sstevel@tonic-gate 	line = raid_line(segment, un);
2145*0Sstevel@tonic-gate 	cs->cs_dcolumn = raid_dcolumn(segment, un);
2146*0Sstevel@tonic-gate 	cs->cs_pcolumn = raid_pcolumn(segment, un);
2147*0Sstevel@tonic-gate 	cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags;
2148*0Sstevel@tonic-gate 	cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags;
2149*0Sstevel@tonic-gate 	cs->cs_line = line;
2150*0Sstevel@tonic-gate 
2151*0Sstevel@tonic-gate 	if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) &&
2152*0Sstevel@tonic-gate 	    (UNIT_STATE(un) & RCS_OKAY) &&
2153*0Sstevel@tonic-gate 	    (segoff == 0) &&
2154*0Sstevel@tonic-gate 	    (un->un_totalcolumncnt == un->un_origcolumncnt) &&
2155*0Sstevel@tonic-gate 	    (un->un_segsize < un->un_iosize) &&
2156*0Sstevel@tonic-gate 	    (un->un_iosize <= un->un_maxio) &&
2157*0Sstevel@tonic-gate 	    (blkno == line * un->un_segsize * colcnt) &&
2158*0Sstevel@tonic-gate 	    (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) &&
2159*0Sstevel@tonic-gate 	    (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) &&
2160*0Sstevel@tonic-gate 	    (raid_check_cols(un) == 0)) {
2161*0Sstevel@tonic-gate 
2162*0Sstevel@tonic-gate 		md_raidcbuf_t	**cbufp;
2163*0Sstevel@tonic-gate 		md_raidcbuf_t	*cbuf;
2164*0Sstevel@tonic-gate 		int		i, j;
2165*0Sstevel@tonic-gate 
2166*0Sstevel@tonic-gate 		STAT_INC(raid_full_line_writes);
2167*0Sstevel@tonic-gate 		leftover = blkcnt - (un->un_segsize * colcnt);
2168*0Sstevel@tonic-gate 		ASSERT(blkcnt >= (un->un_segsize * colcnt));
2169*0Sstevel@tonic-gate 		cs->cs_blkno = line * un->un_segsize;
2170*0Sstevel@tonic-gate 		cs->cs_blkcnt = un->un_segsize;
2171*0Sstevel@tonic-gate 		cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
2172*0Sstevel@tonic-gate 		cs->cs_bcount = dbtob(cs->cs_blkcnt);
2173*0Sstevel@tonic-gate 		cs->cs_flags |= MD_RCS_LINE;
2174*0Sstevel@tonic-gate 
2175*0Sstevel@tonic-gate 		cbufp = &cs->cs_buflist;
2176*0Sstevel@tonic-gate 		for (i = 0; i < un->un_totalcolumncnt; i++) {
2177*0Sstevel@tonic-gate 			j = cs->cs_dcolumn + i;
2178*0Sstevel@tonic-gate 			j = j % un->un_totalcolumncnt;
2179*0Sstevel@tonic-gate 
2180*0Sstevel@tonic-gate 			if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn))
2181*0Sstevel@tonic-gate 				continue;
2182*0Sstevel@tonic-gate 			cbuf = kmem_cache_alloc(raid_cbuf_cache,
2183*0Sstevel@tonic-gate 			    MD_ALLOCFLAGS);
2184*0Sstevel@tonic-gate 			raid_cbuf_init(cbuf);
2185*0Sstevel@tonic-gate 			cbuf->cbuf_un = cs->cs_un;
2186*0Sstevel@tonic-gate 			cbuf->cbuf_ps = cs->cs_ps;
2187*0Sstevel@tonic-gate 			cbuf->cbuf_column = j;
2188*0Sstevel@tonic-gate 			cbuf->cbuf_bcount = dbtob(un->un_segsize);
2189*0Sstevel@tonic-gate 			*cbufp = cbuf;
2190*0Sstevel@tonic-gate 			cbufp = &cbuf->cbuf_next;
2191*0Sstevel@tonic-gate 		}
2192*0Sstevel@tonic-gate 		return (leftover);
2193*0Sstevel@tonic-gate 	}
2194*0Sstevel@tonic-gate 
2195*0Sstevel@tonic-gate 	leftover = blkcnt - (un->un_segsize - segoff);
2196*0Sstevel@tonic-gate 	if (blkcnt > (un->un_segsize - segoff))
2197*0Sstevel@tonic-gate 		blkcnt -= leftover;
2198*0Sstevel@tonic-gate 	else
2199*0Sstevel@tonic-gate 		leftover = 0;
2200*0Sstevel@tonic-gate 
2201*0Sstevel@tonic-gate 	if (blkcnt > (size_t)iosize) {
2202*0Sstevel@tonic-gate 		leftover += (blkcnt - iosize);
2203*0Sstevel@tonic-gate 		blkcnt = iosize;
2204*0Sstevel@tonic-gate 	}
2205*0Sstevel@tonic-gate 
2206*0Sstevel@tonic-gate 	/* calculate the line# and column# for the segment */
2207*0Sstevel@tonic-gate 	cs->cs_flags &= ~MD_RCS_LINE;
2208*0Sstevel@tonic-gate 	cs->cs_blkno = line * un->un_segsize + segoff;
2209*0Sstevel@tonic-gate 	cs->cs_blkcnt = (uint_t)blkcnt;
2210*0Sstevel@tonic-gate 	cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
2211*0Sstevel@tonic-gate 	cs->cs_bcount = dbtob((uint_t)blkcnt);
2212*0Sstevel@tonic-gate 	return (leftover);
2213*0Sstevel@tonic-gate }
2214*0Sstevel@tonic-gate 
2215*0Sstevel@tonic-gate /*
2216*0Sstevel@tonic-gate  * NAME:	raid_done
2217*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice I/O done interrupt routine
2218*0Sstevel@tonic-gate  * PARAMETERS:	struct buf *bp - pointer to a buffer structure
2219*0Sstevel@tonic-gate  */
2220*0Sstevel@tonic-gate static void
2221*0Sstevel@tonic-gate raid_done(struct buf *bp)
2222*0Sstevel@tonic-gate {
2223*0Sstevel@tonic-gate 	md_raidcs_t	*cs;
2224*0Sstevel@tonic-gate 	int		flags, frags;
2225*0Sstevel@tonic-gate 
2226*0Sstevel@tonic-gate 	sema_v(&bp->b_io);
2227*0Sstevel@tonic-gate 	cs = (md_raidcs_t *)bp->b_chain;
2228*0Sstevel@tonic-gate 
2229*0Sstevel@tonic-gate 	ASSERT(cs != NULL);
2230*0Sstevel@tonic-gate 
2231*0Sstevel@tonic-gate 	mutex_enter(&cs->cs_mx);
2232*0Sstevel@tonic-gate 	if (bp->b_flags & B_ERROR) {
2233*0Sstevel@tonic-gate 		cs->cs_flags |= MD_RCS_ERROR;
2234*0Sstevel@tonic-gate 		cs->cs_flags &= ~(MD_RCS_ISCALL);
2235*0Sstevel@tonic-gate 	}
2236*0Sstevel@tonic-gate 
2237*0Sstevel@tonic-gate 	flags = cs->cs_flags;
2238*0Sstevel@tonic-gate 	frags = --cs->cs_frags;
2239*0Sstevel@tonic-gate 	mutex_exit(&cs->cs_mx);
2240*0Sstevel@tonic-gate 	if (frags != 0) {
2241*0Sstevel@tonic-gate 		return;
2242*0Sstevel@tonic-gate 	}
2243*0Sstevel@tonic-gate 
2244*0Sstevel@tonic-gate 	if (flags & MD_RCS_ERROR) {
2245*0Sstevel@tonic-gate 		if (cs->cs_error_call) {
2246*0Sstevel@tonic-gate 			daemon_request(&md_done_daemon, cs->cs_error_call,
2247*0Sstevel@tonic-gate 				(daemon_queue_t *)cs, REQ_OLD);
2248*0Sstevel@tonic-gate 		}
2249*0Sstevel@tonic-gate 		return;
2250*0Sstevel@tonic-gate 	}
2251*0Sstevel@tonic-gate 
2252*0Sstevel@tonic-gate 	if (flags & MD_RCS_ISCALL) {
2253*0Sstevel@tonic-gate 		cs->cs_flags &= ~(MD_RCS_ISCALL);
2254*0Sstevel@tonic-gate 		(*(cs->cs_call))(cs);
2255*0Sstevel@tonic-gate 		return;
2256*0Sstevel@tonic-gate 	}
2257*0Sstevel@tonic-gate 	daemon_request(&md_done_daemon, cs->cs_call,
2258*0Sstevel@tonic-gate 					(daemon_queue_t *)cs, REQ_OLD);
2259*0Sstevel@tonic-gate }
2260*0Sstevel@tonic-gate /*
2261*0Sstevel@tonic-gate  * the flag RIO_EXTRA is used when dealing with a column in the process
2262*0Sstevel@tonic-gate  * of being resynced. During the resync, writes may have to take place
2263*0Sstevel@tonic-gate  * on both the original component and a hotspare component.
2264*0Sstevel@tonic-gate  */
2265*0Sstevel@tonic-gate #define	RIO_DATA	0x00100		/* use data buffer & data column */
2266*0Sstevel@tonic-gate #define	RIO_PARITY	0x00200		/* use parity buffer & parity column */
2267*0Sstevel@tonic-gate #define	RIO_WRITE	0x00400		/* issue a write */
2268*0Sstevel@tonic-gate #define	RIO_READ	0x00800		/* issue a read */
2269*0Sstevel@tonic-gate #define	RIO_PWIO	0x01000		/* do the I/O to the prewrite entry */
2270*0Sstevel@tonic-gate #define	RIO_ALT		0x02000		/* do write to alternate device */
2271*0Sstevel@tonic-gate #define	RIO_EXTRA	0x04000		/* use extra buffer */
2272*0Sstevel@tonic-gate 
2273*0Sstevel@tonic-gate #define	RIO_COLMASK	0x000ff
2274*0Sstevel@tonic-gate 
2275*0Sstevel@tonic-gate #define	RIO_PREWRITE	RIO_WRITE | RIO_PWIO
2276*0Sstevel@tonic-gate 
2277*0Sstevel@tonic-gate /*
2278*0Sstevel@tonic-gate  * NAME:	raidio
2279*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
2280*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2281*0Sstevel@tonic-gate  */
2282*0Sstevel@tonic-gate static void
2283*0Sstevel@tonic-gate raidio(md_raidcs_t *cs, int flags)
2284*0Sstevel@tonic-gate {
2285*0Sstevel@tonic-gate 	buf_t		*bp;
2286*0Sstevel@tonic-gate 	int		column;
2287*0Sstevel@tonic-gate 	int		flag;
2288*0Sstevel@tonic-gate 	void		*private;
2289*0Sstevel@tonic-gate 	mr_unit_t	*un;
2290*0Sstevel@tonic-gate 	int		iosize;
2291*0Sstevel@tonic-gate 	diskaddr_t	pwstart;
2292*0Sstevel@tonic-gate 	diskaddr_t	devstart;
2293*0Sstevel@tonic-gate 	md_dev64_t	dev;
2294*0Sstevel@tonic-gate 
2295*0Sstevel@tonic-gate 	un = cs->cs_un;
2296*0Sstevel@tonic-gate 
2297*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(un));
2298*0Sstevel@tonic-gate 	ASSERT(UNIT_READER_HELD(un));
2299*0Sstevel@tonic-gate 
2300*0Sstevel@tonic-gate 	if (flags & RIO_DATA) {
2301*0Sstevel@tonic-gate 		if (flags & RIO_EXTRA)
2302*0Sstevel@tonic-gate 			bp = &cs->cs_hbuf;
2303*0Sstevel@tonic-gate 		else
2304*0Sstevel@tonic-gate 			bp = &cs->cs_dbuf;
2305*0Sstevel@tonic-gate 		bp->b_un.b_addr = cs->cs_dbuffer;
2306*0Sstevel@tonic-gate 		column = cs->cs_dcolumn;
2307*0Sstevel@tonic-gate 	} else {
2308*0Sstevel@tonic-gate 		if (flags & RIO_EXTRA)
2309*0Sstevel@tonic-gate 			bp = &cs->cs_hbuf;
2310*0Sstevel@tonic-gate 		else
2311*0Sstevel@tonic-gate 			bp = &cs->cs_pbuf;
2312*0Sstevel@tonic-gate 		bp->b_un.b_addr = cs->cs_pbuffer;
2313*0Sstevel@tonic-gate 		column = cs->cs_pcolumn;
2314*0Sstevel@tonic-gate 	}
2315*0Sstevel@tonic-gate 	if (flags & RIO_COLMASK)
2316*0Sstevel@tonic-gate 		column = (flags & RIO_COLMASK) - 1;
2317*0Sstevel@tonic-gate 
2318*0Sstevel@tonic-gate 	bp->b_bcount = cs->cs_bcount;
2319*0Sstevel@tonic-gate 	bp->b_bufsize = cs->cs_bcount;
2320*0Sstevel@tonic-gate 	iosize = un->un_iosize;
2321*0Sstevel@tonic-gate 
2322*0Sstevel@tonic-gate 	/* check if the hotspared device will be used */
2323*0Sstevel@tonic-gate 	if (flags & RIO_ALT && (flags & RIO_WRITE)) {
2324*0Sstevel@tonic-gate 		pwstart = un->un_column[column].un_alt_pwstart;
2325*0Sstevel@tonic-gate 		devstart = un->un_column[column].un_alt_devstart;
2326*0Sstevel@tonic-gate 		dev = un->un_column[column].un_alt_dev;
2327*0Sstevel@tonic-gate 	} else {
2328*0Sstevel@tonic-gate 		pwstart = un->un_column[column].un_pwstart;
2329*0Sstevel@tonic-gate 		devstart = un->un_column[column].un_devstart;
2330*0Sstevel@tonic-gate 		dev = un->un_column[column].un_dev;
2331*0Sstevel@tonic-gate 	}
2332*0Sstevel@tonic-gate 
2333*0Sstevel@tonic-gate 	/* if not writing to log skip log header */
2334*0Sstevel@tonic-gate 	if ((flags & RIO_PWIO) == 0) {
2335*0Sstevel@tonic-gate 		bp->b_lblkno = devstart + cs->cs_blkno;
2336*0Sstevel@tonic-gate 		bp->b_un.b_addr += DEV_BSIZE;
2337*0Sstevel@tonic-gate 	} else {
2338*0Sstevel@tonic-gate 		bp->b_bcount += DEV_BSIZE;
2339*0Sstevel@tonic-gate 		bp->b_bufsize = bp->b_bcount;
2340*0Sstevel@tonic-gate 		if (flags & RIO_DATA) {
2341*0Sstevel@tonic-gate 			bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart;
2342*0Sstevel@tonic-gate 		} else { /* not DATA -> PARITY */
2343*0Sstevel@tonic-gate 			bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart;
2344*0Sstevel@tonic-gate 		}
2345*0Sstevel@tonic-gate 	}
2346*0Sstevel@tonic-gate 
2347*0Sstevel@tonic-gate 	bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available);
2348*0Sstevel@tonic-gate 	bp->b_flags |= B_BUSY;
2349*0Sstevel@tonic-gate 	if (flags & RIO_READ) {
2350*0Sstevel@tonic-gate 		bp->b_flags |= B_READ;
2351*0Sstevel@tonic-gate 	} else {
2352*0Sstevel@tonic-gate 		bp->b_flags |= B_WRITE;
2353*0Sstevel@tonic-gate 		if ((nv_available && nv_parity && (flags & RIO_PARITY)) ||
2354*0Sstevel@tonic-gate 		    (nv_available && nv_prewrite && (flags & RIO_PWIO)))
2355*0Sstevel@tonic-gate 			bp->b_flags |= nv_available;
2356*0Sstevel@tonic-gate 	}
2357*0Sstevel@tonic-gate 	bp->b_iodone = (int (*)())raid_done;
2358*0Sstevel@tonic-gate 	bp->b_edev = md_dev64_to_dev(dev);
2359*0Sstevel@tonic-gate 
2360*0Sstevel@tonic-gate 	ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV));
2361*0Sstevel@tonic-gate 
2362*0Sstevel@tonic-gate 	private = cs->cs_strategy_private;
2363*0Sstevel@tonic-gate 	flag = cs->cs_strategy_flag;
2364*0Sstevel@tonic-gate 
2365*0Sstevel@tonic-gate 	md_call_strategy(bp, flag, private);
2366*0Sstevel@tonic-gate }
2367*0Sstevel@tonic-gate 
2368*0Sstevel@tonic-gate /*
2369*0Sstevel@tonic-gate  * NAME:	genstandardparity
2370*0Sstevel@tonic-gate  * DESCRIPTION: This routine
2371*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2372*0Sstevel@tonic-gate  */
2373*0Sstevel@tonic-gate static void
2374*0Sstevel@tonic-gate genstandardparity(md_raidcs_t *cs)
2375*0Sstevel@tonic-gate {
2376*0Sstevel@tonic-gate 	uint_t		*dbuf, *pbuf;
2377*0Sstevel@tonic-gate 	size_t		wordcnt;
2378*0Sstevel@tonic-gate 	uint_t		dsum = 0;
2379*0Sstevel@tonic-gate 	uint_t		psum = 0;
2380*0Sstevel@tonic-gate 
2381*0Sstevel@tonic-gate 	ASSERT((cs->cs_bcount & 0x3) == 0);
2382*0Sstevel@tonic-gate 
2383*0Sstevel@tonic-gate 	wordcnt = cs->cs_bcount / sizeof (uint_t);
2384*0Sstevel@tonic-gate 
2385*0Sstevel@tonic-gate 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2386*0Sstevel@tonic-gate 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2387*0Sstevel@tonic-gate 
2388*0Sstevel@tonic-gate 	/* Word aligned */
2389*0Sstevel@tonic-gate 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2390*0Sstevel@tonic-gate 		uint_t	*uwbuf = (uint_t *)(void *)(cs->cs_addr);
2391*0Sstevel@tonic-gate 		uint_t	uval;
2392*0Sstevel@tonic-gate 
2393*0Sstevel@tonic-gate 		while (wordcnt--) {
2394*0Sstevel@tonic-gate 			uval = *uwbuf++;
2395*0Sstevel@tonic-gate 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval));
2396*0Sstevel@tonic-gate 			++pbuf;
2397*0Sstevel@tonic-gate 			*dbuf = uval;
2398*0Sstevel@tonic-gate 			dsum ^= uval;
2399*0Sstevel@tonic-gate 			++dbuf;
2400*0Sstevel@tonic-gate 		}
2401*0Sstevel@tonic-gate 	} else {
2402*0Sstevel@tonic-gate 		uchar_t	*ubbuf = (uchar_t *)(cs->cs_addr);
2403*0Sstevel@tonic-gate 		union {
2404*0Sstevel@tonic-gate 			uint_t	wb;
2405*0Sstevel@tonic-gate 			uchar_t	bb[4];
2406*0Sstevel@tonic-gate 		} cb;
2407*0Sstevel@tonic-gate 
2408*0Sstevel@tonic-gate 		while (wordcnt--) {
2409*0Sstevel@tonic-gate 			cb.bb[0] = *ubbuf++;
2410*0Sstevel@tonic-gate 			cb.bb[1] = *ubbuf++;
2411*0Sstevel@tonic-gate 			cb.bb[2] = *ubbuf++;
2412*0Sstevel@tonic-gate 			cb.bb[3] = *ubbuf++;
2413*0Sstevel@tonic-gate 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb));
2414*0Sstevel@tonic-gate 			++pbuf;
2415*0Sstevel@tonic-gate 			*dbuf = cb.wb;
2416*0Sstevel@tonic-gate 			dsum ^= cb.wb;
2417*0Sstevel@tonic-gate 			++dbuf;
2418*0Sstevel@tonic-gate 		}
2419*0Sstevel@tonic-gate 	}
2420*0Sstevel@tonic-gate 
2421*0Sstevel@tonic-gate 	RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn,
2422*0Sstevel@tonic-gate 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2423*0Sstevel@tonic-gate 			2, cs->cs_dcolumn, RAID_PWMAGIC);
2424*0Sstevel@tonic-gate 
2425*0Sstevel@tonic-gate 	RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn,
2426*0Sstevel@tonic-gate 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2427*0Sstevel@tonic-gate 			2, cs->cs_pcolumn, RAID_PWMAGIC);
2428*0Sstevel@tonic-gate }
2429*0Sstevel@tonic-gate 
2430*0Sstevel@tonic-gate static void
2431*0Sstevel@tonic-gate genlineparity(md_raidcs_t *cs)
2432*0Sstevel@tonic-gate {
2433*0Sstevel@tonic-gate 
2434*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
2435*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
2436*0Sstevel@tonic-gate 	uint_t		*pbuf, *dbuf;
2437*0Sstevel@tonic-gate 	uint_t		*uwbuf;
2438*0Sstevel@tonic-gate 	uchar_t		*ubbuf;
2439*0Sstevel@tonic-gate 	size_t		wordcnt;
2440*0Sstevel@tonic-gate 	uint_t		psum = 0, dsum = 0;
2441*0Sstevel@tonic-gate 	size_t		count = un->un_segsize * DEV_BSIZE;
2442*0Sstevel@tonic-gate 	uint_t		col;
2443*0Sstevel@tonic-gate 	buf_t		*bp;
2444*0Sstevel@tonic-gate 
2445*0Sstevel@tonic-gate 	ASSERT((cs->cs_bcount & 0x3) == 0);
2446*0Sstevel@tonic-gate 
2447*0Sstevel@tonic-gate 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2448*0Sstevel@tonic-gate 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2449*0Sstevel@tonic-gate 	uwbuf = (uint_t *)(void *)(cs->cs_addr);
2450*0Sstevel@tonic-gate 	ubbuf = (uchar_t *)(void *)(cs->cs_addr);
2451*0Sstevel@tonic-gate 
2452*0Sstevel@tonic-gate 	wordcnt = count / sizeof (uint_t);
2453*0Sstevel@tonic-gate 
2454*0Sstevel@tonic-gate 	/* Word aligned */
2455*0Sstevel@tonic-gate 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2456*0Sstevel@tonic-gate 		uint_t	 uval;
2457*0Sstevel@tonic-gate 
2458*0Sstevel@tonic-gate 		while (wordcnt--) {
2459*0Sstevel@tonic-gate 			uval = *uwbuf++;
2460*0Sstevel@tonic-gate 			*dbuf = uval;
2461*0Sstevel@tonic-gate 			*pbuf = uval;
2462*0Sstevel@tonic-gate 			dsum ^= uval;
2463*0Sstevel@tonic-gate 			++pbuf;
2464*0Sstevel@tonic-gate 			++dbuf;
2465*0Sstevel@tonic-gate 		}
2466*0Sstevel@tonic-gate 	} else {
2467*0Sstevel@tonic-gate 		union {
2468*0Sstevel@tonic-gate 			uint_t	wb;
2469*0Sstevel@tonic-gate 			uchar_t	bb[4];
2470*0Sstevel@tonic-gate 		} cb;
2471*0Sstevel@tonic-gate 
2472*0Sstevel@tonic-gate 		while (wordcnt--) {
2473*0Sstevel@tonic-gate 			cb.bb[0] = *ubbuf++;
2474*0Sstevel@tonic-gate 			cb.bb[1] = *ubbuf++;
2475*0Sstevel@tonic-gate 			cb.bb[2] = *ubbuf++;
2476*0Sstevel@tonic-gate 			cb.bb[3] = *ubbuf++;
2477*0Sstevel@tonic-gate 			*dbuf = cb.wb;
2478*0Sstevel@tonic-gate 			*pbuf = cb.wb;
2479*0Sstevel@tonic-gate 			dsum ^= cb.wb;
2480*0Sstevel@tonic-gate 			++pbuf;
2481*0Sstevel@tonic-gate 			++dbuf;
2482*0Sstevel@tonic-gate 		}
2483*0Sstevel@tonic-gate 	}
2484*0Sstevel@tonic-gate 
2485*0Sstevel@tonic-gate 	RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn,
2486*0Sstevel@tonic-gate 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2487*0Sstevel@tonic-gate 			un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC);
2488*0Sstevel@tonic-gate 
2489*0Sstevel@tonic-gate 	raidio(cs, RIO_PREWRITE | RIO_DATA);
2490*0Sstevel@tonic-gate 
2491*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
2492*0Sstevel@tonic-gate 
2493*0Sstevel@tonic-gate 		dsum = 0;
2494*0Sstevel@tonic-gate 		pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2495*0Sstevel@tonic-gate 		dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE);
2496*0Sstevel@tonic-gate 
2497*0Sstevel@tonic-gate 		wordcnt = count / sizeof (uint_t);
2498*0Sstevel@tonic-gate 
2499*0Sstevel@tonic-gate 		col = cbuf->cbuf_column;
2500*0Sstevel@tonic-gate 
2501*0Sstevel@tonic-gate 		/* Word aligned */
2502*0Sstevel@tonic-gate 		if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2503*0Sstevel@tonic-gate 			uint_t	uval;
2504*0Sstevel@tonic-gate 
2505*0Sstevel@tonic-gate 			/*
2506*0Sstevel@tonic-gate 			 * Only calculate psum when working on the last
2507*0Sstevel@tonic-gate 			 * data buffer.
2508*0Sstevel@tonic-gate 			 */
2509*0Sstevel@tonic-gate 			if (cbuf->cbuf_next == NULL) {
2510*0Sstevel@tonic-gate 				psum = 0;
2511*0Sstevel@tonic-gate 				while (wordcnt--) {
2512*0Sstevel@tonic-gate 					uval = *uwbuf++;
2513*0Sstevel@tonic-gate 					*dbuf = uval;
2514*0Sstevel@tonic-gate 					psum ^= (*pbuf ^= uval);
2515*0Sstevel@tonic-gate 					dsum ^= uval;
2516*0Sstevel@tonic-gate 					++dbuf;
2517*0Sstevel@tonic-gate 					++pbuf;
2518*0Sstevel@tonic-gate 				}
2519*0Sstevel@tonic-gate 			} else {
2520*0Sstevel@tonic-gate 				while (wordcnt--) {
2521*0Sstevel@tonic-gate 					uval = *uwbuf++;
2522*0Sstevel@tonic-gate 					*dbuf = uval;
2523*0Sstevel@tonic-gate 					*pbuf ^= uval;
2524*0Sstevel@tonic-gate 					dsum ^= uval;
2525*0Sstevel@tonic-gate 					++dbuf;
2526*0Sstevel@tonic-gate 					++pbuf;
2527*0Sstevel@tonic-gate 				}
2528*0Sstevel@tonic-gate 			}
2529*0Sstevel@tonic-gate 		} else {
2530*0Sstevel@tonic-gate 			union {
2531*0Sstevel@tonic-gate 				uint_t	wb;
2532*0Sstevel@tonic-gate 				uchar_t	bb[4];
2533*0Sstevel@tonic-gate 			} cb;
2534*0Sstevel@tonic-gate 
2535*0Sstevel@tonic-gate 			/*
2536*0Sstevel@tonic-gate 			 * Only calculate psum when working on the last
2537*0Sstevel@tonic-gate 			 * data buffer.
2538*0Sstevel@tonic-gate 			 */
2539*0Sstevel@tonic-gate 			if (cbuf->cbuf_next == NULL) {
2540*0Sstevel@tonic-gate 				psum = 0;
2541*0Sstevel@tonic-gate 				while (wordcnt--) {
2542*0Sstevel@tonic-gate 					cb.bb[0] = *ubbuf++;
2543*0Sstevel@tonic-gate 					cb.bb[1] = *ubbuf++;
2544*0Sstevel@tonic-gate 					cb.bb[2] = *ubbuf++;
2545*0Sstevel@tonic-gate 					cb.bb[3] = *ubbuf++;
2546*0Sstevel@tonic-gate 					*dbuf = cb.wb;
2547*0Sstevel@tonic-gate 					psum ^= (*pbuf ^= cb.wb);
2548*0Sstevel@tonic-gate 					dsum ^= cb.wb;
2549*0Sstevel@tonic-gate 					++dbuf;
2550*0Sstevel@tonic-gate 					++pbuf;
2551*0Sstevel@tonic-gate 				}
2552*0Sstevel@tonic-gate 			} else {
2553*0Sstevel@tonic-gate 				while (wordcnt--) {
2554*0Sstevel@tonic-gate 					cb.bb[0] = *ubbuf++;
2555*0Sstevel@tonic-gate 					cb.bb[1] = *ubbuf++;
2556*0Sstevel@tonic-gate 					cb.bb[2] = *ubbuf++;
2557*0Sstevel@tonic-gate 					cb.bb[3] = *ubbuf++;
2558*0Sstevel@tonic-gate 					*dbuf = cb.wb;
2559*0Sstevel@tonic-gate 					*pbuf ^= cb.wb;
2560*0Sstevel@tonic-gate 					dsum ^= cb.wb;
2561*0Sstevel@tonic-gate 					++dbuf;
2562*0Sstevel@tonic-gate 					++pbuf;
2563*0Sstevel@tonic-gate 				}
2564*0Sstevel@tonic-gate 			}
2565*0Sstevel@tonic-gate 		}
2566*0Sstevel@tonic-gate 		RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn,
2567*0Sstevel@tonic-gate 				cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2568*0Sstevel@tonic-gate 				un->un_totalcolumncnt, col, RAID_PWMAGIC);
2569*0Sstevel@tonic-gate 
2570*0Sstevel@tonic-gate 		/*
2571*0Sstevel@tonic-gate 		 * fill in buffer for write to prewrite area
2572*0Sstevel@tonic-gate 		 */
2573*0Sstevel@tonic-gate 		bp = &cbuf->cbuf_bp;
2574*0Sstevel@tonic-gate 		bp->b_un.b_addr = cbuf->cbuf_buffer;
2575*0Sstevel@tonic-gate 		bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE;
2576*0Sstevel@tonic-gate 		bp->b_bufsize = bp->b_bcount;
2577*0Sstevel@tonic-gate 		bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) +
2578*0Sstevel@tonic-gate 		    un->un_column[col].un_pwstart;
2579*0Sstevel@tonic-gate 		bp->b_flags = B_WRITE | B_BUSY;
2580*0Sstevel@tonic-gate 		if (nv_available && nv_prewrite)
2581*0Sstevel@tonic-gate 			bp->b_flags |= nv_available;
2582*0Sstevel@tonic-gate 		bp->b_iodone = (int (*)())raid_done;
2583*0Sstevel@tonic-gate 		bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev);
2584*0Sstevel@tonic-gate 		bp->b_chain = (struct buf *)cs;
2585*0Sstevel@tonic-gate 		md_call_strategy(bp,
2586*0Sstevel@tonic-gate 			cs->cs_strategy_flag, cs->cs_strategy_private);
2587*0Sstevel@tonic-gate 	}
2588*0Sstevel@tonic-gate 
2589*0Sstevel@tonic-gate 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn,
2590*0Sstevel@tonic-gate 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2591*0Sstevel@tonic-gate 			un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC);
2592*0Sstevel@tonic-gate 
2593*0Sstevel@tonic-gate 	raidio(cs, RIO_PREWRITE | RIO_PARITY);
2594*0Sstevel@tonic-gate }
2595*0Sstevel@tonic-gate 
2596*0Sstevel@tonic-gate /*
2597*0Sstevel@tonic-gate  * NAME:	raid_readregenloop
2598*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
2599*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2600*0Sstevel@tonic-gate  */
2601*0Sstevel@tonic-gate static void
2602*0Sstevel@tonic-gate raid_readregenloop(md_raidcs_t *cs)
2603*0Sstevel@tonic-gate {
2604*0Sstevel@tonic-gate 	mr_unit_t	*un;
2605*0Sstevel@tonic-gate 	md_raidps_t	*ps;
2606*0Sstevel@tonic-gate 	uint_t		*dbuf;
2607*0Sstevel@tonic-gate 	uint_t		*pbuf;
2608*0Sstevel@tonic-gate 	size_t		wordcnt;
2609*0Sstevel@tonic-gate 
2610*0Sstevel@tonic-gate 	un = cs->cs_un;
2611*0Sstevel@tonic-gate 
2612*0Sstevel@tonic-gate 	/*
2613*0Sstevel@tonic-gate 	 * XOR the parity with data bytes, must skip the
2614*0Sstevel@tonic-gate 	 * pre-write entry header in all data/parity buffers
2615*0Sstevel@tonic-gate 	 */
2616*0Sstevel@tonic-gate 	wordcnt = cs->cs_bcount / sizeof (uint_t);
2617*0Sstevel@tonic-gate 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2618*0Sstevel@tonic-gate 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2619*0Sstevel@tonic-gate 	while (wordcnt--)
2620*0Sstevel@tonic-gate 		*dbuf++ ^= *pbuf++;
2621*0Sstevel@tonic-gate 
2622*0Sstevel@tonic-gate 	/* bump up the loop count */
2623*0Sstevel@tonic-gate 	cs->cs_loop++;
2624*0Sstevel@tonic-gate 
2625*0Sstevel@tonic-gate 	/* skip the errored component */
2626*0Sstevel@tonic-gate 	if (cs->cs_loop == cs->cs_dcolumn)
2627*0Sstevel@tonic-gate 		cs->cs_loop++;
2628*0Sstevel@tonic-gate 
2629*0Sstevel@tonic-gate 	if (cs->cs_loop != un->un_totalcolumncnt) {
2630*0Sstevel@tonic-gate 		cs->cs_frags = 1;
2631*0Sstevel@tonic-gate 		raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
2632*0Sstevel@tonic-gate 		return;
2633*0Sstevel@tonic-gate 	}
2634*0Sstevel@tonic-gate 	/* reaching the end sof loop */
2635*0Sstevel@tonic-gate 	ps = cs->cs_ps;
2636*0Sstevel@tonic-gate 	bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount);
2637*0Sstevel@tonic-gate 	raid_free_child(cs, 1);
2638*0Sstevel@tonic-gate 
2639*0Sstevel@tonic-gate 	/* decrement readfrags */
2640*0Sstevel@tonic-gate 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
2641*0Sstevel@tonic-gate }
2642*0Sstevel@tonic-gate 
2643*0Sstevel@tonic-gate /*
2644*0Sstevel@tonic-gate  * NAME:	raid_read_io
2645*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice read I/O routine
2646*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2647*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
2648*0Sstevel@tonic-gate  */
2649*0Sstevel@tonic-gate static void
2650*0Sstevel@tonic-gate raid_read_io(mr_unit_t *un, md_raidcs_t *cs)
2651*0Sstevel@tonic-gate {
2652*0Sstevel@tonic-gate 	int	flag;
2653*0Sstevel@tonic-gate 	void	*private;
2654*0Sstevel@tonic-gate 	buf_t	*bp;
2655*0Sstevel@tonic-gate 	buf_t	*pb = cs->cs_ps->ps_bp;
2656*0Sstevel@tonic-gate 	mr_column_t	*column;
2657*0Sstevel@tonic-gate 
2658*0Sstevel@tonic-gate 	flag = cs->cs_strategy_flag;
2659*0Sstevel@tonic-gate 	private = cs->cs_strategy_private;
2660*0Sstevel@tonic-gate 	column = &un->un_column[cs->cs_dcolumn];
2661*0Sstevel@tonic-gate 
2662*0Sstevel@tonic-gate 	/*
2663*0Sstevel@tonic-gate 	 * The component to be read is good, simply set up bp structure
2664*0Sstevel@tonic-gate 	 * and call low level md routine doing the read.
2665*0Sstevel@tonic-gate 	 */
2666*0Sstevel@tonic-gate 
2667*0Sstevel@tonic-gate 	if (COLUMN_ISOKAY(un, cs->cs_dcolumn) ||
2668*0Sstevel@tonic-gate 	    (COLUMN_ISLASTERR(un, cs->cs_dcolumn) &&
2669*0Sstevel@tonic-gate 		    (cs->cs_flags & MD_RCS_RECOVERY) == 0)) {
2670*0Sstevel@tonic-gate 		dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */
2671*0Sstevel@tonic-gate 		ddi_dev = md_dev64_to_dev(column->un_dev);
2672*0Sstevel@tonic-gate 
2673*0Sstevel@tonic-gate 		bp = &cs->cs_dbuf;
2674*0Sstevel@tonic-gate 		bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev,
2675*0Sstevel@tonic-gate 				column->un_devstart + cs->cs_blkno,
2676*0Sstevel@tonic-gate 				(int (*)())raid_done, bp, KM_NOSLEEP);
2677*0Sstevel@tonic-gate 
2678*0Sstevel@tonic-gate 		bp->b_chain = (buf_t *)cs;
2679*0Sstevel@tonic-gate 
2680*0Sstevel@tonic-gate 		cs->cs_frags = 1;
2681*0Sstevel@tonic-gate 		cs->cs_error_call = raid_read_error;
2682*0Sstevel@tonic-gate 		cs->cs_retry_call = raid_read_retry;
2683*0Sstevel@tonic-gate 		cs->cs_flags |= MD_RCS_ISCALL;
2684*0Sstevel@tonic-gate 		cs->cs_stage = RAID_READ_DONE;
2685*0Sstevel@tonic-gate 		cs->cs_call = raid_stage;
2686*0Sstevel@tonic-gate 
2687*0Sstevel@tonic-gate 		ASSERT(bp->b_edev != 0);
2688*0Sstevel@tonic-gate 
2689*0Sstevel@tonic-gate 		md_call_strategy(bp, flag, private);
2690*0Sstevel@tonic-gate 		return;
2691*0Sstevel@tonic-gate 	}
2692*0Sstevel@tonic-gate 
2693*0Sstevel@tonic-gate 	/*
2694*0Sstevel@tonic-gate 	 * The component to be read is bad, have to go through
2695*0Sstevel@tonic-gate 	 * raid specific method to read data from other members.
2696*0Sstevel@tonic-gate 	 */
2697*0Sstevel@tonic-gate 	cs->cs_loop = 0;
2698*0Sstevel@tonic-gate 	/*
2699*0Sstevel@tonic-gate 	 * NOTE: always get dbuffer before pbuffer
2700*0Sstevel@tonic-gate 	 *	 and get both buffers before pwslot
2701*0Sstevel@tonic-gate 	 *	 otherwise a deadlock could be introduced.
2702*0Sstevel@tonic-gate 	 */
2703*0Sstevel@tonic-gate 	raid_mapin_buf(cs);
2704*0Sstevel@tonic-gate 	getdbuffer(cs);
2705*0Sstevel@tonic-gate 	getpbuffer(cs);
2706*0Sstevel@tonic-gate 	if (cs->cs_loop == cs->cs_dcolumn)
2707*0Sstevel@tonic-gate 		cs->cs_loop++;
2708*0Sstevel@tonic-gate 
2709*0Sstevel@tonic-gate 	/* zero out data buffer for use as a data sink */
2710*0Sstevel@tonic-gate 	bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount);
2711*0Sstevel@tonic-gate 	cs->cs_stage = RAID_NONE;
2712*0Sstevel@tonic-gate 	cs->cs_call = raid_readregenloop;
2713*0Sstevel@tonic-gate 	cs->cs_error_call = raid_read_error;
2714*0Sstevel@tonic-gate 	cs->cs_retry_call = raid_read_no_retry;
2715*0Sstevel@tonic-gate 	cs->cs_frags = 1;
2716*0Sstevel@tonic-gate 
2717*0Sstevel@tonic-gate 	/* use parity buffer to read other columns */
2718*0Sstevel@tonic-gate 	raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
2719*0Sstevel@tonic-gate }
2720*0Sstevel@tonic-gate 
2721*0Sstevel@tonic-gate /*
2722*0Sstevel@tonic-gate  * NAME:	raid_read
2723*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
2724*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2725*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
2726*0Sstevel@tonic-gate  */
2727*0Sstevel@tonic-gate static int
2728*0Sstevel@tonic-gate raid_read(mr_unit_t *un, md_raidcs_t *cs)
2729*0Sstevel@tonic-gate {
2730*0Sstevel@tonic-gate 	int		error = 0;
2731*0Sstevel@tonic-gate 	md_raidps_t	*ps;
2732*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
2733*0Sstevel@tonic-gate 	minor_t		mnum;
2734*0Sstevel@tonic-gate 
2735*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(un));
2736*0Sstevel@tonic-gate 	ps = cs->cs_ps;
2737*0Sstevel@tonic-gate 	ui = ps->ps_ui;
2738*0Sstevel@tonic-gate 	raid_line_reader_lock(cs, 0);
2739*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_readerlock(ui);
2740*0Sstevel@tonic-gate 	ASSERT(UNIT_STATE(un) != RUS_INIT);
2741*0Sstevel@tonic-gate 	mnum = MD_SID(un);
2742*0Sstevel@tonic-gate 	cs->cs_un = un;
2743*0Sstevel@tonic-gate 
2744*0Sstevel@tonic-gate 	/* make sure the read doesn't go beyond the end of the column */
2745*0Sstevel@tonic-gate 	if (cs->cs_blkno + cs->cs_blkcnt >
2746*0Sstevel@tonic-gate 	    un->un_segsize * un->un_segsincolumn) {
2747*0Sstevel@tonic-gate 		error = ENXIO;
2748*0Sstevel@tonic-gate 	}
2749*0Sstevel@tonic-gate 	if (error)
2750*0Sstevel@tonic-gate 		goto rerror;
2751*0Sstevel@tonic-gate 
2752*0Sstevel@tonic-gate 	if (un->un_state & RUS_REGEN) {
2753*0Sstevel@tonic-gate 		raid_regen_parity(cs);
2754*0Sstevel@tonic-gate 		un = MD_UNIT(mnum);
2755*0Sstevel@tonic-gate 		cs->cs_un = un;
2756*0Sstevel@tonic-gate 	}
2757*0Sstevel@tonic-gate 
2758*0Sstevel@tonic-gate 	raid_read_io(un, cs);
2759*0Sstevel@tonic-gate 	return (0);
2760*0Sstevel@tonic-gate 
2761*0Sstevel@tonic-gate rerror:
2762*0Sstevel@tonic-gate 	raid_error_parent(ps, error);
2763*0Sstevel@tonic-gate 	raid_free_child(cs, 1);
2764*0Sstevel@tonic-gate 	/* decrement readfrags */
2765*0Sstevel@tonic-gate 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
2766*0Sstevel@tonic-gate 	return (0);
2767*0Sstevel@tonic-gate }
2768*0Sstevel@tonic-gate 
2769*0Sstevel@tonic-gate /*
2770*0Sstevel@tonic-gate  * NAME:	raid_write_err_retry
2771*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write retry routine
2772*0Sstevel@tonic-gate  *		write was for parity or data only;
2773*0Sstevel@tonic-gate  *		complete write with error, no recovery possible
2774*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2775*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
2776*0Sstevel@tonic-gate  */
2777*0Sstevel@tonic-gate /*ARGSUSED*/
2778*0Sstevel@tonic-gate static void
2779*0Sstevel@tonic-gate raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs)
2780*0Sstevel@tonic-gate {
2781*0Sstevel@tonic-gate 	md_raidps_t	*ps = cs->cs_ps;
2782*0Sstevel@tonic-gate 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
2783*0Sstevel@tonic-gate 
2784*0Sstevel@tonic-gate 	/* decrement pwfrags if needed, and frags */
2785*0Sstevel@tonic-gate 	if (!(cs->cs_flags & MD_RCS_PWDONE))
2786*0Sstevel@tonic-gate 		flags |= RFP_DECR_PWFRAGS;
2787*0Sstevel@tonic-gate 	raid_error_parent(ps, EIO);
2788*0Sstevel@tonic-gate 	raid_free_child(cs, 1);
2789*0Sstevel@tonic-gate 	raid_free_parent(ps, flags);
2790*0Sstevel@tonic-gate }
2791*0Sstevel@tonic-gate 
2792*0Sstevel@tonic-gate /*
2793*0Sstevel@tonic-gate  * NAME:	raid_write_err_retry
2794*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write retry routine
2795*0Sstevel@tonic-gate  *		 write is too far along to retry and parent
2796*0Sstevel@tonic-gate  *		 has already been signaled with iodone.
2797*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2798*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
2799*0Sstevel@tonic-gate  */
2800*0Sstevel@tonic-gate /*ARGSUSED*/
2801*0Sstevel@tonic-gate static void
2802*0Sstevel@tonic-gate raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs)
2803*0Sstevel@tonic-gate {
2804*0Sstevel@tonic-gate 	md_raidps_t	*ps = cs->cs_ps;
2805*0Sstevel@tonic-gate 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
2806*0Sstevel@tonic-gate 
2807*0Sstevel@tonic-gate 	/* decrement pwfrags if needed, and frags */
2808*0Sstevel@tonic-gate 	if (!(cs->cs_flags & MD_RCS_PWDONE))
2809*0Sstevel@tonic-gate 		flags |= RFP_DECR_PWFRAGS;
2810*0Sstevel@tonic-gate 	raid_free_child(cs, 1);
2811*0Sstevel@tonic-gate 	raid_free_parent(ps, flags);
2812*0Sstevel@tonic-gate }
2813*0Sstevel@tonic-gate 
2814*0Sstevel@tonic-gate /*
2815*0Sstevel@tonic-gate  * NAME:	raid_write_retry
2816*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write retry routine
2817*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2818*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
2819*0Sstevel@tonic-gate  */
2820*0Sstevel@tonic-gate static void
2821*0Sstevel@tonic-gate raid_write_retry(mr_unit_t *un, md_raidcs_t *cs)
2822*0Sstevel@tonic-gate {
2823*0Sstevel@tonic-gate 	md_raidps_t	*ps;
2824*0Sstevel@tonic-gate 
2825*0Sstevel@tonic-gate 	ps = cs->cs_ps;
2826*0Sstevel@tonic-gate 
2827*0Sstevel@tonic-gate 	/* re-initialize the buf_t structure for raid_write() */
2828*0Sstevel@tonic-gate 	cs->cs_dbuf.b_chain = (struct buf *)cs;
2829*0Sstevel@tonic-gate 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
2830*0Sstevel@tonic-gate 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
2831*0Sstevel@tonic-gate 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
2832*0Sstevel@tonic-gate 	cs->cs_dbuf.b_error = 0;	/* initialize error */
2833*0Sstevel@tonic-gate 	cs->cs_dbuf.b_offset = -1;
2834*0Sstevel@tonic-gate 	/* Initialize semaphores */
2835*0Sstevel@tonic-gate 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
2836*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
2837*0Sstevel@tonic-gate 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
2838*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
2839*0Sstevel@tonic-gate 
2840*0Sstevel@tonic-gate 	cs->cs_pbuf.b_chain = (struct buf *)cs;
2841*0Sstevel@tonic-gate 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
2842*0Sstevel@tonic-gate 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
2843*0Sstevel@tonic-gate 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
2844*0Sstevel@tonic-gate 	cs->cs_pbuf.b_error = 0;	/* initialize error */
2845*0Sstevel@tonic-gate 	cs->cs_pbuf.b_offset = -1;
2846*0Sstevel@tonic-gate 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
2847*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
2848*0Sstevel@tonic-gate 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
2849*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
2850*0Sstevel@tonic-gate 
2851*0Sstevel@tonic-gate 	cs->cs_hbuf.b_chain = (struct buf *)cs;
2852*0Sstevel@tonic-gate 	cs->cs_hbuf.b_back = &cs->cs_hbuf;
2853*0Sstevel@tonic-gate 	cs->cs_hbuf.b_forw = &cs->cs_hbuf;
2854*0Sstevel@tonic-gate 	cs->cs_hbuf.b_flags = B_BUSY;	/* initialize flags */
2855*0Sstevel@tonic-gate 	cs->cs_hbuf.b_error = 0;	/* initialize error */
2856*0Sstevel@tonic-gate 	cs->cs_hbuf.b_offset = -1;
2857*0Sstevel@tonic-gate 	sema_init(&cs->cs_hbuf.b_io, 0, NULL,
2858*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
2859*0Sstevel@tonic-gate 	sema_init(&cs->cs_hbuf.b_sem, 0, NULL,
2860*0Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
2861*0Sstevel@tonic-gate 
2862*0Sstevel@tonic-gate 	cs->cs_flags &= ~(MD_RCS_ERROR);
2863*0Sstevel@tonic-gate 	/*
2864*0Sstevel@tonic-gate 	 * If we have already done'ed the i/o but have done prewrite
2865*0Sstevel@tonic-gate 	 * on this child, then reset PWDONE flag and bump pwfrags before
2866*0Sstevel@tonic-gate 	 * restarting i/o.
2867*0Sstevel@tonic-gate 	 * If pwfrags is zero, we have already 'iodone'd the i/o so
2868*0Sstevel@tonic-gate 	 * leave things alone.  We don't want to re-'done' it.
2869*0Sstevel@tonic-gate 	 */
2870*0Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
2871*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_PWDONE) {
2872*0Sstevel@tonic-gate 		cs->cs_flags &= ~MD_RCS_PWDONE;
2873*0Sstevel@tonic-gate 		ps->ps_pwfrags++;
2874*0Sstevel@tonic-gate 	}
2875*0Sstevel@tonic-gate 	mutex_exit(&ps->ps_mx);
2876*0Sstevel@tonic-gate 	raid_write_io(un, cs);
2877*0Sstevel@tonic-gate }
2878*0Sstevel@tonic-gate 
2879*0Sstevel@tonic-gate /*
2880*0Sstevel@tonic-gate  * NAME:	raid_wrerr
2881*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
2882*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2883*0Sstevel@tonic-gate  * LOCKS:	must obtain unit writer lock while calling raid_error_state
2884*0Sstevel@tonic-gate  *		since a unit or column state transition may take place.
2885*0Sstevel@tonic-gate  *		must obtain unit reader lock to retry I/O.
2886*0Sstevel@tonic-gate  */
2887*0Sstevel@tonic-gate static void
2888*0Sstevel@tonic-gate raid_wrerr(md_raidcs_t *cs)
2889*0Sstevel@tonic-gate {
2890*0Sstevel@tonic-gate 	md_raidps_t	*ps;
2891*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
2892*0Sstevel@tonic-gate 	mr_unit_t	*un;
2893*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
2894*0Sstevel@tonic-gate 
2895*0Sstevel@tonic-gate 	ps = cs->cs_ps;
2896*0Sstevel@tonic-gate 	ui = ps->ps_ui;
2897*0Sstevel@tonic-gate 
2898*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_writerlock(ui);
2899*0Sstevel@tonic-gate 	ASSERT(un != 0);
2900*0Sstevel@tonic-gate 
2901*0Sstevel@tonic-gate 	if (cs->cs_dbuf.b_flags & B_ERROR)
2902*0Sstevel@tonic-gate 		(void) raid_error_state(un, &cs->cs_dbuf);
2903*0Sstevel@tonic-gate 	if (cs->cs_pbuf.b_flags & B_ERROR)
2904*0Sstevel@tonic-gate 		(void) raid_error_state(un, &cs->cs_pbuf);
2905*0Sstevel@tonic-gate 	if (cs->cs_hbuf.b_flags & B_ERROR)
2906*0Sstevel@tonic-gate 		(void) raid_error_state(un, &cs->cs_hbuf);
2907*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
2908*0Sstevel@tonic-gate 		if (cbuf->cbuf_bp.b_flags & B_ERROR)
2909*0Sstevel@tonic-gate 			(void) raid_error_state(un, &cbuf->cbuf_bp);
2910*0Sstevel@tonic-gate 
2911*0Sstevel@tonic-gate 	md_unit_writerexit(ui);
2912*0Sstevel@tonic-gate 
2913*0Sstevel@tonic-gate 	ps->ps_flags |= MD_RPS_HSREQ;
2914*0Sstevel@tonic-gate 
2915*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_readerlock(ui);
2916*0Sstevel@tonic-gate 
2917*0Sstevel@tonic-gate 	/* now attempt the appropriate retry routine */
2918*0Sstevel@tonic-gate 	(*(cs->cs_retry_call))(un, cs);
2919*0Sstevel@tonic-gate }
2920*0Sstevel@tonic-gate /*
2921*0Sstevel@tonic-gate  * NAMES:	raid_write_error
2922*0Sstevel@tonic-gate  * DESCRIPTION: I/O error handling routine for a RAID metadevice write
2923*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
2924*0Sstevel@tonic-gate  */
2925*0Sstevel@tonic-gate /*ARGSUSED*/
2926*0Sstevel@tonic-gate static void
2927*0Sstevel@tonic-gate raid_write_error(md_raidcs_t *cs)
2928*0Sstevel@tonic-gate {
2929*0Sstevel@tonic-gate 	md_raidps_t	*ps;
2930*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
2931*0Sstevel@tonic-gate 	mr_unit_t	*un;
2932*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
2933*0Sstevel@tonic-gate 	set_t		setno;
2934*0Sstevel@tonic-gate 
2935*0Sstevel@tonic-gate 	ps = cs->cs_ps;
2936*0Sstevel@tonic-gate 	ui = ps->ps_ui;
2937*0Sstevel@tonic-gate 	un = cs->cs_un;
2938*0Sstevel@tonic-gate 
2939*0Sstevel@tonic-gate 	setno = MD_UN2SET(un);
2940*0Sstevel@tonic-gate 
2941*0Sstevel@tonic-gate 	/*
2942*0Sstevel@tonic-gate 	 * locate each buf that is in error on this io and then
2943*0Sstevel@tonic-gate 	 * output an error message
2944*0Sstevel@tonic-gate 	 */
2945*0Sstevel@tonic-gate 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
2946*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
2947*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
2948*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md %s: write error on %s",
2949*0Sstevel@tonic-gate 		    md_shortname(MD_SID(un)),
2950*0Sstevel@tonic-gate 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
2951*0Sstevel@tonic-gate 
2952*0Sstevel@tonic-gate 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
2953*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
2954*0Sstevel@tonic-gate 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
2955*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md %s: write error on %s",
2956*0Sstevel@tonic-gate 		    md_shortname(MD_SID(un)),
2957*0Sstevel@tonic-gate 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
2958*0Sstevel@tonic-gate 
2959*0Sstevel@tonic-gate 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
2960*0Sstevel@tonic-gate 		if ((cbuf->cbuf_bp.b_flags & B_ERROR) &&
2961*0Sstevel@tonic-gate 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) &&
2962*0Sstevel@tonic-gate 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED))
2963*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md %s: write error on %s",
2964*0Sstevel@tonic-gate 			    md_shortname(MD_SID(un)),
2965*0Sstevel@tonic-gate 			    md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev),
2966*0Sstevel@tonic-gate 					NULL, 0));
2967*0Sstevel@tonic-gate 
2968*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
2969*0Sstevel@tonic-gate 
2970*0Sstevel@tonic-gate 	ASSERT(cs->cs_frags == 0);
2971*0Sstevel@tonic-gate 
2972*0Sstevel@tonic-gate 	/* now schedule processing for possible state change */
2973*0Sstevel@tonic-gate 	daemon_request(&md_mstr_daemon, raid_wrerr,
2974*0Sstevel@tonic-gate 		(daemon_queue_t *)cs, REQ_OLD);
2975*0Sstevel@tonic-gate 
2976*0Sstevel@tonic-gate }
2977*0Sstevel@tonic-gate 
2978*0Sstevel@tonic-gate /*
2979*0Sstevel@tonic-gate  * NAME:	raid_write_ponly
2980*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
2981*0Sstevel@tonic-gate  *		in the case where only the parity column can be written
2982*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2983*0Sstevel@tonic-gate  */
2984*0Sstevel@tonic-gate static void
2985*0Sstevel@tonic-gate raid_write_ponly(md_raidcs_t *cs)
2986*0Sstevel@tonic-gate {
2987*0Sstevel@tonic-gate 	md_raidps_t	*ps;
2988*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
2989*0Sstevel@tonic-gate 
2990*0Sstevel@tonic-gate 	ps = cs->cs_ps;
2991*0Sstevel@tonic-gate 	/* decrement pwfrags if needed, but not frags */
2992*0Sstevel@tonic-gate 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
2993*0Sstevel@tonic-gate 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
2994*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_PWDONE;
2995*0Sstevel@tonic-gate 	cs->cs_frags = 1;
2996*0Sstevel@tonic-gate 	cs->cs_stage = RAID_WRITE_PONLY_DONE;
2997*0Sstevel@tonic-gate 	cs->cs_call = raid_stage;
2998*0Sstevel@tonic-gate 	cs->cs_error_call = raid_write_error;
2999*0Sstevel@tonic-gate 	cs->cs_retry_call = raid_write_no_retry;
3000*0Sstevel@tonic-gate 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3001*0Sstevel@tonic-gate 		cs->cs_frags++;
3002*0Sstevel@tonic-gate 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE);
3003*0Sstevel@tonic-gate 	}
3004*0Sstevel@tonic-gate 	raidio(cs, RIO_PARITY | RIO_WRITE);
3005*0Sstevel@tonic-gate }
3006*0Sstevel@tonic-gate 
3007*0Sstevel@tonic-gate /*
3008*0Sstevel@tonic-gate  * NAME:	raid_write_ploop
3009*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine, constructs parity from
3010*0Sstevel@tonic-gate  *		data in other columns.
3011*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3012*0Sstevel@tonic-gate  */
3013*0Sstevel@tonic-gate static void
3014*0Sstevel@tonic-gate raid_write_ploop(md_raidcs_t *cs)
3015*0Sstevel@tonic-gate {
3016*0Sstevel@tonic-gate 	mr_unit_t *un = cs->cs_un;
3017*0Sstevel@tonic-gate 	uint_t *dbuf;
3018*0Sstevel@tonic-gate 	uint_t *pbuf;
3019*0Sstevel@tonic-gate 	size_t wordcnt;
3020*0Sstevel@tonic-gate 	uint_t psum = 0;
3021*0Sstevel@tonic-gate 
3022*0Sstevel@tonic-gate 	wordcnt = cs->cs_bcount / sizeof (uint_t);
3023*0Sstevel@tonic-gate 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
3024*0Sstevel@tonic-gate 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
3025*0Sstevel@tonic-gate 	while (wordcnt--)
3026*0Sstevel@tonic-gate 		*pbuf++ ^= *dbuf++;
3027*0Sstevel@tonic-gate 	cs->cs_loop++;
3028*0Sstevel@tonic-gate 
3029*0Sstevel@tonic-gate 	/*
3030*0Sstevel@tonic-gate 	 * build parity from scratch using new data,
3031*0Sstevel@tonic-gate 	 * skip reading the data and parity columns.
3032*0Sstevel@tonic-gate 	 */
3033*0Sstevel@tonic-gate 	while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn)
3034*0Sstevel@tonic-gate 		cs->cs_loop++;
3035*0Sstevel@tonic-gate 
3036*0Sstevel@tonic-gate 	if (cs->cs_loop != un->un_totalcolumncnt) {
3037*0Sstevel@tonic-gate 		cs->cs_frags = 1;
3038*0Sstevel@tonic-gate 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
3039*0Sstevel@tonic-gate 		return;
3040*0Sstevel@tonic-gate 	}
3041*0Sstevel@tonic-gate 
3042*0Sstevel@tonic-gate 	/* construct checksum for parity buffer */
3043*0Sstevel@tonic-gate 	wordcnt = cs->cs_bcount / sizeof (uint_t);
3044*0Sstevel@tonic-gate 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
3045*0Sstevel@tonic-gate 	while (wordcnt--) {
3046*0Sstevel@tonic-gate 		psum ^= *pbuf;
3047*0Sstevel@tonic-gate 		pbuf++;
3048*0Sstevel@tonic-gate 	}
3049*0Sstevel@tonic-gate 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1,
3050*0Sstevel@tonic-gate 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
3051*0Sstevel@tonic-gate 			1, cs->cs_pcolumn, RAID_PWMAGIC);
3052*0Sstevel@tonic-gate 
3053*0Sstevel@tonic-gate 	cs->cs_stage = RAID_NONE;
3054*0Sstevel@tonic-gate 	cs->cs_call = raid_write_ponly;
3055*0Sstevel@tonic-gate 	cs->cs_error_call = raid_write_error;
3056*0Sstevel@tonic-gate 	cs->cs_retry_call = raid_write_err_retry;
3057*0Sstevel@tonic-gate 	cs->cs_frags = 1;
3058*0Sstevel@tonic-gate 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3059*0Sstevel@tonic-gate 		cs->cs_frags++;
3060*0Sstevel@tonic-gate 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
3061*0Sstevel@tonic-gate 	}
3062*0Sstevel@tonic-gate 	raidio(cs, RIO_PARITY | RIO_PREWRITE);
3063*0Sstevel@tonic-gate }
3064*0Sstevel@tonic-gate 
3065*0Sstevel@tonic-gate /*
3066*0Sstevel@tonic-gate  * NAME:	raid_write_donly
3067*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
3068*0Sstevel@tonic-gate  *		Completed writing data to prewrite entry
3069*0Sstevel@tonic-gate  *		in the case where only the data column can be written
3070*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3071*0Sstevel@tonic-gate  */
3072*0Sstevel@tonic-gate static void
3073*0Sstevel@tonic-gate raid_write_donly(md_raidcs_t *cs)
3074*0Sstevel@tonic-gate {
3075*0Sstevel@tonic-gate 	md_raidps_t	*ps;
3076*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
3077*0Sstevel@tonic-gate 
3078*0Sstevel@tonic-gate 	ps = cs->cs_ps;
3079*0Sstevel@tonic-gate 	/* WARNING: don't release unit reader lock here... */
3080*0Sstevel@tonic-gate 	/* decrement pwfrags if needed, but not frags */
3081*0Sstevel@tonic-gate 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
3082*0Sstevel@tonic-gate 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
3083*0Sstevel@tonic-gate 	cs->cs_flags |= MD_RCS_PWDONE;
3084*0Sstevel@tonic-gate 	cs->cs_frags = 1;
3085*0Sstevel@tonic-gate 	cs->cs_stage = RAID_WRITE_DONLY_DONE;
3086*0Sstevel@tonic-gate 	cs->cs_call = raid_stage;
3087*0Sstevel@tonic-gate 	cs->cs_error_call = raid_write_error;
3088*0Sstevel@tonic-gate 	cs->cs_retry_call = raid_write_err_retry;
3089*0Sstevel@tonic-gate 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
3090*0Sstevel@tonic-gate 		cs->cs_frags++;
3091*0Sstevel@tonic-gate 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
3092*0Sstevel@tonic-gate 	}
3093*0Sstevel@tonic-gate 	raidio(cs, RIO_DATA | RIO_WRITE);
3094*0Sstevel@tonic-gate }
3095*0Sstevel@tonic-gate 
3096*0Sstevel@tonic-gate /*
3097*0Sstevel@tonic-gate  * NAME:	raid_write_got_old
3098*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
3099*0Sstevel@tonic-gate  *		completed read of old data and old parity
3100*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3101*0Sstevel@tonic-gate  */
3102*0Sstevel@tonic-gate static void
3103*0Sstevel@tonic-gate raid_write_got_old(md_raidcs_t *cs)
3104*0Sstevel@tonic-gate {
3105*0Sstevel@tonic-gate 	mr_unit_t *un = cs->cs_un;
3106*0Sstevel@tonic-gate 
3107*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(cs->cs_un));
3108*0Sstevel@tonic-gate 	ASSERT(UNIT_READER_HELD(cs->cs_un));
3109*0Sstevel@tonic-gate 
3110*0Sstevel@tonic-gate 	raid_mapin_buf(cs);
3111*0Sstevel@tonic-gate 	genstandardparity(cs);
3112*0Sstevel@tonic-gate 	cs->cs_frags = 2;
3113*0Sstevel@tonic-gate 	cs->cs_call = raid_stage;
3114*0Sstevel@tonic-gate 	cs->cs_stage = RAID_PREWRITE_DONE;
3115*0Sstevel@tonic-gate 	cs->cs_error_call = raid_write_error;
3116*0Sstevel@tonic-gate 	cs->cs_retry_call = raid_write_retry;
3117*0Sstevel@tonic-gate 
3118*0Sstevel@tonic-gate 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
3119*0Sstevel@tonic-gate 		cs->cs_frags++;
3120*0Sstevel@tonic-gate 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE);
3121*0Sstevel@tonic-gate 	}
3122*0Sstevel@tonic-gate 
3123*0Sstevel@tonic-gate 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3124*0Sstevel@tonic-gate 		cs->cs_frags++;
3125*0Sstevel@tonic-gate 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
3126*0Sstevel@tonic-gate 	}
3127*0Sstevel@tonic-gate 	ASSERT(cs->cs_frags < 4);
3128*0Sstevel@tonic-gate 	raidio(cs,  RIO_DATA | RIO_PREWRITE);
3129*0Sstevel@tonic-gate 	raidio(cs,  RIO_PARITY | RIO_PREWRITE);
3130*0Sstevel@tonic-gate }
3131*0Sstevel@tonic-gate 
3132*0Sstevel@tonic-gate /*
3133*0Sstevel@tonic-gate  * NAME:	raid_write_io
3134*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write I/O routine
3135*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
3136*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
3137*0Sstevel@tonic-gate  */
3138*0Sstevel@tonic-gate 
3139*0Sstevel@tonic-gate /*ARGSUSED*/
3140*0Sstevel@tonic-gate static void
3141*0Sstevel@tonic-gate raid_write_io(mr_unit_t *un, md_raidcs_t *cs)
3142*0Sstevel@tonic-gate {
3143*0Sstevel@tonic-gate 	md_raidps_t	*ps = cs->cs_ps;
3144*0Sstevel@tonic-gate 	uint_t		*dbuf;
3145*0Sstevel@tonic-gate 	uint_t		*ubuf;
3146*0Sstevel@tonic-gate 	size_t		wordcnt;
3147*0Sstevel@tonic-gate 	uint_t		dsum = 0;
3148*0Sstevel@tonic-gate 	int		pcheck;
3149*0Sstevel@tonic-gate 	int		dcheck;
3150*0Sstevel@tonic-gate 
3151*0Sstevel@tonic-gate 	ASSERT((un->un_column[cs->cs_pcolumn].un_devstate &
3152*0Sstevel@tonic-gate 	    RCS_INIT) == 0);
3153*0Sstevel@tonic-gate 	ASSERT((un->un_column[cs->cs_dcolumn].un_devstate &
3154*0Sstevel@tonic-gate 	    RCS_INIT) == 0);
3155*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(un));
3156*0Sstevel@tonic-gate 	ASSERT(UNIT_READER_HELD(un));
3157*0Sstevel@tonic-gate 	ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS);
3158*0Sstevel@tonic-gate 	if (cs->cs_flags & MD_RCS_LINE) {
3159*0Sstevel@tonic-gate 
3160*0Sstevel@tonic-gate 		mr_unit_t	*un = cs->cs_un;
3161*0Sstevel@tonic-gate 
3162*0Sstevel@tonic-gate 		ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt);
3163*0Sstevel@tonic-gate 		raid_mapin_buf(cs);
3164*0Sstevel@tonic-gate 		cs->cs_frags = un->un_origcolumncnt;
3165*0Sstevel@tonic-gate 		cs->cs_call = raid_stage;
3166*0Sstevel@tonic-gate 		cs->cs_error_call = raid_write_error;
3167*0Sstevel@tonic-gate 		cs->cs_retry_call = raid_write_no_retry;
3168*0Sstevel@tonic-gate 		cs->cs_stage = RAID_LINE_PWDONE;
3169*0Sstevel@tonic-gate 		genlineparity(cs);
3170*0Sstevel@tonic-gate 		return;
3171*0Sstevel@tonic-gate 	}
3172*0Sstevel@tonic-gate 
3173*0Sstevel@tonic-gate 	pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]);
3174*0Sstevel@tonic-gate 	dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]);
3175*0Sstevel@tonic-gate 	cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck;
3176*0Sstevel@tonic-gate 
3177*0Sstevel@tonic-gate 	if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) {
3178*0Sstevel@tonic-gate 		int err = EIO;
3179*0Sstevel@tonic-gate 
3180*0Sstevel@tonic-gate 		if ((un->un_column[cs->cs_pcolumn].un_devstate ==
3181*0Sstevel@tonic-gate 		    RCS_LAST_ERRED) ||
3182*0Sstevel@tonic-gate 		    (un->un_column[cs->cs_dcolumn].un_devstate ==
3183*0Sstevel@tonic-gate 		    RCS_LAST_ERRED))
3184*0Sstevel@tonic-gate 			err = ENXIO;
3185*0Sstevel@tonic-gate 		raid_error_parent(ps, err);
3186*0Sstevel@tonic-gate 		ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
3187*0Sstevel@tonic-gate 		raid_free_child(cs, 1);
3188*0Sstevel@tonic-gate 		raid_free_parent(ps,  RFP_DECR_FRAGS
3189*0Sstevel@tonic-gate 		    | RFP_RLS_LOCK | RFP_DECR_PWFRAGS);
3190*0Sstevel@tonic-gate 		return;
3191*0Sstevel@tonic-gate 	}
3192*0Sstevel@tonic-gate 
3193*0Sstevel@tonic-gate 	if (pcheck & RCL_ERRED) {
3194*0Sstevel@tonic-gate 		/*
3195*0Sstevel@tonic-gate 		 * handle case of only having data drive
3196*0Sstevel@tonic-gate 		 */
3197*0Sstevel@tonic-gate 		raid_mapin_buf(cs);
3198*0Sstevel@tonic-gate 		wordcnt = cs->cs_bcount / sizeof (uint_t);
3199*0Sstevel@tonic-gate 
3200*0Sstevel@tonic-gate 		dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
3201*0Sstevel@tonic-gate 		ubuf = (uint_t *)(void *)(cs->cs_addr);
3202*0Sstevel@tonic-gate 
3203*0Sstevel@tonic-gate 		while (wordcnt--) {
3204*0Sstevel@tonic-gate 			*dbuf = *ubuf;
3205*0Sstevel@tonic-gate 			dsum ^= *ubuf;
3206*0Sstevel@tonic-gate 			dbuf++;
3207*0Sstevel@tonic-gate 			ubuf++;
3208*0Sstevel@tonic-gate 		}
3209*0Sstevel@tonic-gate 		RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1,
3210*0Sstevel@tonic-gate 				cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
3211*0Sstevel@tonic-gate 				1, cs->cs_dcolumn, RAID_PWMAGIC);
3212*0Sstevel@tonic-gate 		cs->cs_frags = 1;
3213*0Sstevel@tonic-gate 		cs->cs_stage = RAID_NONE;
3214*0Sstevel@tonic-gate 		cs->cs_call = raid_write_donly;
3215*0Sstevel@tonic-gate 		cs->cs_error_call = raid_write_error;
3216*0Sstevel@tonic-gate 		cs->cs_retry_call = raid_write_err_retry;
3217*0Sstevel@tonic-gate 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
3218*0Sstevel@tonic-gate 			cs->cs_frags++;
3219*0Sstevel@tonic-gate 			raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA |
3220*0Sstevel@tonic-gate 			    RIO_PREWRITE);
3221*0Sstevel@tonic-gate 		}
3222*0Sstevel@tonic-gate 		raidio(cs, RIO_DATA | RIO_PREWRITE);
3223*0Sstevel@tonic-gate 		return;
3224*0Sstevel@tonic-gate 	}
3225*0Sstevel@tonic-gate 
3226*0Sstevel@tonic-gate 	if (dcheck & RCL_ERRED) {
3227*0Sstevel@tonic-gate 		/*
3228*0Sstevel@tonic-gate 		 * handle case of only having parity drive
3229*0Sstevel@tonic-gate 		 * build parity from scratch using new data,
3230*0Sstevel@tonic-gate 		 * skip reading the data and parity columns.
3231*0Sstevel@tonic-gate 		 */
3232*0Sstevel@tonic-gate 		raid_mapin_buf(cs);
3233*0Sstevel@tonic-gate 		cs->cs_loop = 0;
3234*0Sstevel@tonic-gate 		while (cs->cs_loop == cs->cs_dcolumn ||
3235*0Sstevel@tonic-gate 		    cs->cs_loop == cs->cs_pcolumn)
3236*0Sstevel@tonic-gate 			cs->cs_loop++;
3237*0Sstevel@tonic-gate 
3238*0Sstevel@tonic-gate 		/* copy new data in to begin building parity */
3239*0Sstevel@tonic-gate 		bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount);
3240*0Sstevel@tonic-gate 		cs->cs_stage = RAID_NONE;
3241*0Sstevel@tonic-gate 		cs->cs_call = raid_write_ploop;
3242*0Sstevel@tonic-gate 		cs->cs_error_call = raid_write_error;
3243*0Sstevel@tonic-gate 		cs->cs_retry_call = raid_write_err_retry;
3244*0Sstevel@tonic-gate 		cs->cs_frags = 1;
3245*0Sstevel@tonic-gate 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
3246*0Sstevel@tonic-gate 		return;
3247*0Sstevel@tonic-gate 	}
3248*0Sstevel@tonic-gate 	/*
3249*0Sstevel@tonic-gate 	 * handle normal cases
3250*0Sstevel@tonic-gate 	 * read old data and old parity
3251*0Sstevel@tonic-gate 	 */
3252*0Sstevel@tonic-gate 	cs->cs_frags = 2;
3253*0Sstevel@tonic-gate 	cs->cs_stage = RAID_NONE;
3254*0Sstevel@tonic-gate 	cs->cs_call = raid_write_got_old;
3255*0Sstevel@tonic-gate 	cs->cs_error_call = raid_write_error;
3256*0Sstevel@tonic-gate 	cs->cs_retry_call = raid_write_retry;
3257*0Sstevel@tonic-gate 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
3258*0Sstevel@tonic-gate 	raidio(cs, RIO_DATA | RIO_READ);
3259*0Sstevel@tonic-gate 	raidio(cs, RIO_PARITY | RIO_READ);
3260*0Sstevel@tonic-gate }
3261*0Sstevel@tonic-gate 
3262*0Sstevel@tonic-gate static void
3263*0Sstevel@tonic-gate raid_enqueue(md_raidcs_t *cs)
3264*0Sstevel@tonic-gate {
3265*0Sstevel@tonic-gate 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
3266*0Sstevel@tonic-gate 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
3267*0Sstevel@tonic-gate 	md_raidcs_t	*cs1;
3268*0Sstevel@tonic-gate 
3269*0Sstevel@tonic-gate 	mutex_enter(io_list_mutex);
3270*0Sstevel@tonic-gate 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
3271*0Sstevel@tonic-gate 	if (ui->ui_io_lock->io_list_front == NULL) {
3272*0Sstevel@tonic-gate 		ui->ui_io_lock->io_list_front = cs;
3273*0Sstevel@tonic-gate 		ui->ui_io_lock->io_list_back = cs;
3274*0Sstevel@tonic-gate 	} else {
3275*0Sstevel@tonic-gate 		cs1 = ui->ui_io_lock->io_list_back;
3276*0Sstevel@tonic-gate 		cs1->cs_linlck_next = cs;
3277*0Sstevel@tonic-gate 		ui->ui_io_lock->io_list_back = cs;
3278*0Sstevel@tonic-gate 	}
3279*0Sstevel@tonic-gate 	STAT_INC(raid_write_waits);
3280*0Sstevel@tonic-gate 	STAT_MAX(raid_max_write_q_length, raid_write_queue_length);
3281*0Sstevel@tonic-gate 	cs->cs_linlck_next = NULL;
3282*0Sstevel@tonic-gate 	mutex_exit(io_list_mutex);
3283*0Sstevel@tonic-gate }
3284*0Sstevel@tonic-gate 
3285*0Sstevel@tonic-gate /*
3286*0Sstevel@tonic-gate  * NAME:	raid_write
3287*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice write routine
3288*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
3289*0Sstevel@tonic-gate  *		md_raidcs_t *cs - pointer to a child structure
3290*0Sstevel@tonic-gate  */
3291*0Sstevel@tonic-gate 
3292*0Sstevel@tonic-gate /*ARGSUSED*/
3293*0Sstevel@tonic-gate static int
3294*0Sstevel@tonic-gate raid_write(mr_unit_t *un, md_raidcs_t *cs)
3295*0Sstevel@tonic-gate {
3296*0Sstevel@tonic-gate 	int		error = 0;
3297*0Sstevel@tonic-gate 	md_raidps_t	*ps;
3298*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
3299*0Sstevel@tonic-gate 	minor_t		mnum;
3300*0Sstevel@tonic-gate 	clock_t		timeout;
3301*0Sstevel@tonic-gate 
3302*0Sstevel@tonic-gate 	ASSERT(IO_READER_HELD(un));
3303*0Sstevel@tonic-gate 	ps = cs->cs_ps;
3304*0Sstevel@tonic-gate 	ui = ps->ps_ui;
3305*0Sstevel@tonic-gate 
3306*0Sstevel@tonic-gate 	ASSERT(UNIT_STATE(un) != RUS_INIT);
3307*0Sstevel@tonic-gate 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
3308*0Sstevel@tonic-gate 		error = EIO;
3309*0Sstevel@tonic-gate 
3310*0Sstevel@tonic-gate 	/* make sure the write doesn't go beyond the column */
3311*0Sstevel@tonic-gate 	if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn)
3312*0Sstevel@tonic-gate 		error = ENXIO;
3313*0Sstevel@tonic-gate 	if (error)
3314*0Sstevel@tonic-gate 		goto werror;
3315*0Sstevel@tonic-gate 
3316*0Sstevel@tonic-gate 	getresources(cs);
3317*0Sstevel@tonic-gate 
3318*0Sstevel@tonic-gate 	/*
3319*0Sstevel@tonic-gate 	 * this is an advisory loop that keeps the waiting lists short
3320*0Sstevel@tonic-gate 	 * to reduce cpu time.  Since there is a race introduced by not
3321*0Sstevel@tonic-gate 	 * aquiring all the correct mutexes, use a cv_timedwait to be
3322*0Sstevel@tonic-gate 	 * sure the write always will wake up and start.
3323*0Sstevel@tonic-gate 	 */
3324*0Sstevel@tonic-gate 	while (raid_check_pw(cs)) {
3325*0Sstevel@tonic-gate 		mutex_enter(&un->un_mx);
3326*0Sstevel@tonic-gate 		(void) drv_getparm(LBOLT, &timeout);
3327*0Sstevel@tonic-gate 		timeout += md_wr_wait;
3328*0Sstevel@tonic-gate 		un->un_rflags |= MD_RFLAG_NEEDPW;
3329*0Sstevel@tonic-gate 		STAT_INC(raid_prewrite_waits);
3330*0Sstevel@tonic-gate 		(void) cv_timedwait(&un->un_cv, &un->un_mx, timeout);
3331*0Sstevel@tonic-gate 		un->un_rflags &= ~MD_RFLAG_NEEDPW;
3332*0Sstevel@tonic-gate 		mutex_exit(&un->un_mx);
3333*0Sstevel@tonic-gate 	}
3334*0Sstevel@tonic-gate 
3335*0Sstevel@tonic-gate 	if (raid_line_writer_lock(cs, 1))
3336*0Sstevel@tonic-gate 		return (0);
3337*0Sstevel@tonic-gate 
3338*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_readerlock(ui);
3339*0Sstevel@tonic-gate 	cs->cs_un = un;
3340*0Sstevel@tonic-gate 	mnum = MD_SID(un);
3341*0Sstevel@tonic-gate 
3342*0Sstevel@tonic-gate 	if (un->un_state & RUS_REGEN) {
3343*0Sstevel@tonic-gate 		raid_regen_parity(cs);
3344*0Sstevel@tonic-gate 		un = MD_UNIT(mnum);
3345*0Sstevel@tonic-gate 		cs->cs_un = un;
3346*0Sstevel@tonic-gate 	}
3347*0Sstevel@tonic-gate 
3348*0Sstevel@tonic-gate 	raid_write_io(un, cs);
3349*0Sstevel@tonic-gate 	return (0);
3350*0Sstevel@tonic-gate werror:
3351*0Sstevel@tonic-gate 	/* aquire unit reader lock sinc raid_free_child always drops it */
3352*0Sstevel@tonic-gate 	raid_error_parent(ps, error);
3353*0Sstevel@tonic-gate 	raid_free_child(cs, 0);
3354*0Sstevel@tonic-gate 	/* decrement both pwfrags and frags */
3355*0Sstevel@tonic-gate 	raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK);
3356*0Sstevel@tonic-gate 	return (0);
3357*0Sstevel@tonic-gate }
3358*0Sstevel@tonic-gate 
3359*0Sstevel@tonic-gate 
3360*0Sstevel@tonic-gate /*
3361*0Sstevel@tonic-gate  * NAMES:	raid_stage
3362*0Sstevel@tonic-gate  * DESCRIPTION: post-processing routine for a RAID metadevice
3363*0Sstevel@tonic-gate  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
3364*0Sstevel@tonic-gate  */
3365*0Sstevel@tonic-gate static void
3366*0Sstevel@tonic-gate raid_stage(md_raidcs_t *cs)
3367*0Sstevel@tonic-gate {
3368*0Sstevel@tonic-gate 	md_raidps_t	*ps = cs->cs_ps;
3369*0Sstevel@tonic-gate 	mr_unit_t	*un = cs->cs_un;
3370*0Sstevel@tonic-gate 	md_raidcbuf_t	*cbuf;
3371*0Sstevel@tonic-gate 	buf_t		*bp;
3372*0Sstevel@tonic-gate 	void		*private;
3373*0Sstevel@tonic-gate 	int		flag;
3374*0Sstevel@tonic-gate 
3375*0Sstevel@tonic-gate 	switch (cs->cs_stage) {
3376*0Sstevel@tonic-gate 	    case RAID_READ_DONE:
3377*0Sstevel@tonic-gate 		raid_free_child(cs, 1);
3378*0Sstevel@tonic-gate 		/* decrement readfrags */
3379*0Sstevel@tonic-gate 		raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
3380*0Sstevel@tonic-gate 		return;
3381*0Sstevel@tonic-gate 
3382*0Sstevel@tonic-gate 	    case RAID_WRITE_DONE:
3383*0Sstevel@tonic-gate 	    case RAID_WRITE_PONLY_DONE:
3384*0Sstevel@tonic-gate 	    case RAID_WRITE_DONLY_DONE:
3385*0Sstevel@tonic-gate 		/*
3386*0Sstevel@tonic-gate 		 *  Completed writing real parity and/or data.
3387*0Sstevel@tonic-gate 		 */
3388*0Sstevel@tonic-gate 		ASSERT(cs->cs_flags & MD_RCS_PWDONE);
3389*0Sstevel@tonic-gate 		raid_free_child(cs, 1);
3390*0Sstevel@tonic-gate 		/* decrement frags but not pwfrags */
3391*0Sstevel@tonic-gate 		raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK);
3392*0Sstevel@tonic-gate 		return;
3393*0Sstevel@tonic-gate 
3394*0Sstevel@tonic-gate 	    case RAID_PREWRITE_DONE:
3395*0Sstevel@tonic-gate 		/*
3396*0Sstevel@tonic-gate 		 * completed writing data and parity to prewrite entries
3397*0Sstevel@tonic-gate 		 */
3398*0Sstevel@tonic-gate 		/*
3399*0Sstevel@tonic-gate 		 * WARNING: don't release unit reader lock here..
3400*0Sstevel@tonic-gate 		 * decrement pwfrags but not frags
3401*0Sstevel@tonic-gate 		 */
3402*0Sstevel@tonic-gate 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
3403*0Sstevel@tonic-gate 		cs->cs_flags |= MD_RCS_PWDONE;
3404*0Sstevel@tonic-gate 		cs->cs_frags = 2;
3405*0Sstevel@tonic-gate 		cs->cs_stage = RAID_WRITE_DONE;
3406*0Sstevel@tonic-gate 		cs->cs_call = raid_stage;
3407*0Sstevel@tonic-gate 		cs->cs_error_call = raid_write_error;
3408*0Sstevel@tonic-gate 		cs->cs_retry_call = raid_write_no_retry;
3409*0Sstevel@tonic-gate 		if (WRITE_ALT(un, cs->cs_pcolumn)) {
3410*0Sstevel@tonic-gate 			cs->cs_frags++;
3411*0Sstevel@tonic-gate 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY |
3412*0Sstevel@tonic-gate 			    RIO_WRITE);
3413*0Sstevel@tonic-gate 		}
3414*0Sstevel@tonic-gate 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
3415*0Sstevel@tonic-gate 			cs->cs_frags++;
3416*0Sstevel@tonic-gate 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
3417*0Sstevel@tonic-gate 		}
3418*0Sstevel@tonic-gate 		ASSERT(cs->cs_frags < 4);
3419*0Sstevel@tonic-gate 		raidio(cs, RIO_DATA | RIO_WRITE);
3420*0Sstevel@tonic-gate 		raidio(cs, RIO_PARITY | RIO_WRITE);
3421*0Sstevel@tonic-gate 		if (cs->cs_pw_inval_list) {
3422*0Sstevel@tonic-gate 			raid_free_pwinvalidate(cs);
3423*0Sstevel@tonic-gate 		}
3424*0Sstevel@tonic-gate 		return;
3425*0Sstevel@tonic-gate 
3426*0Sstevel@tonic-gate 	    case RAID_LINE_PWDONE:
3427*0Sstevel@tonic-gate 		ASSERT(cs->cs_frags == 0);
3428*0Sstevel@tonic-gate 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
3429*0Sstevel@tonic-gate 		cs->cs_flags |= MD_RCS_PWDONE;
3430*0Sstevel@tonic-gate 		cs->cs_frags = un->un_origcolumncnt;
3431*0Sstevel@tonic-gate 		cs->cs_call = raid_stage;
3432*0Sstevel@tonic-gate 		cs->cs_error_call = raid_write_error;
3433*0Sstevel@tonic-gate 		cs->cs_retry_call = raid_write_no_retry;
3434*0Sstevel@tonic-gate 		cs->cs_stage = RAID_WRITE_DONE;
3435*0Sstevel@tonic-gate 		for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
3436*0Sstevel@tonic-gate 			/*
3437*0Sstevel@tonic-gate 			 * fill in buffer for write to prewrite area
3438*0Sstevel@tonic-gate 			 */
3439*0Sstevel@tonic-gate 			bp = &cbuf->cbuf_bp;
3440*0Sstevel@tonic-gate 			bp->b_back = bp;
3441*0Sstevel@tonic-gate 			bp->b_forw = bp;
3442*0Sstevel@tonic-gate 			bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE;
3443*0Sstevel@tonic-gate 			bp->b_bcount = cbuf->cbuf_bcount;
3444*0Sstevel@tonic-gate 			bp->b_bufsize = cbuf->cbuf_bcount;
3445*0Sstevel@tonic-gate 			bp->b_lblkno =
3446*0Sstevel@tonic-gate 			    un->un_column[cbuf->cbuf_column].un_devstart +
3447*0Sstevel@tonic-gate 			    cs->cs_blkno;
3448*0Sstevel@tonic-gate 			bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR);
3449*0Sstevel@tonic-gate 			bp->b_flags &= ~nv_available;
3450*0Sstevel@tonic-gate 			bp->b_flags |= B_WRITE | B_BUSY;
3451*0Sstevel@tonic-gate 			bp->b_iodone = (int (*)())raid_done;
3452*0Sstevel@tonic-gate 			bp->b_edev = md_dev64_to_dev(
3453*0Sstevel@tonic-gate 				un->un_column[cbuf->cbuf_column].un_dev);
3454*0Sstevel@tonic-gate 			bp->b_chain = (struct buf *)cs;
3455*0Sstevel@tonic-gate 			private = cs->cs_strategy_private;
3456*0Sstevel@tonic-gate 			flag = cs->cs_strategy_flag;
3457*0Sstevel@tonic-gate 			md_call_strategy(bp, flag, private);
3458*0Sstevel@tonic-gate 		}
3459*0Sstevel@tonic-gate 		raidio(cs, RIO_DATA | RIO_WRITE);
3460*0Sstevel@tonic-gate 		raidio(cs, RIO_PARITY | RIO_WRITE);
3461*0Sstevel@tonic-gate 		if (cs->cs_pw_inval_list) {
3462*0Sstevel@tonic-gate 			raid_free_pwinvalidate(cs);
3463*0Sstevel@tonic-gate 		}
3464*0Sstevel@tonic-gate 		return;
3465*0Sstevel@tonic-gate 
3466*0Sstevel@tonic-gate 	    default:
3467*0Sstevel@tonic-gate 		ASSERT(0);
3468*0Sstevel@tonic-gate 		break;
3469*0Sstevel@tonic-gate 	}
3470*0Sstevel@tonic-gate }
3471*0Sstevel@tonic-gate /*
3472*0Sstevel@tonic-gate  * NAME:	md_raid_strategy
3473*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice I/O oprations entry point.
3474*0Sstevel@tonic-gate  * PARAMETERS:	buf_t	  *pb - pointer to a user I/O buffer
3475*0Sstevel@tonic-gate  *		int	 flag - metadevice specific flag
3476*0Sstevel@tonic-gate  *		void *private - carry over flag ??
3477*0Sstevel@tonic-gate  *
3478*0Sstevel@tonic-gate  */
3479*0Sstevel@tonic-gate 
3480*0Sstevel@tonic-gate void
3481*0Sstevel@tonic-gate md_raid_strategy(buf_t *pb, int flag, void *private)
3482*0Sstevel@tonic-gate {
3483*0Sstevel@tonic-gate 	md_raidps_t	*ps;
3484*0Sstevel@tonic-gate 	md_raidcs_t	*cs;
3485*0Sstevel@tonic-gate 	int		doing_writes;
3486*0Sstevel@tonic-gate 	int		err;
3487*0Sstevel@tonic-gate 	mr_unit_t	*un;
3488*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
3489*0Sstevel@tonic-gate 	size_t		count;
3490*0Sstevel@tonic-gate 	diskaddr_t	blkno;
3491*0Sstevel@tonic-gate 	caddr_t		addr;
3492*0Sstevel@tonic-gate 	off_t		offset;
3493*0Sstevel@tonic-gate 	int		colcnt;
3494*0Sstevel@tonic-gate 	minor_t		mnum;
3495*0Sstevel@tonic-gate 	set_t		setno;
3496*0Sstevel@tonic-gate 
3497*0Sstevel@tonic-gate 	ui = MDI_UNIT(getminor(pb->b_edev));
3498*0Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
3499*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_io_readerlock(ui);
3500*0Sstevel@tonic-gate 	setno = MD_MIN2SET(getminor(pb->b_edev));
3501*0Sstevel@tonic-gate 
3502*0Sstevel@tonic-gate 	if ((flag & MD_NOBLOCK) == 0) {
3503*0Sstevel@tonic-gate 		if (md_inc_iocount(setno) != 0) {
3504*0Sstevel@tonic-gate 			pb->b_flags |= B_ERROR;
3505*0Sstevel@tonic-gate 			pb->b_error = ENXIO;
3506*0Sstevel@tonic-gate 			pb->b_resid = pb->b_bcount;
3507*0Sstevel@tonic-gate 			md_io_readerexit(ui);
3508*0Sstevel@tonic-gate 			biodone(pb);
3509*0Sstevel@tonic-gate 			return;
3510*0Sstevel@tonic-gate 		}
3511*0Sstevel@tonic-gate 	} else {
3512*0Sstevel@tonic-gate 		md_inc_iocount_noblock(setno);
3513*0Sstevel@tonic-gate 	}
3514*0Sstevel@tonic-gate 
3515*0Sstevel@tonic-gate 	mnum = MD_SID(un);
3516*0Sstevel@tonic-gate 	colcnt = un->un_totalcolumncnt - 1;
3517*0Sstevel@tonic-gate 	count = pb->b_bcount;
3518*0Sstevel@tonic-gate 
3519*0Sstevel@tonic-gate 	STAT_CHECK(raid_512, count == 512);
3520*0Sstevel@tonic-gate 	STAT_CHECK(raid_1024, count == 1024);
3521*0Sstevel@tonic-gate 	STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192);
3522*0Sstevel@tonic-gate 	STAT_CHECK(raid_8192, count == 8192);
3523*0Sstevel@tonic-gate 	STAT_CHECK(raid_8192_bigger, count > 8192);
3524*0Sstevel@tonic-gate 
3525*0Sstevel@tonic-gate 	(void *) md_unit_readerlock(ui);
3526*0Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP)) {
3527*0Sstevel@tonic-gate 		err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */
3528*0Sstevel@tonic-gate 		if (err != 0) {
3529*0Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
3530*0Sstevel@tonic-gate 			md_io_readerexit(ui);
3531*0Sstevel@tonic-gate 			return;
3532*0Sstevel@tonic-gate 		}
3533*0Sstevel@tonic-gate 	}
3534*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
3535*0Sstevel@tonic-gate 
3536*0Sstevel@tonic-gate 	STAT_INC(raid_total_io);
3537*0Sstevel@tonic-gate 
3538*0Sstevel@tonic-gate 	/* allocate a parent structure for the user I/O */
3539*0Sstevel@tonic-gate 	ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
3540*0Sstevel@tonic-gate 	raid_parent_init(ps);
3541*0Sstevel@tonic-gate 
3542*0Sstevel@tonic-gate 	/*
3543*0Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
3544*0Sstevel@tonic-gate 	 * in the md_save structure.
3545*0Sstevel@tonic-gate 	 */
3546*0Sstevel@tonic-gate 	ps->ps_un = un;
3547*0Sstevel@tonic-gate 	ps->ps_ui = ui;
3548*0Sstevel@tonic-gate 	ps->ps_bp = pb;
3549*0Sstevel@tonic-gate 	ps->ps_addr = pb->b_un.b_addr;
3550*0Sstevel@tonic-gate 
3551*0Sstevel@tonic-gate 	if ((pb->b_flags & B_READ) == 0) {
3552*0Sstevel@tonic-gate 		ps->ps_flags |= MD_RPS_WRITE;
3553*0Sstevel@tonic-gate 		doing_writes = 1;
3554*0Sstevel@tonic-gate 		STAT_INC(raid_writes);
3555*0Sstevel@tonic-gate 	} else {
3556*0Sstevel@tonic-gate 		ps->ps_flags |= MD_RPS_READ;
3557*0Sstevel@tonic-gate 		doing_writes = 0;
3558*0Sstevel@tonic-gate 		STAT_INC(raid_reads);
3559*0Sstevel@tonic-gate 	}
3560*0Sstevel@tonic-gate 
3561*0Sstevel@tonic-gate 	count = lbtodb(pb->b_bcount);	/* transfer count (in blocks) */
3562*0Sstevel@tonic-gate 	blkno = pb->b_lblkno;		/* block number on device */
3563*0Sstevel@tonic-gate 	addr  = 0;
3564*0Sstevel@tonic-gate 	offset = 0;
3565*0Sstevel@tonic-gate 	ps->ps_pwfrags = 1;
3566*0Sstevel@tonic-gate 	ps->ps_frags = 1;
3567*0Sstevel@tonic-gate 	md_kstat_waitq_to_runq(ui);
3568*0Sstevel@tonic-gate 
3569*0Sstevel@tonic-gate 	do {
3570*0Sstevel@tonic-gate 		cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
3571*0Sstevel@tonic-gate 		raid_child_init(cs);
3572*0Sstevel@tonic-gate 		cs->cs_ps = ps;
3573*0Sstevel@tonic-gate 		cs->cs_un = un;
3574*0Sstevel@tonic-gate 		cs->cs_mdunit = mnum;
3575*0Sstevel@tonic-gate 		cs->cs_strategy_flag = flag;
3576*0Sstevel@tonic-gate 		cs->cs_strategy_private = private;
3577*0Sstevel@tonic-gate 		cs->cs_addr = addr;
3578*0Sstevel@tonic-gate 		cs->cs_offset = offset;
3579*0Sstevel@tonic-gate 		count = raid_iosetup(un, blkno, count, cs);
3580*0Sstevel@tonic-gate 		if (cs->cs_flags & MD_RCS_LINE) {
3581*0Sstevel@tonic-gate 			blkno += (cs->cs_blkcnt * colcnt);
3582*0Sstevel@tonic-gate 			offset += (cs->cs_bcount * colcnt);
3583*0Sstevel@tonic-gate 		} else {
3584*0Sstevel@tonic-gate 			blkno +=  cs->cs_blkcnt;
3585*0Sstevel@tonic-gate 			offset += cs->cs_bcount;
3586*0Sstevel@tonic-gate 		}
3587*0Sstevel@tonic-gate 		/* for each cs bump up the ps_pwfrags and ps_frags fields */
3588*0Sstevel@tonic-gate 		if (count) {
3589*0Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
3590*0Sstevel@tonic-gate 			ps->ps_pwfrags++;
3591*0Sstevel@tonic-gate 			ps->ps_frags++;
3592*0Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
3593*0Sstevel@tonic-gate 			if (doing_writes)
3594*0Sstevel@tonic-gate 				(void) raid_write(un, cs);
3595*0Sstevel@tonic-gate 			else
3596*0Sstevel@tonic-gate 				(void) raid_read(un, cs);
3597*0Sstevel@tonic-gate 		}
3598*0Sstevel@tonic-gate 	} while (count);
3599*0Sstevel@tonic-gate 	if (doing_writes) {
3600*0Sstevel@tonic-gate 		(void) raid_write(un, cs);
3601*0Sstevel@tonic-gate 	} else
3602*0Sstevel@tonic-gate 		(void) raid_read(un, cs);
3603*0Sstevel@tonic-gate 
3604*0Sstevel@tonic-gate 	if (! (flag & MD_STR_NOTTOP) && panicstr) {
3605*0Sstevel@tonic-gate 		while (! (ps->ps_flags & MD_RPS_DONE)) {
3606*0Sstevel@tonic-gate 			md_daemon(1, &md_done_daemon);
3607*0Sstevel@tonic-gate 			drv_usecwait(10);
3608*0Sstevel@tonic-gate 		}
3609*0Sstevel@tonic-gate 		kmem_cache_free(raid_parent_cache, ps);
3610*0Sstevel@tonic-gate 	}
3611*0Sstevel@tonic-gate }
3612*0Sstevel@tonic-gate 
3613*0Sstevel@tonic-gate /*
3614*0Sstevel@tonic-gate  * NAMES:	raid_snarf
3615*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice SNARF entry point
3616*0Sstevel@tonic-gate  * PARAMETERS:	md_snarfcmd_t cmd,
3617*0Sstevel@tonic-gate  *		set_t setno
3618*0Sstevel@tonic-gate  * RETURNS:
3619*0Sstevel@tonic-gate  */
3620*0Sstevel@tonic-gate static int
3621*0Sstevel@tonic-gate raid_snarf(md_snarfcmd_t cmd, set_t setno)
3622*0Sstevel@tonic-gate {
3623*0Sstevel@tonic-gate 	mr_unit_t	*un;
3624*0Sstevel@tonic-gate 	mddb_recid_t	recid;
3625*0Sstevel@tonic-gate 	int		gotsomething;
3626*0Sstevel@tonic-gate 	int		all_raid_gotten;
3627*0Sstevel@tonic-gate 	mddb_type_t	typ1;
3628*0Sstevel@tonic-gate 	uint_t		ncol;
3629*0Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
3630*0Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
3631*0Sstevel@tonic-gate 	size_t		newreqsize;
3632*0Sstevel@tonic-gate 	mr_unit_t	*big_un;
3633*0Sstevel@tonic-gate 	mr_unit32_od_t	*small_un;
3634*0Sstevel@tonic-gate 
3635*0Sstevel@tonic-gate 
3636*0Sstevel@tonic-gate 	if (cmd == MD_SNARF_CLEANUP)
3637*0Sstevel@tonic-gate 		return (0);
3638*0Sstevel@tonic-gate 
3639*0Sstevel@tonic-gate 	all_raid_gotten = 1;
3640*0Sstevel@tonic-gate 	gotsomething = 0;
3641*0Sstevel@tonic-gate 	typ1 = (mddb_type_t)md_getshared_key(setno,
3642*0Sstevel@tonic-gate 	    raid_md_ops.md_driver.md_drivername);
3643*0Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
3644*0Sstevel@tonic-gate 
3645*0Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
3646*0Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) {
3647*0Sstevel@tonic-gate 			continue;
3648*0Sstevel@tonic-gate 		}
3649*0Sstevel@tonic-gate 
3650*0Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
3651*0Sstevel@tonic-gate 		dep->de_flags = MDDB_F_RAID;
3652*0Sstevel@tonic-gate 		rbp = dep->de_rb;
3653*0Sstevel@tonic-gate 		if ((rbp->rb_revision == MDDB_REV_RB) &&
3654*0Sstevel@tonic-gate 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
3655*0Sstevel@tonic-gate 			/*
3656*0Sstevel@tonic-gate 			 * This means, we have an old and small record
3657*0Sstevel@tonic-gate 			 * and this record hasn't already been converted.
3658*0Sstevel@tonic-gate 			 * Before we create an incore metadevice from this
3659*0Sstevel@tonic-gate 			 * we have to convert it to a big record.
3660*0Sstevel@tonic-gate 			 */
3661*0Sstevel@tonic-gate 			small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid);
3662*0Sstevel@tonic-gate 			ncol = small_un->un_totalcolumncnt;
3663*0Sstevel@tonic-gate 			newreqsize = sizeof (mr_unit_t) +
3664*0Sstevel@tonic-gate 				((ncol - 1) * sizeof (mr_column_t));
3665*0Sstevel@tonic-gate 			big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
3666*0Sstevel@tonic-gate 			raid_convert((caddr_t)small_un, (caddr_t)big_un,
3667*0Sstevel@tonic-gate 				SMALL_2_BIG);
3668*0Sstevel@tonic-gate 			kmem_free(small_un, dep->de_reqsize);
3669*0Sstevel@tonic-gate 			dep->de_rb_userdata = big_un;
3670*0Sstevel@tonic-gate 			dep->de_reqsize = newreqsize;
3671*0Sstevel@tonic-gate 			un = big_un;
3672*0Sstevel@tonic-gate 			rbp->rb_private |= MD_PRV_CONVD;
3673*0Sstevel@tonic-gate 		} else {
3674*0Sstevel@tonic-gate 			/* Big device */
3675*0Sstevel@tonic-gate 			un = (mr_unit_t *)mddb_getrecaddr(recid);
3676*0Sstevel@tonic-gate 		}
3677*0Sstevel@tonic-gate 
3678*0Sstevel@tonic-gate 		/* Set revision and flag accordingly */
3679*0Sstevel@tonic-gate 		if (rbp->rb_revision == MDDB_REV_RB) {
3680*0Sstevel@tonic-gate 			un->c.un_revision = MD_32BIT_META_DEV;
3681*0Sstevel@tonic-gate 		} else {
3682*0Sstevel@tonic-gate 			un->c.un_revision = MD_64BIT_META_DEV;
3683*0Sstevel@tonic-gate 			un->c.un_flag |= MD_EFILABEL;
3684*0Sstevel@tonic-gate 		}
3685*0Sstevel@tonic-gate 
3686*0Sstevel@tonic-gate 		/*
3687*0Sstevel@tonic-gate 		 * Create minor device node for snarfed entry.
3688*0Sstevel@tonic-gate 		 */
3689*0Sstevel@tonic-gate 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
3690*0Sstevel@tonic-gate 
3691*0Sstevel@tonic-gate 		if (MD_UNIT(MD_SID(un)) != NULL) {
3692*0Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
3693*0Sstevel@tonic-gate 			continue;
3694*0Sstevel@tonic-gate 		}
3695*0Sstevel@tonic-gate 		all_raid_gotten = 0;
3696*0Sstevel@tonic-gate 		if (raid_build_incore((void *)un, 1) == 0) {
3697*0Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
3698*0Sstevel@tonic-gate 			md_create_unit_incore(MD_SID(un), &raid_md_ops,
3699*0Sstevel@tonic-gate 			    1);
3700*0Sstevel@tonic-gate 			gotsomething = 1;
3701*0Sstevel@tonic-gate 		} else if (un->mr_ic) {
3702*0Sstevel@tonic-gate 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
3703*0Sstevel@tonic-gate 				un->un_totalcolumncnt);
3704*0Sstevel@tonic-gate 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
3705*0Sstevel@tonic-gate 		}
3706*0Sstevel@tonic-gate 	}
3707*0Sstevel@tonic-gate 
3708*0Sstevel@tonic-gate 	if (!all_raid_gotten) {
3709*0Sstevel@tonic-gate 		return (gotsomething);
3710*0Sstevel@tonic-gate 	}
3711*0Sstevel@tonic-gate 
3712*0Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
3713*0Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
3714*0Sstevel@tonic-gate 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
3715*0Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
3716*0Sstevel@tonic-gate 
3717*0Sstevel@tonic-gate 	return (0);
3718*0Sstevel@tonic-gate }
3719*0Sstevel@tonic-gate 
3720*0Sstevel@tonic-gate /*
3721*0Sstevel@tonic-gate  * NAMES:	raid_halt
3722*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice HALT entry point
3723*0Sstevel@tonic-gate  * PARAMETERS:	md_haltcmd_t cmd -
3724*0Sstevel@tonic-gate  *		set_t	setno -
3725*0Sstevel@tonic-gate  * RETURNS:
3726*0Sstevel@tonic-gate  */
3727*0Sstevel@tonic-gate static int
3728*0Sstevel@tonic-gate raid_halt(md_haltcmd_t cmd, set_t setno)
3729*0Sstevel@tonic-gate {
3730*0Sstevel@tonic-gate 	set_t		i;
3731*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
3732*0Sstevel@tonic-gate 	minor_t		mnum;
3733*0Sstevel@tonic-gate 
3734*0Sstevel@tonic-gate 	if (cmd == MD_HALT_CLOSE)
3735*0Sstevel@tonic-gate 		return (0);
3736*0Sstevel@tonic-gate 
3737*0Sstevel@tonic-gate 	if (cmd == MD_HALT_OPEN)
3738*0Sstevel@tonic-gate 		return (0);
3739*0Sstevel@tonic-gate 
3740*0Sstevel@tonic-gate 	if (cmd == MD_HALT_UNLOAD)
3741*0Sstevel@tonic-gate 		return (0);
3742*0Sstevel@tonic-gate 
3743*0Sstevel@tonic-gate 	if (cmd == MD_HALT_CHECK) {
3744*0Sstevel@tonic-gate 		for (i = 0; i < md_nunits; i++) {
3745*0Sstevel@tonic-gate 			mnum = MD_MKMIN(setno, i);
3746*0Sstevel@tonic-gate 			if ((ui = MDI_UNIT(mnum)) == NULL)
3747*0Sstevel@tonic-gate 				continue;
3748*0Sstevel@tonic-gate 			if (ui->ui_opsindex != raid_md_ops.md_selfindex)
3749*0Sstevel@tonic-gate 				continue;
3750*0Sstevel@tonic-gate 			if (md_unit_isopen(ui))
3751*0Sstevel@tonic-gate 				return (1);
3752*0Sstevel@tonic-gate 		}
3753*0Sstevel@tonic-gate 		return (0);
3754*0Sstevel@tonic-gate 	}
3755*0Sstevel@tonic-gate 
3756*0Sstevel@tonic-gate 	if (cmd != MD_HALT_DOIT)
3757*0Sstevel@tonic-gate 		return (1);
3758*0Sstevel@tonic-gate 
3759*0Sstevel@tonic-gate 	for (i = 0; i < md_nunits; i++) {
3760*0Sstevel@tonic-gate 		mnum = MD_MKMIN(setno, i);
3761*0Sstevel@tonic-gate 		if ((ui = MDI_UNIT(mnum)) == NULL)
3762*0Sstevel@tonic-gate 			continue;
3763*0Sstevel@tonic-gate 		if (ui->ui_opsindex != raid_md_ops.md_selfindex)
3764*0Sstevel@tonic-gate 			continue;
3765*0Sstevel@tonic-gate 		reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0);
3766*0Sstevel@tonic-gate 	}
3767*0Sstevel@tonic-gate 	return (0);
3768*0Sstevel@tonic-gate }
3769*0Sstevel@tonic-gate 
3770*0Sstevel@tonic-gate /*
3771*0Sstevel@tonic-gate  * NAMES:	raid_close_all_devs
3772*0Sstevel@tonic-gate  * DESCRIPTION: Close all the devices of the unit.
3773*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
3774*0Sstevel@tonic-gate  * RETURNS:
3775*0Sstevel@tonic-gate  */
3776*0Sstevel@tonic-gate void
3777*0Sstevel@tonic-gate raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags)
3778*0Sstevel@tonic-gate {
3779*0Sstevel@tonic-gate 	int		i;
3780*0Sstevel@tonic-gate 	mr_column_t	*device;
3781*0Sstevel@tonic-gate 
3782*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3783*0Sstevel@tonic-gate 		device = &un->un_column[i];
3784*0Sstevel@tonic-gate 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
3785*0Sstevel@tonic-gate 			ASSERT((device->un_dev != (md_dev64_t)0) &&
3786*0Sstevel@tonic-gate 			    (device->un_dev != NODEV64));
3787*0Sstevel@tonic-gate 			if ((device->un_devstate & RCS_OKAY) && init_pw)
3788*0Sstevel@tonic-gate 				(void) init_pw_area(un, device->un_dev,
3789*0Sstevel@tonic-gate 							device->un_pwstart, i);
3790*0Sstevel@tonic-gate 			md_layered_close(device->un_dev, md_cflags);
3791*0Sstevel@tonic-gate 			device->un_devflags &= ~MD_RAID_DEV_ISOPEN;
3792*0Sstevel@tonic-gate 		}
3793*0Sstevel@tonic-gate 	}
3794*0Sstevel@tonic-gate }
3795*0Sstevel@tonic-gate 
3796*0Sstevel@tonic-gate /*
3797*0Sstevel@tonic-gate  * NAMES:	raid_open_all_devs
3798*0Sstevel@tonic-gate  * DESCRIPTION: Open all the components (columns) of the device unit.
3799*0Sstevel@tonic-gate  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
3800*0Sstevel@tonic-gate  * RETURNS:
3801*0Sstevel@tonic-gate  */
3802*0Sstevel@tonic-gate static int
3803*0Sstevel@tonic-gate raid_open_all_devs(mr_unit_t *un, int md_oflags)
3804*0Sstevel@tonic-gate {
3805*0Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
3806*0Sstevel@tonic-gate 	int		i;
3807*0Sstevel@tonic-gate 	int		not_opened = 0;
3808*0Sstevel@tonic-gate 	int		commit = 0;
3809*0Sstevel@tonic-gate 	int		col = -1;
3810*0Sstevel@tonic-gate 	mr_column_t	*device;
3811*0Sstevel@tonic-gate 	set_t		setno = MD_MIN2SET(MD_SID(un));
3812*0Sstevel@tonic-gate 	side_t		side = mddb_getsidenum(setno);
3813*0Sstevel@tonic-gate 	mdkey_t		key;
3814*0Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
3815*0Sstevel@tonic-gate 
3816*0Sstevel@tonic-gate 	ui->ui_tstate &= ~MD_INACCESSIBLE;
3817*0Sstevel@tonic-gate 
3818*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3819*0Sstevel@tonic-gate 		md_dev64_t tmpdev;
3820*0Sstevel@tonic-gate 
3821*0Sstevel@tonic-gate 		device = &un->un_column[i];
3822*0Sstevel@tonic-gate 
3823*0Sstevel@tonic-gate 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
3824*0Sstevel@tonic-gate 			not_opened++;
3825*0Sstevel@tonic-gate 			continue;
3826*0Sstevel@tonic-gate 		}
3827*0Sstevel@tonic-gate 
3828*0Sstevel@tonic-gate 		if (device->un_devflags & MD_RAID_DEV_ISOPEN)
3829*0Sstevel@tonic-gate 			continue;
3830*0Sstevel@tonic-gate 
3831*0Sstevel@tonic-gate 		tmpdev = device->un_dev;
3832*0Sstevel@tonic-gate 		/*
3833*0Sstevel@tonic-gate 		 * Open by device id
3834*0Sstevel@tonic-gate 		 */
3835*0Sstevel@tonic-gate 		key = HOTSPARED(un, i) ?
3836*0Sstevel@tonic-gate 			device->un_hs_key : device->un_orig_key;
3837*0Sstevel@tonic-gate 		if ((md_getmajor(tmpdev) != md_major) &&
3838*0Sstevel@tonic-gate 			md_devid_found(setno, side, key) == 1) {
3839*0Sstevel@tonic-gate 			tmpdev = md_resolve_bydevid(mnum, tmpdev, key);
3840*0Sstevel@tonic-gate 		}
3841*0Sstevel@tonic-gate 		if (md_layered_open(mnum, &tmpdev, md_oflags)) {
3842*0Sstevel@tonic-gate 			device->un_dev = tmpdev;
3843*0Sstevel@tonic-gate 			not_opened++;
3844*0Sstevel@tonic-gate 			continue;
3845*0Sstevel@tonic-gate 		}
3846*0Sstevel@tonic-gate 		device->un_dev = tmpdev;
3847*0Sstevel@tonic-gate 		device->un_devflags |= MD_RAID_DEV_ISOPEN;
3848*0Sstevel@tonic-gate 	}
3849*0Sstevel@tonic-gate 
3850*0Sstevel@tonic-gate 	/* if open errors and errored devices are 1 then device can run */
3851*0Sstevel@tonic-gate 	if (not_opened > 1) {
3852*0Sstevel@tonic-gate 		cmn_err(CE_WARN,
3853*0Sstevel@tonic-gate 		"md: %s failed to open. open error on %s\n",
3854*0Sstevel@tonic-gate 			md_shortname(MD_SID(un)),
3855*0Sstevel@tonic-gate 			md_devname(MD_UN2SET(un), device->un_orig_dev,
3856*0Sstevel@tonic-gate 					NULL, 0));
3857*0Sstevel@tonic-gate 
3858*0Sstevel@tonic-gate 		ui->ui_tstate |= MD_INACCESSIBLE;
3859*0Sstevel@tonic-gate 
3860*0Sstevel@tonic-gate 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
3861*0Sstevel@tonic-gate 		    MD_UN2SET(un), MD_SID(un));
3862*0Sstevel@tonic-gate 
3863*0Sstevel@tonic-gate 		return (not_opened > 1);
3864*0Sstevel@tonic-gate 	}
3865*0Sstevel@tonic-gate 
3866*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3867*0Sstevel@tonic-gate 		device = &un->un_column[i];
3868*0Sstevel@tonic-gate 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
3869*0Sstevel@tonic-gate 			if (device->un_devstate & RCS_LAST_ERRED) {
3870*0Sstevel@tonic-gate 			/*
3871*0Sstevel@tonic-gate 			 * At this point in time there is a possibility
3872*0Sstevel@tonic-gate 			 * that errors were the result of a controller
3873*0Sstevel@tonic-gate 			 * failure with more than a single column on it
3874*0Sstevel@tonic-gate 			 * so clear out last errored columns and let errors
3875*0Sstevel@tonic-gate 			 * re-occur is necessary.
3876*0Sstevel@tonic-gate 			 */
3877*0Sstevel@tonic-gate 				raid_set_state(un, i, RCS_OKAY, 0);
3878*0Sstevel@tonic-gate 				commit++;
3879*0Sstevel@tonic-gate 			}
3880*0Sstevel@tonic-gate 			continue;
3881*0Sstevel@tonic-gate 		}
3882*0Sstevel@tonic-gate 		ASSERT(col == -1);
3883*0Sstevel@tonic-gate 		col = i;
3884*0Sstevel@tonic-gate 	}
3885*0Sstevel@tonic-gate 
3886*0Sstevel@tonic-gate 	if (col != -1) {
3887*0Sstevel@tonic-gate 		raid_set_state(un, col, RCS_ERRED, 0);
3888*0Sstevel@tonic-gate 		commit++;
3889*0Sstevel@tonic-gate 	}
3890*0Sstevel@tonic-gate 
3891*0Sstevel@tonic-gate 	if (commit)
3892*0Sstevel@tonic-gate 		raid_commit(un, NULL);
3893*0Sstevel@tonic-gate 
3894*0Sstevel@tonic-gate 	if (col != -1) {
3895*0Sstevel@tonic-gate 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
3896*0Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
3897*0Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
3898*0Sstevel@tonic-gate 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
3899*0Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
3900*0Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
3901*0Sstevel@tonic-gate 		}
3902*0Sstevel@tonic-gate 	}
3903*0Sstevel@tonic-gate 
3904*0Sstevel@tonic-gate 	return (0);
3905*0Sstevel@tonic-gate }
3906*0Sstevel@tonic-gate 
3907*0Sstevel@tonic-gate /*
3908*0Sstevel@tonic-gate  * NAMES:	raid_internal_open
3909*0Sstevel@tonic-gate  * DESCRIPTION: Do the actual RAID open
3910*0Sstevel@tonic-gate  * PARAMETERS:	minor_t mnum - minor number of the RAID device
3911*0Sstevel@tonic-gate  *		int flag -
3912*0Sstevel@tonic-gate  *		int otyp -
3913*0Sstevel@tonic-gate  *		int md_oflags - RAID open flags
3914*0Sstevel@tonic-gate  * RETURNS:	0 if successful, nonzero otherwise
3915*0Sstevel@tonic-gate  */
3916*0Sstevel@tonic-gate int
3917*0Sstevel@tonic-gate raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags)
3918*0Sstevel@tonic-gate {
3919*0Sstevel@tonic-gate 	mr_unit_t	*un;
3920*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
3921*0Sstevel@tonic-gate 	int		err = 0;
3922*0Sstevel@tonic-gate 	int		replay_error = 0;
3923*0Sstevel@tonic-gate 
3924*0Sstevel@tonic-gate 	ui = MDI_UNIT(mnum);
3925*0Sstevel@tonic-gate 	ASSERT(ui != NULL);
3926*0Sstevel@tonic-gate 
3927*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
3928*0Sstevel@tonic-gate 	/*
3929*0Sstevel@tonic-gate 	 * this MUST be checked before md_unit_isopen is checked.
3930*0Sstevel@tonic-gate 	 * raid_init_columns sets md_unit_isopen to block reset, halt.
3931*0Sstevel@tonic-gate 	 */
3932*0Sstevel@tonic-gate 	if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) &&
3933*0Sstevel@tonic-gate 			!(md_oflags & MD_OFLG_ISINIT)) {
3934*0Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
3935*0Sstevel@tonic-gate 		return (EAGAIN);
3936*0Sstevel@tonic-gate 	}
3937*0Sstevel@tonic-gate 
3938*0Sstevel@tonic-gate 	if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) {
3939*0Sstevel@tonic-gate 		err = md_unit_incopen(mnum, flag, otyp);
3940*0Sstevel@tonic-gate 		goto out;
3941*0Sstevel@tonic-gate 	}
3942*0Sstevel@tonic-gate 
3943*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
3944*0Sstevel@tonic-gate 
3945*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_writerlock(ui);
3946*0Sstevel@tonic-gate 	if (raid_open_all_devs(un, md_oflags) == 0) {
3947*0Sstevel@tonic-gate 		if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) {
3948*0Sstevel@tonic-gate 			md_unit_writerexit(ui);
3949*0Sstevel@tonic-gate 			un = (mr_unit_t *)md_unit_readerlock(ui);
3950*0Sstevel@tonic-gate 			raid_close_all_devs(un, 0, md_oflags);
3951*0Sstevel@tonic-gate 			goto out;
3952*0Sstevel@tonic-gate 		}
3953*0Sstevel@tonic-gate 	} else {
3954*0Sstevel@tonic-gate 		/*
3955*0Sstevel@tonic-gate 		 * if this unit contains more than two errored components
3956*0Sstevel@tonic-gate 		 * should return error and close all opened devices
3957*0Sstevel@tonic-gate 		 */
3958*0Sstevel@tonic-gate 
3959*0Sstevel@tonic-gate 		md_unit_writerexit(ui);
3960*0Sstevel@tonic-gate 		un = (mr_unit_t *)md_unit_readerlock(ui);
3961*0Sstevel@tonic-gate 		raid_close_all_devs(un, 0, md_oflags);
3962*0Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
3963*0Sstevel@tonic-gate 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
3964*0Sstevel@tonic-gate 		    MD_UN2SET(un), MD_SID(un));
3965*0Sstevel@tonic-gate 		return (ENXIO);
3966*0Sstevel@tonic-gate 	}
3967*0Sstevel@tonic-gate 
3968*0Sstevel@tonic-gate 	if (!(MD_STATUS(un) & MD_UN_REPLAYED)) {
3969*0Sstevel@tonic-gate 		replay_error = raid_replay(un);
3970*0Sstevel@tonic-gate 		MD_STATUS(un) |= MD_UN_REPLAYED;
3971*0Sstevel@tonic-gate 	}
3972*0Sstevel@tonic-gate 
3973*0Sstevel@tonic-gate 	md_unit_writerexit(ui);
3974*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_readerlock(ui);
3975*0Sstevel@tonic-gate 
3976*0Sstevel@tonic-gate 	if ((replay_error == RAID_RPLY_READONLY) &&
3977*0Sstevel@tonic-gate 	    ((flag & (FREAD | FWRITE)) == FREAD)) {
3978*0Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
3979*0Sstevel@tonic-gate 		return (0);
3980*0Sstevel@tonic-gate 	}
3981*0Sstevel@tonic-gate 
3982*0Sstevel@tonic-gate 	/* allocate hotspare if possible */
3983*0Sstevel@tonic-gate 	(void) raid_hotspares();
3984*0Sstevel@tonic-gate 
3985*0Sstevel@tonic-gate 
3986*0Sstevel@tonic-gate out:
3987*0Sstevel@tonic-gate 	md_unit_openclose_exit(ui);
3988*0Sstevel@tonic-gate 	return (err);
3989*0Sstevel@tonic-gate }
3990*0Sstevel@tonic-gate /*
3991*0Sstevel@tonic-gate  * NAMES:	raid_open
3992*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice OPEN entry point
3993*0Sstevel@tonic-gate  * PARAMETERS:	dev_t dev -
3994*0Sstevel@tonic-gate  *		int flag -
3995*0Sstevel@tonic-gate  *		int otyp -
3996*0Sstevel@tonic-gate  *		cred_t * cred_p -
3997*0Sstevel@tonic-gate  *		int md_oflags -
3998*0Sstevel@tonic-gate  * RETURNS:
3999*0Sstevel@tonic-gate  */
4000*0Sstevel@tonic-gate /*ARGSUSED1*/
4001*0Sstevel@tonic-gate static int
4002*0Sstevel@tonic-gate raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
4003*0Sstevel@tonic-gate {
4004*0Sstevel@tonic-gate 	int		error = 0;
4005*0Sstevel@tonic-gate 
4006*0Sstevel@tonic-gate 	if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) {
4007*0Sstevel@tonic-gate 		return (error);
4008*0Sstevel@tonic-gate 	}
4009*0Sstevel@tonic-gate 	return (0);
4010*0Sstevel@tonic-gate }
4011*0Sstevel@tonic-gate 
4012*0Sstevel@tonic-gate /*
4013*0Sstevel@tonic-gate  * NAMES:	raid_internal_close
4014*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice CLOSE actual implementation
4015*0Sstevel@tonic-gate  * PARAMETERS:	minor_t - minor number of the RAID device
4016*0Sstevel@tonic-gate  *		int otyp -
4017*0Sstevel@tonic-gate  *		int init_pw -
4018*0Sstevel@tonic-gate  *		int md_cflags - RAID close flags
4019*0Sstevel@tonic-gate  * RETURNS:	0 if successful, nonzero otherwise
4020*0Sstevel@tonic-gate  */
4021*0Sstevel@tonic-gate /*ARGSUSED*/
4022*0Sstevel@tonic-gate int
4023*0Sstevel@tonic-gate raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags)
4024*0Sstevel@tonic-gate {
4025*0Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4026*0Sstevel@tonic-gate 	mr_unit_t	*un;
4027*0Sstevel@tonic-gate 	int		err = 0;
4028*0Sstevel@tonic-gate 
4029*0Sstevel@tonic-gate 	/* single thread */
4030*0Sstevel@tonic-gate 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
4031*0Sstevel@tonic-gate 
4032*0Sstevel@tonic-gate 	/* count closed */
4033*0Sstevel@tonic-gate 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
4034*0Sstevel@tonic-gate 		goto out;
4035*0Sstevel@tonic-gate 	/* close devices, if necessary */
4036*0Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
4037*0Sstevel@tonic-gate 		raid_close_all_devs(un, init_pw, md_cflags);
4038*0Sstevel@tonic-gate 	}
4039*0Sstevel@tonic-gate 
4040*0Sstevel@tonic-gate 	/* unlock, return success */
4041*0Sstevel@tonic-gate out:
4042*0Sstevel@tonic-gate 	md_unit_openclose_exit(ui);
4043*0Sstevel@tonic-gate 	return (err);
4044*0Sstevel@tonic-gate }
4045*0Sstevel@tonic-gate 
4046*0Sstevel@tonic-gate /*
4047*0Sstevel@tonic-gate  * NAMES:	raid_close
4048*0Sstevel@tonic-gate  * DESCRIPTION: RAID metadevice close entry point
4049*0Sstevel@tonic-gate  * PARAMETERS:	dev_t dev -
4050*0Sstevel@tonic-gate  *		int flag -
4051*0Sstevel@tonic-gate  *		int otyp -
4052*0Sstevel@tonic-gate  *		cred_t * cred_p -
4053*0Sstevel@tonic-gate  *		int md_oflags -
4054*0Sstevel@tonic-gate  * RETURNS:
4055*0Sstevel@tonic-gate  */
4056*0Sstevel@tonic-gate /*ARGSUSED1*/
4057*0Sstevel@tonic-gate static int
4058*0Sstevel@tonic-gate raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
4059*0Sstevel@tonic-gate {
4060*0Sstevel@tonic-gate 	int retval;
4061*0Sstevel@tonic-gate 
4062*0Sstevel@tonic-gate 	(void) md_io_writerlock(MDI_UNIT(getminor(dev)));
4063*0Sstevel@tonic-gate 	retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags);
4064*0Sstevel@tonic-gate 	(void) md_io_writerexit(MDI_UNIT(getminor(dev)));
4065*0Sstevel@tonic-gate 	return (retval);
4066*0Sstevel@tonic-gate }
4067*0Sstevel@tonic-gate 
4068*0Sstevel@tonic-gate /*
4069*0Sstevel@tonic-gate  * raid_probe_close_all_devs
4070*0Sstevel@tonic-gate  */
4071*0Sstevel@tonic-gate void
4072*0Sstevel@tonic-gate raid_probe_close_all_devs(mr_unit_t *un)
4073*0Sstevel@tonic-gate {
4074*0Sstevel@tonic-gate 	int		i;
4075*0Sstevel@tonic-gate 	mr_column_t	*device;
4076*0Sstevel@tonic-gate 
4077*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
4078*0Sstevel@tonic-gate 		device = &un->un_column[i];
4079*0Sstevel@tonic-gate 
4080*0Sstevel@tonic-gate 		if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
4081*0Sstevel@tonic-gate 			md_layered_close(device->un_dev,
4082*0Sstevel@tonic-gate 				MD_OFLG_PROBEDEV);
4083*0Sstevel@tonic-gate 			device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN;
4084*0Sstevel@tonic-gate 		}
4085*0Sstevel@tonic-gate 	}
4086*0Sstevel@tonic-gate }
4087*0Sstevel@tonic-gate /*
4088*0Sstevel@tonic-gate  * Raid_probe_dev:
4089*0Sstevel@tonic-gate  *
4090*0Sstevel@tonic-gate  * On entry the unit writerlock is held
4091*0Sstevel@tonic-gate  */
4092*0Sstevel@tonic-gate static int
4093*0Sstevel@tonic-gate raid_probe_dev(mdi_unit_t *ui, minor_t mnum)
4094*0Sstevel@tonic-gate {
4095*0Sstevel@tonic-gate 	mr_unit_t	*un;
4096*0Sstevel@tonic-gate 	int		i;
4097*0Sstevel@tonic-gate 	int		not_opened = 0;
4098*0Sstevel@tonic-gate 	int		commit = 0;
4099*0Sstevel@tonic-gate 	int		col = -1;
4100*0Sstevel@tonic-gate 	mr_column_t	*device;
4101*0Sstevel@tonic-gate 	int		md_devopen = 0;
4102*0Sstevel@tonic-gate 
4103*0Sstevel@tonic-gate 	if (md_unit_isopen(ui))
4104*0Sstevel@tonic-gate 		md_devopen++;
4105*0Sstevel@tonic-gate 
4106*0Sstevel@tonic-gate 	un = MD_UNIT(mnum);
4107*0Sstevel@tonic-gate 	/*
4108*0Sstevel@tonic-gate 	 * If the state has been set to LAST_ERRED because
4109*0Sstevel@tonic-gate 	 * of an error when the raid device was open at some
4110*0Sstevel@tonic-gate 	 * point in the past, don't probe. We really don't want
4111*0Sstevel@tonic-gate 	 * to reset the state in this case.
4112*0Sstevel@tonic-gate 	 */
4113*0Sstevel@tonic-gate 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
4114*0Sstevel@tonic-gate 		return (0);
4115*0Sstevel@tonic-gate 
4116*0Sstevel@tonic-gate 	ui->ui_tstate &= ~MD_INACCESSIBLE;
4117*0Sstevel@tonic-gate 
4118*0Sstevel@tonic-gate 	for (i = 0; i < un->un_totalcolumncnt; i++) {
4119*0Sstevel@tonic-gate 		md_dev64_t tmpdev;
4120*0Sstevel@tonic-gate 
4121*0Sstevel@tonic-gate 		device = &un->un_column[i];
4122*0Sstevel@tonic-gate 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
4123*0Sstevel@tonic-gate 			not_opened++;
4124*0Sstevel@tonic-gate 			continue;
4125*0Sstevel@tonic-gate 		}
4126*0Sstevel@tonic-gate 
4127*0Sstevel@tonic-gate 		tmpdev = device->un_dev;
4128*0Sstevel@tonic-gate 		/*
4129*0Sstevel@tonic-gate 		 * Currently the flags passed are not needed since
4130*0Sstevel@tonic-gate 		 * there cannot be an underlying metadevice. However
4131*0Sstevel@tonic-gate 		 * they are kept here for consistency.
4132*0Sstevel@tonic-gate 		 *
4133*0Sstevel@tonic-gate 		 * Open by device id
4134*0Sstevel@tonic-gate 		 */
4135*0Sstevel@tonic-gate 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)?
4136*0Sstevel@tonic-gate 			device->un_hs_key : device->un_orig_key);
4137*0Sstevel@tonic-gate 		if (md_layered_open(mnum, &tmpdev,
4138*0Sstevel@tonic-gate 				MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) {
4139*0Sstevel@tonic-gate 			device->un_dev = tmpdev;
4140*0Sstevel@tonic-gate 			not_opened++;
4141*0Sstevel@tonic-gate 			continue;
4142*0Sstevel@tonic-gate 		}
4143*0Sstevel@tonic-gate 		device->un_dev = tmpdev;
4144*0Sstevel@tonic-gate 
4145*0Sstevel@tonic-gate 		device->un_devflags |= MD_RAID_DEV_PROBEOPEN;
4146*0Sstevel@tonic-gate 	}
4147*0Sstevel@tonic-gate 
4148*0Sstevel@tonic-gate 	/*
4149*0Sstevel@tonic-gate 	 * The code below is careful on setting the LAST_ERRED state.
4150*0Sstevel@tonic-gate 	 *
4151*0Sstevel@tonic-gate 	 * If open errors and exactly one device has failed we can run.
4152*0Sstevel@tonic-gate 	 * If more then one device fails we have to figure out when to set
4153*0Sstevel@tonic-gate 	 * LAST_ERRED state.  The rationale is to avoid unnecessary resyncs
4154*0Sstevel@tonic-gate 	 * since they are painful and time consuming.
4155*0Sstevel@tonic-gate 	 *
4156*0Sstevel@tonic-gate 	 * When more than one component/column fails there are 2 scenerios.
4157*0Sstevel@tonic-gate 	 *
4158*0Sstevel@tonic-gate 	 * 1. Metadevice has NOT been opened: In this case, the behavior
4159*0Sstevel@tonic-gate 	 *    mimics the open symantics. ie. Only the first failed device
4160*0Sstevel@tonic-gate 	 *    is ERRED and LAST_ERRED is not set.
4161*0Sstevel@tonic-gate 	 *
4162*0Sstevel@tonic-gate 	 * 2. Metadevice has been opened: Here the read/write sematics are
4163*0Sstevel@tonic-gate 	 *    followed. The first failed devicce is ERRED and on the next
4164*0Sstevel@tonic-gate 	 *    failed device LAST_ERRED is set.
4165*0Sstevel@tonic-gate 	 */
4166*0Sstevel@tonic-gate 
4167*0Sstevel@tonic-gate 	if (not_opened > 1 && !md_devopen) {
4168*0Sstevel@tonic-gate 		cmn_err(CE_WARN,
4169*0Sstevel@tonic-gate 			"md: %s failed to open. open error on %s\n",
4170*0Sstevel@tonic-gate 				md_shortname(MD_SID(un)),
4171*0Sstevel@tonic-gate 				md_devname(MD_UN2SET(un), device->un_orig_dev,
4172*0Sstevel@tonic-gate 						NULL, 0));
4173*0Sstevel@tonic-gate 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
4174*0Sstevel@tonic-gate 		    MD_UN2SET(un), MD_SID(un));
4175*0Sstevel@tonic-gate 		raid_probe_close_all_devs(un);
4176*0Sstevel@tonic-gate 		ui->ui_tstate |= MD_INACCESSIBLE;
4177*0Sstevel@tonic-gate 		return (not_opened > 1);
4178*0Sstevel@tonic-gate 	}
4179*0Sstevel@tonic-gate 
4180*0Sstevel@tonic-gate 	if (!md_devopen) {
4181*0Sstevel@tonic-gate 		for (i = 0; i < un->un_totalcolumncnt; i++) {
4182*0Sstevel@tonic-gate 			device = &un->un_column[i];
4183*0Sstevel@tonic-gate 			if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
4184*0Sstevel@tonic-gate 				if (device->un_devstate & RCS_LAST_ERRED) {
4185*0Sstevel@tonic-gate 					/*
4186*0Sstevel@tonic-gate 					 * At this point in time there is a
4187*0Sstevel@tonic-gate 					 * possibility that errors were the
4188*0Sstevel@tonic-gate 					 * result of a controller failure with
4189*0Sstevel@tonic-gate 					 * more than a single column on it so
4190*0Sstevel@tonic-gate 					 * clear out last errored columns and
4191*0Sstevel@tonic-gate 					 * let errors re-occur is necessary.
4192*0Sstevel@tonic-gate 					 */
4193*0Sstevel@tonic-gate 					raid_set_state(un, i, RCS_OKAY, 0);
4194*0Sstevel@tonic-gate 					commit++;
4195*0Sstevel@tonic-gate 					}
4196*0Sstevel@tonic-gate 				continue;
4197*0Sstevel@tonic-gate 			}
4198*0Sstevel@tonic-gate 			ASSERT(col == -1);
4199*0Sstevel@tonic-gate 			/*
4200*0Sstevel@tonic-gate 			 * note if multiple devices are failing then only
4201*0Sstevel@tonic-gate 			 * the last one is marked as error
4202*0Sstevel@tonic-gate 			 */
4203*0Sstevel@tonic-gate 			col = i;
4204*0Sstevel@tonic-gate 		}
4205*0Sstevel@tonic-gate 
4206*0Sstevel@tonic-gate 		if (col != -1) {
4207*0Sstevel@tonic-gate 			raid_set_state(un, col, RCS_ERRED, 0);
4208*0Sstevel@tonic-gate 			commit++;
4209*0Sstevel@tonic-gate 		}
4210*0Sstevel@tonic-gate 
4211*0Sstevel@tonic-gate 	} else {
4212*0Sstevel@tonic-gate 		for (i = 0; i < un->un_totalcolumncnt; i++) {
4213*0Sstevel@tonic-gate 			device = &un->un_column[i];
4214*0Sstevel@tonic-gate 
4215*0Sstevel@tonic-gate 			/* if we have LAST_ERRED go ahead and commit. */
4216*0Sstevel@tonic-gate 			if (un->un_state & RUS_LAST_ERRED)
4217*0Sstevel@tonic-gate 				break;
4218*0Sstevel@tonic-gate 			/*
4219*0Sstevel@tonic-gate 			 * could not open the component
4220*0Sstevel@tonic-gate 			 */
4221*0Sstevel@tonic-gate 
4222*0Sstevel@tonic-gate 			if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) {
4223*0Sstevel@tonic-gate 				col = i;
4224*0Sstevel@tonic-gate 				raid_set_state(un, col, RCS_ERRED, 0);
4225*0Sstevel@tonic-gate 				commit++;
4226*0Sstevel@tonic-gate 			}
4227*0Sstevel@tonic-gate 		}
4228*0Sstevel@tonic-gate 	}
4229*0Sstevel@tonic-gate 
4230*0Sstevel@tonic-gate 	if (commit)
4231*0Sstevel@tonic-gate 		raid_commit(un, NULL);
4232*0Sstevel@tonic-gate 
4233*0Sstevel@tonic-gate 	if (col != -1) {
4234*0Sstevel@tonic-gate 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
4235*0Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
4236*0Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
4237*0Sstevel@tonic-gate 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
4238*0Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
4239*0Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
4240*0Sstevel@tonic-gate 		}
4241*0Sstevel@tonic-gate 	}
4242*0Sstevel@tonic-gate 
4243*0Sstevel@tonic-gate 	raid_probe_close_all_devs(un);
4244*0Sstevel@tonic-gate 	return (0);
4245*0Sstevel@tonic-gate }
4246*0Sstevel@tonic-gate 
4247*0Sstevel@tonic-gate static int
4248*0Sstevel@tonic-gate raid_imp_set(
4249*0Sstevel@tonic-gate 	set_t	setno
4250*0Sstevel@tonic-gate )
4251*0Sstevel@tonic-gate {
4252*0Sstevel@tonic-gate 	mddb_recid_t    recid;
4253*0Sstevel@tonic-gate 	int		i, gotsomething;
4254*0Sstevel@tonic-gate 	mddb_type_t	typ1;
4255*0Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
4256*0Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
4257*0Sstevel@tonic-gate 	mr_unit_t	*un64;
4258*0Sstevel@tonic-gate 	mr_unit32_od_t	*un32;
4259*0Sstevel@tonic-gate 	minor_t		*self_id;	/* minor needs to be updated */
4260*0Sstevel@tonic-gate 	md_parent_t	*parent_id;	/* parent needs to be updated */
4261*0Sstevel@tonic-gate 	mddb_recid_t	*record_id;	 /* record id needs to be updated */
4262*0Sstevel@tonic-gate 	hsp_t		*hsp_id;
4263*0Sstevel@tonic-gate 
4264*0Sstevel@tonic-gate 	gotsomething = 0;
4265*0Sstevel@tonic-gate 
4266*0Sstevel@tonic-gate 	typ1 = (mddb_type_t)md_getshared_key(setno,
4267*0Sstevel@tonic-gate 	    raid_md_ops.md_driver.md_drivername);
4268*0Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
4269*0Sstevel@tonic-gate 
4270*0Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
4271*0Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4272*0Sstevel@tonic-gate 			continue;
4273*0Sstevel@tonic-gate 
4274*0Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
4275*0Sstevel@tonic-gate 		rbp = dep->de_rb;
4276*0Sstevel@tonic-gate 
4277*0Sstevel@tonic-gate 		if (rbp->rb_revision == MDDB_REV_RB) {
4278*0Sstevel@tonic-gate 			/*
4279*0Sstevel@tonic-gate 			 * Small device
4280*0Sstevel@tonic-gate 			 */
4281*0Sstevel@tonic-gate 			un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid);
4282*0Sstevel@tonic-gate 			self_id = &(un32->c.un_self_id);
4283*0Sstevel@tonic-gate 			parent_id = &(un32->c.un_parent);
4284*0Sstevel@tonic-gate 			record_id = &(un32->c.un_record_id);
4285*0Sstevel@tonic-gate 			hsp_id = &(un32->un_hsp_id);
4286*0Sstevel@tonic-gate 
4287*0Sstevel@tonic-gate 			for (i = 0; i < un32->un_totalcolumncnt; i++) {
4288*0Sstevel@tonic-gate 			    mr_column32_od_t *device;
4289*0Sstevel@tonic-gate 
4290*0Sstevel@tonic-gate 			    device = &un32->un_column[i];
4291*0Sstevel@tonic-gate 			    if (!md_update_minor(setno, mddb_getsidenum
4292*0Sstevel@tonic-gate 				(setno), device->un_orig_key))
4293*0Sstevel@tonic-gate 				goto out;
4294*0Sstevel@tonic-gate 
4295*0Sstevel@tonic-gate 			    if (device->un_hs_id != 0)
4296*0Sstevel@tonic-gate 				device->un_hs_id = MAKERECID(
4297*0Sstevel@tonic-gate 				setno, device->un_hs_id);
4298*0Sstevel@tonic-gate 			}
4299*0Sstevel@tonic-gate 		} else {
4300*0Sstevel@tonic-gate 			un64 = (mr_unit_t *)mddb_getrecaddr(recid);
4301*0Sstevel@tonic-gate 			self_id = &(un64->c.un_self_id);
4302*0Sstevel@tonic-gate 			parent_id = &(un64->c.un_parent);
4303*0Sstevel@tonic-gate 			record_id = &(un64->c.un_record_id);
4304*0Sstevel@tonic-gate 			hsp_id = &(un64->un_hsp_id);
4305*0Sstevel@tonic-gate 
4306*0Sstevel@tonic-gate 			for (i = 0; i < un64->un_totalcolumncnt; i++) {
4307*0Sstevel@tonic-gate 			    mr_column_t	*device;
4308*0Sstevel@tonic-gate 
4309*0Sstevel@tonic-gate 			    device = &un64->un_column[i];
4310*0Sstevel@tonic-gate 			    if (!md_update_minor(setno, mddb_getsidenum
4311*0Sstevel@tonic-gate 				(setno), device->un_orig_key))
4312*0Sstevel@tonic-gate 				goto out;
4313*0Sstevel@tonic-gate 
4314*0Sstevel@tonic-gate 			    if (device->un_hs_id != 0)
4315*0Sstevel@tonic-gate 				device->un_hs_id = MAKERECID(
4316*0Sstevel@tonic-gate 				setno, device->un_hs_id);
4317*0Sstevel@tonic-gate 			}
4318*0Sstevel@tonic-gate 		}
4319*0Sstevel@tonic-gate 
4320*0Sstevel@tonic-gate 		/*
4321*0Sstevel@tonic-gate 		 * Update unit with the imported setno
4322*0Sstevel@tonic-gate 		 */
4323*0Sstevel@tonic-gate 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
4324*0Sstevel@tonic-gate 
4325*0Sstevel@tonic-gate 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
4326*0Sstevel@tonic-gate 
4327*0Sstevel@tonic-gate 		if (*hsp_id != -1)
4328*0Sstevel@tonic-gate 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
4329*0Sstevel@tonic-gate 
4330*0Sstevel@tonic-gate 		if (*parent_id != MD_NO_PARENT)
4331*0Sstevel@tonic-gate 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
4332*0Sstevel@tonic-gate 		*record_id = MAKERECID(setno, DBID(*record_id));
4333*0Sstevel@tonic-gate 		gotsomething = 1;
4334*0Sstevel@tonic-gate 	}
4335*0Sstevel@tonic-gate 
4336*0Sstevel@tonic-gate out:
4337*0Sstevel@tonic-gate 	return (gotsomething);
4338*0Sstevel@tonic-gate }
4339*0Sstevel@tonic-gate 
4340*0Sstevel@tonic-gate static md_named_services_t raid_named_services[] = {
4341*0Sstevel@tonic-gate 	{raid_hotspares,			"poke hotspares"	},
4342*0Sstevel@tonic-gate 	{raid_rename_check,			MDRNM_CHECK		},
4343*0Sstevel@tonic-gate 	{raid_rename_lock,			MDRNM_LOCK		},
4344*0Sstevel@tonic-gate 	{(intptr_t (*)()) raid_rename_unlock,	MDRNM_UNLOCK		},
4345*0Sstevel@tonic-gate 	{(intptr_t (*)()) raid_probe_dev,	"probe open test"	},
4346*0Sstevel@tonic-gate 	{NULL,					0			}
4347*0Sstevel@tonic-gate };
4348*0Sstevel@tonic-gate 
4349*0Sstevel@tonic-gate md_ops_t raid_md_ops = {
4350*0Sstevel@tonic-gate 	raid_open,		/* open */
4351*0Sstevel@tonic-gate 	raid_close,		/* close */
4352*0Sstevel@tonic-gate 	md_raid_strategy,	/* strategy */
4353*0Sstevel@tonic-gate 	NULL,			/* print */
4354*0Sstevel@tonic-gate 	NULL,			/* dump */
4355*0Sstevel@tonic-gate 	NULL,			/* read */
4356*0Sstevel@tonic-gate 	NULL,			/* write */
4357*0Sstevel@tonic-gate 	md_raid_ioctl,		/* ioctl, */
4358*0Sstevel@tonic-gate 	raid_snarf,		/* raid_snarf */
4359*0Sstevel@tonic-gate 	raid_halt,		/* raid_halt */
4360*0Sstevel@tonic-gate 	NULL,			/* aread */
4361*0Sstevel@tonic-gate 	NULL,			/* awrite */
4362*0Sstevel@tonic-gate 	raid_imp_set,		/* import set */
4363*0Sstevel@tonic-gate 	raid_named_services
4364*0Sstevel@tonic-gate };
4365*0Sstevel@tonic-gate 
4366*0Sstevel@tonic-gate static void
4367*0Sstevel@tonic-gate init_init()
4368*0Sstevel@tonic-gate {
4369*0Sstevel@tonic-gate 	/* default to a second */
4370*0Sstevel@tonic-gate 	if (md_wr_wait == 0)
4371*0Sstevel@tonic-gate 		md_wr_wait = md_hz >> 1;
4372*0Sstevel@tonic-gate 
4373*0Sstevel@tonic-gate 	raid_parent_cache = kmem_cache_create("md_raid_parent",
4374*0Sstevel@tonic-gate 	    sizeof (md_raidps_t), 0, raid_parent_constructor,
4375*0Sstevel@tonic-gate 	    raid_parent_destructor, raid_run_queue, NULL, NULL, 0);
4376*0Sstevel@tonic-gate 	raid_child_cache = kmem_cache_create("md_raid_child",
4377*0Sstevel@tonic-gate 	    sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0,
4378*0Sstevel@tonic-gate 	    raid_child_constructor, raid_child_destructor,
4379*0Sstevel@tonic-gate 	    raid_run_queue, NULL, NULL, 0);
4380*0Sstevel@tonic-gate 	raid_cbuf_cache = kmem_cache_create("md_raid_cbufs",
4381*0Sstevel@tonic-gate 	    sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor,
4382*0Sstevel@tonic-gate 	    raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0);
4383*0Sstevel@tonic-gate }
4384*0Sstevel@tonic-gate 
4385*0Sstevel@tonic-gate static void
4386*0Sstevel@tonic-gate fini_uninit()
4387*0Sstevel@tonic-gate {
4388*0Sstevel@tonic-gate 	kmem_cache_destroy(raid_parent_cache);
4389*0Sstevel@tonic-gate 	kmem_cache_destroy(raid_child_cache);
4390*0Sstevel@tonic-gate 	kmem_cache_destroy(raid_cbuf_cache);
4391*0Sstevel@tonic-gate 	raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL;
4392*0Sstevel@tonic-gate }
4393*0Sstevel@tonic-gate 
4394*0Sstevel@tonic-gate /* define the module linkage */
4395*0Sstevel@tonic-gate MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit())
4396