xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.345 2016/04/27 02:47:39 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.345 2016/04/27 02:47:39 christos Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 
131 #include <prop/proplib.h>
132 
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136 
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150 
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154 
155 #include "ioconf.h"
156 
157 #ifdef DEBUG
158 int     rf_kdebug_level = 0;
159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
160 #else				/* DEBUG */
161 #define db1_printf(a) { }
162 #endif				/* DEBUG */
163 
164 #ifdef DEBUG_ROOT
165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
166 #else
167 #define DPRINTF(a, ...)
168 #endif
169 
170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
171 static rf_declare_mutex2(rf_sparet_wait_mutex);
172 static rf_declare_cond2(rf_sparet_wait_cv);
173 static rf_declare_cond2(rf_sparet_resp_cv);
174 
175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
176 						 * spare table */
177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
178 						 * installation process */
179 #endif
180 
181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
182 
183 /* prototypes */
184 static void KernelWakeupFunc(struct buf *);
185 static void InitBP(struct buf *, struct vnode *, unsigned,
186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
187     void *, int, struct proc *);
188 struct raid_softc;
189 static void raidinit(struct raid_softc *);
190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
191 
192 static int raid_match(device_t, cfdata_t, void *);
193 static void raid_attach(device_t, device_t, void *);
194 static int raid_detach(device_t, int);
195 
196 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
197     daddr_t, daddr_t);
198 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
199     daddr_t, daddr_t, int);
200 
201 static int raidwrite_component_label(unsigned,
202     dev_t, struct vnode *, RF_ComponentLabel_t *);
203 static int raidread_component_label(unsigned,
204     dev_t, struct vnode *, RF_ComponentLabel_t *);
205 
206 static int raid_diskstart(device_t, struct buf *bp);
207 static int raid_dumpblocks(device_t, void *, daddr_t, int);
208 static int raid_lastclose(device_t);
209 
210 static dev_type_open(raidopen);
211 static dev_type_close(raidclose);
212 static dev_type_read(raidread);
213 static dev_type_write(raidwrite);
214 static dev_type_ioctl(raidioctl);
215 static dev_type_strategy(raidstrategy);
216 static dev_type_dump(raiddump);
217 static dev_type_size(raidsize);
218 
219 const struct bdevsw raid_bdevsw = {
220 	.d_open = raidopen,
221 	.d_close = raidclose,
222 	.d_strategy = raidstrategy,
223 	.d_ioctl = raidioctl,
224 	.d_dump = raiddump,
225 	.d_psize = raidsize,
226 	.d_discard = nodiscard,
227 	.d_flag = D_DISK
228 };
229 
230 const struct cdevsw raid_cdevsw = {
231 	.d_open = raidopen,
232 	.d_close = raidclose,
233 	.d_read = raidread,
234 	.d_write = raidwrite,
235 	.d_ioctl = raidioctl,
236 	.d_stop = nostop,
237 	.d_tty = notty,
238 	.d_poll = nopoll,
239 	.d_mmap = nommap,
240 	.d_kqfilter = nokqfilter,
241 	.d_discard = nodiscard,
242 	.d_flag = D_DISK
243 };
244 
245 static struct dkdriver rf_dkdriver = {
246 	.d_open = raidopen,
247 	.d_close = raidclose,
248 	.d_strategy = raidstrategy,
249 	.d_diskstart = raid_diskstart,
250 	.d_dumpblocks = raid_dumpblocks,
251 	.d_lastclose = raid_lastclose,
252 	.d_minphys = minphys
253 };
254 
255 struct raid_softc {
256 	struct dk_softc sc_dksc;
257 	int	sc_unit;
258 	int     sc_flags;	/* flags */
259 	int     sc_cflags;	/* configuration flags */
260 	kmutex_t sc_mutex;	/* interlock mutex */
261 	kcondvar_t sc_cv;	/* and the condvar */
262 	uint64_t sc_size;	/* size of the raid device */
263 	char    sc_xname[20];	/* XXX external name */
264 	RF_Raid_t sc_r;
265 	LIST_ENTRY(raid_softc) sc_link;
266 };
267 /* sc_flags */
268 #define RAIDF_INITED		0x01	/* unit has been initialized */
269 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
270 #define RAIDF_DETACH  		0x04	/* detach after final close */
271 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
272 #define RAIDF_LOCKED		0x10	/* unit is locked */
273 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
274 
275 #define	raidunit(x)	DISKUNIT(x)
276 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
277 
278 extern struct cfdriver raid_cd;
279 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
280     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
281     DVF_DETACH_SHUTDOWN);
282 
283 /*
284  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
285  * Be aware that large numbers can allow the driver to consume a lot of
286  * kernel memory, especially on writes, and in degraded mode reads.
287  *
288  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
289  * a single 64K write will typically require 64K for the old data,
290  * 64K for the old parity, and 64K for the new parity, for a total
291  * of 192K (if the parity buffer is not re-used immediately).
292  * Even it if is used immediately, that's still 128K, which when multiplied
293  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
294  *
295  * Now in degraded mode, for example, a 64K read on the above setup may
296  * require data reconstruction, which will require *all* of the 4 remaining
297  * disks to participate -- 4 * 32K/disk == 128K again.
298  */
299 
300 #ifndef RAIDOUTSTANDING
301 #define RAIDOUTSTANDING   6
302 #endif
303 
304 #define RAIDLABELDEV(dev)	\
305 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
306 
307 /* declared here, and made public, for the benefit of KVM stuff.. */
308 
309 static int raidlock(struct raid_softc *);
310 static void raidunlock(struct raid_softc *);
311 
312 static int raid_detach_unlocked(struct raid_softc *);
313 
314 static void rf_markalldirty(RF_Raid_t *);
315 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
316 
317 void rf_ReconThread(struct rf_recon_req *);
318 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
319 void rf_CopybackThread(RF_Raid_t *raidPtr);
320 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
321 int rf_autoconfig(device_t);
322 void rf_buildroothack(RF_ConfigSet_t *);
323 
324 RF_AutoConfig_t *rf_find_raid_components(void);
325 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
326 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
327 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
328 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
329 int rf_set_autoconfig(RF_Raid_t *, int);
330 int rf_set_rootpartition(RF_Raid_t *, int);
331 void rf_release_all_vps(RF_ConfigSet_t *);
332 void rf_cleanup_config_set(RF_ConfigSet_t *);
333 int rf_have_enough_components(RF_ConfigSet_t *);
334 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
335 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
336 
337 /*
338  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
339  * Note that this is overridden by having RAID_AUTOCONFIG as an option
340  * in the kernel config file.
341  */
342 #ifdef RAID_AUTOCONFIG
343 int raidautoconfig = 1;
344 #else
345 int raidautoconfig = 0;
346 #endif
347 static bool raidautoconfigdone = false;
348 
349 struct RF_Pools_s rf_pools;
350 
351 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
352 static kmutex_t raid_lock;
353 
354 static struct raid_softc *
355 raidcreate(int unit) {
356 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
357 	if (sc == NULL) {
358 #ifdef DIAGNOSTIC
359 		printf("%s: out of memory\n", __func__);
360 #endif
361 		return NULL;
362 	}
363 	sc->sc_unit = unit;
364 	cv_init(&sc->sc_cv, "raidunit");
365 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
366 	return sc;
367 }
368 
369 static void
370 raiddestroy(struct raid_softc *sc) {
371 	cv_destroy(&sc->sc_cv);
372 	mutex_destroy(&sc->sc_mutex);
373 	kmem_free(sc, sizeof(*sc));
374 }
375 
376 static struct raid_softc *
377 raidget(int unit, bool create) {
378 	struct raid_softc *sc;
379 	if (unit < 0) {
380 #ifdef DIAGNOSTIC
381 		panic("%s: unit %d!", __func__, unit);
382 #endif
383 		return NULL;
384 	}
385 	mutex_enter(&raid_lock);
386 	LIST_FOREACH(sc, &raids, sc_link) {
387 		if (sc->sc_unit == unit) {
388 			mutex_exit(&raid_lock);
389 			return sc;
390 		}
391 	}
392 	mutex_exit(&raid_lock);
393 	if (!create)
394 		return NULL;
395 	if ((sc = raidcreate(unit)) == NULL)
396 		return NULL;
397 	mutex_enter(&raid_lock);
398 	LIST_INSERT_HEAD(&raids, sc, sc_link);
399 	mutex_exit(&raid_lock);
400 	return sc;
401 }
402 
403 static void
404 raidput(struct raid_softc *sc) {
405 	mutex_enter(&raid_lock);
406 	LIST_REMOVE(sc, sc_link);
407 	mutex_exit(&raid_lock);
408 	raiddestroy(sc);
409 }
410 
411 void
412 raidattach(int num)
413 {
414 
415 	/*
416 	 * Device attachment and associated initialization now occurs
417 	 * as part of the module initialization.
418 	 */
419 }
420 
421 int
422 rf_autoconfig(device_t self)
423 {
424 	RF_AutoConfig_t *ac_list;
425 	RF_ConfigSet_t *config_sets;
426 
427 	if (!raidautoconfig || raidautoconfigdone == true)
428 		return (0);
429 
430 	/* XXX This code can only be run once. */
431 	raidautoconfigdone = true;
432 
433 #ifdef __HAVE_CPU_BOOTCONF
434 	/*
435 	 * 0. find the boot device if needed first so we can use it later
436 	 * this needs to be done before we autoconfigure any raid sets,
437 	 * because if we use wedges we are not going to be able to open
438 	 * the boot device later
439 	 */
440 	if (booted_device == NULL)
441 		cpu_bootconf();
442 #endif
443 	/* 1. locate all RAID components on the system */
444 	aprint_debug("Searching for RAID components...\n");
445 	ac_list = rf_find_raid_components();
446 
447 	/* 2. Sort them into their respective sets. */
448 	config_sets = rf_create_auto_sets(ac_list);
449 
450 	/*
451 	 * 3. Evaluate each set and configure the valid ones.
452 	 * This gets done in rf_buildroothack().
453 	 */
454 	rf_buildroothack(config_sets);
455 
456 	return 1;
457 }
458 
459 static int
460 rf_containsboot(RF_Raid_t *r, device_t bdv) {
461 	const char *bootname = device_xname(bdv);
462 	size_t len = strlen(bootname);
463 
464 	for (int col = 0; col < r->numCol; col++) {
465 		const char *devname = r->Disks[col].devname;
466 		devname += sizeof("/dev/") - 1;
467 		if (strncmp(devname, "dk", 2) == 0) {
468 			const char *parent =
469 			    dkwedge_get_parent_name(r->Disks[col].dev);
470 			if (parent != NULL)
471 				devname = parent;
472 		}
473 		if (strncmp(devname, bootname, len) == 0) {
474 			struct raid_softc *sc = r->softc;
475 			aprint_debug("raid%d includes boot device %s\n",
476 			    sc->sc_unit, devname);
477 			return 1;
478 		}
479 	}
480 	return 0;
481 }
482 
483 void
484 rf_buildroothack(RF_ConfigSet_t *config_sets)
485 {
486 	RF_ConfigSet_t *cset;
487 	RF_ConfigSet_t *next_cset;
488 	int num_root;
489 	struct raid_softc *sc, *rsc;
490 	struct dk_softc *dksc;
491 
492 	sc = rsc = NULL;
493 	num_root = 0;
494 	cset = config_sets;
495 	while (cset != NULL) {
496 		next_cset = cset->next;
497 		if (rf_have_enough_components(cset) &&
498 		    cset->ac->clabel->autoconfigure == 1) {
499 			sc = rf_auto_config_set(cset);
500 			if (sc != NULL) {
501 				aprint_debug("raid%d: configured ok\n",
502 				    sc->sc_unit);
503 				if (cset->rootable) {
504 					rsc = sc;
505 					num_root++;
506 				}
507 			} else {
508 				/* The autoconfig didn't work :( */
509 				aprint_debug("Autoconfig failed\n");
510 				rf_release_all_vps(cset);
511 			}
512 		} else {
513 			/* we're not autoconfiguring this set...
514 			   release the associated resources */
515 			rf_release_all_vps(cset);
516 		}
517 		/* cleanup */
518 		rf_cleanup_config_set(cset);
519 		cset = next_cset;
520 	}
521 	dksc = &rsc->sc_dksc;
522 
523 	/* if the user has specified what the root device should be
524 	   then we don't touch booted_device or boothowto... */
525 
526 	if (rootspec != NULL)
527 		return;
528 
529 	/* we found something bootable... */
530 
531 	/*
532 	 * XXX: The following code assumes that the root raid
533 	 * is the first ('a') partition. This is about the best
534 	 * we can do with a BSD disklabel, but we might be able
535 	 * to do better with a GPT label, by setting a specified
536 	 * attribute to indicate the root partition. We can then
537 	 * stash the partition number in the r->root_partition
538 	 * high bits (the bottom 2 bits are already used). For
539 	 * now we just set booted_partition to 0 when we override
540 	 * root.
541 	 */
542 	if (num_root == 1) {
543 		device_t candidate_root;
544 		if (dksc->sc_dkdev.dk_nwedges != 0) {
545 			char cname[sizeof(cset->ac->devname)];
546 			/* XXX: assume partition 'a' first */
547 			snprintf(cname, sizeof(cname), "%s%c",
548 			    device_xname(dksc->sc_dev), 'a');
549 			candidate_root = dkwedge_find_by_wname(cname);
550 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
551 			    cname);
552 			if (candidate_root == NULL) {
553 				/*
554 				 * If that is not found, because we don't use
555 				 * disklabel, return the first dk child
556 				 * XXX: we can skip the 'a' check above
557 				 * and always do this...
558 				 */
559 				size_t i = 0;
560 				candidate_root = dkwedge_find_by_parent(
561 				    device_xname(dksc->sc_dev), &i);
562 			}
563 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
564 			    candidate_root);
565 		} else
566 			candidate_root = dksc->sc_dev;
567 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
568 		DPRINTF("%s: booted_device=%p root_partition=%d "
569 		   "contains_boot=%d\n", __func__, booted_device,
570 		   rsc->sc_r.root_partition,
571 		   rf_containsboot(&rsc->sc_r, booted_device));
572 		if (booted_device == NULL ||
573 		    rsc->sc_r.root_partition == 1 ||
574 		    rf_containsboot(&rsc->sc_r, booted_device)) {
575 			booted_device = candidate_root;
576 			booted_partition = 0;	/* XXX assume 'a' */
577 		}
578 	} else if (num_root > 1) {
579 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
580 		    booted_device);
581 
582 		/*
583 		 * Maybe the MD code can help. If it cannot, then
584 		 * setroot() will discover that we have no
585 		 * booted_device and will ask the user if nothing was
586 		 * hardwired in the kernel config file
587 		 */
588 		if (booted_device == NULL)
589 			return;
590 
591 		num_root = 0;
592 		mutex_enter(&raid_lock);
593 		LIST_FOREACH(sc, &raids, sc_link) {
594 			RF_Raid_t *r = &sc->sc_r;
595 			if (r->valid == 0)
596 				continue;
597 
598 			if (r->root_partition == 0)
599 				continue;
600 
601 			if (rf_containsboot(r, booted_device)) {
602 				num_root++;
603 				rsc = sc;
604 				dksc = &rsc->sc_dksc;
605 			}
606 		}
607 		mutex_exit(&raid_lock);
608 
609 		if (num_root == 1) {
610 			booted_device = dksc->sc_dev;
611 			booted_partition = 0;	/* XXX assume 'a' */
612 		} else {
613 			/* we can't guess.. require the user to answer... */
614 			boothowto |= RB_ASKNAME;
615 		}
616 	}
617 }
618 
619 static int
620 raidsize(dev_t dev)
621 {
622 	struct raid_softc *rs;
623 	struct dk_softc *dksc;
624 	unsigned int unit;
625 
626 	unit = raidunit(dev);
627 	if ((rs = raidget(unit, false)) == NULL)
628 		return -1;
629 	dksc = &rs->sc_dksc;
630 
631 	if ((rs->sc_flags & RAIDF_INITED) == 0)
632 		return -1;
633 
634 	return dk_size(dksc, dev);
635 }
636 
637 static int
638 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
639 {
640 	unsigned int unit;
641 	struct raid_softc *rs;
642 	struct dk_softc *dksc;
643 
644 	unit = raidunit(dev);
645 	if ((rs = raidget(unit, false)) == NULL)
646 		return ENXIO;
647 	dksc = &rs->sc_dksc;
648 
649 	if ((rs->sc_flags & RAIDF_INITED) == 0)
650 		return ENODEV;
651 
652         /*
653            Note that blkno is relative to this particular partition.
654            By adding adding RF_PROTECTED_SECTORS, we get a value that
655 	   is relative to the partition used for the underlying component.
656         */
657 	blkno += RF_PROTECTED_SECTORS;
658 
659 	return dk_dump(dksc, dev, blkno, va, size);
660 }
661 
662 static int
663 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
664 {
665 	struct raid_softc *rs = raidsoftc(dev);
666 	const struct bdevsw *bdev;
667 	RF_Raid_t *raidPtr;
668 	int     c, sparecol, j, scol, dumpto;
669 	int     error = 0;
670 
671 	raidPtr = &rs->sc_r;
672 
673 	/* we only support dumping to RAID 1 sets */
674 	if (raidPtr->Layout.numDataCol != 1 ||
675 	    raidPtr->Layout.numParityCol != 1)
676 		return EINVAL;
677 
678 	if ((error = raidlock(rs)) != 0)
679 		return error;
680 
681 	/* figure out what device is alive.. */
682 
683 	/*
684 	   Look for a component to dump to.  The preference for the
685 	   component to dump to is as follows:
686 	   1) the master
687 	   2) a used_spare of the master
688 	   3) the slave
689 	   4) a used_spare of the slave
690 	*/
691 
692 	dumpto = -1;
693 	for (c = 0; c < raidPtr->numCol; c++) {
694 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
695 			/* this might be the one */
696 			dumpto = c;
697 			break;
698 		}
699 	}
700 
701 	/*
702 	   At this point we have possibly selected a live master or a
703 	   live slave.  We now check to see if there is a spared
704 	   master (or a spared slave), if we didn't find a live master
705 	   or a live slave.
706 	*/
707 
708 	for (c = 0; c < raidPtr->numSpare; c++) {
709 		sparecol = raidPtr->numCol + c;
710 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
711 			/* How about this one? */
712 			scol = -1;
713 			for(j=0;j<raidPtr->numCol;j++) {
714 				if (raidPtr->Disks[j].spareCol == sparecol) {
715 					scol = j;
716 					break;
717 				}
718 			}
719 			if (scol == 0) {
720 				/*
721 				   We must have found a spared master!
722 				   We'll take that over anything else
723 				   found so far.  (We couldn't have
724 				   found a real master before, since
725 				   this is a used spare, and it's
726 				   saying that it's replacing the
727 				   master.)  On reboot (with
728 				   autoconfiguration turned on)
729 				   sparecol will become the 1st
730 				   component (component0) of this set.
731 				*/
732 				dumpto = sparecol;
733 				break;
734 			} else if (scol != -1) {
735 				/*
736 				   Must be a spared slave.  We'll dump
737 				   to that if we havn't found anything
738 				   else so far.
739 				*/
740 				if (dumpto == -1)
741 					dumpto = sparecol;
742 			}
743 		}
744 	}
745 
746 	if (dumpto == -1) {
747 		/* we couldn't find any live components to dump to!?!?
748 		 */
749 		error = EINVAL;
750 		goto out;
751 	}
752 
753 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
754 	if (bdev == NULL) {
755 		error = ENXIO;
756 		goto out;
757 	}
758 
759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
760 				blkno, va, nblk * raidPtr->bytesPerSector);
761 
762 out:
763 	raidunlock(rs);
764 
765 	return error;
766 }
767 
768 /* ARGSUSED */
769 static int
770 raidopen(dev_t dev, int flags, int fmt,
771     struct lwp *l)
772 {
773 	int     unit = raidunit(dev);
774 	struct raid_softc *rs;
775 	struct dk_softc *dksc;
776 	int     error = 0;
777 	int     part, pmask;
778 
779 	if ((rs = raidget(unit, true)) == NULL)
780 		return ENXIO;
781 	if ((error = raidlock(rs)) != 0)
782 		return (error);
783 
784 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
785 		error = EBUSY;
786 		goto bad;
787 	}
788 
789 	dksc = &rs->sc_dksc;
790 
791 	part = DISKPART(dev);
792 	pmask = (1 << part);
793 
794 	if (!DK_BUSY(dksc, pmask) &&
795 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
796 		/* First one... mark things as dirty... Note that we *MUST*
797 		 have done a configure before this.  I DO NOT WANT TO BE
798 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
799 		 THAT THEY BELONG TOGETHER!!!!! */
800 		/* XXX should check to see if we're only open for reading
801 		   here... If so, we needn't do this, but then need some
802 		   other way of keeping track of what's happened.. */
803 
804 		rf_markalldirty(&rs->sc_r);
805 	}
806 
807 	if ((rs->sc_flags & RAIDF_INITED) != 0)
808 		error = dk_open(dksc, dev, flags, fmt, l);
809 
810 bad:
811 	raidunlock(rs);
812 
813 	return (error);
814 
815 
816 }
817 
818 static int
819 raid_lastclose(device_t self)
820 {
821 	struct raid_softc *rs = raidsoftc(self);
822 
823 	/* Last one... device is not unconfigured yet.
824 	   Device shutdown has taken care of setting the
825 	   clean bits if RAIDF_INITED is not set
826 	   mark things as clean... */
827 
828 	rf_update_component_labels(&rs->sc_r,
829 	    RF_FINAL_COMPONENT_UPDATE);
830 
831 	/* pass to unlocked code */
832 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
833 		rs->sc_flags |= RAIDF_DETACH;
834 
835 	return 0;
836 }
837 
838 /* ARGSUSED */
839 static int
840 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
841 {
842 	int     unit = raidunit(dev);
843 	struct raid_softc *rs;
844 	struct dk_softc *dksc;
845 	cfdata_t cf;
846 	int     error = 0, do_detach = 0, do_put = 0;
847 
848 	if ((rs = raidget(unit, false)) == NULL)
849 		return ENXIO;
850 	dksc = &rs->sc_dksc;
851 
852 	if ((error = raidlock(rs)) != 0)
853 		return (error);
854 
855 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
856 		error = dk_close(dksc, dev, flags, fmt, l);
857 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
858 			do_detach = 1;
859 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
860 		do_put = 1;
861 
862 	raidunlock(rs);
863 
864 	if (do_detach) {
865 		/* free the pseudo device attach bits */
866 		cf = device_cfdata(dksc->sc_dev);
867 		error = config_detach(dksc->sc_dev, 0);
868 		if (error == 0)
869 			free(cf, M_RAIDFRAME);
870 	} else if (do_put) {
871 		raidput(rs);
872 	}
873 
874 	return (error);
875 
876 }
877 
878 static void
879 raid_wakeup(RF_Raid_t *raidPtr)
880 {
881 	rf_lock_mutex2(raidPtr->iodone_lock);
882 	rf_signal_cond2(raidPtr->iodone_cv);
883 	rf_unlock_mutex2(raidPtr->iodone_lock);
884 }
885 
886 static void
887 raidstrategy(struct buf *bp)
888 {
889 	unsigned int unit;
890 	struct raid_softc *rs;
891 	struct dk_softc *dksc;
892 	RF_Raid_t *raidPtr;
893 
894 	unit = raidunit(bp->b_dev);
895 	if ((rs = raidget(unit, false)) == NULL) {
896 		bp->b_error = ENXIO;
897 		goto fail;
898 	}
899 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
900 		bp->b_error = ENXIO;
901 		goto fail;
902 	}
903 	dksc = &rs->sc_dksc;
904 	raidPtr = &rs->sc_r;
905 
906 	/* Queue IO only */
907 	if (dk_strategy_defer(dksc, bp))
908 		goto done;
909 
910 	/* schedule the IO to happen at the next convenient time */
911 	raid_wakeup(raidPtr);
912 
913 done:
914 	return;
915 
916 fail:
917 	bp->b_resid = bp->b_bcount;
918 	biodone(bp);
919 }
920 
921 static int
922 raid_diskstart(device_t dev, struct buf *bp)
923 {
924 	struct raid_softc *rs = raidsoftc(dev);
925 	RF_Raid_t *raidPtr;
926 
927 	raidPtr = &rs->sc_r;
928 	if (!raidPtr->valid) {
929 		db1_printf(("raid is not valid..\n"));
930 		return ENODEV;
931 	}
932 
933 	/* XXX */
934 	bp->b_resid = 0;
935 
936 	return raiddoaccess(raidPtr, bp);
937 }
938 
939 void
940 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
941 {
942 	struct raid_softc *rs;
943 	struct dk_softc *dksc;
944 
945 	rs = raidPtr->softc;
946 	dksc = &rs->sc_dksc;
947 
948 	dk_done(dksc, bp);
949 
950 	rf_lock_mutex2(raidPtr->mutex);
951 	raidPtr->openings++;
952 	rf_unlock_mutex2(raidPtr->mutex);
953 
954 	/* schedule more IO */
955 	raid_wakeup(raidPtr);
956 }
957 
958 /* ARGSUSED */
959 static int
960 raidread(dev_t dev, struct uio *uio, int flags)
961 {
962 	int     unit = raidunit(dev);
963 	struct raid_softc *rs;
964 
965 	if ((rs = raidget(unit, false)) == NULL)
966 		return ENXIO;
967 
968 	if ((rs->sc_flags & RAIDF_INITED) == 0)
969 		return (ENXIO);
970 
971 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
972 
973 }
974 
975 /* ARGSUSED */
976 static int
977 raidwrite(dev_t dev, struct uio *uio, int flags)
978 {
979 	int     unit = raidunit(dev);
980 	struct raid_softc *rs;
981 
982 	if ((rs = raidget(unit, false)) == NULL)
983 		return ENXIO;
984 
985 	if ((rs->sc_flags & RAIDF_INITED) == 0)
986 		return (ENXIO);
987 
988 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
989 
990 }
991 
992 static int
993 raid_detach_unlocked(struct raid_softc *rs)
994 {
995 	struct dk_softc *dksc = &rs->sc_dksc;
996 	RF_Raid_t *raidPtr;
997 	int error;
998 
999 	raidPtr = &rs->sc_r;
1000 
1001 	if (DK_BUSY(dksc, 0) ||
1002 	    raidPtr->recon_in_progress != 0 ||
1003 	    raidPtr->parity_rewrite_in_progress != 0 ||
1004 	    raidPtr->copyback_in_progress != 0)
1005 		return EBUSY;
1006 
1007 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1008 		return 0;
1009 
1010 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1011 
1012 	if ((error = rf_Shutdown(raidPtr)) != 0)
1013 		return error;
1014 
1015 	rs->sc_flags &= ~RAIDF_INITED;
1016 
1017 	/* Kill off any queued buffers */
1018 	dk_drain(dksc);
1019 	bufq_free(dksc->sc_bufq);
1020 
1021 	/* Detach the disk. */
1022 	dkwedge_delall(&dksc->sc_dkdev);
1023 	disk_detach(&dksc->sc_dkdev);
1024 	disk_destroy(&dksc->sc_dkdev);
1025 	dk_detach(dksc);
1026 
1027 	return 0;
1028 }
1029 
1030 static int
1031 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1032 {
1033 	int     unit = raidunit(dev);
1034 	int     error = 0;
1035 	int     part, pmask;
1036 	struct raid_softc *rs;
1037 	struct dk_softc *dksc;
1038 	RF_Config_t *k_cfg, *u_cfg;
1039 	RF_Raid_t *raidPtr;
1040 	RF_RaidDisk_t *diskPtr;
1041 	RF_AccTotals_t *totals;
1042 	RF_DeviceConfig_t *d_cfg, **ucfgp;
1043 	u_char *specific_buf;
1044 	int retcode = 0;
1045 	int column;
1046 /*	int raidid; */
1047 	struct rf_recon_req *rrcopy, *rr;
1048 	RF_ComponentLabel_t *clabel;
1049 	RF_ComponentLabel_t *ci_label;
1050 	RF_ComponentLabel_t **clabel_ptr;
1051 	RF_SingleComponent_t *sparePtr,*componentPtr;
1052 	RF_SingleComponent_t component;
1053 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1054 	int i, j, d;
1055 
1056 	if ((rs = raidget(unit, false)) == NULL)
1057 		return ENXIO;
1058 	dksc = &rs->sc_dksc;
1059 	raidPtr = &rs->sc_r;
1060 
1061 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1062 		(int) DISKPART(dev), (int) unit, cmd));
1063 
1064 	/* Must be initialized for these... */
1065 	switch (cmd) {
1066 	case RAIDFRAME_REWRITEPARITY:
1067 	case RAIDFRAME_GET_INFO:
1068 	case RAIDFRAME_RESET_ACCTOTALS:
1069 	case RAIDFRAME_GET_ACCTOTALS:
1070 	case RAIDFRAME_KEEP_ACCTOTALS:
1071 	case RAIDFRAME_GET_SIZE:
1072 	case RAIDFRAME_FAIL_DISK:
1073 	case RAIDFRAME_COPYBACK:
1074 	case RAIDFRAME_CHECK_RECON_STATUS:
1075 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1076 	case RAIDFRAME_GET_COMPONENT_LABEL:
1077 	case RAIDFRAME_SET_COMPONENT_LABEL:
1078 	case RAIDFRAME_ADD_HOT_SPARE:
1079 	case RAIDFRAME_REMOVE_HOT_SPARE:
1080 	case RAIDFRAME_INIT_LABELS:
1081 	case RAIDFRAME_REBUILD_IN_PLACE:
1082 	case RAIDFRAME_CHECK_PARITY:
1083 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1084 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1085 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1086 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1087 	case RAIDFRAME_SET_AUTOCONFIG:
1088 	case RAIDFRAME_SET_ROOT:
1089 	case RAIDFRAME_DELETE_COMPONENT:
1090 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1091 	case RAIDFRAME_PARITYMAP_STATUS:
1092 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1093 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1094 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1095 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1096 			return (ENXIO);
1097 	}
1098 
1099 	switch (cmd) {
1100 #ifdef COMPAT_50
1101 	case RAIDFRAME_GET_INFO50:
1102 		return rf_get_info50(raidPtr, data);
1103 
1104 	case RAIDFRAME_CONFIGURE50:
1105 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1106 			return retcode;
1107 		goto config;
1108 #endif
1109 		/* configure the system */
1110 	case RAIDFRAME_CONFIGURE:
1111 
1112 		if (raidPtr->valid) {
1113 			/* There is a valid RAID set running on this unit! */
1114 			printf("raid%d: Device already configured!\n",unit);
1115 			return(EINVAL);
1116 		}
1117 
1118 		/* copy-in the configuration information */
1119 		/* data points to a pointer to the configuration structure */
1120 
1121 		u_cfg = *((RF_Config_t **) data);
1122 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1123 		if (k_cfg == NULL) {
1124 			return (ENOMEM);
1125 		}
1126 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1127 		if (retcode) {
1128 			RF_Free(k_cfg, sizeof(RF_Config_t));
1129 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1130 				retcode));
1131 			goto no_config;
1132 		}
1133 		goto config;
1134 	config:
1135 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
1136 
1137 		/* allocate a buffer for the layout-specific data, and copy it
1138 		 * in */
1139 		if (k_cfg->layoutSpecificSize) {
1140 			if (k_cfg->layoutSpecificSize > 10000) {
1141 				/* sanity check */
1142 				RF_Free(k_cfg, sizeof(RF_Config_t));
1143 				retcode = EINVAL;
1144 				goto no_config;
1145 			}
1146 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1147 			    (u_char *));
1148 			if (specific_buf == NULL) {
1149 				RF_Free(k_cfg, sizeof(RF_Config_t));
1150 				retcode = ENOMEM;
1151 				goto no_config;
1152 			}
1153 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1154 			    k_cfg->layoutSpecificSize);
1155 			if (retcode) {
1156 				RF_Free(k_cfg, sizeof(RF_Config_t));
1157 				RF_Free(specific_buf,
1158 					k_cfg->layoutSpecificSize);
1159 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1160 					retcode));
1161 				goto no_config;
1162 			}
1163 		} else
1164 			specific_buf = NULL;
1165 		k_cfg->layoutSpecific = specific_buf;
1166 
1167 		/* should do some kind of sanity check on the configuration.
1168 		 * Store the sum of all the bytes in the last byte? */
1169 
1170 		/* configure the system */
1171 
1172 		/*
1173 		 * Clear the entire RAID descriptor, just to make sure
1174 		 *  there is no stale data left in the case of a
1175 		 *  reconfiguration
1176 		 */
1177 		memset(raidPtr, 0, sizeof(*raidPtr));
1178 		raidPtr->softc = rs;
1179 		raidPtr->raidid = unit;
1180 
1181 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1182 
1183 		if (retcode == 0) {
1184 
1185 			/* allow this many simultaneous IO's to
1186 			   this RAID device */
1187 			raidPtr->openings = RAIDOUTSTANDING;
1188 
1189 			raidinit(rs);
1190 			raid_wakeup(raidPtr);
1191 			rf_markalldirty(raidPtr);
1192 		}
1193 		/* free the buffers.  No return code here. */
1194 		if (k_cfg->layoutSpecificSize) {
1195 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1196 		}
1197 		RF_Free(k_cfg, sizeof(RF_Config_t));
1198 
1199 	no_config:
1200 		/*
1201 		 * If configuration failed, set sc_flags so that we
1202 		 * will detach the device when we close it.
1203 		 */
1204 		if (retcode != 0)
1205 			rs->sc_flags |= RAIDF_SHUTDOWN;
1206 		return (retcode);
1207 
1208 		/* shutdown the system */
1209 	case RAIDFRAME_SHUTDOWN:
1210 
1211 		part = DISKPART(dev);
1212 		pmask = (1 << part);
1213 
1214 		if ((error = raidlock(rs)) != 0)
1215 			return (error);
1216 
1217 		if (DK_BUSY(dksc, pmask) ||
1218 		    raidPtr->recon_in_progress != 0 ||
1219 		    raidPtr->parity_rewrite_in_progress != 0 ||
1220 		    raidPtr->copyback_in_progress != 0)
1221 			retcode = EBUSY;
1222 		else {
1223 			/* detach and free on close */
1224 			rs->sc_flags |= RAIDF_SHUTDOWN;
1225 			retcode = 0;
1226 		}
1227 
1228 		raidunlock(rs);
1229 
1230 		return (retcode);
1231 	case RAIDFRAME_GET_COMPONENT_LABEL:
1232 		clabel_ptr = (RF_ComponentLabel_t **) data;
1233 		/* need to read the component label for the disk indicated
1234 		   by row,column in clabel */
1235 
1236 		/*
1237 		 * Perhaps there should be an option to skip the in-core
1238 		 * copy and hit the disk, as with disklabel(8).
1239 		 */
1240 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1241 
1242 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1243 
1244 		if (retcode) {
1245 			RF_Free(clabel, sizeof(*clabel));
1246 			return retcode;
1247 		}
1248 
1249 		clabel->row = 0; /* Don't allow looking at anything else.*/
1250 
1251 		column = clabel->column;
1252 
1253 		if ((column < 0) || (column >= raidPtr->numCol +
1254 		    raidPtr->numSpare)) {
1255 			RF_Free(clabel, sizeof(*clabel));
1256 			return EINVAL;
1257 		}
1258 
1259 		RF_Free(clabel, sizeof(*clabel));
1260 
1261 		clabel = raidget_component_label(raidPtr, column);
1262 
1263 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1264 
1265 #if 0
1266 	case RAIDFRAME_SET_COMPONENT_LABEL:
1267 		clabel = (RF_ComponentLabel_t *) data;
1268 
1269 		/* XXX check the label for valid stuff... */
1270 		/* Note that some things *should not* get modified --
1271 		   the user should be re-initing the labels instead of
1272 		   trying to patch things.
1273 		   */
1274 
1275 		raidid = raidPtr->raidid;
1276 #ifdef DEBUG
1277 		printf("raid%d: Got component label:\n", raidid);
1278 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1279 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1280 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1281 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1282 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1283 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1284 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1285 #endif
1286 		clabel->row = 0;
1287 		column = clabel->column;
1288 
1289 		if ((column < 0) || (column >= raidPtr->numCol)) {
1290 			return(EINVAL);
1291 		}
1292 
1293 		/* XXX this isn't allowed to do anything for now :-) */
1294 
1295 		/* XXX and before it is, we need to fill in the rest
1296 		   of the fields!?!?!?! */
1297 		memcpy(raidget_component_label(raidPtr, column),
1298 		    clabel, sizeof(*clabel));
1299 		raidflush_component_label(raidPtr, column);
1300 		return (0);
1301 #endif
1302 
1303 	case RAIDFRAME_INIT_LABELS:
1304 		clabel = (RF_ComponentLabel_t *) data;
1305 		/*
1306 		   we only want the serial number from
1307 		   the above.  We get all the rest of the information
1308 		   from the config that was used to create this RAID
1309 		   set.
1310 		   */
1311 
1312 		raidPtr->serial_number = clabel->serial_number;
1313 
1314 		for(column=0;column<raidPtr->numCol;column++) {
1315 			diskPtr = &raidPtr->Disks[column];
1316 			if (!RF_DEAD_DISK(diskPtr->status)) {
1317 				ci_label = raidget_component_label(raidPtr,
1318 				    column);
1319 				/* Zeroing this is important. */
1320 				memset(ci_label, 0, sizeof(*ci_label));
1321 				raid_init_component_label(raidPtr, ci_label);
1322 				ci_label->serial_number =
1323 				    raidPtr->serial_number;
1324 				ci_label->row = 0; /* we dont' pretend to support more */
1325 				rf_component_label_set_partitionsize(ci_label,
1326 				    diskPtr->partitionSize);
1327 				ci_label->column = column;
1328 				raidflush_component_label(raidPtr, column);
1329 			}
1330 			/* XXXjld what about the spares? */
1331 		}
1332 
1333 		return (retcode);
1334 	case RAIDFRAME_SET_AUTOCONFIG:
1335 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1336 		printf("raid%d: New autoconfig value is: %d\n",
1337 		       raidPtr->raidid, d);
1338 		*(int *) data = d;
1339 		return (retcode);
1340 
1341 	case RAIDFRAME_SET_ROOT:
1342 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1343 		printf("raid%d: New rootpartition value is: %d\n",
1344 		       raidPtr->raidid, d);
1345 		*(int *) data = d;
1346 		return (retcode);
1347 
1348 		/* initialize all parity */
1349 	case RAIDFRAME_REWRITEPARITY:
1350 
1351 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1352 			/* Parity for RAID 0 is trivially correct */
1353 			raidPtr->parity_good = RF_RAID_CLEAN;
1354 			return(0);
1355 		}
1356 
1357 		if (raidPtr->parity_rewrite_in_progress == 1) {
1358 			/* Re-write is already in progress! */
1359 			return(EINVAL);
1360 		}
1361 
1362 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1363 					   rf_RewriteParityThread,
1364 					   raidPtr,"raid_parity");
1365 		return (retcode);
1366 
1367 
1368 	case RAIDFRAME_ADD_HOT_SPARE:
1369 		sparePtr = (RF_SingleComponent_t *) data;
1370 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1371 		retcode = rf_add_hot_spare(raidPtr, &component);
1372 		return(retcode);
1373 
1374 	case RAIDFRAME_REMOVE_HOT_SPARE:
1375 		return(retcode);
1376 
1377 	case RAIDFRAME_DELETE_COMPONENT:
1378 		componentPtr = (RF_SingleComponent_t *)data;
1379 		memcpy( &component, componentPtr,
1380 			sizeof(RF_SingleComponent_t));
1381 		retcode = rf_delete_component(raidPtr, &component);
1382 		return(retcode);
1383 
1384 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1385 		componentPtr = (RF_SingleComponent_t *)data;
1386 		memcpy( &component, componentPtr,
1387 			sizeof(RF_SingleComponent_t));
1388 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1389 		return(retcode);
1390 
1391 	case RAIDFRAME_REBUILD_IN_PLACE:
1392 
1393 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1394 			/* Can't do this on a RAID 0!! */
1395 			return(EINVAL);
1396 		}
1397 
1398 		if (raidPtr->recon_in_progress == 1) {
1399 			/* a reconstruct is already in progress! */
1400 			return(EINVAL);
1401 		}
1402 
1403 		componentPtr = (RF_SingleComponent_t *) data;
1404 		memcpy( &component, componentPtr,
1405 			sizeof(RF_SingleComponent_t));
1406 		component.row = 0; /* we don't support any more */
1407 		column = component.column;
1408 
1409 		if ((column < 0) || (column >= raidPtr->numCol)) {
1410 			return(EINVAL);
1411 		}
1412 
1413 		rf_lock_mutex2(raidPtr->mutex);
1414 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1415 		    (raidPtr->numFailures > 0)) {
1416 			/* XXX 0 above shouldn't be constant!!! */
1417 			/* some component other than this has failed.
1418 			   Let's not make things worse than they already
1419 			   are... */
1420 			printf("raid%d: Unable to reconstruct to disk at:\n",
1421 			       raidPtr->raidid);
1422 			printf("raid%d:     Col: %d   Too many failures.\n",
1423 			       raidPtr->raidid, column);
1424 			rf_unlock_mutex2(raidPtr->mutex);
1425 			return (EINVAL);
1426 		}
1427 		if (raidPtr->Disks[column].status ==
1428 		    rf_ds_reconstructing) {
1429 			printf("raid%d: Unable to reconstruct to disk at:\n",
1430 			       raidPtr->raidid);
1431 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
1432 
1433 			rf_unlock_mutex2(raidPtr->mutex);
1434 			return (EINVAL);
1435 		}
1436 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1437 			rf_unlock_mutex2(raidPtr->mutex);
1438 			return (EINVAL);
1439 		}
1440 		rf_unlock_mutex2(raidPtr->mutex);
1441 
1442 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1443 		if (rrcopy == NULL)
1444 			return(ENOMEM);
1445 
1446 		rrcopy->raidPtr = (void *) raidPtr;
1447 		rrcopy->col = column;
1448 
1449 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1450 					   rf_ReconstructInPlaceThread,
1451 					   rrcopy,"raid_reconip");
1452 		return(retcode);
1453 
1454 	case RAIDFRAME_GET_INFO:
1455 		if (!raidPtr->valid)
1456 			return (ENODEV);
1457 		ucfgp = (RF_DeviceConfig_t **) data;
1458 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1459 			  (RF_DeviceConfig_t *));
1460 		if (d_cfg == NULL)
1461 			return (ENOMEM);
1462 		d_cfg->rows = 1; /* there is only 1 row now */
1463 		d_cfg->cols = raidPtr->numCol;
1464 		d_cfg->ndevs = raidPtr->numCol;
1465 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1466 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1467 			return (ENOMEM);
1468 		}
1469 		d_cfg->nspares = raidPtr->numSpare;
1470 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1471 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1472 			return (ENOMEM);
1473 		}
1474 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1475 		d = 0;
1476 		for (j = 0; j < d_cfg->cols; j++) {
1477 			d_cfg->devs[d] = raidPtr->Disks[j];
1478 			d++;
1479 		}
1480 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1481 			d_cfg->spares[i] = raidPtr->Disks[j];
1482 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1483 				/* XXX: raidctl(8) expects to see this as a used spare */
1484 				d_cfg->spares[i].status = rf_ds_used_spare;
1485 			}
1486 		}
1487 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1488 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1489 
1490 		return (retcode);
1491 
1492 	case RAIDFRAME_CHECK_PARITY:
1493 		*(int *) data = raidPtr->parity_good;
1494 		return (0);
1495 
1496 	case RAIDFRAME_PARITYMAP_STATUS:
1497 		if (rf_paritymap_ineligible(raidPtr))
1498 			return EINVAL;
1499 		rf_paritymap_status(raidPtr->parity_map,
1500 		    (struct rf_pmstat *)data);
1501 		return 0;
1502 
1503 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1504 		if (rf_paritymap_ineligible(raidPtr))
1505 			return EINVAL;
1506 		if (raidPtr->parity_map == NULL)
1507 			return ENOENT; /* ??? */
1508 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1509 			(struct rf_pmparams *)data, 1))
1510 			return EINVAL;
1511 		return 0;
1512 
1513 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1514 		if (rf_paritymap_ineligible(raidPtr))
1515 			return EINVAL;
1516 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1517 		return 0;
1518 
1519 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1520 		if (rf_paritymap_ineligible(raidPtr))
1521 			return EINVAL;
1522 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1523 		/* XXX should errors be passed up? */
1524 		return 0;
1525 
1526 	case RAIDFRAME_RESET_ACCTOTALS:
1527 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1528 		return (0);
1529 
1530 	case RAIDFRAME_GET_ACCTOTALS:
1531 		totals = (RF_AccTotals_t *) data;
1532 		*totals = raidPtr->acc_totals;
1533 		return (0);
1534 
1535 	case RAIDFRAME_KEEP_ACCTOTALS:
1536 		raidPtr->keep_acc_totals = *(int *)data;
1537 		return (0);
1538 
1539 	case RAIDFRAME_GET_SIZE:
1540 		*(int *) data = raidPtr->totalSectors;
1541 		return (0);
1542 
1543 		/* fail a disk & optionally start reconstruction */
1544 	case RAIDFRAME_FAIL_DISK:
1545 
1546 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1547 			/* Can't do this on a RAID 0!! */
1548 			return(EINVAL);
1549 		}
1550 
1551 		rr = (struct rf_recon_req *) data;
1552 		rr->row = 0;
1553 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1554 			return (EINVAL);
1555 
1556 
1557 		rf_lock_mutex2(raidPtr->mutex);
1558 		if (raidPtr->status == rf_rs_reconstructing) {
1559 			/* you can't fail a disk while we're reconstructing! */
1560 			/* XXX wrong for RAID6 */
1561 			rf_unlock_mutex2(raidPtr->mutex);
1562 			return (EINVAL);
1563 		}
1564 		if ((raidPtr->Disks[rr->col].status ==
1565 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1566 			/* some other component has failed.  Let's not make
1567 			   things worse. XXX wrong for RAID6 */
1568 			rf_unlock_mutex2(raidPtr->mutex);
1569 			return (EINVAL);
1570 		}
1571 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1572 			/* Can't fail a spared disk! */
1573 			rf_unlock_mutex2(raidPtr->mutex);
1574 			return (EINVAL);
1575 		}
1576 		rf_unlock_mutex2(raidPtr->mutex);
1577 
1578 		/* make a copy of the recon request so that we don't rely on
1579 		 * the user's buffer */
1580 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1581 		if (rrcopy == NULL)
1582 			return(ENOMEM);
1583 		memcpy(rrcopy, rr, sizeof(*rr));
1584 		rrcopy->raidPtr = (void *) raidPtr;
1585 
1586 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1587 					   rf_ReconThread,
1588 					   rrcopy,"raid_recon");
1589 		return (0);
1590 
1591 		/* invoke a copyback operation after recon on whatever disk
1592 		 * needs it, if any */
1593 	case RAIDFRAME_COPYBACK:
1594 
1595 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1596 			/* This makes no sense on a RAID 0!! */
1597 			return(EINVAL);
1598 		}
1599 
1600 		if (raidPtr->copyback_in_progress == 1) {
1601 			/* Copyback is already in progress! */
1602 			return(EINVAL);
1603 		}
1604 
1605 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1606 					   rf_CopybackThread,
1607 					   raidPtr,"raid_copyback");
1608 		return (retcode);
1609 
1610 		/* return the percentage completion of reconstruction */
1611 	case RAIDFRAME_CHECK_RECON_STATUS:
1612 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1613 			/* This makes no sense on a RAID 0, so tell the
1614 			   user it's done. */
1615 			*(int *) data = 100;
1616 			return(0);
1617 		}
1618 		if (raidPtr->status != rf_rs_reconstructing)
1619 			*(int *) data = 100;
1620 		else {
1621 			if (raidPtr->reconControl->numRUsTotal > 0) {
1622 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1623 			} else {
1624 				*(int *) data = 0;
1625 			}
1626 		}
1627 		return (0);
1628 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1629 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1630 		if (raidPtr->status != rf_rs_reconstructing) {
1631 			progressInfo.remaining = 0;
1632 			progressInfo.completed = 100;
1633 			progressInfo.total = 100;
1634 		} else {
1635 			progressInfo.total =
1636 				raidPtr->reconControl->numRUsTotal;
1637 			progressInfo.completed =
1638 				raidPtr->reconControl->numRUsComplete;
1639 			progressInfo.remaining = progressInfo.total -
1640 				progressInfo.completed;
1641 		}
1642 		retcode = copyout(&progressInfo, *progressInfoPtr,
1643 				  sizeof(RF_ProgressInfo_t));
1644 		return (retcode);
1645 
1646 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1647 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1648 			/* This makes no sense on a RAID 0, so tell the
1649 			   user it's done. */
1650 			*(int *) data = 100;
1651 			return(0);
1652 		}
1653 		if (raidPtr->parity_rewrite_in_progress == 1) {
1654 			*(int *) data = 100 *
1655 				raidPtr->parity_rewrite_stripes_done /
1656 				raidPtr->Layout.numStripe;
1657 		} else {
1658 			*(int *) data = 100;
1659 		}
1660 		return (0);
1661 
1662 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1663 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1664 		if (raidPtr->parity_rewrite_in_progress == 1) {
1665 			progressInfo.total = raidPtr->Layout.numStripe;
1666 			progressInfo.completed =
1667 				raidPtr->parity_rewrite_stripes_done;
1668 			progressInfo.remaining = progressInfo.total -
1669 				progressInfo.completed;
1670 		} else {
1671 			progressInfo.remaining = 0;
1672 			progressInfo.completed = 100;
1673 			progressInfo.total = 100;
1674 		}
1675 		retcode = copyout(&progressInfo, *progressInfoPtr,
1676 				  sizeof(RF_ProgressInfo_t));
1677 		return (retcode);
1678 
1679 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1680 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1681 			/* This makes no sense on a RAID 0 */
1682 			*(int *) data = 100;
1683 			return(0);
1684 		}
1685 		if (raidPtr->copyback_in_progress == 1) {
1686 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1687 				raidPtr->Layout.numStripe;
1688 		} else {
1689 			*(int *) data = 100;
1690 		}
1691 		return (0);
1692 
1693 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1694 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1695 		if (raidPtr->copyback_in_progress == 1) {
1696 			progressInfo.total = raidPtr->Layout.numStripe;
1697 			progressInfo.completed =
1698 				raidPtr->copyback_stripes_done;
1699 			progressInfo.remaining = progressInfo.total -
1700 				progressInfo.completed;
1701 		} else {
1702 			progressInfo.remaining = 0;
1703 			progressInfo.completed = 100;
1704 			progressInfo.total = 100;
1705 		}
1706 		retcode = copyout(&progressInfo, *progressInfoPtr,
1707 				  sizeof(RF_ProgressInfo_t));
1708 		return (retcode);
1709 
1710 	case RAIDFRAME_SET_LAST_UNIT:
1711 		for (column = 0; column < raidPtr->numCol; column++)
1712 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1713 				return EBUSY;
1714 
1715 		for (column = 0; column < raidPtr->numCol; column++) {
1716 			clabel = raidget_component_label(raidPtr, column);
1717 			clabel->last_unit = *(int *)data;
1718 			raidflush_component_label(raidPtr, column);
1719 		}
1720 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1721 		return 0;
1722 
1723 		/* the sparetable daemon calls this to wait for the kernel to
1724 		 * need a spare table. this ioctl does not return until a
1725 		 * spare table is needed. XXX -- calling mpsleep here in the
1726 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1727 		 * -- I should either compute the spare table in the kernel,
1728 		 * or have a different -- XXX XXX -- interface (a different
1729 		 * character device) for delivering the table     -- XXX */
1730 #if 0
1731 	case RAIDFRAME_SPARET_WAIT:
1732 		rf_lock_mutex2(rf_sparet_wait_mutex);
1733 		while (!rf_sparet_wait_queue)
1734 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1735 		waitreq = rf_sparet_wait_queue;
1736 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1737 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1738 
1739 		/* structure assignment */
1740 		*((RF_SparetWait_t *) data) = *waitreq;
1741 
1742 		RF_Free(waitreq, sizeof(*waitreq));
1743 		return (0);
1744 
1745 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1746 		 * code in it that will cause the dameon to exit */
1747 	case RAIDFRAME_ABORT_SPARET_WAIT:
1748 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1749 		waitreq->fcol = -1;
1750 		rf_lock_mutex2(rf_sparet_wait_mutex);
1751 		waitreq->next = rf_sparet_wait_queue;
1752 		rf_sparet_wait_queue = waitreq;
1753 		rf_broadcast_conf2(rf_sparet_wait_cv);
1754 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1755 		return (0);
1756 
1757 		/* used by the spare table daemon to deliver a spare table
1758 		 * into the kernel */
1759 	case RAIDFRAME_SEND_SPARET:
1760 
1761 		/* install the spare table */
1762 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1763 
1764 		/* respond to the requestor.  the return status of the spare
1765 		 * table installation is passed in the "fcol" field */
1766 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1767 		waitreq->fcol = retcode;
1768 		rf_lock_mutex2(rf_sparet_wait_mutex);
1769 		waitreq->next = rf_sparet_resp_queue;
1770 		rf_sparet_resp_queue = waitreq;
1771 		rf_broadcast_cond2(rf_sparet_resp_cv);
1772 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1773 
1774 		return (retcode);
1775 #endif
1776 
1777 	default:
1778 		break; /* fall through to the os-specific code below */
1779 
1780 	}
1781 
1782 	if (!raidPtr->valid)
1783 		return (EINVAL);
1784 
1785 	/*
1786 	 * Add support for "regular" device ioctls here.
1787 	 */
1788 
1789 	error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1790 	if (error != EPASSTHROUGH)
1791 		return (error);
1792 
1793 	switch (cmd) {
1794 	case DIOCCACHESYNC:
1795 		return rf_sync_component_caches(raidPtr);
1796 
1797 	default:
1798 		retcode = ENOTTY;
1799 	}
1800 	return (retcode);
1801 
1802 }
1803 
1804 
1805 /* raidinit -- complete the rest of the initialization for the
1806    RAIDframe device.  */
1807 
1808 
1809 static void
1810 raidinit(struct raid_softc *rs)
1811 {
1812 	cfdata_t cf;
1813 	unsigned int unit;
1814 	struct dk_softc *dksc = &rs->sc_dksc;
1815 	RF_Raid_t *raidPtr = &rs->sc_r;
1816 	device_t dev;
1817 
1818 	unit = raidPtr->raidid;
1819 
1820 	/* XXX doesn't check bounds. */
1821 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1822 
1823 	/* attach the pseudo device */
1824 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1825 	cf->cf_name = raid_cd.cd_name;
1826 	cf->cf_atname = raid_cd.cd_name;
1827 	cf->cf_unit = unit;
1828 	cf->cf_fstate = FSTATE_STAR;
1829 
1830 	dev = config_attach_pseudo(cf);
1831 	if (dev == NULL) {
1832 		printf("raid%d: config_attach_pseudo failed\n",
1833 		    raidPtr->raidid);
1834 		free(cf, M_RAIDFRAME);
1835 		return;
1836 	}
1837 
1838 	/* provide a backpointer to the real softc */
1839 	raidsoftc(dev) = rs;
1840 
1841 	/* disk_attach actually creates space for the CPU disklabel, among
1842 	 * other things, so it's critical to call this *BEFORE* we try putzing
1843 	 * with disklabels. */
1844 	dk_init(dksc, dev, DKTYPE_RAID);
1845 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1846 
1847 	/* XXX There may be a weird interaction here between this, and
1848 	 * protectedSectors, as used in RAIDframe.  */
1849 
1850 	rs->sc_size = raidPtr->totalSectors;
1851 
1852 	/* Attach dk and disk subsystems */
1853 	dk_attach(dksc);
1854 	disk_attach(&dksc->sc_dkdev);
1855 	rf_set_geometry(rs, raidPtr);
1856 
1857 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1858 
1859 	/* mark unit as usuable */
1860 	rs->sc_flags |= RAIDF_INITED;
1861 
1862 	dkwedge_discover(&dksc->sc_dkdev);
1863 }
1864 
1865 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1866 /* wake up the daemon & tell it to get us a spare table
1867  * XXX
1868  * the entries in the queues should be tagged with the raidPtr
1869  * so that in the extremely rare case that two recons happen at once,
1870  * we know for which device were requesting a spare table
1871  * XXX
1872  *
1873  * XXX This code is not currently used. GO
1874  */
1875 int
1876 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1877 {
1878 	int     retcode;
1879 
1880 	rf_lock_mutex2(rf_sparet_wait_mutex);
1881 	req->next = rf_sparet_wait_queue;
1882 	rf_sparet_wait_queue = req;
1883 	rf_broadcast_cond2(rf_sparet_wait_cv);
1884 
1885 	/* mpsleep unlocks the mutex */
1886 	while (!rf_sparet_resp_queue) {
1887 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1888 	}
1889 	req = rf_sparet_resp_queue;
1890 	rf_sparet_resp_queue = req->next;
1891 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1892 
1893 	retcode = req->fcol;
1894 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1895 					 * alloc'd */
1896 	return (retcode);
1897 }
1898 #endif
1899 
1900 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1901  * bp & passes it down.
1902  * any calls originating in the kernel must use non-blocking I/O
1903  * do some extra sanity checking to return "appropriate" error values for
1904  * certain conditions (to make some standard utilities work)
1905  *
1906  * Formerly known as: rf_DoAccessKernel
1907  */
1908 void
1909 raidstart(RF_Raid_t *raidPtr)
1910 {
1911 	struct raid_softc *rs;
1912 	struct dk_softc *dksc;
1913 
1914 	rs = raidPtr->softc;
1915 	dksc = &rs->sc_dksc;
1916 	/* quick check to see if anything has died recently */
1917 	rf_lock_mutex2(raidPtr->mutex);
1918 	if (raidPtr->numNewFailures > 0) {
1919 		rf_unlock_mutex2(raidPtr->mutex);
1920 		rf_update_component_labels(raidPtr,
1921 					   RF_NORMAL_COMPONENT_UPDATE);
1922 		rf_lock_mutex2(raidPtr->mutex);
1923 		raidPtr->numNewFailures--;
1924 	}
1925 	rf_unlock_mutex2(raidPtr->mutex);
1926 
1927 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1928 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1929 		return;
1930 	}
1931 
1932 	dk_start(dksc, NULL);
1933 }
1934 
1935 static int
1936 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1937 {
1938 	RF_SectorCount_t num_blocks, pb, sum;
1939 	RF_RaidAddr_t raid_addr;
1940 	daddr_t blocknum;
1941 	int     do_async;
1942 	int rc;
1943 
1944 	rf_lock_mutex2(raidPtr->mutex);
1945 	if (raidPtr->openings == 0) {
1946 		rf_unlock_mutex2(raidPtr->mutex);
1947 		return EAGAIN;
1948 	}
1949 	rf_unlock_mutex2(raidPtr->mutex);
1950 
1951 	blocknum = bp->b_rawblkno;
1952 
1953 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1954 		    (int) blocknum));
1955 
1956 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1957 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1958 
1959 	/* *THIS* is where we adjust what block we're going to...
1960 	 * but DO NOT TOUCH bp->b_blkno!!! */
1961 	raid_addr = blocknum;
1962 
1963 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1964 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1965 	sum = raid_addr + num_blocks + pb;
1966 	if (1 || rf_debugKernelAccess) {
1967 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1968 			    (int) raid_addr, (int) sum, (int) num_blocks,
1969 			    (int) pb, (int) bp->b_resid));
1970 	}
1971 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1972 	    || (sum < num_blocks) || (sum < pb)) {
1973 		rc = ENOSPC;
1974 		goto done;
1975 	}
1976 	/*
1977 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1978 	 */
1979 
1980 	if (bp->b_bcount & raidPtr->sectorMask) {
1981 		rc = ENOSPC;
1982 		goto done;
1983 	}
1984 	db1_printf(("Calling DoAccess..\n"));
1985 
1986 
1987 	rf_lock_mutex2(raidPtr->mutex);
1988 	raidPtr->openings--;
1989 	rf_unlock_mutex2(raidPtr->mutex);
1990 
1991 	/*
1992 	 * Everything is async.
1993 	 */
1994 	do_async = 1;
1995 
1996 	/* don't ever condition on bp->b_flags & B_WRITE.
1997 	 * always condition on B_READ instead */
1998 
1999 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2000 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2001 			 do_async, raid_addr, num_blocks,
2002 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2003 
2004 done:
2005 	return rc;
2006 }
2007 
2008 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2009 
2010 int
2011 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2012 {
2013 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2014 	struct buf *bp;
2015 
2016 	req->queue = queue;
2017 	bp = req->bp;
2018 
2019 	switch (req->type) {
2020 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2021 		/* XXX need to do something extra here.. */
2022 		/* I'm leaving this in, as I've never actually seen it used,
2023 		 * and I'd like folks to report it... GO */
2024 		printf(("WAKEUP CALLED\n"));
2025 		queue->numOutstanding++;
2026 
2027 		bp->b_flags = 0;
2028 		bp->b_private = req;
2029 
2030 		KernelWakeupFunc(bp);
2031 		break;
2032 
2033 	case RF_IO_TYPE_READ:
2034 	case RF_IO_TYPE_WRITE:
2035 #if RF_ACC_TRACE > 0
2036 		if (req->tracerec) {
2037 			RF_ETIMER_START(req->tracerec->timer);
2038 		}
2039 #endif
2040 		InitBP(bp, queue->rf_cinfo->ci_vp,
2041 		    op, queue->rf_cinfo->ci_dev,
2042 		    req->sectorOffset, req->numSector,
2043 		    req->buf, KernelWakeupFunc, (void *) req,
2044 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2045 
2046 		if (rf_debugKernelAccess) {
2047 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2048 				(long) bp->b_blkno));
2049 		}
2050 		queue->numOutstanding++;
2051 		queue->last_deq_sector = req->sectorOffset;
2052 		/* acc wouldn't have been let in if there were any pending
2053 		 * reqs at any other priority */
2054 		queue->curPriority = req->priority;
2055 
2056 		db1_printf(("Going for %c to unit %d col %d\n",
2057 			    req->type, queue->raidPtr->raidid,
2058 			    queue->col));
2059 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2060 			(int) req->sectorOffset, (int) req->numSector,
2061 			(int) (req->numSector <<
2062 			    queue->raidPtr->logBytesPerSector),
2063 			(int) queue->raidPtr->logBytesPerSector));
2064 
2065 		/*
2066 		 * XXX: drop lock here since this can block at
2067 		 * least with backing SCSI devices.  Retake it
2068 		 * to minimize fuss with calling interfaces.
2069 		 */
2070 
2071 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2072 		bdev_strategy(bp);
2073 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2074 		break;
2075 
2076 	default:
2077 		panic("bad req->type in rf_DispatchKernelIO");
2078 	}
2079 	db1_printf(("Exiting from DispatchKernelIO\n"));
2080 
2081 	return (0);
2082 }
2083 /* this is the callback function associated with a I/O invoked from
2084    kernel code.
2085  */
2086 static void
2087 KernelWakeupFunc(struct buf *bp)
2088 {
2089 	RF_DiskQueueData_t *req = NULL;
2090 	RF_DiskQueue_t *queue;
2091 
2092 	db1_printf(("recovering the request queue:\n"));
2093 
2094 	req = bp->b_private;
2095 
2096 	queue = (RF_DiskQueue_t *) req->queue;
2097 
2098 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2099 
2100 #if RF_ACC_TRACE > 0
2101 	if (req->tracerec) {
2102 		RF_ETIMER_STOP(req->tracerec->timer);
2103 		RF_ETIMER_EVAL(req->tracerec->timer);
2104 		rf_lock_mutex2(rf_tracing_mutex);
2105 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2106 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2107 		req->tracerec->num_phys_ios++;
2108 		rf_unlock_mutex2(rf_tracing_mutex);
2109 	}
2110 #endif
2111 
2112 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2113 	 * ballistic, and mark the component as hosed... */
2114 
2115 	if (bp->b_error != 0) {
2116 		/* Mark the disk as dead */
2117 		/* but only mark it once... */
2118 		/* and only if it wouldn't leave this RAID set
2119 		   completely broken */
2120 		if (((queue->raidPtr->Disks[queue->col].status ==
2121 		      rf_ds_optimal) ||
2122 		     (queue->raidPtr->Disks[queue->col].status ==
2123 		      rf_ds_used_spare)) &&
2124 		     (queue->raidPtr->numFailures <
2125 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2126 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2127 			       queue->raidPtr->raidid,
2128 			       bp->b_error,
2129 			       queue->raidPtr->Disks[queue->col].devname);
2130 			queue->raidPtr->Disks[queue->col].status =
2131 			    rf_ds_failed;
2132 			queue->raidPtr->status = rf_rs_degraded;
2133 			queue->raidPtr->numFailures++;
2134 			queue->raidPtr->numNewFailures++;
2135 		} else {	/* Disk is already dead... */
2136 			/* printf("Disk already marked as dead!\n"); */
2137 		}
2138 
2139 	}
2140 
2141 	/* Fill in the error value */
2142 	req->error = bp->b_error;
2143 
2144 	/* Drop this one on the "finished" queue... */
2145 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2146 
2147 	/* Let the raidio thread know there is work to be done. */
2148 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2149 
2150 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2151 }
2152 
2153 
2154 /*
2155  * initialize a buf structure for doing an I/O in the kernel.
2156  */
2157 static void
2158 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2159        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2160        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2161        struct proc *b_proc)
2162 {
2163 	/* bp->b_flags       = B_PHYS | rw_flag; */
2164 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2165 	bp->b_oflags = 0;
2166 	bp->b_cflags = 0;
2167 	bp->b_bcount = numSect << logBytesPerSector;
2168 	bp->b_bufsize = bp->b_bcount;
2169 	bp->b_error = 0;
2170 	bp->b_dev = dev;
2171 	bp->b_data = bf;
2172 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2173 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2174 	if (bp->b_bcount == 0) {
2175 		panic("bp->b_bcount is zero in InitBP!!");
2176 	}
2177 	bp->b_proc = b_proc;
2178 	bp->b_iodone = cbFunc;
2179 	bp->b_private = cbArg;
2180 }
2181 
2182 /*
2183  * Wait interruptibly for an exclusive lock.
2184  *
2185  * XXX
2186  * Several drivers do this; it should be abstracted and made MP-safe.
2187  * (Hmm... where have we seen this warning before :->  GO )
2188  */
2189 static int
2190 raidlock(struct raid_softc *rs)
2191 {
2192 	int     error;
2193 
2194 	error = 0;
2195 	mutex_enter(&rs->sc_mutex);
2196 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2197 		rs->sc_flags |= RAIDF_WANTED;
2198 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2199 		if (error != 0)
2200 			goto done;
2201 	}
2202 	rs->sc_flags |= RAIDF_LOCKED;
2203 done:
2204 	mutex_exit(&rs->sc_mutex);
2205 	return (error);
2206 }
2207 /*
2208  * Unlock and wake up any waiters.
2209  */
2210 static void
2211 raidunlock(struct raid_softc *rs)
2212 {
2213 
2214 	mutex_enter(&rs->sc_mutex);
2215 	rs->sc_flags &= ~RAIDF_LOCKED;
2216 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2217 		rs->sc_flags &= ~RAIDF_WANTED;
2218 		cv_broadcast(&rs->sc_cv);
2219 	}
2220 	mutex_exit(&rs->sc_mutex);
2221 }
2222 
2223 
2224 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2225 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2226 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2227 
2228 static daddr_t
2229 rf_component_info_offset(void)
2230 {
2231 
2232 	return RF_COMPONENT_INFO_OFFSET;
2233 }
2234 
2235 static daddr_t
2236 rf_component_info_size(unsigned secsize)
2237 {
2238 	daddr_t info_size;
2239 
2240 	KASSERT(secsize);
2241 	if (secsize > RF_COMPONENT_INFO_SIZE)
2242 		info_size = secsize;
2243 	else
2244 		info_size = RF_COMPONENT_INFO_SIZE;
2245 
2246 	return info_size;
2247 }
2248 
2249 static daddr_t
2250 rf_parity_map_offset(RF_Raid_t *raidPtr)
2251 {
2252 	daddr_t map_offset;
2253 
2254 	KASSERT(raidPtr->bytesPerSector);
2255 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2256 		map_offset = raidPtr->bytesPerSector;
2257 	else
2258 		map_offset = RF_COMPONENT_INFO_SIZE;
2259 	map_offset += rf_component_info_offset();
2260 
2261 	return map_offset;
2262 }
2263 
2264 static daddr_t
2265 rf_parity_map_size(RF_Raid_t *raidPtr)
2266 {
2267 	daddr_t map_size;
2268 
2269 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2270 		map_size = raidPtr->bytesPerSector;
2271 	else
2272 		map_size = RF_PARITY_MAP_SIZE;
2273 
2274 	return map_size;
2275 }
2276 
2277 int
2278 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2279 {
2280 	RF_ComponentLabel_t *clabel;
2281 
2282 	clabel = raidget_component_label(raidPtr, col);
2283 	clabel->clean = RF_RAID_CLEAN;
2284 	raidflush_component_label(raidPtr, col);
2285 	return(0);
2286 }
2287 
2288 
2289 int
2290 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2291 {
2292 	RF_ComponentLabel_t *clabel;
2293 
2294 	clabel = raidget_component_label(raidPtr, col);
2295 	clabel->clean = RF_RAID_DIRTY;
2296 	raidflush_component_label(raidPtr, col);
2297 	return(0);
2298 }
2299 
2300 int
2301 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2302 {
2303 	KASSERT(raidPtr->bytesPerSector);
2304 	return raidread_component_label(raidPtr->bytesPerSector,
2305 	    raidPtr->Disks[col].dev,
2306 	    raidPtr->raid_cinfo[col].ci_vp,
2307 	    &raidPtr->raid_cinfo[col].ci_label);
2308 }
2309 
2310 RF_ComponentLabel_t *
2311 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2312 {
2313 	return &raidPtr->raid_cinfo[col].ci_label;
2314 }
2315 
2316 int
2317 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2318 {
2319 	RF_ComponentLabel_t *label;
2320 
2321 	label = &raidPtr->raid_cinfo[col].ci_label;
2322 	label->mod_counter = raidPtr->mod_counter;
2323 #ifndef RF_NO_PARITY_MAP
2324 	label->parity_map_modcount = label->mod_counter;
2325 #endif
2326 	return raidwrite_component_label(raidPtr->bytesPerSector,
2327 	    raidPtr->Disks[col].dev,
2328 	    raidPtr->raid_cinfo[col].ci_vp, label);
2329 }
2330 
2331 
2332 static int
2333 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2334     RF_ComponentLabel_t *clabel)
2335 {
2336 	return raidread_component_area(dev, b_vp, clabel,
2337 	    sizeof(RF_ComponentLabel_t),
2338 	    rf_component_info_offset(),
2339 	    rf_component_info_size(secsize));
2340 }
2341 
2342 /* ARGSUSED */
2343 static int
2344 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2345     size_t msize, daddr_t offset, daddr_t dsize)
2346 {
2347 	struct buf *bp;
2348 	int error;
2349 
2350 	/* XXX should probably ensure that we don't try to do this if
2351 	   someone has changed rf_protected_sectors. */
2352 
2353 	if (b_vp == NULL) {
2354 		/* For whatever reason, this component is not valid.
2355 		   Don't try to read a component label from it. */
2356 		return(EINVAL);
2357 	}
2358 
2359 	/* get a block of the appropriate size... */
2360 	bp = geteblk((int)dsize);
2361 	bp->b_dev = dev;
2362 
2363 	/* get our ducks in a row for the read */
2364 	bp->b_blkno = offset / DEV_BSIZE;
2365 	bp->b_bcount = dsize;
2366 	bp->b_flags |= B_READ;
2367  	bp->b_resid = dsize;
2368 
2369 	bdev_strategy(bp);
2370 	error = biowait(bp);
2371 
2372 	if (!error) {
2373 		memcpy(data, bp->b_data, msize);
2374 	}
2375 
2376 	brelse(bp, 0);
2377 	return(error);
2378 }
2379 
2380 
2381 static int
2382 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2383     RF_ComponentLabel_t *clabel)
2384 {
2385 	return raidwrite_component_area(dev, b_vp, clabel,
2386 	    sizeof(RF_ComponentLabel_t),
2387 	    rf_component_info_offset(),
2388 	    rf_component_info_size(secsize), 0);
2389 }
2390 
2391 /* ARGSUSED */
2392 static int
2393 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2394     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2395 {
2396 	struct buf *bp;
2397 	int error;
2398 
2399 	/* get a block of the appropriate size... */
2400 	bp = geteblk((int)dsize);
2401 	bp->b_dev = dev;
2402 
2403 	/* get our ducks in a row for the write */
2404 	bp->b_blkno = offset / DEV_BSIZE;
2405 	bp->b_bcount = dsize;
2406 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2407  	bp->b_resid = dsize;
2408 
2409 	memset(bp->b_data, 0, dsize);
2410 	memcpy(bp->b_data, data, msize);
2411 
2412 	bdev_strategy(bp);
2413 	if (asyncp)
2414 		return 0;
2415 	error = biowait(bp);
2416 	brelse(bp, 0);
2417 	if (error) {
2418 #if 1
2419 		printf("Failed to write RAID component info!\n");
2420 #endif
2421 	}
2422 
2423 	return(error);
2424 }
2425 
2426 void
2427 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2428 {
2429 	int c;
2430 
2431 	for (c = 0; c < raidPtr->numCol; c++) {
2432 		/* Skip dead disks. */
2433 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2434 			continue;
2435 		/* XXXjld: what if an error occurs here? */
2436 		raidwrite_component_area(raidPtr->Disks[c].dev,
2437 		    raidPtr->raid_cinfo[c].ci_vp, map,
2438 		    RF_PARITYMAP_NBYTE,
2439 		    rf_parity_map_offset(raidPtr),
2440 		    rf_parity_map_size(raidPtr), 0);
2441 	}
2442 }
2443 
2444 void
2445 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2446 {
2447 	struct rf_paritymap_ondisk tmp;
2448 	int c,first;
2449 
2450 	first=1;
2451 	for (c = 0; c < raidPtr->numCol; c++) {
2452 		/* Skip dead disks. */
2453 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2454 			continue;
2455 		raidread_component_area(raidPtr->Disks[c].dev,
2456 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2457 		    RF_PARITYMAP_NBYTE,
2458 		    rf_parity_map_offset(raidPtr),
2459 		    rf_parity_map_size(raidPtr));
2460 		if (first) {
2461 			memcpy(map, &tmp, sizeof(*map));
2462 			first = 0;
2463 		} else {
2464 			rf_paritymap_merge(map, &tmp);
2465 		}
2466 	}
2467 }
2468 
2469 void
2470 rf_markalldirty(RF_Raid_t *raidPtr)
2471 {
2472 	RF_ComponentLabel_t *clabel;
2473 	int sparecol;
2474 	int c;
2475 	int j;
2476 	int scol = -1;
2477 
2478 	raidPtr->mod_counter++;
2479 	for (c = 0; c < raidPtr->numCol; c++) {
2480 		/* we don't want to touch (at all) a disk that has
2481 		   failed */
2482 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2483 			clabel = raidget_component_label(raidPtr, c);
2484 			if (clabel->status == rf_ds_spared) {
2485 				/* XXX do something special...
2486 				   but whatever you do, don't
2487 				   try to access it!! */
2488 			} else {
2489 				raidmarkdirty(raidPtr, c);
2490 			}
2491 		}
2492 	}
2493 
2494 	for( c = 0; c < raidPtr->numSpare ; c++) {
2495 		sparecol = raidPtr->numCol + c;
2496 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2497 			/*
2498 
2499 			   we claim this disk is "optimal" if it's
2500 			   rf_ds_used_spare, as that means it should be
2501 			   directly substitutable for the disk it replaced.
2502 			   We note that too...
2503 
2504 			 */
2505 
2506 			for(j=0;j<raidPtr->numCol;j++) {
2507 				if (raidPtr->Disks[j].spareCol == sparecol) {
2508 					scol = j;
2509 					break;
2510 				}
2511 			}
2512 
2513 			clabel = raidget_component_label(raidPtr, sparecol);
2514 			/* make sure status is noted */
2515 
2516 			raid_init_component_label(raidPtr, clabel);
2517 
2518 			clabel->row = 0;
2519 			clabel->column = scol;
2520 			/* Note: we *don't* change status from rf_ds_used_spare
2521 			   to rf_ds_optimal */
2522 			/* clabel.status = rf_ds_optimal; */
2523 
2524 			raidmarkdirty(raidPtr, sparecol);
2525 		}
2526 	}
2527 }
2528 
2529 
2530 void
2531 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2532 {
2533 	RF_ComponentLabel_t *clabel;
2534 	int sparecol;
2535 	int c;
2536 	int j;
2537 	int scol;
2538 	struct raid_softc *rs = raidPtr->softc;
2539 
2540 	scol = -1;
2541 
2542 	/* XXX should do extra checks to make sure things really are clean,
2543 	   rather than blindly setting the clean bit... */
2544 
2545 	raidPtr->mod_counter++;
2546 
2547 	for (c = 0; c < raidPtr->numCol; c++) {
2548 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2549 			clabel = raidget_component_label(raidPtr, c);
2550 			/* make sure status is noted */
2551 			clabel->status = rf_ds_optimal;
2552 
2553 			/* note what unit we are configured as */
2554 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2555 				clabel->last_unit = raidPtr->raidid;
2556 
2557 			raidflush_component_label(raidPtr, c);
2558 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2559 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2560 					raidmarkclean(raidPtr, c);
2561 				}
2562 			}
2563 		}
2564 		/* else we don't touch it.. */
2565 	}
2566 
2567 	for( c = 0; c < raidPtr->numSpare ; c++) {
2568 		sparecol = raidPtr->numCol + c;
2569 		/* Need to ensure that the reconstruct actually completed! */
2570 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2571 			/*
2572 
2573 			   we claim this disk is "optimal" if it's
2574 			   rf_ds_used_spare, as that means it should be
2575 			   directly substitutable for the disk it replaced.
2576 			   We note that too...
2577 
2578 			 */
2579 
2580 			for(j=0;j<raidPtr->numCol;j++) {
2581 				if (raidPtr->Disks[j].spareCol == sparecol) {
2582 					scol = j;
2583 					break;
2584 				}
2585 			}
2586 
2587 			/* XXX shouldn't *really* need this... */
2588 			clabel = raidget_component_label(raidPtr, sparecol);
2589 			/* make sure status is noted */
2590 
2591 			raid_init_component_label(raidPtr, clabel);
2592 
2593 			clabel->column = scol;
2594 			clabel->status = rf_ds_optimal;
2595 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2596 				clabel->last_unit = raidPtr->raidid;
2597 
2598 			raidflush_component_label(raidPtr, sparecol);
2599 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2600 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2601 					raidmarkclean(raidPtr, sparecol);
2602 				}
2603 			}
2604 		}
2605 	}
2606 }
2607 
2608 void
2609 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2610 {
2611 
2612 	if (vp != NULL) {
2613 		if (auto_configured == 1) {
2614 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2615 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2616 			vput(vp);
2617 
2618 		} else {
2619 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2620 		}
2621 	}
2622 }
2623 
2624 
2625 void
2626 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2627 {
2628 	int r,c;
2629 	struct vnode *vp;
2630 	int acd;
2631 
2632 
2633 	/* We take this opportunity to close the vnodes like we should.. */
2634 
2635 	for (c = 0; c < raidPtr->numCol; c++) {
2636 		vp = raidPtr->raid_cinfo[c].ci_vp;
2637 		acd = raidPtr->Disks[c].auto_configured;
2638 		rf_close_component(raidPtr, vp, acd);
2639 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2640 		raidPtr->Disks[c].auto_configured = 0;
2641 	}
2642 
2643 	for (r = 0; r < raidPtr->numSpare; r++) {
2644 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2645 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2646 		rf_close_component(raidPtr, vp, acd);
2647 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2648 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2649 	}
2650 }
2651 
2652 
2653 void
2654 rf_ReconThread(struct rf_recon_req *req)
2655 {
2656 	int     s;
2657 	RF_Raid_t *raidPtr;
2658 
2659 	s = splbio();
2660 	raidPtr = (RF_Raid_t *) req->raidPtr;
2661 	raidPtr->recon_in_progress = 1;
2662 
2663 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2664 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2665 
2666 	RF_Free(req, sizeof(*req));
2667 
2668 	raidPtr->recon_in_progress = 0;
2669 	splx(s);
2670 
2671 	/* That's all... */
2672 	kthread_exit(0);	/* does not return */
2673 }
2674 
2675 void
2676 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2677 {
2678 	int retcode;
2679 	int s;
2680 
2681 	raidPtr->parity_rewrite_stripes_done = 0;
2682 	raidPtr->parity_rewrite_in_progress = 1;
2683 	s = splbio();
2684 	retcode = rf_RewriteParity(raidPtr);
2685 	splx(s);
2686 	if (retcode) {
2687 		printf("raid%d: Error re-writing parity (%d)!\n",
2688 		    raidPtr->raidid, retcode);
2689 	} else {
2690 		/* set the clean bit!  If we shutdown correctly,
2691 		   the clean bit on each component label will get
2692 		   set */
2693 		raidPtr->parity_good = RF_RAID_CLEAN;
2694 	}
2695 	raidPtr->parity_rewrite_in_progress = 0;
2696 
2697 	/* Anyone waiting for us to stop?  If so, inform them... */
2698 	if (raidPtr->waitShutdown) {
2699 		wakeup(&raidPtr->parity_rewrite_in_progress);
2700 	}
2701 
2702 	/* That's all... */
2703 	kthread_exit(0);	/* does not return */
2704 }
2705 
2706 
2707 void
2708 rf_CopybackThread(RF_Raid_t *raidPtr)
2709 {
2710 	int s;
2711 
2712 	raidPtr->copyback_in_progress = 1;
2713 	s = splbio();
2714 	rf_CopybackReconstructedData(raidPtr);
2715 	splx(s);
2716 	raidPtr->copyback_in_progress = 0;
2717 
2718 	/* That's all... */
2719 	kthread_exit(0);	/* does not return */
2720 }
2721 
2722 
2723 void
2724 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2725 {
2726 	int s;
2727 	RF_Raid_t *raidPtr;
2728 
2729 	s = splbio();
2730 	raidPtr = req->raidPtr;
2731 	raidPtr->recon_in_progress = 1;
2732 	rf_ReconstructInPlace(raidPtr, req->col);
2733 	RF_Free(req, sizeof(*req));
2734 	raidPtr->recon_in_progress = 0;
2735 	splx(s);
2736 
2737 	/* That's all... */
2738 	kthread_exit(0);	/* does not return */
2739 }
2740 
2741 static RF_AutoConfig_t *
2742 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2743     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2744     unsigned secsize)
2745 {
2746 	int good_one = 0;
2747 	RF_ComponentLabel_t *clabel;
2748 	RF_AutoConfig_t *ac;
2749 
2750 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2751 	if (clabel == NULL) {
2752 oomem:
2753 		    while(ac_list) {
2754 			    ac = ac_list;
2755 			    if (ac->clabel)
2756 				    free(ac->clabel, M_RAIDFRAME);
2757 			    ac_list = ac_list->next;
2758 			    free(ac, M_RAIDFRAME);
2759 		    }
2760 		    printf("RAID auto config: out of memory!\n");
2761 		    return NULL; /* XXX probably should panic? */
2762 	}
2763 
2764 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2765 		/* Got the label.  Does it look reasonable? */
2766 		if (rf_reasonable_label(clabel, numsecs) &&
2767 		    (rf_component_label_partitionsize(clabel) <= size)) {
2768 #ifdef DEBUG
2769 			printf("Component on: %s: %llu\n",
2770 				cname, (unsigned long long)size);
2771 			rf_print_component_label(clabel);
2772 #endif
2773 			/* if it's reasonable, add it, else ignore it. */
2774 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2775 				M_NOWAIT);
2776 			if (ac == NULL) {
2777 				free(clabel, M_RAIDFRAME);
2778 				goto oomem;
2779 			}
2780 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2781 			ac->dev = dev;
2782 			ac->vp = vp;
2783 			ac->clabel = clabel;
2784 			ac->next = ac_list;
2785 			ac_list = ac;
2786 			good_one = 1;
2787 		}
2788 	}
2789 	if (!good_one) {
2790 		/* cleanup */
2791 		free(clabel, M_RAIDFRAME);
2792 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2793 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2794 		vput(vp);
2795 	}
2796 	return ac_list;
2797 }
2798 
2799 RF_AutoConfig_t *
2800 rf_find_raid_components(void)
2801 {
2802 	struct vnode *vp;
2803 	struct disklabel label;
2804 	device_t dv;
2805 	deviter_t di;
2806 	dev_t dev;
2807 	int bmajor, bminor, wedge, rf_part_found;
2808 	int error;
2809 	int i;
2810 	RF_AutoConfig_t *ac_list;
2811 	uint64_t numsecs;
2812 	unsigned secsize;
2813 	int dowedges;
2814 
2815 	/* initialize the AutoConfig list */
2816 	ac_list = NULL;
2817 
2818 	/*
2819 	 * we begin by trolling through *all* the devices on the system *twice*
2820 	 * first we scan for wedges, second for other devices. This avoids
2821 	 * using a raw partition instead of a wedge that covers the whole disk
2822 	 */
2823 
2824 	for (dowedges=1; dowedges>=0; --dowedges) {
2825 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2826 		     dv = deviter_next(&di)) {
2827 
2828 			/* we are only interested in disks... */
2829 			if (device_class(dv) != DV_DISK)
2830 				continue;
2831 
2832 			/* we don't care about floppies... */
2833 			if (device_is_a(dv, "fd")) {
2834 				continue;
2835 			}
2836 
2837 			/* we don't care about CD's... */
2838 			if (device_is_a(dv, "cd")) {
2839 				continue;
2840 			}
2841 
2842 			/* we don't care about md's... */
2843 			if (device_is_a(dv, "md")) {
2844 				continue;
2845 			}
2846 
2847 			/* hdfd is the Atari/Hades floppy driver */
2848 			if (device_is_a(dv, "hdfd")) {
2849 				continue;
2850 			}
2851 
2852 			/* fdisa is the Atari/Milan floppy driver */
2853 			if (device_is_a(dv, "fdisa")) {
2854 				continue;
2855 			}
2856 
2857 			/* are we in the wedges pass ? */
2858 			wedge = device_is_a(dv, "dk");
2859 			if (wedge != dowedges) {
2860 				continue;
2861 			}
2862 
2863 			/* need to find the device_name_to_block_device_major stuff */
2864 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2865 
2866 			rf_part_found = 0; /*No raid partition as yet*/
2867 
2868 			/* get a vnode for the raw partition of this disk */
2869 			bminor = minor(device_unit(dv));
2870 			dev = wedge ? makedev(bmajor, bminor) :
2871 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2872 			if (bdevvp(dev, &vp))
2873 				panic("RAID can't alloc vnode");
2874 
2875 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2876 
2877 			if (error) {
2878 				/* "Who cares."  Continue looking
2879 				   for something that exists*/
2880 				vput(vp);
2881 				continue;
2882 			}
2883 
2884 			error = getdisksize(vp, &numsecs, &secsize);
2885 			if (error) {
2886 				/*
2887 				 * Pseudo devices like vnd and cgd can be
2888 				 * opened but may still need some configuration.
2889 				 * Ignore these quietly.
2890 				 */
2891 				if (error != ENXIO)
2892 					printf("RAIDframe: can't get disk size"
2893 					    " for dev %s (%d)\n",
2894 					    device_xname(dv), error);
2895 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2896 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2897 				vput(vp);
2898 				continue;
2899 			}
2900 			if (wedge) {
2901 				struct dkwedge_info dkw;
2902 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2903 				    NOCRED);
2904 				if (error) {
2905 					printf("RAIDframe: can't get wedge info for "
2906 					    "dev %s (%d)\n", device_xname(dv), error);
2907 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2908 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2909 					vput(vp);
2910 					continue;
2911 				}
2912 
2913 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2914 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2915 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2916 					vput(vp);
2917 					continue;
2918 				}
2919 
2920 				ac_list = rf_get_component(ac_list, dev, vp,
2921 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
2922 				rf_part_found = 1; /*There is a raid component on this disk*/
2923 				continue;
2924 			}
2925 
2926 			/* Ok, the disk exists.  Go get the disklabel. */
2927 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2928 			if (error) {
2929 				/*
2930 				 * XXX can't happen - open() would
2931 				 * have errored out (or faked up one)
2932 				 */
2933 				if (error != ENOTTY)
2934 					printf("RAIDframe: can't get label for dev "
2935 					    "%s (%d)\n", device_xname(dv), error);
2936 			}
2937 
2938 			/* don't need this any more.  We'll allocate it again
2939 			   a little later if we really do... */
2940 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2941 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2942 			vput(vp);
2943 
2944 			if (error)
2945 				continue;
2946 
2947 			rf_part_found = 0; /*No raid partitions yet*/
2948 			for (i = 0; i < label.d_npartitions; i++) {
2949 				char cname[sizeof(ac_list->devname)];
2950 
2951 				/* We only support partitions marked as RAID */
2952 				if (label.d_partitions[i].p_fstype != FS_RAID)
2953 					continue;
2954 
2955 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2956 				if (bdevvp(dev, &vp))
2957 					panic("RAID can't alloc vnode");
2958 
2959 				error = VOP_OPEN(vp, FREAD, NOCRED);
2960 				if (error) {
2961 					/* Whatever... */
2962 					vput(vp);
2963 					continue;
2964 				}
2965 				snprintf(cname, sizeof(cname), "%s%c",
2966 				    device_xname(dv), 'a' + i);
2967 				ac_list = rf_get_component(ac_list, dev, vp, cname,
2968 					label.d_partitions[i].p_size, numsecs, secsize);
2969 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
2970 			}
2971 
2972 			/*
2973 			 *If there is no raid component on this disk, either in a
2974 			 *disklabel or inside a wedge, check the raw partition as well,
2975 			 *as it is possible to configure raid components on raw disk
2976 			 *devices.
2977 			 */
2978 
2979 			if (!rf_part_found) {
2980 				char cname[sizeof(ac_list->devname)];
2981 
2982 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2983 				if (bdevvp(dev, &vp))
2984 					panic("RAID can't alloc vnode");
2985 
2986 				error = VOP_OPEN(vp, FREAD, NOCRED);
2987 				if (error) {
2988 					/* Whatever... */
2989 					vput(vp);
2990 					continue;
2991 				}
2992 				snprintf(cname, sizeof(cname), "%s%c",
2993 				    device_xname(dv), 'a' + RAW_PART);
2994 				ac_list = rf_get_component(ac_list, dev, vp, cname,
2995 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2996 			}
2997 		}
2998 		deviter_release(&di);
2999 	}
3000 	return ac_list;
3001 }
3002 
3003 
3004 int
3005 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3006 {
3007 
3008 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3009 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3010 	    ((clabel->clean == RF_RAID_CLEAN) ||
3011 	     (clabel->clean == RF_RAID_DIRTY)) &&
3012 	    clabel->row >=0 &&
3013 	    clabel->column >= 0 &&
3014 	    clabel->num_rows > 0 &&
3015 	    clabel->num_columns > 0 &&
3016 	    clabel->row < clabel->num_rows &&
3017 	    clabel->column < clabel->num_columns &&
3018 	    clabel->blockSize > 0 &&
3019 	    /*
3020 	     * numBlocksHi may contain garbage, but it is ok since
3021 	     * the type is unsigned.  If it is really garbage,
3022 	     * rf_fix_old_label_size() will fix it.
3023 	     */
3024 	    rf_component_label_numblocks(clabel) > 0) {
3025 		/*
3026 		 * label looks reasonable enough...
3027 		 * let's make sure it has no old garbage.
3028 		 */
3029 		if (numsecs)
3030 			rf_fix_old_label_size(clabel, numsecs);
3031 		return(1);
3032 	}
3033 	return(0);
3034 }
3035 
3036 
3037 /*
3038  * For reasons yet unknown, some old component labels have garbage in
3039  * the newer numBlocksHi region, and this causes lossage.  Since those
3040  * disks will also have numsecs set to less than 32 bits of sectors,
3041  * we can determine when this corruption has occurred, and fix it.
3042  *
3043  * The exact same problem, with the same unknown reason, happens to
3044  * the partitionSizeHi member as well.
3045  */
3046 static void
3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3048 {
3049 
3050 	if (numsecs < ((uint64_t)1 << 32)) {
3051 		if (clabel->numBlocksHi) {
3052 			printf("WARNING: total sectors < 32 bits, yet "
3053 			       "numBlocksHi set\n"
3054 			       "WARNING: resetting numBlocksHi to zero.\n");
3055 			clabel->numBlocksHi = 0;
3056 		}
3057 
3058 		if (clabel->partitionSizeHi) {
3059 			printf("WARNING: total sectors < 32 bits, yet "
3060 			       "partitionSizeHi set\n"
3061 			       "WARNING: resetting partitionSizeHi to zero.\n");
3062 			clabel->partitionSizeHi = 0;
3063 		}
3064 	}
3065 }
3066 
3067 
3068 #ifdef DEBUG
3069 void
3070 rf_print_component_label(RF_ComponentLabel_t *clabel)
3071 {
3072 	uint64_t numBlocks;
3073 	static const char *rp[] = {
3074 	    "No", "Force", "Soft", "*invalid*"
3075 	};
3076 
3077 
3078 	numBlocks = rf_component_label_numblocks(clabel);
3079 
3080 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3081 	       clabel->row, clabel->column,
3082 	       clabel->num_rows, clabel->num_columns);
3083 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3084 	       clabel->version, clabel->serial_number,
3085 	       clabel->mod_counter);
3086 	printf("   Clean: %s Status: %d\n",
3087 	       clabel->clean ? "Yes" : "No", clabel->status);
3088 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3089 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3090 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3091 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3092 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3093 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3094 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3095 #if 0
3096 	   printf("   Config order: %d\n", clabel->config_order);
3097 #endif
3098 
3099 }
3100 #endif
3101 
3102 RF_ConfigSet_t *
3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3104 {
3105 	RF_AutoConfig_t *ac;
3106 	RF_ConfigSet_t *config_sets;
3107 	RF_ConfigSet_t *cset;
3108 	RF_AutoConfig_t *ac_next;
3109 
3110 
3111 	config_sets = NULL;
3112 
3113 	/* Go through the AutoConfig list, and figure out which components
3114 	   belong to what sets.  */
3115 	ac = ac_list;
3116 	while(ac!=NULL) {
3117 		/* we're going to putz with ac->next, so save it here
3118 		   for use at the end of the loop */
3119 		ac_next = ac->next;
3120 
3121 		if (config_sets == NULL) {
3122 			/* will need at least this one... */
3123 			config_sets = (RF_ConfigSet_t *)
3124 				malloc(sizeof(RF_ConfigSet_t),
3125 				       M_RAIDFRAME, M_NOWAIT);
3126 			if (config_sets == NULL) {
3127 				panic("rf_create_auto_sets: No memory!");
3128 			}
3129 			/* this one is easy :) */
3130 			config_sets->ac = ac;
3131 			config_sets->next = NULL;
3132 			config_sets->rootable = 0;
3133 			ac->next = NULL;
3134 		} else {
3135 			/* which set does this component fit into? */
3136 			cset = config_sets;
3137 			while(cset!=NULL) {
3138 				if (rf_does_it_fit(cset, ac)) {
3139 					/* looks like it matches... */
3140 					ac->next = cset->ac;
3141 					cset->ac = ac;
3142 					break;
3143 				}
3144 				cset = cset->next;
3145 			}
3146 			if (cset==NULL) {
3147 				/* didn't find a match above... new set..*/
3148 				cset = (RF_ConfigSet_t *)
3149 					malloc(sizeof(RF_ConfigSet_t),
3150 					       M_RAIDFRAME, M_NOWAIT);
3151 				if (cset == NULL) {
3152 					panic("rf_create_auto_sets: No memory!");
3153 				}
3154 				cset->ac = ac;
3155 				ac->next = NULL;
3156 				cset->next = config_sets;
3157 				cset->rootable = 0;
3158 				config_sets = cset;
3159 			}
3160 		}
3161 		ac = ac_next;
3162 	}
3163 
3164 
3165 	return(config_sets);
3166 }
3167 
3168 static int
3169 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3170 {
3171 	RF_ComponentLabel_t *clabel1, *clabel2;
3172 
3173 	/* If this one matches the *first* one in the set, that's good
3174 	   enough, since the other members of the set would have been
3175 	   through here too... */
3176 	/* note that we are not checking partitionSize here..
3177 
3178 	   Note that we are also not checking the mod_counters here.
3179 	   If everything else matches except the mod_counter, that's
3180 	   good enough for this test.  We will deal with the mod_counters
3181 	   a little later in the autoconfiguration process.
3182 
3183 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3184 
3185 	   The reason we don't check for this is that failed disks
3186 	   will have lower modification counts.  If those disks are
3187 	   not added to the set they used to belong to, then they will
3188 	   form their own set, which may result in 2 different sets,
3189 	   for example, competing to be configured at raid0, and
3190 	   perhaps competing to be the root filesystem set.  If the
3191 	   wrong ones get configured, or both attempt to become /,
3192 	   weird behaviour and or serious lossage will occur.  Thus we
3193 	   need to bring them into the fold here, and kick them out at
3194 	   a later point.
3195 
3196 	*/
3197 
3198 	clabel1 = cset->ac->clabel;
3199 	clabel2 = ac->clabel;
3200 	if ((clabel1->version == clabel2->version) &&
3201 	    (clabel1->serial_number == clabel2->serial_number) &&
3202 	    (clabel1->num_rows == clabel2->num_rows) &&
3203 	    (clabel1->num_columns == clabel2->num_columns) &&
3204 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3205 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3206 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3207 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3208 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3209 	    (clabel1->blockSize == clabel2->blockSize) &&
3210 	    rf_component_label_numblocks(clabel1) ==
3211 	    rf_component_label_numblocks(clabel2) &&
3212 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3213 	    (clabel1->root_partition == clabel2->root_partition) &&
3214 	    (clabel1->last_unit == clabel2->last_unit) &&
3215 	    (clabel1->config_order == clabel2->config_order)) {
3216 		/* if it get's here, it almost *has* to be a match */
3217 	} else {
3218 		/* it's not consistent with somebody in the set..
3219 		   punt */
3220 		return(0);
3221 	}
3222 	/* all was fine.. it must fit... */
3223 	return(1);
3224 }
3225 
3226 int
3227 rf_have_enough_components(RF_ConfigSet_t *cset)
3228 {
3229 	RF_AutoConfig_t *ac;
3230 	RF_AutoConfig_t *auto_config;
3231 	RF_ComponentLabel_t *clabel;
3232 	int c;
3233 	int num_cols;
3234 	int num_missing;
3235 	int mod_counter;
3236 	int mod_counter_found;
3237 	int even_pair_failed;
3238 	char parity_type;
3239 
3240 
3241 	/* check to see that we have enough 'live' components
3242 	   of this set.  If so, we can configure it if necessary */
3243 
3244 	num_cols = cset->ac->clabel->num_columns;
3245 	parity_type = cset->ac->clabel->parityConfig;
3246 
3247 	/* XXX Check for duplicate components!?!?!? */
3248 
3249 	/* Determine what the mod_counter is supposed to be for this set. */
3250 
3251 	mod_counter_found = 0;
3252 	mod_counter = 0;
3253 	ac = cset->ac;
3254 	while(ac!=NULL) {
3255 		if (mod_counter_found==0) {
3256 			mod_counter = ac->clabel->mod_counter;
3257 			mod_counter_found = 1;
3258 		} else {
3259 			if (ac->clabel->mod_counter > mod_counter) {
3260 				mod_counter = ac->clabel->mod_counter;
3261 			}
3262 		}
3263 		ac = ac->next;
3264 	}
3265 
3266 	num_missing = 0;
3267 	auto_config = cset->ac;
3268 
3269 	even_pair_failed = 0;
3270 	for(c=0; c<num_cols; c++) {
3271 		ac = auto_config;
3272 		while(ac!=NULL) {
3273 			if ((ac->clabel->column == c) &&
3274 			    (ac->clabel->mod_counter == mod_counter)) {
3275 				/* it's this one... */
3276 #ifdef DEBUG
3277 				printf("Found: %s at %d\n",
3278 				       ac->devname,c);
3279 #endif
3280 				break;
3281 			}
3282 			ac=ac->next;
3283 		}
3284 		if (ac==NULL) {
3285 				/* Didn't find one here! */
3286 				/* special case for RAID 1, especially
3287 				   where there are more than 2
3288 				   components (where RAIDframe treats
3289 				   things a little differently :( ) */
3290 			if (parity_type == '1') {
3291 				if (c%2 == 0) { /* even component */
3292 					even_pair_failed = 1;
3293 				} else { /* odd component.  If
3294 					    we're failed, and
3295 					    so is the even
3296 					    component, it's
3297 					    "Good Night, Charlie" */
3298 					if (even_pair_failed == 1) {
3299 						return(0);
3300 					}
3301 				}
3302 			} else {
3303 				/* normal accounting */
3304 				num_missing++;
3305 			}
3306 		}
3307 		if ((parity_type == '1') && (c%2 == 1)) {
3308 				/* Just did an even component, and we didn't
3309 				   bail.. reset the even_pair_failed flag,
3310 				   and go on to the next component.... */
3311 			even_pair_failed = 0;
3312 		}
3313 	}
3314 
3315 	clabel = cset->ac->clabel;
3316 
3317 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3318 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3319 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3320 		/* XXX this needs to be made *much* more general */
3321 		/* Too many failures */
3322 		return(0);
3323 	}
3324 	/* otherwise, all is well, and we've got enough to take a kick
3325 	   at autoconfiguring this set */
3326 	return(1);
3327 }
3328 
3329 void
3330 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3331 			RF_Raid_t *raidPtr)
3332 {
3333 	RF_ComponentLabel_t *clabel;
3334 	int i;
3335 
3336 	clabel = ac->clabel;
3337 
3338 	/* 1. Fill in the common stuff */
3339 	config->numRow = clabel->num_rows = 1;
3340 	config->numCol = clabel->num_columns;
3341 	config->numSpare = 0; /* XXX should this be set here? */
3342 	config->sectPerSU = clabel->sectPerSU;
3343 	config->SUsPerPU = clabel->SUsPerPU;
3344 	config->SUsPerRU = clabel->SUsPerRU;
3345 	config->parityConfig = clabel->parityConfig;
3346 	/* XXX... */
3347 	strcpy(config->diskQueueType,"fifo");
3348 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3349 	config->layoutSpecificSize = 0; /* XXX ?? */
3350 
3351 	while(ac!=NULL) {
3352 		/* row/col values will be in range due to the checks
3353 		   in reasonable_label() */
3354 		strcpy(config->devnames[0][ac->clabel->column],
3355 		       ac->devname);
3356 		ac = ac->next;
3357 	}
3358 
3359 	for(i=0;i<RF_MAXDBGV;i++) {
3360 		config->debugVars[i][0] = 0;
3361 	}
3362 }
3363 
3364 int
3365 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3366 {
3367 	RF_ComponentLabel_t *clabel;
3368 	int column;
3369 	int sparecol;
3370 
3371 	raidPtr->autoconfigure = new_value;
3372 
3373 	for(column=0; column<raidPtr->numCol; column++) {
3374 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3375 			clabel = raidget_component_label(raidPtr, column);
3376 			clabel->autoconfigure = new_value;
3377 			raidflush_component_label(raidPtr, column);
3378 		}
3379 	}
3380 	for(column = 0; column < raidPtr->numSpare ; column++) {
3381 		sparecol = raidPtr->numCol + column;
3382 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3383 			clabel = raidget_component_label(raidPtr, sparecol);
3384 			clabel->autoconfigure = new_value;
3385 			raidflush_component_label(raidPtr, sparecol);
3386 		}
3387 	}
3388 	return(new_value);
3389 }
3390 
3391 int
3392 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3393 {
3394 	RF_ComponentLabel_t *clabel;
3395 	int column;
3396 	int sparecol;
3397 
3398 	raidPtr->root_partition = new_value;
3399 	for(column=0; column<raidPtr->numCol; column++) {
3400 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3401 			clabel = raidget_component_label(raidPtr, column);
3402 			clabel->root_partition = new_value;
3403 			raidflush_component_label(raidPtr, column);
3404 		}
3405 	}
3406 	for(column = 0; column < raidPtr->numSpare ; column++) {
3407 		sparecol = raidPtr->numCol + column;
3408 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3409 			clabel = raidget_component_label(raidPtr, sparecol);
3410 			clabel->root_partition = new_value;
3411 			raidflush_component_label(raidPtr, sparecol);
3412 		}
3413 	}
3414 	return(new_value);
3415 }
3416 
3417 void
3418 rf_release_all_vps(RF_ConfigSet_t *cset)
3419 {
3420 	RF_AutoConfig_t *ac;
3421 
3422 	ac = cset->ac;
3423 	while(ac!=NULL) {
3424 		/* Close the vp, and give it back */
3425 		if (ac->vp) {
3426 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3427 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3428 			vput(ac->vp);
3429 			ac->vp = NULL;
3430 		}
3431 		ac = ac->next;
3432 	}
3433 }
3434 
3435 
3436 void
3437 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3438 {
3439 	RF_AutoConfig_t *ac;
3440 	RF_AutoConfig_t *next_ac;
3441 
3442 	ac = cset->ac;
3443 	while(ac!=NULL) {
3444 		next_ac = ac->next;
3445 		/* nuke the label */
3446 		free(ac->clabel, M_RAIDFRAME);
3447 		/* cleanup the config structure */
3448 		free(ac, M_RAIDFRAME);
3449 		/* "next.." */
3450 		ac = next_ac;
3451 	}
3452 	/* and, finally, nuke the config set */
3453 	free(cset, M_RAIDFRAME);
3454 }
3455 
3456 
3457 void
3458 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3459 {
3460 	/* current version number */
3461 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3462 	clabel->serial_number = raidPtr->serial_number;
3463 	clabel->mod_counter = raidPtr->mod_counter;
3464 
3465 	clabel->num_rows = 1;
3466 	clabel->num_columns = raidPtr->numCol;
3467 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3468 	clabel->status = rf_ds_optimal; /* "It's good!" */
3469 
3470 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3471 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3472 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3473 
3474 	clabel->blockSize = raidPtr->bytesPerSector;
3475 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3476 
3477 	/* XXX not portable */
3478 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3479 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3480 	clabel->autoconfigure = raidPtr->autoconfigure;
3481 	clabel->root_partition = raidPtr->root_partition;
3482 	clabel->last_unit = raidPtr->raidid;
3483 	clabel->config_order = raidPtr->config_order;
3484 
3485 #ifndef RF_NO_PARITY_MAP
3486 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3487 #endif
3488 }
3489 
3490 struct raid_softc *
3491 rf_auto_config_set(RF_ConfigSet_t *cset)
3492 {
3493 	RF_Raid_t *raidPtr;
3494 	RF_Config_t *config;
3495 	int raidID;
3496 	struct raid_softc *sc;
3497 
3498 #ifdef DEBUG
3499 	printf("RAID autoconfigure\n");
3500 #endif
3501 
3502 	/* 1. Create a config structure */
3503 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3504 	if (config == NULL) {
3505 		printf("%s: Out of mem - config!?!?\n", __func__);
3506 				/* XXX do something more intelligent here. */
3507 		return NULL;
3508 	}
3509 
3510 	/*
3511 	   2. Figure out what RAID ID this one is supposed to live at
3512 	   See if we can get the same RAID dev that it was configured
3513 	   on last time..
3514 	*/
3515 
3516 	raidID = cset->ac->clabel->last_unit;
3517 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3518 	     sc = raidget(++raidID, false))
3519 		continue;
3520 #ifdef DEBUG
3521 	printf("Configuring raid%d:\n",raidID);
3522 #endif
3523 
3524 	if (sc == NULL)
3525 		sc = raidget(raidID, true);
3526 	if (sc == NULL) {
3527 		printf("%s: Out of mem - softc!?!?\n", __func__);
3528 				/* XXX do something more intelligent here. */
3529 		free(config, M_RAIDFRAME);
3530 		return NULL;
3531 	}
3532 
3533 	raidPtr = &sc->sc_r;
3534 
3535 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3536 	raidPtr->softc = sc;
3537 	raidPtr->raidid = raidID;
3538 	raidPtr->openings = RAIDOUTSTANDING;
3539 
3540 	/* 3. Build the configuration structure */
3541 	rf_create_configuration(cset->ac, config, raidPtr);
3542 
3543 	/* 4. Do the configuration */
3544 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3545 		raidinit(sc);
3546 
3547 		rf_markalldirty(raidPtr);
3548 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3549 		switch (cset->ac->clabel->root_partition) {
3550 		case 1:	/* Force Root */
3551 		case 2:	/* Soft Root: root when boot partition part of raid */
3552 			/*
3553 			 * everything configured just fine.  Make a note
3554 			 * that this set is eligible to be root,
3555 			 * or forced to be root
3556 			 */
3557 			cset->rootable = cset->ac->clabel->root_partition;
3558 			/* XXX do this here? */
3559 			raidPtr->root_partition = cset->rootable;
3560 			break;
3561 		default:
3562 			break;
3563 		}
3564 	} else {
3565 		raidput(sc);
3566 		sc = NULL;
3567 	}
3568 
3569 	/* 5. Cleanup */
3570 	free(config, M_RAIDFRAME);
3571 	return sc;
3572 }
3573 
3574 void
3575 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3576 	     size_t xmin, size_t xmax)
3577 {
3578 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3579 	pool_sethiwat(p, xmax);
3580 	pool_prime(p, xmin);
3581 	pool_setlowat(p, xmin);
3582 }
3583 
3584 /*
3585  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3586  * to see if there is IO pending and if that IO could possibly be done
3587  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3588  * otherwise.
3589  *
3590  */
3591 int
3592 rf_buf_queue_check(RF_Raid_t *raidPtr)
3593 {
3594 	struct raid_softc *rs;
3595 	struct dk_softc *dksc;
3596 
3597 	rs = raidPtr->softc;
3598 	dksc = &rs->sc_dksc;
3599 
3600 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3601 		return 1;
3602 
3603 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3604 		/* there is work to do */
3605 		return 0;
3606 	}
3607 	/* default is nothing to do */
3608 	return 1;
3609 }
3610 
3611 int
3612 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3613 {
3614 	uint64_t numsecs;
3615 	unsigned secsize;
3616 	int error;
3617 
3618 	error = getdisksize(vp, &numsecs, &secsize);
3619 	if (error == 0) {
3620 		diskPtr->blockSize = secsize;
3621 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3622 		diskPtr->partitionSize = numsecs;
3623 		return 0;
3624 	}
3625 	return error;
3626 }
3627 
3628 static int
3629 raid_match(device_t self, cfdata_t cfdata, void *aux)
3630 {
3631 	return 1;
3632 }
3633 
3634 static void
3635 raid_attach(device_t parent, device_t self, void *aux)
3636 {
3637 }
3638 
3639 
3640 static int
3641 raid_detach(device_t self, int flags)
3642 {
3643 	int error;
3644 	struct raid_softc *rs = raidsoftc(self);
3645 
3646 	if (rs == NULL)
3647 		return ENXIO;
3648 
3649 	if ((error = raidlock(rs)) != 0)
3650 		return (error);
3651 
3652 	error = raid_detach_unlocked(rs);
3653 
3654 	raidunlock(rs);
3655 
3656 	/* XXX raid can be referenced here */
3657 
3658 	if (error)
3659 		return error;
3660 
3661 	/* Free the softc */
3662 	raidput(rs);
3663 
3664 	return 0;
3665 }
3666 
3667 static void
3668 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3669 {
3670 	struct dk_softc *dksc = &rs->sc_dksc;
3671 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3672 
3673 	memset(dg, 0, sizeof(*dg));
3674 
3675 	dg->dg_secperunit = raidPtr->totalSectors;
3676 	dg->dg_secsize = raidPtr->bytesPerSector;
3677 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3678 	dg->dg_ntracks = 4 * raidPtr->numCol;
3679 
3680 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3681 }
3682 
3683 /*
3684  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3685  * We end up returning whatever error was returned by the first cache flush
3686  * that fails.
3687  */
3688 
3689 int
3690 rf_sync_component_caches(RF_Raid_t *raidPtr)
3691 {
3692 	int c, sparecol;
3693 	int e,error;
3694 	int force = 1;
3695 
3696 	error = 0;
3697 	for (c = 0; c < raidPtr->numCol; c++) {
3698 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3699 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3700 					  &force, FWRITE, NOCRED);
3701 			if (e) {
3702 				if (e != ENODEV)
3703 					printf("raid%d: cache flush to component %s failed.\n",
3704 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3705 				if (error == 0) {
3706 					error = e;
3707 				}
3708 			}
3709 		}
3710 	}
3711 
3712 	for( c = 0; c < raidPtr->numSpare ; c++) {
3713 		sparecol = raidPtr->numCol + c;
3714 		/* Need to ensure that the reconstruct actually completed! */
3715 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3716 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3717 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3718 			if (e) {
3719 				if (e != ENODEV)
3720 					printf("raid%d: cache flush to component %s failed.\n",
3721 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3722 				if (error == 0) {
3723 					error = e;
3724 				}
3725 			}
3726 		}
3727 	}
3728 	return error;
3729 }
3730 
3731 /*
3732  * Module interface
3733  */
3734 
3735 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3736 
3737 #ifdef _MODULE
3738 CFDRIVER_DECL(raid, DV_DISK, NULL);
3739 #endif
3740 
3741 static int raid_modcmd(modcmd_t, void *);
3742 static int raid_modcmd_init(void);
3743 static int raid_modcmd_fini(void);
3744 
3745 static int
3746 raid_modcmd(modcmd_t cmd, void *data)
3747 {
3748 	int error;
3749 
3750 	error = 0;
3751 	switch (cmd) {
3752 	case MODULE_CMD_INIT:
3753 		error = raid_modcmd_init();
3754 		break;
3755 	case MODULE_CMD_FINI:
3756 		error = raid_modcmd_fini();
3757 		break;
3758 	default:
3759 		error = ENOTTY;
3760 		break;
3761 	}
3762 	return error;
3763 }
3764 
3765 static int
3766 raid_modcmd_init(void)
3767 {
3768 	int error;
3769 	int bmajor, cmajor;
3770 
3771 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3772 	mutex_enter(&raid_lock);
3773 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3774 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3775 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3776 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3777 
3778 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3779 #endif
3780 
3781 	bmajor = cmajor = -1;
3782 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3783 	    &raid_cdevsw, &cmajor);
3784 	if (error != 0 && error != EEXIST) {
3785 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3786 		mutex_exit(&raid_lock);
3787 		return error;
3788 	}
3789 #ifdef _MODULE
3790 	error = config_cfdriver_attach(&raid_cd);
3791 	if (error != 0) {
3792 		aprint_error("%s: config_cfdriver_attach failed %d\n",
3793 		    __func__, error);
3794 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
3795 		mutex_exit(&raid_lock);
3796 		return error;
3797 	}
3798 #endif
3799 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3800 	if (error != 0) {
3801 		aprint_error("%s: config_cfattach_attach failed %d\n",
3802 		    __func__, error);
3803 #ifdef _MODULE
3804 		config_cfdriver_detach(&raid_cd);
3805 #endif
3806 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
3807 		mutex_exit(&raid_lock);
3808 		return error;
3809 	}
3810 
3811 	raidautoconfigdone = false;
3812 
3813 	mutex_exit(&raid_lock);
3814 
3815 	if (error == 0) {
3816 		if (rf_BootRaidframe(true) == 0)
3817 			aprint_verbose("Kernelized RAIDframe activated\n");
3818 		else
3819 			panic("Serious error activating RAID!!");
3820 	}
3821 
3822 	/*
3823 	 * Register a finalizer which will be used to auto-config RAID
3824 	 * sets once all real hardware devices have been found.
3825 	 */
3826 	error = config_finalize_register(NULL, rf_autoconfig);
3827 	if (error != 0) {
3828 		aprint_error("WARNING: unable to register RAIDframe "
3829 		    "finalizer\n");
3830 		error = 0;
3831 	}
3832 
3833 	return error;
3834 }
3835 
3836 static int
3837 raid_modcmd_fini(void)
3838 {
3839 	int error;
3840 
3841 	mutex_enter(&raid_lock);
3842 
3843 	/* Don't allow unload if raid device(s) exist.  */
3844 	if (!LIST_EMPTY(&raids)) {
3845 		mutex_exit(&raid_lock);
3846 		return EBUSY;
3847 	}
3848 
3849 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3850 	if (error != 0) {
3851 		aprint_error("%s: cannot detach cfattach\n",__func__);
3852 		mutex_exit(&raid_lock);
3853 		return error;
3854 	}
3855 #ifdef _MODULE
3856 	error = config_cfdriver_detach(&raid_cd);
3857 	if (error != 0) {
3858 		aprint_error("%s: cannot detach cfdriver\n",__func__);
3859 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3860 		mutex_exit(&raid_lock);
3861 		return error;
3862 	}
3863 #endif
3864 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3865 	if (error != 0) {
3866 		aprint_error("%s: cannot detach devsw\n",__func__);
3867 #ifdef _MODULE
3868 		config_cfdriver_attach(&raid_cd);
3869 #endif
3870 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3871 		mutex_exit(&raid_lock);
3872 		return error;
3873 	}
3874 	rf_BootRaidframe(false);
3875 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3876 	rf_destroy_mutex2(rf_sparet_wait_mutex);
3877 	rf_destroy_cond2(rf_sparet_wait_cv);
3878 	rf_destroy_cond2(rf_sparet_resp_cv);
3879 #endif
3880 	mutex_exit(&raid_lock);
3881 	mutex_destroy(&raid_lock);
3882 
3883 	return error;
3884 }
3885