xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 181254a7b1bdde6873432bffef2d2decc4b5c22f)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.389 2020/08/25 13:50:00 skrll Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.389 2020/08/25 13:50:00 skrll Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151 
152 #include "ioconf.h"
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166 
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171 
172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
173 						 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
175 						 * installation process */
176 #endif
177 
178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
179 
180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
181 
182 /* prototypes */
183 static void KernelWakeupFunc(struct buf *);
184 static void InitBP(struct buf *, struct vnode *, unsigned,
185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
186     void *, int);
187 static void raidinit(struct raid_softc *);
188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
190 
191 static int raid_match(device_t, cfdata_t, void *);
192 static void raid_attach(device_t, device_t, void *);
193 static int raid_detach(device_t, int);
194 
195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
196     daddr_t, daddr_t);
197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
198     daddr_t, daddr_t, int);
199 
200 static int raidwrite_component_label(unsigned,
201     dev_t, struct vnode *, RF_ComponentLabel_t *);
202 static int raidread_component_label(unsigned,
203     dev_t, struct vnode *, RF_ComponentLabel_t *);
204 
205 static int raid_diskstart(device_t, struct buf *bp);
206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
207 static int raid_lastclose(device_t);
208 
209 static dev_type_open(raidopen);
210 static dev_type_close(raidclose);
211 static dev_type_read(raidread);
212 static dev_type_write(raidwrite);
213 static dev_type_ioctl(raidioctl);
214 static dev_type_strategy(raidstrategy);
215 static dev_type_dump(raiddump);
216 static dev_type_size(raidsize);
217 
218 const struct bdevsw raid_bdevsw = {
219 	.d_open = raidopen,
220 	.d_close = raidclose,
221 	.d_strategy = raidstrategy,
222 	.d_ioctl = raidioctl,
223 	.d_dump = raiddump,
224 	.d_psize = raidsize,
225 	.d_discard = nodiscard,
226 	.d_flag = D_DISK
227 };
228 
229 const struct cdevsw raid_cdevsw = {
230 	.d_open = raidopen,
231 	.d_close = raidclose,
232 	.d_read = raidread,
233 	.d_write = raidwrite,
234 	.d_ioctl = raidioctl,
235 	.d_stop = nostop,
236 	.d_tty = notty,
237 	.d_poll = nopoll,
238 	.d_mmap = nommap,
239 	.d_kqfilter = nokqfilter,
240 	.d_discard = nodiscard,
241 	.d_flag = D_DISK
242 };
243 
244 static struct dkdriver rf_dkdriver = {
245 	.d_open = raidopen,
246 	.d_close = raidclose,
247 	.d_strategy = raidstrategy,
248 	.d_diskstart = raid_diskstart,
249 	.d_dumpblocks = raid_dumpblocks,
250 	.d_lastclose = raid_lastclose,
251 	.d_minphys = minphys
252 };
253 
254 #define	raidunit(x)	DISKUNIT(x)
255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
256 
257 extern struct cfdriver raid_cd;
258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
260     DVF_DETACH_SHUTDOWN);
261 
262 /* Internal representation of a rf_recon_req */
263 struct rf_recon_req_internal {
264 	RF_RowCol_t col;
265 	RF_ReconReqFlags_t flags;
266 	void   *raidPtr;
267 };
268 
269 /*
270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
271  * Be aware that large numbers can allow the driver to consume a lot of
272  * kernel memory, especially on writes, and in degraded mode reads.
273  *
274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
275  * a single 64K write will typically require 64K for the old data,
276  * 64K for the old parity, and 64K for the new parity, for a total
277  * of 192K (if the parity buffer is not re-used immediately).
278  * Even it if is used immediately, that's still 128K, which when multiplied
279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
280  *
281  * Now in degraded mode, for example, a 64K read on the above setup may
282  * require data reconstruction, which will require *all* of the 4 remaining
283  * disks to participate -- 4 * 32K/disk == 128K again.
284  */
285 
286 #ifndef RAIDOUTSTANDING
287 #define RAIDOUTSTANDING   6
288 #endif
289 
290 #define RAIDLABELDEV(dev)	\
291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
292 
293 /* declared here, and made public, for the benefit of KVM stuff.. */
294 
295 static int raidlock(struct raid_softc *);
296 static void raidunlock(struct raid_softc *);
297 
298 static int raid_detach_unlocked(struct raid_softc *);
299 
300 static void rf_markalldirty(RF_Raid_t *);
301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
302 
303 void rf_ReconThread(struct rf_recon_req_internal *);
304 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
305 void rf_CopybackThread(RF_Raid_t *raidPtr);
306 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
307 int rf_autoconfig(device_t);
308 void rf_buildroothack(RF_ConfigSet_t *);
309 
310 RF_AutoConfig_t *rf_find_raid_components(void);
311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
315 int rf_set_autoconfig(RF_Raid_t *, int);
316 int rf_set_rootpartition(RF_Raid_t *, int);
317 void rf_release_all_vps(RF_ConfigSet_t *);
318 void rf_cleanup_config_set(RF_ConfigSet_t *);
319 int rf_have_enough_components(RF_ConfigSet_t *);
320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
322 
323 /*
324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
326  * in the kernel config file.
327  */
328 #ifdef RAID_AUTOCONFIG
329 int raidautoconfig = 1;
330 #else
331 int raidautoconfig = 0;
332 #endif
333 static bool raidautoconfigdone = false;
334 
335 struct RF_Pools_s rf_pools;
336 
337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
338 static kmutex_t raid_lock;
339 
340 static struct raid_softc *
341 raidcreate(int unit) {
342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
343 	sc->sc_unit = unit;
344 	cv_init(&sc->sc_cv, "raidunit");
345 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
346 	return sc;
347 }
348 
349 static void
350 raiddestroy(struct raid_softc *sc) {
351 	cv_destroy(&sc->sc_cv);
352 	mutex_destroy(&sc->sc_mutex);
353 	kmem_free(sc, sizeof(*sc));
354 }
355 
356 static struct raid_softc *
357 raidget(int unit, bool create) {
358 	struct raid_softc *sc;
359 	if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 		panic("%s: unit %d!", __func__, unit);
362 #endif
363 		return NULL;
364 	}
365 	mutex_enter(&raid_lock);
366 	LIST_FOREACH(sc, &raids, sc_link) {
367 		if (sc->sc_unit == unit) {
368 			mutex_exit(&raid_lock);
369 			return sc;
370 		}
371 	}
372 	mutex_exit(&raid_lock);
373 	if (!create)
374 		return NULL;
375 	sc = raidcreate(unit);
376 	mutex_enter(&raid_lock);
377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
378 	mutex_exit(&raid_lock);
379 	return sc;
380 }
381 
382 static void
383 raidput(struct raid_softc *sc) {
384 	mutex_enter(&raid_lock);
385 	LIST_REMOVE(sc, sc_link);
386 	mutex_exit(&raid_lock);
387 	raiddestroy(sc);
388 }
389 
390 void
391 raidattach(int num)
392 {
393 
394 	/*
395 	 * Device attachment and associated initialization now occurs
396 	 * as part of the module initialization.
397 	 */
398 }
399 
400 int
401 rf_autoconfig(device_t self)
402 {
403 	RF_AutoConfig_t *ac_list;
404 	RF_ConfigSet_t *config_sets;
405 
406 	if (!raidautoconfig || raidautoconfigdone == true)
407 		return 0;
408 
409 	/* XXX This code can only be run once. */
410 	raidautoconfigdone = true;
411 
412 #ifdef __HAVE_CPU_BOOTCONF
413 	/*
414 	 * 0. find the boot device if needed first so we can use it later
415 	 * this needs to be done before we autoconfigure any raid sets,
416 	 * because if we use wedges we are not going to be able to open
417 	 * the boot device later
418 	 */
419 	if (booted_device == NULL)
420 		cpu_bootconf();
421 #endif
422 	/* 1. locate all RAID components on the system */
423 	aprint_debug("Searching for RAID components...\n");
424 	ac_list = rf_find_raid_components();
425 
426 	/* 2. Sort them into their respective sets. */
427 	config_sets = rf_create_auto_sets(ac_list);
428 
429 	/*
430 	 * 3. Evaluate each set and configure the valid ones.
431 	 * This gets done in rf_buildroothack().
432 	 */
433 	rf_buildroothack(config_sets);
434 
435 	return 1;
436 }
437 
438 int
439 rf_inited(const struct raid_softc *rs) {
440 	return (rs->sc_flags & RAIDF_INITED) != 0;
441 }
442 
443 RF_Raid_t *
444 rf_get_raid(struct raid_softc *rs) {
445 	return &rs->sc_r;
446 }
447 
448 int
449 rf_get_unit(const struct raid_softc *rs) {
450 	return rs->sc_unit;
451 }
452 
453 static int
454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
455 	const char *bootname;
456 	size_t len;
457 
458 	/* if bdv is NULL, the set can't contain it. exit early. */
459 	if (bdv == NULL)
460 		return 0;
461 
462 	bootname = device_xname(bdv);
463 	len = strlen(bootname);
464 
465 	for (int col = 0; col < r->numCol; col++) {
466 		const char *devname = r->Disks[col].devname;
467 		devname += sizeof("/dev/") - 1;
468 		if (strncmp(devname, "dk", 2) == 0) {
469 			const char *parent =
470 			    dkwedge_get_parent_name(r->Disks[col].dev);
471 			if (parent != NULL)
472 				devname = parent;
473 		}
474 		if (strncmp(devname, bootname, len) == 0) {
475 			struct raid_softc *sc = r->softc;
476 			aprint_debug("raid%d includes boot device %s\n",
477 			    sc->sc_unit, devname);
478 			return 1;
479 		}
480 	}
481 	return 0;
482 }
483 
484 void
485 rf_buildroothack(RF_ConfigSet_t *config_sets)
486 {
487 	RF_ConfigSet_t *cset;
488 	RF_ConfigSet_t *next_cset;
489 	int num_root;
490 	struct raid_softc *sc, *rsc;
491 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
492 
493 	sc = rsc = NULL;
494 	num_root = 0;
495 	cset = config_sets;
496 	while (cset != NULL) {
497 		next_cset = cset->next;
498 		if (rf_have_enough_components(cset) &&
499 		    cset->ac->clabel->autoconfigure == 1) {
500 			sc = rf_auto_config_set(cset);
501 			if (sc != NULL) {
502 				aprint_debug("raid%d: configured ok, rootable %d\n",
503 				    sc->sc_unit, cset->rootable);
504 				if (cset->rootable) {
505 					rsc = sc;
506 					num_root++;
507 				}
508 			} else {
509 				/* The autoconfig didn't work :( */
510 				aprint_debug("Autoconfig failed\n");
511 				rf_release_all_vps(cset);
512 			}
513 		} else {
514 			/* we're not autoconfiguring this set...
515 			   release the associated resources */
516 			rf_release_all_vps(cset);
517 		}
518 		/* cleanup */
519 		rf_cleanup_config_set(cset);
520 		cset = next_cset;
521 	}
522 
523 	/* if the user has specified what the root device should be
524 	   then we don't touch booted_device or boothowto... */
525 
526 	if (rootspec != NULL) {
527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
528 		return;
529 	}
530 
531 	/* we found something bootable... */
532 
533 	/*
534 	 * XXX: The following code assumes that the root raid
535 	 * is the first ('a') partition. This is about the best
536 	 * we can do with a BSD disklabel, but we might be able
537 	 * to do better with a GPT label, by setting a specified
538 	 * attribute to indicate the root partition. We can then
539 	 * stash the partition number in the r->root_partition
540 	 * high bits (the bottom 2 bits are already used). For
541 	 * now we just set booted_partition to 0 when we override
542 	 * root.
543 	 */
544 	if (num_root == 1) {
545 		device_t candidate_root;
546 		dksc = &rsc->sc_dksc;
547 		if (dksc->sc_dkdev.dk_nwedges != 0) {
548 			char cname[sizeof(cset->ac->devname)];
549 			/* XXX: assume partition 'a' first */
550 			snprintf(cname, sizeof(cname), "%s%c",
551 			    device_xname(dksc->sc_dev), 'a');
552 			candidate_root = dkwedge_find_by_wname(cname);
553 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
554 			    cname);
555 			if (candidate_root == NULL) {
556 				/*
557 				 * If that is not found, because we don't use
558 				 * disklabel, return the first dk child
559 				 * XXX: we can skip the 'a' check above
560 				 * and always do this...
561 				 */
562 				size_t i = 0;
563 				candidate_root = dkwedge_find_by_parent(
564 				    device_xname(dksc->sc_dev), &i);
565 			}
566 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
567 			    candidate_root);
568 		} else
569 			candidate_root = dksc->sc_dev;
570 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
571 		DPRINTF("%s: booted_device=%p root_partition=%d "
572 			"contains_boot=%d",
573 		    __func__, booted_device, rsc->sc_r.root_partition,
574 			   rf_containsboot(&rsc->sc_r, booted_device));
575 		/* XXX the check for booted_device == NULL can probably be
576 		 * dropped, now that rf_containsboot handles that case.
577 		 */
578 		if (booted_device == NULL ||
579 		    rsc->sc_r.root_partition == 1 ||
580 		    rf_containsboot(&rsc->sc_r, booted_device)) {
581 			booted_device = candidate_root;
582 			booted_method = "raidframe/single";
583 			booted_partition = 0;	/* XXX assume 'a' */
584 		}
585 	} else if (num_root > 1) {
586 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
587 		    booted_device);
588 
589 		/*
590 		 * Maybe the MD code can help. If it cannot, then
591 		 * setroot() will discover that we have no
592 		 * booted_device and will ask the user if nothing was
593 		 * hardwired in the kernel config file
594 		 */
595 		if (booted_device == NULL)
596 			return;
597 
598 		num_root = 0;
599 		mutex_enter(&raid_lock);
600 		LIST_FOREACH(sc, &raids, sc_link) {
601 			RF_Raid_t *r = &sc->sc_r;
602 			if (r->valid == 0)
603 				continue;
604 
605 			if (r->root_partition == 0)
606 				continue;
607 
608 			if (rf_containsboot(r, booted_device)) {
609 				num_root++;
610 				rsc = sc;
611 				dksc = &rsc->sc_dksc;
612 			}
613 		}
614 		mutex_exit(&raid_lock);
615 
616 		if (num_root == 1) {
617 			booted_device = dksc->sc_dev;
618 			booted_method = "raidframe/multi";
619 			booted_partition = 0;	/* XXX assume 'a' */
620 		} else {
621 			/* we can't guess.. require the user to answer... */
622 			boothowto |= RB_ASKNAME;
623 		}
624 	}
625 }
626 
627 static int
628 raidsize(dev_t dev)
629 {
630 	struct raid_softc *rs;
631 	struct dk_softc *dksc;
632 	unsigned int unit;
633 
634 	unit = raidunit(dev);
635 	if ((rs = raidget(unit, false)) == NULL)
636 		return -1;
637 	dksc = &rs->sc_dksc;
638 
639 	if ((rs->sc_flags & RAIDF_INITED) == 0)
640 		return -1;
641 
642 	return dk_size(dksc, dev);
643 }
644 
645 static int
646 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
647 {
648 	unsigned int unit;
649 	struct raid_softc *rs;
650 	struct dk_softc *dksc;
651 
652 	unit = raidunit(dev);
653 	if ((rs = raidget(unit, false)) == NULL)
654 		return ENXIO;
655 	dksc = &rs->sc_dksc;
656 
657 	if ((rs->sc_flags & RAIDF_INITED) == 0)
658 		return ENODEV;
659 
660         /*
661            Note that blkno is relative to this particular partition.
662            By adding adding RF_PROTECTED_SECTORS, we get a value that
663 	   is relative to the partition used for the underlying component.
664         */
665 	blkno += RF_PROTECTED_SECTORS;
666 
667 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
668 }
669 
670 static int
671 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
672 {
673 	struct raid_softc *rs = raidsoftc(dev);
674 	const struct bdevsw *bdev;
675 	RF_Raid_t *raidPtr;
676 	int     c, sparecol, j, scol, dumpto;
677 	int     error = 0;
678 
679 	raidPtr = &rs->sc_r;
680 
681 	/* we only support dumping to RAID 1 sets */
682 	if (raidPtr->Layout.numDataCol != 1 ||
683 	    raidPtr->Layout.numParityCol != 1)
684 		return EINVAL;
685 
686 	if ((error = raidlock(rs)) != 0)
687 		return error;
688 
689 	/* figure out what device is alive.. */
690 
691 	/*
692 	   Look for a component to dump to.  The preference for the
693 	   component to dump to is as follows:
694 	   1) the first component
695 	   2) a used_spare of the first component
696 	   3) the second component
697 	   4) a used_spare of the second component
698 	*/
699 
700 	dumpto = -1;
701 	for (c = 0; c < raidPtr->numCol; c++) {
702 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
703 			/* this might be the one */
704 			dumpto = c;
705 			break;
706 		}
707 	}
708 
709 	/*
710 	   At this point we have possibly selected a live component.
711 	   If we didn't find a live ocmponent, we now check to see
712 	   if there is a relevant spared component.
713 	*/
714 
715 	for (c = 0; c < raidPtr->numSpare; c++) {
716 		sparecol = raidPtr->numCol + c;
717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
718 			/* How about this one? */
719 			scol = -1;
720 			for(j=0;j<raidPtr->numCol;j++) {
721 				if (raidPtr->Disks[j].spareCol == sparecol) {
722 					scol = j;
723 					break;
724 				}
725 			}
726 			if (scol == 0) {
727 				/*
728 				   We must have found a spared first
729 				   component!  We'll take that over
730 				   anything else found so far.  (We
731 				   couldn't have found a real first
732 				   component before, since this is a
733 				   used spare, and it's saying that
734 				   it's replacing the first
735 				   component.)  On reboot (with
736 				   autoconfiguration turned on)
737 				   sparecol will become the first
738 				   component (component0) of this set.
739 				*/
740 				dumpto = sparecol;
741 				break;
742 			} else if (scol != -1) {
743 				/*
744 				   Must be a spared second component.
745 				   We'll dump to that if we havn't found
746 				   anything else so far.
747 				*/
748 				if (dumpto == -1)
749 					dumpto = sparecol;
750 			}
751 		}
752 	}
753 
754 	if (dumpto == -1) {
755 		/* we couldn't find any live components to dump to!?!?
756 		 */
757 		error = EINVAL;
758 		goto out;
759 	}
760 
761 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
762 	if (bdev == NULL) {
763 		error = ENXIO;
764 		goto out;
765 	}
766 
767 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
768 				blkno, va, nblk * raidPtr->bytesPerSector);
769 
770 out:
771 	raidunlock(rs);
772 
773 	return error;
774 }
775 
776 /* ARGSUSED */
777 static int
778 raidopen(dev_t dev, int flags, int fmt,
779     struct lwp *l)
780 {
781 	int     unit = raidunit(dev);
782 	struct raid_softc *rs;
783 	struct dk_softc *dksc;
784 	int     error = 0;
785 	int     part, pmask;
786 
787 	if ((rs = raidget(unit, true)) == NULL)
788 		return ENXIO;
789 	if ((error = raidlock(rs)) != 0)
790 		return error;
791 
792 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
793 		error = EBUSY;
794 		goto bad;
795 	}
796 
797 	dksc = &rs->sc_dksc;
798 
799 	part = DISKPART(dev);
800 	pmask = (1 << part);
801 
802 	if (!DK_BUSY(dksc, pmask) &&
803 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
804 		/* First one... mark things as dirty... Note that we *MUST*
805 		 have done a configure before this.  I DO NOT WANT TO BE
806 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
807 		 THAT THEY BELONG TOGETHER!!!!! */
808 		/* XXX should check to see if we're only open for reading
809 		   here... If so, we needn't do this, but then need some
810 		   other way of keeping track of what's happened.. */
811 
812 		rf_markalldirty(&rs->sc_r);
813 	}
814 
815 	if ((rs->sc_flags & RAIDF_INITED) != 0)
816 		error = dk_open(dksc, dev, flags, fmt, l);
817 
818 bad:
819 	raidunlock(rs);
820 
821 	return error;
822 
823 
824 }
825 
826 static int
827 raid_lastclose(device_t self)
828 {
829 	struct raid_softc *rs = raidsoftc(self);
830 
831 	/* Last one... device is not unconfigured yet.
832 	   Device shutdown has taken care of setting the
833 	   clean bits if RAIDF_INITED is not set
834 	   mark things as clean... */
835 
836 	rf_update_component_labels(&rs->sc_r,
837 	    RF_FINAL_COMPONENT_UPDATE);
838 
839 	/* pass to unlocked code */
840 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
841 		rs->sc_flags |= RAIDF_DETACH;
842 
843 	return 0;
844 }
845 
846 /* ARGSUSED */
847 static int
848 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
849 {
850 	int     unit = raidunit(dev);
851 	struct raid_softc *rs;
852 	struct dk_softc *dksc;
853 	cfdata_t cf;
854 	int     error = 0, do_detach = 0, do_put = 0;
855 
856 	if ((rs = raidget(unit, false)) == NULL)
857 		return ENXIO;
858 	dksc = &rs->sc_dksc;
859 
860 	if ((error = raidlock(rs)) != 0)
861 		return error;
862 
863 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
864 		error = dk_close(dksc, dev, flags, fmt, l);
865 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
866 			do_detach = 1;
867 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
868 		do_put = 1;
869 
870 	raidunlock(rs);
871 
872 	if (do_detach) {
873 		/* free the pseudo device attach bits */
874 		cf = device_cfdata(dksc->sc_dev);
875 		error = config_detach(dksc->sc_dev, 0);
876 		if (error == 0)
877 			free(cf, M_RAIDFRAME);
878 	} else if (do_put) {
879 		raidput(rs);
880 	}
881 
882 	return error;
883 
884 }
885 
886 static void
887 raid_wakeup(RF_Raid_t *raidPtr)
888 {
889 	rf_lock_mutex2(raidPtr->iodone_lock);
890 	rf_signal_cond2(raidPtr->iodone_cv);
891 	rf_unlock_mutex2(raidPtr->iodone_lock);
892 }
893 
894 static void
895 raidstrategy(struct buf *bp)
896 {
897 	unsigned int unit;
898 	struct raid_softc *rs;
899 	struct dk_softc *dksc;
900 	RF_Raid_t *raidPtr;
901 
902 	unit = raidunit(bp->b_dev);
903 	if ((rs = raidget(unit, false)) == NULL) {
904 		bp->b_error = ENXIO;
905 		goto fail;
906 	}
907 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
908 		bp->b_error = ENXIO;
909 		goto fail;
910 	}
911 	dksc = &rs->sc_dksc;
912 	raidPtr = &rs->sc_r;
913 
914 	/* Queue IO only */
915 	if (dk_strategy_defer(dksc, bp))
916 		goto done;
917 
918 	/* schedule the IO to happen at the next convenient time */
919 	raid_wakeup(raidPtr);
920 
921 done:
922 	return;
923 
924 fail:
925 	bp->b_resid = bp->b_bcount;
926 	biodone(bp);
927 }
928 
929 static int
930 raid_diskstart(device_t dev, struct buf *bp)
931 {
932 	struct raid_softc *rs = raidsoftc(dev);
933 	RF_Raid_t *raidPtr;
934 
935 	raidPtr = &rs->sc_r;
936 	if (!raidPtr->valid) {
937 		db1_printf(("raid is not valid..\n"));
938 		return ENODEV;
939 	}
940 
941 	/* XXX */
942 	bp->b_resid = 0;
943 
944 	return raiddoaccess(raidPtr, bp);
945 }
946 
947 void
948 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
949 {
950 	struct raid_softc *rs;
951 	struct dk_softc *dksc;
952 
953 	rs = raidPtr->softc;
954 	dksc = &rs->sc_dksc;
955 
956 	dk_done(dksc, bp);
957 
958 	rf_lock_mutex2(raidPtr->mutex);
959 	raidPtr->openings++;
960 	rf_unlock_mutex2(raidPtr->mutex);
961 
962 	/* schedule more IO */
963 	raid_wakeup(raidPtr);
964 }
965 
966 /* ARGSUSED */
967 static int
968 raidread(dev_t dev, struct uio *uio, int flags)
969 {
970 	int     unit = raidunit(dev);
971 	struct raid_softc *rs;
972 
973 	if ((rs = raidget(unit, false)) == NULL)
974 		return ENXIO;
975 
976 	if ((rs->sc_flags & RAIDF_INITED) == 0)
977 		return ENXIO;
978 
979 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
980 
981 }
982 
983 /* ARGSUSED */
984 static int
985 raidwrite(dev_t dev, struct uio *uio, int flags)
986 {
987 	int     unit = raidunit(dev);
988 	struct raid_softc *rs;
989 
990 	if ((rs = raidget(unit, false)) == NULL)
991 		return ENXIO;
992 
993 	if ((rs->sc_flags & RAIDF_INITED) == 0)
994 		return ENXIO;
995 
996 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
997 
998 }
999 
1000 static int
1001 raid_detach_unlocked(struct raid_softc *rs)
1002 {
1003 	struct dk_softc *dksc = &rs->sc_dksc;
1004 	RF_Raid_t *raidPtr;
1005 	int error;
1006 
1007 	raidPtr = &rs->sc_r;
1008 
1009 	if (DK_BUSY(dksc, 0) ||
1010 	    raidPtr->recon_in_progress != 0 ||
1011 	    raidPtr->parity_rewrite_in_progress != 0 ||
1012 	    raidPtr->copyback_in_progress != 0)
1013 		return EBUSY;
1014 
1015 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1016 		return 0;
1017 
1018 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1019 
1020 	if ((error = rf_Shutdown(raidPtr)) != 0)
1021 		return error;
1022 
1023 	rs->sc_flags &= ~RAIDF_INITED;
1024 
1025 	/* Kill off any queued buffers */
1026 	dk_drain(dksc);
1027 	bufq_free(dksc->sc_bufq);
1028 
1029 	/* Detach the disk. */
1030 	dkwedge_delall(&dksc->sc_dkdev);
1031 	disk_detach(&dksc->sc_dkdev);
1032 	disk_destroy(&dksc->sc_dkdev);
1033 	dk_detach(dksc);
1034 
1035 	return 0;
1036 }
1037 
1038 static bool
1039 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1040 {
1041 	switch (cmd) {
1042 	case RAIDFRAME_ADD_HOT_SPARE:
1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1044 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1045 	case RAIDFRAME_CHECK_PARITY:
1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1047 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1048 	case RAIDFRAME_CHECK_RECON_STATUS:
1049 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1050 	case RAIDFRAME_COPYBACK:
1051 	case RAIDFRAME_DELETE_COMPONENT:
1052 	case RAIDFRAME_FAIL_DISK:
1053 	case RAIDFRAME_GET_ACCTOTALS:
1054 	case RAIDFRAME_GET_COMPONENT_LABEL:
1055 	case RAIDFRAME_GET_INFO:
1056 	case RAIDFRAME_GET_SIZE:
1057 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1058 	case RAIDFRAME_INIT_LABELS:
1059 	case RAIDFRAME_KEEP_ACCTOTALS:
1060 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1061 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1062 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1063 	case RAIDFRAME_PARITYMAP_STATUS:
1064 	case RAIDFRAME_REBUILD_IN_PLACE:
1065 	case RAIDFRAME_REMOVE_HOT_SPARE:
1066 	case RAIDFRAME_RESET_ACCTOTALS:
1067 	case RAIDFRAME_REWRITEPARITY:
1068 	case RAIDFRAME_SET_AUTOCONFIG:
1069 	case RAIDFRAME_SET_COMPONENT_LABEL:
1070 	case RAIDFRAME_SET_ROOT:
1071 		return (rs->sc_flags & RAIDF_INITED) == 0;
1072 	}
1073 	return false;
1074 }
1075 
1076 int
1077 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1078 {
1079 	struct rf_recon_req_internal *rrint;
1080 
1081 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1082 		/* Can't do this on a RAID 0!! */
1083 		return EINVAL;
1084 	}
1085 
1086 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1087 		/* bad column */
1088 		return EINVAL;
1089 	}
1090 
1091 	rf_lock_mutex2(raidPtr->mutex);
1092 	if (raidPtr->status == rf_rs_reconstructing) {
1093 		/* you can't fail a disk while we're reconstructing! */
1094 		/* XXX wrong for RAID6 */
1095 		goto out;
1096 	}
1097 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1098 	    (raidPtr->numFailures > 0)) {
1099 		/* some other component has failed.  Let's not make
1100 		   things worse. XXX wrong for RAID6 */
1101 		goto out;
1102 	}
1103 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1104 		/* Can't fail a spared disk! */
1105 		goto out;
1106 	}
1107 	rf_unlock_mutex2(raidPtr->mutex);
1108 
1109 	/* make a copy of the recon request so that we don't rely on
1110 	 * the user's buffer */
1111 	rrint = RF_Malloc(sizeof(*rrint));
1112 	if (rrint == NULL)
1113 		return(ENOMEM);
1114 	rrint->col = rr->col;
1115 	rrint->flags = rr->flags;
1116 	rrint->raidPtr = raidPtr;
1117 
1118 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1119 	    rrint, "raid_recon");
1120 out:
1121 	rf_unlock_mutex2(raidPtr->mutex);
1122 	return EINVAL;
1123 }
1124 
1125 static int
1126 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1127 {
1128 	/* allocate a buffer for the layout-specific data, and copy it in */
1129 	if (k_cfg->layoutSpecificSize == 0)
1130 		return 0;
1131 
1132 	if (k_cfg->layoutSpecificSize > 10000) {
1133 	    /* sanity check */
1134 	    return EINVAL;
1135 	}
1136 
1137 	u_char *specific_buf;
1138 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1139 	if (specific_buf == NULL)
1140 		return ENOMEM;
1141 
1142 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1143 	    k_cfg->layoutSpecificSize);
1144 	if (retcode) {
1145 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1146 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1147 		return retcode;
1148 	}
1149 
1150 	k_cfg->layoutSpecific = specific_buf;
1151 	return 0;
1152 }
1153 
1154 static int
1155 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1156 {
1157 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
1158 
1159 	if (rs->sc_r.valid) {
1160 		/* There is a valid RAID set running on this unit! */
1161 		printf("raid%d: Device already configured!\n", rs->sc_unit);
1162 		return EINVAL;
1163 	}
1164 
1165 	/* copy-in the configuration information */
1166 	/* data points to a pointer to the configuration structure */
1167 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
1168 	if (*k_cfg == NULL) {
1169 		return ENOMEM;
1170 	}
1171 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1172 	if (retcode == 0)
1173 		return 0;
1174 	RF_Free(*k_cfg, sizeof(RF_Config_t));
1175 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1176 	rs->sc_flags |= RAIDF_SHUTDOWN;
1177 	return retcode;
1178 }
1179 
1180 int
1181 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1182 {
1183 	int retcode;
1184 	RF_Raid_t *raidPtr = &rs->sc_r;
1185 
1186 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1187 
1188 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1189 		goto out;
1190 
1191 	/* should do some kind of sanity check on the configuration.
1192 	 * Store the sum of all the bytes in the last byte? */
1193 
1194 	/* configure the system */
1195 
1196 	/*
1197 	 * Clear the entire RAID descriptor, just to make sure
1198 	 *  there is no stale data left in the case of a
1199 	 *  reconfiguration
1200 	 */
1201 	memset(raidPtr, 0, sizeof(*raidPtr));
1202 	raidPtr->softc = rs;
1203 	raidPtr->raidid = rs->sc_unit;
1204 
1205 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
1206 
1207 	if (retcode == 0) {
1208 		/* allow this many simultaneous IO's to
1209 		   this RAID device */
1210 		raidPtr->openings = RAIDOUTSTANDING;
1211 
1212 		raidinit(rs);
1213 		raid_wakeup(raidPtr);
1214 		rf_markalldirty(raidPtr);
1215 	}
1216 
1217 	/* free the buffers.  No return code here. */
1218 	if (k_cfg->layoutSpecificSize) {
1219 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1220 	}
1221 out:
1222 	RF_Free(k_cfg, sizeof(RF_Config_t));
1223 	if (retcode) {
1224 		/*
1225 		 * If configuration failed, set sc_flags so that we
1226 		 * will detach the device when we close it.
1227 		 */
1228 		rs->sc_flags |= RAIDF_SHUTDOWN;
1229 	}
1230 	return retcode;
1231 }
1232 
1233 #if RF_DISABLED
1234 static int
1235 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1236 {
1237 
1238 	/* XXX check the label for valid stuff... */
1239 	/* Note that some things *should not* get modified --
1240 	   the user should be re-initing the labels instead of
1241 	   trying to patch things.
1242 	   */
1243 #ifdef DEBUG
1244 	int raidid = raidPtr->raidid;
1245 	printf("raid%d: Got component label:\n", raidid);
1246 	printf("raid%d: Version: %d\n", raidid, clabel->version);
1247 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1248 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1249 	printf("raid%d: Column: %d\n", raidid, clabel->column);
1250 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1251 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1252 	printf("raid%d: Status: %d\n", raidid, clabel->status);
1253 #endif	/* DEBUG */
1254 	clabel->row = 0;
1255 	int column = clabel->column;
1256 
1257 	if ((column < 0) || (column >= raidPtr->numCol)) {
1258 		return(EINVAL);
1259 	}
1260 
1261 	/* XXX this isn't allowed to do anything for now :-) */
1262 
1263 	/* XXX and before it is, we need to fill in the rest
1264 	   of the fields!?!?!?! */
1265 	memcpy(raidget_component_label(raidPtr, column),
1266 	    clabel, sizeof(*clabel));
1267 	raidflush_component_label(raidPtr, column);
1268 	return 0;
1269 }
1270 #endif
1271 
1272 static int
1273 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1274 {
1275 	/*
1276 	   we only want the serial number from
1277 	   the above.  We get all the rest of the information
1278 	   from the config that was used to create this RAID
1279 	   set.
1280 	   */
1281 
1282 	raidPtr->serial_number = clabel->serial_number;
1283 
1284 	for (int column = 0; column < raidPtr->numCol; column++) {
1285 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1286 		if (RF_DEAD_DISK(diskPtr->status))
1287 			continue;
1288 		RF_ComponentLabel_t *ci_label = raidget_component_label(
1289 		    raidPtr, column);
1290 		/* Zeroing this is important. */
1291 		memset(ci_label, 0, sizeof(*ci_label));
1292 		raid_init_component_label(raidPtr, ci_label);
1293 		ci_label->serial_number = raidPtr->serial_number;
1294 		ci_label->row = 0; /* we dont' pretend to support more */
1295 		rf_component_label_set_partitionsize(ci_label,
1296 		    diskPtr->partitionSize);
1297 		ci_label->column = column;
1298 		raidflush_component_label(raidPtr, column);
1299 		/* XXXjld what about the spares? */
1300 	}
1301 
1302 	return 0;
1303 }
1304 
1305 static int
1306 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1307 {
1308 
1309 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1310 		/* Can't do this on a RAID 0!! */
1311 		return EINVAL;
1312 	}
1313 
1314 	if (raidPtr->recon_in_progress == 1) {
1315 		/* a reconstruct is already in progress! */
1316 		return EINVAL;
1317 	}
1318 
1319 	RF_SingleComponent_t component;
1320 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1321 	component.row = 0; /* we don't support any more */
1322 	int column = component.column;
1323 
1324 	if ((column < 0) || (column >= raidPtr->numCol)) {
1325 		return EINVAL;
1326 	}
1327 
1328 	rf_lock_mutex2(raidPtr->mutex);
1329 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1330 	    (raidPtr->numFailures > 0)) {
1331 		/* XXX 0 above shouldn't be constant!!! */
1332 		/* some component other than this has failed.
1333 		   Let's not make things worse than they already
1334 		   are... */
1335 		printf("raid%d: Unable to reconstruct to disk at:\n",
1336 		       raidPtr->raidid);
1337 		printf("raid%d:     Col: %d   Too many failures.\n",
1338 		       raidPtr->raidid, column);
1339 		rf_unlock_mutex2(raidPtr->mutex);
1340 		return EINVAL;
1341 	}
1342 
1343 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1344 		printf("raid%d: Unable to reconstruct to disk at:\n",
1345 		       raidPtr->raidid);
1346 		printf("raid%d:    Col: %d   "
1347 		    "Reconstruction already occurring!\n",
1348 		    raidPtr->raidid, column);
1349 
1350 		rf_unlock_mutex2(raidPtr->mutex);
1351 		return EINVAL;
1352 	}
1353 
1354 	if (raidPtr->Disks[column].status == rf_ds_spared) {
1355 		rf_unlock_mutex2(raidPtr->mutex);
1356 		return EINVAL;
1357 	}
1358 
1359 	rf_unlock_mutex2(raidPtr->mutex);
1360 
1361 	struct rf_recon_req_internal *rrint;
1362 	rrint = RF_Malloc(sizeof(*rrint));
1363 	if (rrint == NULL)
1364 		return ENOMEM;
1365 
1366 	rrint->col = column;
1367 	rrint->raidPtr = raidPtr;
1368 
1369 	return RF_CREATE_THREAD(raidPtr->recon_thread,
1370 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1371 }
1372 
1373 static int
1374 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1375 {
1376 	/*
1377 	 * This makes no sense on a RAID 0, or if we are not reconstructing
1378 	 * so tell the user it's done.
1379 	 */
1380 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
1381 	    raidPtr->status != rf_rs_reconstructing) {
1382 		*data = 100;
1383 		return 0;
1384 	}
1385 	if (raidPtr->reconControl->numRUsTotal == 0) {
1386 		*data = 0;
1387 		return 0;
1388 	}
1389 	*data = (raidPtr->reconControl->numRUsComplete * 100
1390 	    / raidPtr->reconControl->numRUsTotal);
1391 	return 0;
1392 }
1393 
1394 static int
1395 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1396 {
1397 	int     unit = raidunit(dev);
1398 	int     part, pmask;
1399 	struct raid_softc *rs;
1400 	struct dk_softc *dksc;
1401 	RF_Config_t *k_cfg;
1402 	RF_Raid_t *raidPtr;
1403 	RF_AccTotals_t *totals;
1404 	RF_SingleComponent_t component;
1405 	RF_DeviceConfig_t *d_cfg, *ucfgp;
1406 	int retcode = 0;
1407 	int column;
1408 	RF_ComponentLabel_t *clabel;
1409 	RF_SingleComponent_t *sparePtr,*componentPtr;
1410 	int d;
1411 
1412 	if ((rs = raidget(unit, false)) == NULL)
1413 		return ENXIO;
1414 
1415 	dksc = &rs->sc_dksc;
1416 	raidPtr = &rs->sc_r;
1417 
1418 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1419 	    (int) DISKPART(dev), (int) unit, cmd));
1420 
1421 	/* Must be initialized for these... */
1422 	if (rf_must_be_initialized(rs, cmd))
1423 		return ENXIO;
1424 
1425 	switch (cmd) {
1426 		/* configure the system */
1427 	case RAIDFRAME_CONFIGURE:
1428 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1429 			return retcode;
1430 		return rf_construct(rs, k_cfg);
1431 
1432 		/* shutdown the system */
1433 	case RAIDFRAME_SHUTDOWN:
1434 
1435 		part = DISKPART(dev);
1436 		pmask = (1 << part);
1437 
1438 		if ((retcode = raidlock(rs)) != 0)
1439 			return retcode;
1440 
1441 		if (DK_BUSY(dksc, pmask) ||
1442 		    raidPtr->recon_in_progress != 0 ||
1443 		    raidPtr->parity_rewrite_in_progress != 0 ||
1444 		    raidPtr->copyback_in_progress != 0)
1445 			retcode = EBUSY;
1446 		else {
1447 			/* detach and free on close */
1448 			rs->sc_flags |= RAIDF_SHUTDOWN;
1449 			retcode = 0;
1450 		}
1451 
1452 		raidunlock(rs);
1453 
1454 		return retcode;
1455 	case RAIDFRAME_GET_COMPONENT_LABEL:
1456 		return rf_get_component_label(raidPtr, data);
1457 
1458 #if RF_DISABLED
1459 	case RAIDFRAME_SET_COMPONENT_LABEL:
1460 		return rf_set_component_label(raidPtr, data);
1461 #endif
1462 
1463 	case RAIDFRAME_INIT_LABELS:
1464 		return rf_init_component_label(raidPtr, data);
1465 
1466 	case RAIDFRAME_SET_AUTOCONFIG:
1467 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1468 		printf("raid%d: New autoconfig value is: %d\n",
1469 		       raidPtr->raidid, d);
1470 		*(int *) data = d;
1471 		return retcode;
1472 
1473 	case RAIDFRAME_SET_ROOT:
1474 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1475 		printf("raid%d: New rootpartition value is: %d\n",
1476 		       raidPtr->raidid, d);
1477 		*(int *) data = d;
1478 		return retcode;
1479 
1480 		/* initialize all parity */
1481 	case RAIDFRAME_REWRITEPARITY:
1482 
1483 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1484 			/* Parity for RAID 0 is trivially correct */
1485 			raidPtr->parity_good = RF_RAID_CLEAN;
1486 			return 0;
1487 		}
1488 
1489 		if (raidPtr->parity_rewrite_in_progress == 1) {
1490 			/* Re-write is already in progress! */
1491 			return EINVAL;
1492 		}
1493 
1494 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1495 		    rf_RewriteParityThread, raidPtr,"raid_parity");
1496 
1497 	case RAIDFRAME_ADD_HOT_SPARE:
1498 		sparePtr = (RF_SingleComponent_t *) data;
1499 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1500 		return rf_add_hot_spare(raidPtr, &component);
1501 
1502 	case RAIDFRAME_REMOVE_HOT_SPARE:
1503 		return retcode;
1504 
1505 	case RAIDFRAME_DELETE_COMPONENT:
1506 		componentPtr = (RF_SingleComponent_t *)data;
1507 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1508 		return rf_delete_component(raidPtr, &component);
1509 
1510 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1511 		componentPtr = (RF_SingleComponent_t *)data;
1512 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1513 		return rf_incorporate_hot_spare(raidPtr, &component);
1514 
1515 	case RAIDFRAME_REBUILD_IN_PLACE:
1516 		return rf_rebuild_in_place(raidPtr, data);
1517 
1518 	case RAIDFRAME_GET_INFO:
1519 		ucfgp = *(RF_DeviceConfig_t **)data;
1520 		d_cfg = RF_Malloc(sizeof(*d_cfg));
1521 		if (d_cfg == NULL)
1522 			return ENOMEM;
1523 		retcode = rf_get_info(raidPtr, d_cfg);
1524 		if (retcode == 0) {
1525 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1526 		}
1527 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1528 		return retcode;
1529 
1530 	case RAIDFRAME_CHECK_PARITY:
1531 		*(int *) data = raidPtr->parity_good;
1532 		return 0;
1533 
1534 	case RAIDFRAME_PARITYMAP_STATUS:
1535 		if (rf_paritymap_ineligible(raidPtr))
1536 			return EINVAL;
1537 		rf_paritymap_status(raidPtr->parity_map, data);
1538 		return 0;
1539 
1540 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1541 		if (rf_paritymap_ineligible(raidPtr))
1542 			return EINVAL;
1543 		if (raidPtr->parity_map == NULL)
1544 			return ENOENT; /* ??? */
1545 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1546 			return EINVAL;
1547 		return 0;
1548 
1549 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1550 		if (rf_paritymap_ineligible(raidPtr))
1551 			return EINVAL;
1552 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1553 		return 0;
1554 
1555 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1556 		if (rf_paritymap_ineligible(raidPtr))
1557 			return EINVAL;
1558 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1559 		/* XXX should errors be passed up? */
1560 		return 0;
1561 
1562 	case RAIDFRAME_RESET_ACCTOTALS:
1563 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1564 		return 0;
1565 
1566 	case RAIDFRAME_GET_ACCTOTALS:
1567 		totals = (RF_AccTotals_t *) data;
1568 		*totals = raidPtr->acc_totals;
1569 		return 0;
1570 
1571 	case RAIDFRAME_KEEP_ACCTOTALS:
1572 		raidPtr->keep_acc_totals = *(int *)data;
1573 		return 0;
1574 
1575 	case RAIDFRAME_GET_SIZE:
1576 		*(int *) data = raidPtr->totalSectors;
1577 		return 0;
1578 
1579 	case RAIDFRAME_FAIL_DISK:
1580 		return rf_fail_disk(raidPtr, data);
1581 
1582 		/* invoke a copyback operation after recon on whatever disk
1583 		 * needs it, if any */
1584 	case RAIDFRAME_COPYBACK:
1585 
1586 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1587 			/* This makes no sense on a RAID 0!! */
1588 			return EINVAL;
1589 		}
1590 
1591 		if (raidPtr->copyback_in_progress == 1) {
1592 			/* Copyback is already in progress! */
1593 			return EINVAL;
1594 		}
1595 
1596 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
1597 		    rf_CopybackThread, raidPtr, "raid_copyback");
1598 
1599 		/* return the percentage completion of reconstruction */
1600 	case RAIDFRAME_CHECK_RECON_STATUS:
1601 		return rf_check_recon_status(raidPtr, data);
1602 
1603 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1604 		rf_check_recon_status_ext(raidPtr, data);
1605 		return 0;
1606 
1607 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1608 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1609 			/* This makes no sense on a RAID 0, so tell the
1610 			   user it's done. */
1611 			*(int *) data = 100;
1612 			return 0;
1613 		}
1614 		if (raidPtr->parity_rewrite_in_progress == 1) {
1615 			*(int *) data = 100 *
1616 				raidPtr->parity_rewrite_stripes_done /
1617 				raidPtr->Layout.numStripe;
1618 		} else {
1619 			*(int *) data = 100;
1620 		}
1621 		return 0;
1622 
1623 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1624 		rf_check_parityrewrite_status_ext(raidPtr, data);
1625 		return 0;
1626 
1627 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1628 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1629 			/* This makes no sense on a RAID 0 */
1630 			*(int *) data = 100;
1631 			return 0;
1632 		}
1633 		if (raidPtr->copyback_in_progress == 1) {
1634 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1635 				raidPtr->Layout.numStripe;
1636 		} else {
1637 			*(int *) data = 100;
1638 		}
1639 		return 0;
1640 
1641 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1642 		rf_check_copyback_status_ext(raidPtr, data);
1643 		return 0;
1644 
1645 	case RAIDFRAME_SET_LAST_UNIT:
1646 		for (column = 0; column < raidPtr->numCol; column++)
1647 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1648 				return EBUSY;
1649 
1650 		for (column = 0; column < raidPtr->numCol; column++) {
1651 			clabel = raidget_component_label(raidPtr, column);
1652 			clabel->last_unit = *(int *)data;
1653 			raidflush_component_label(raidPtr, column);
1654 		}
1655 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1656 		return 0;
1657 
1658 		/* the sparetable daemon calls this to wait for the kernel to
1659 		 * need a spare table. this ioctl does not return until a
1660 		 * spare table is needed. XXX -- calling mpsleep here in the
1661 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1662 		 * -- I should either compute the spare table in the kernel,
1663 		 * or have a different -- XXX XXX -- interface (a different
1664 		 * character device) for delivering the table     -- XXX */
1665 #if RF_DISABLED
1666 	case RAIDFRAME_SPARET_WAIT:
1667 		rf_lock_mutex2(rf_sparet_wait_mutex);
1668 		while (!rf_sparet_wait_queue)
1669 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1670 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1671 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1672 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1673 
1674 		/* structure assignment */
1675 		*((RF_SparetWait_t *) data) = *waitreq;
1676 
1677 		RF_Free(waitreq, sizeof(*waitreq));
1678 		return 0;
1679 
1680 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1681 		 * code in it that will cause the dameon to exit */
1682 	case RAIDFRAME_ABORT_SPARET_WAIT:
1683 		waitreq = RF_Malloc(sizeof(*waitreq));
1684 		waitreq->fcol = -1;
1685 		rf_lock_mutex2(rf_sparet_wait_mutex);
1686 		waitreq->next = rf_sparet_wait_queue;
1687 		rf_sparet_wait_queue = waitreq;
1688 		rf_broadcast_cond2(rf_sparet_wait_cv);
1689 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1690 		return 0;
1691 
1692 		/* used by the spare table daemon to deliver a spare table
1693 		 * into the kernel */
1694 	case RAIDFRAME_SEND_SPARET:
1695 
1696 		/* install the spare table */
1697 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1698 
1699 		/* respond to the requestor.  the return status of the spare
1700 		 * table installation is passed in the "fcol" field */
1701 		waitred = RF_Malloc(sizeof(*waitreq));
1702 		waitreq->fcol = retcode;
1703 		rf_lock_mutex2(rf_sparet_wait_mutex);
1704 		waitreq->next = rf_sparet_resp_queue;
1705 		rf_sparet_resp_queue = waitreq;
1706 		rf_broadcast_cond2(rf_sparet_resp_cv);
1707 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1708 
1709 		return retcode;
1710 #endif
1711 	default:
1712 		/*
1713 		 * Don't bother trying to load compat modules
1714 		 * if it is not our ioctl. This is more efficient
1715 		 * and makes rump tests not depend on compat code
1716 		 */
1717 		if (IOCGROUP(cmd) != 'r')
1718 			break;
1719 #ifdef _LP64
1720 		if ((l->l_proc->p_flag & PK_32) != 0) {
1721 			module_autoload("compat_netbsd32_raid",
1722 			    MODULE_CLASS_EXEC);
1723 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1724 			    (rs, cmd, data), enosys(), retcode);
1725 			if (retcode != EPASSTHROUGH)
1726 				return retcode;
1727 		}
1728 #endif
1729 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1730 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1731 		    (rs, cmd, data), enosys(), retcode);
1732 		if (retcode != EPASSTHROUGH)
1733 			return retcode;
1734 
1735 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1736 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1737 		    (rs, cmd, data), enosys(), retcode);
1738 		if (retcode != EPASSTHROUGH)
1739 			return retcode;
1740 		break; /* fall through to the os-specific code below */
1741 
1742 	}
1743 
1744 	if (!raidPtr->valid)
1745 		return EINVAL;
1746 
1747 	/*
1748 	 * Add support for "regular" device ioctls here.
1749 	 */
1750 
1751 	switch (cmd) {
1752 	case DIOCGCACHE:
1753 		retcode = rf_get_component_caches(raidPtr, (int *)data);
1754 		break;
1755 
1756 	case DIOCCACHESYNC:
1757 		retcode = rf_sync_component_caches(raidPtr);
1758 		break;
1759 
1760 	default:
1761 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1762 		break;
1763 	}
1764 
1765 	return retcode;
1766 
1767 }
1768 
1769 
1770 /* raidinit -- complete the rest of the initialization for the
1771    RAIDframe device.  */
1772 
1773 
1774 static void
1775 raidinit(struct raid_softc *rs)
1776 {
1777 	cfdata_t cf;
1778 	unsigned int unit;
1779 	struct dk_softc *dksc = &rs->sc_dksc;
1780 	RF_Raid_t *raidPtr = &rs->sc_r;
1781 	device_t dev;
1782 
1783 	unit = raidPtr->raidid;
1784 
1785 	/* XXX doesn't check bounds. */
1786 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1787 
1788 	/* attach the pseudo device */
1789 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1790 	cf->cf_name = raid_cd.cd_name;
1791 	cf->cf_atname = raid_cd.cd_name;
1792 	cf->cf_unit = unit;
1793 	cf->cf_fstate = FSTATE_STAR;
1794 
1795 	dev = config_attach_pseudo(cf);
1796 	if (dev == NULL) {
1797 		printf("raid%d: config_attach_pseudo failed\n",
1798 		    raidPtr->raidid);
1799 		free(cf, M_RAIDFRAME);
1800 		return;
1801 	}
1802 
1803 	/* provide a backpointer to the real softc */
1804 	raidsoftc(dev) = rs;
1805 
1806 	/* disk_attach actually creates space for the CPU disklabel, among
1807 	 * other things, so it's critical to call this *BEFORE* we try putzing
1808 	 * with disklabels. */
1809 	dk_init(dksc, dev, DKTYPE_RAID);
1810 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1811 
1812 	/* XXX There may be a weird interaction here between this, and
1813 	 * protectedSectors, as used in RAIDframe.  */
1814 
1815 	rs->sc_size = raidPtr->totalSectors;
1816 
1817 	/* Attach dk and disk subsystems */
1818 	dk_attach(dksc);
1819 	disk_attach(&dksc->sc_dkdev);
1820 	rf_set_geometry(rs, raidPtr);
1821 
1822 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1823 
1824 	/* mark unit as usuable */
1825 	rs->sc_flags |= RAIDF_INITED;
1826 
1827 	dkwedge_discover(&dksc->sc_dkdev);
1828 }
1829 
1830 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1831 /* wake up the daemon & tell it to get us a spare table
1832  * XXX
1833  * the entries in the queues should be tagged with the raidPtr
1834  * so that in the extremely rare case that two recons happen at once,
1835  * we know for which device were requesting a spare table
1836  * XXX
1837  *
1838  * XXX This code is not currently used. GO
1839  */
1840 int
1841 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1842 {
1843 	int     retcode;
1844 
1845 	rf_lock_mutex2(rf_sparet_wait_mutex);
1846 	req->next = rf_sparet_wait_queue;
1847 	rf_sparet_wait_queue = req;
1848 	rf_broadcast_cond2(rf_sparet_wait_cv);
1849 
1850 	/* mpsleep unlocks the mutex */
1851 	while (!rf_sparet_resp_queue) {
1852 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1853 	}
1854 	req = rf_sparet_resp_queue;
1855 	rf_sparet_resp_queue = req->next;
1856 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1857 
1858 	retcode = req->fcol;
1859 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1860 					 * alloc'd */
1861 	return retcode;
1862 }
1863 #endif
1864 
1865 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1866  * bp & passes it down.
1867  * any calls originating in the kernel must use non-blocking I/O
1868  * do some extra sanity checking to return "appropriate" error values for
1869  * certain conditions (to make some standard utilities work)
1870  *
1871  * Formerly known as: rf_DoAccessKernel
1872  */
1873 void
1874 raidstart(RF_Raid_t *raidPtr)
1875 {
1876 	struct raid_softc *rs;
1877 	struct dk_softc *dksc;
1878 
1879 	rs = raidPtr->softc;
1880 	dksc = &rs->sc_dksc;
1881 	/* quick check to see if anything has died recently */
1882 	rf_lock_mutex2(raidPtr->mutex);
1883 	if (raidPtr->numNewFailures > 0) {
1884 		rf_unlock_mutex2(raidPtr->mutex);
1885 		rf_update_component_labels(raidPtr,
1886 					   RF_NORMAL_COMPONENT_UPDATE);
1887 		rf_lock_mutex2(raidPtr->mutex);
1888 		raidPtr->numNewFailures--;
1889 	}
1890 	rf_unlock_mutex2(raidPtr->mutex);
1891 
1892 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1893 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1894 		return;
1895 	}
1896 
1897 	dk_start(dksc, NULL);
1898 }
1899 
1900 static int
1901 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1902 {
1903 	RF_SectorCount_t num_blocks, pb, sum;
1904 	RF_RaidAddr_t raid_addr;
1905 	daddr_t blocknum;
1906 	int     do_async;
1907 	int rc;
1908 
1909 	rf_lock_mutex2(raidPtr->mutex);
1910 	if (raidPtr->openings == 0) {
1911 		rf_unlock_mutex2(raidPtr->mutex);
1912 		return EAGAIN;
1913 	}
1914 	rf_unlock_mutex2(raidPtr->mutex);
1915 
1916 	blocknum = bp->b_rawblkno;
1917 
1918 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1919 		    (int) blocknum));
1920 
1921 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1922 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1923 
1924 	/* *THIS* is where we adjust what block we're going to...
1925 	 * but DO NOT TOUCH bp->b_blkno!!! */
1926 	raid_addr = blocknum;
1927 
1928 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1929 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1930 	sum = raid_addr + num_blocks + pb;
1931 	if (1 || rf_debugKernelAccess) {
1932 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1933 			    (int) raid_addr, (int) sum, (int) num_blocks,
1934 			    (int) pb, (int) bp->b_resid));
1935 	}
1936 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1937 	    || (sum < num_blocks) || (sum < pb)) {
1938 		rc = ENOSPC;
1939 		goto done;
1940 	}
1941 	/*
1942 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1943 	 */
1944 
1945 	if (bp->b_bcount & raidPtr->sectorMask) {
1946 		rc = ENOSPC;
1947 		goto done;
1948 	}
1949 	db1_printf(("Calling DoAccess..\n"));
1950 
1951 
1952 	rf_lock_mutex2(raidPtr->mutex);
1953 	raidPtr->openings--;
1954 	rf_unlock_mutex2(raidPtr->mutex);
1955 
1956 	/*
1957 	 * Everything is async.
1958 	 */
1959 	do_async = 1;
1960 
1961 	/* don't ever condition on bp->b_flags & B_WRITE.
1962 	 * always condition on B_READ instead */
1963 
1964 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1965 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1966 			 do_async, raid_addr, num_blocks,
1967 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1968 
1969 done:
1970 	return rc;
1971 }
1972 
1973 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
1974 
1975 int
1976 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1977 {
1978 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1979 	struct buf *bp;
1980 
1981 	req->queue = queue;
1982 	bp = req->bp;
1983 
1984 	switch (req->type) {
1985 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
1986 		/* XXX need to do something extra here.. */
1987 		/* I'm leaving this in, as I've never actually seen it used,
1988 		 * and I'd like folks to report it... GO */
1989 		printf(("WAKEUP CALLED\n"));
1990 		queue->numOutstanding++;
1991 
1992 		bp->b_flags = 0;
1993 		bp->b_private = req;
1994 
1995 		KernelWakeupFunc(bp);
1996 		break;
1997 
1998 	case RF_IO_TYPE_READ:
1999 	case RF_IO_TYPE_WRITE:
2000 #if RF_ACC_TRACE > 0
2001 		if (req->tracerec) {
2002 			RF_ETIMER_START(req->tracerec->timer);
2003 		}
2004 #endif
2005 		InitBP(bp, queue->rf_cinfo->ci_vp,
2006 		    op, queue->rf_cinfo->ci_dev,
2007 		    req->sectorOffset, req->numSector,
2008 		    req->buf, KernelWakeupFunc, (void *) req,
2009 		    queue->raidPtr->logBytesPerSector);
2010 
2011 		if (rf_debugKernelAccess) {
2012 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2013 				(long) bp->b_blkno));
2014 		}
2015 		queue->numOutstanding++;
2016 		queue->last_deq_sector = req->sectorOffset;
2017 		/* acc wouldn't have been let in if there were any pending
2018 		 * reqs at any other priority */
2019 		queue->curPriority = req->priority;
2020 
2021 		db1_printf(("Going for %c to unit %d col %d\n",
2022 			    req->type, queue->raidPtr->raidid,
2023 			    queue->col));
2024 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2025 			(int) req->sectorOffset, (int) req->numSector,
2026 			(int) (req->numSector <<
2027 			    queue->raidPtr->logBytesPerSector),
2028 			(int) queue->raidPtr->logBytesPerSector));
2029 
2030 		/*
2031 		 * XXX: drop lock here since this can block at
2032 		 * least with backing SCSI devices.  Retake it
2033 		 * to minimize fuss with calling interfaces.
2034 		 */
2035 
2036 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2037 		bdev_strategy(bp);
2038 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2039 		break;
2040 
2041 	default:
2042 		panic("bad req->type in rf_DispatchKernelIO");
2043 	}
2044 	db1_printf(("Exiting from DispatchKernelIO\n"));
2045 
2046 	return 0;
2047 }
2048 /* this is the callback function associated with a I/O invoked from
2049    kernel code.
2050  */
2051 static void
2052 KernelWakeupFunc(struct buf *bp)
2053 {
2054 	RF_DiskQueueData_t *req = NULL;
2055 	RF_DiskQueue_t *queue;
2056 
2057 	db1_printf(("recovering the request queue:\n"));
2058 
2059 	req = bp->b_private;
2060 
2061 	queue = (RF_DiskQueue_t *) req->queue;
2062 
2063 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2064 
2065 #if RF_ACC_TRACE > 0
2066 	if (req->tracerec) {
2067 		RF_ETIMER_STOP(req->tracerec->timer);
2068 		RF_ETIMER_EVAL(req->tracerec->timer);
2069 		rf_lock_mutex2(rf_tracing_mutex);
2070 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2071 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2072 		req->tracerec->num_phys_ios++;
2073 		rf_unlock_mutex2(rf_tracing_mutex);
2074 	}
2075 #endif
2076 
2077 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2078 	 * ballistic, and mark the component as hosed... */
2079 
2080 	if (bp->b_error != 0) {
2081 		/* Mark the disk as dead */
2082 		/* but only mark it once... */
2083 		/* and only if it wouldn't leave this RAID set
2084 		   completely broken */
2085 		if (((queue->raidPtr->Disks[queue->col].status ==
2086 		      rf_ds_optimal) ||
2087 		     (queue->raidPtr->Disks[queue->col].status ==
2088 		      rf_ds_used_spare)) &&
2089 		     (queue->raidPtr->numFailures <
2090 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2091 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2092 			       queue->raidPtr->raidid,
2093 			       bp->b_error,
2094 			       queue->raidPtr->Disks[queue->col].devname);
2095 			queue->raidPtr->Disks[queue->col].status =
2096 			    rf_ds_failed;
2097 			queue->raidPtr->status = rf_rs_degraded;
2098 			queue->raidPtr->numFailures++;
2099 			queue->raidPtr->numNewFailures++;
2100 		} else {	/* Disk is already dead... */
2101 			/* printf("Disk already marked as dead!\n"); */
2102 		}
2103 
2104 	}
2105 
2106 	/* Fill in the error value */
2107 	req->error = bp->b_error;
2108 
2109 	/* Drop this one on the "finished" queue... */
2110 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2111 
2112 	/* Let the raidio thread know there is work to be done. */
2113 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2114 
2115 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2116 }
2117 
2118 
2119 /*
2120  * initialize a buf structure for doing an I/O in the kernel.
2121  */
2122 static void
2123 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2124        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2125        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2126 {
2127 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2128 	bp->b_oflags = 0;
2129 	bp->b_cflags = 0;
2130 	bp->b_bcount = numSect << logBytesPerSector;
2131 	bp->b_bufsize = bp->b_bcount;
2132 	bp->b_error = 0;
2133 	bp->b_dev = dev;
2134 	bp->b_data = bf;
2135 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2136 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2137 	if (bp->b_bcount == 0) {
2138 		panic("bp->b_bcount is zero in InitBP!!");
2139 	}
2140 	bp->b_iodone = cbFunc;
2141 	bp->b_private = cbArg;
2142 }
2143 
2144 /*
2145  * Wait interruptibly for an exclusive lock.
2146  *
2147  * XXX
2148  * Several drivers do this; it should be abstracted and made MP-safe.
2149  * (Hmm... where have we seen this warning before :->  GO )
2150  */
2151 static int
2152 raidlock(struct raid_softc *rs)
2153 {
2154 	int     error;
2155 
2156 	error = 0;
2157 	mutex_enter(&rs->sc_mutex);
2158 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2159 		rs->sc_flags |= RAIDF_WANTED;
2160 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2161 		if (error != 0)
2162 			goto done;
2163 	}
2164 	rs->sc_flags |= RAIDF_LOCKED;
2165 done:
2166 	mutex_exit(&rs->sc_mutex);
2167 	return error;
2168 }
2169 /*
2170  * Unlock and wake up any waiters.
2171  */
2172 static void
2173 raidunlock(struct raid_softc *rs)
2174 {
2175 
2176 	mutex_enter(&rs->sc_mutex);
2177 	rs->sc_flags &= ~RAIDF_LOCKED;
2178 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2179 		rs->sc_flags &= ~RAIDF_WANTED;
2180 		cv_broadcast(&rs->sc_cv);
2181 	}
2182 	mutex_exit(&rs->sc_mutex);
2183 }
2184 
2185 
2186 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2187 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2188 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2189 
2190 static daddr_t
2191 rf_component_info_offset(void)
2192 {
2193 
2194 	return RF_COMPONENT_INFO_OFFSET;
2195 }
2196 
2197 static daddr_t
2198 rf_component_info_size(unsigned secsize)
2199 {
2200 	daddr_t info_size;
2201 
2202 	KASSERT(secsize);
2203 	if (secsize > RF_COMPONENT_INFO_SIZE)
2204 		info_size = secsize;
2205 	else
2206 		info_size = RF_COMPONENT_INFO_SIZE;
2207 
2208 	return info_size;
2209 }
2210 
2211 static daddr_t
2212 rf_parity_map_offset(RF_Raid_t *raidPtr)
2213 {
2214 	daddr_t map_offset;
2215 
2216 	KASSERT(raidPtr->bytesPerSector);
2217 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2218 		map_offset = raidPtr->bytesPerSector;
2219 	else
2220 		map_offset = RF_COMPONENT_INFO_SIZE;
2221 	map_offset += rf_component_info_offset();
2222 
2223 	return map_offset;
2224 }
2225 
2226 static daddr_t
2227 rf_parity_map_size(RF_Raid_t *raidPtr)
2228 {
2229 	daddr_t map_size;
2230 
2231 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2232 		map_size = raidPtr->bytesPerSector;
2233 	else
2234 		map_size = RF_PARITY_MAP_SIZE;
2235 
2236 	return map_size;
2237 }
2238 
2239 int
2240 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2241 {
2242 	RF_ComponentLabel_t *clabel;
2243 
2244 	clabel = raidget_component_label(raidPtr, col);
2245 	clabel->clean = RF_RAID_CLEAN;
2246 	raidflush_component_label(raidPtr, col);
2247 	return(0);
2248 }
2249 
2250 
2251 int
2252 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2253 {
2254 	RF_ComponentLabel_t *clabel;
2255 
2256 	clabel = raidget_component_label(raidPtr, col);
2257 	clabel->clean = RF_RAID_DIRTY;
2258 	raidflush_component_label(raidPtr, col);
2259 	return(0);
2260 }
2261 
2262 int
2263 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2264 {
2265 	KASSERT(raidPtr->bytesPerSector);
2266 	return raidread_component_label(raidPtr->bytesPerSector,
2267 	    raidPtr->Disks[col].dev,
2268 	    raidPtr->raid_cinfo[col].ci_vp,
2269 	    &raidPtr->raid_cinfo[col].ci_label);
2270 }
2271 
2272 RF_ComponentLabel_t *
2273 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2274 {
2275 	return &raidPtr->raid_cinfo[col].ci_label;
2276 }
2277 
2278 int
2279 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2280 {
2281 	RF_ComponentLabel_t *label;
2282 
2283 	label = &raidPtr->raid_cinfo[col].ci_label;
2284 	label->mod_counter = raidPtr->mod_counter;
2285 #ifndef RF_NO_PARITY_MAP
2286 	label->parity_map_modcount = label->mod_counter;
2287 #endif
2288 	return raidwrite_component_label(raidPtr->bytesPerSector,
2289 	    raidPtr->Disks[col].dev,
2290 	    raidPtr->raid_cinfo[col].ci_vp, label);
2291 }
2292 
2293 
2294 static int
2295 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2296     RF_ComponentLabel_t *clabel)
2297 {
2298 	return raidread_component_area(dev, b_vp, clabel,
2299 	    sizeof(RF_ComponentLabel_t),
2300 	    rf_component_info_offset(),
2301 	    rf_component_info_size(secsize));
2302 }
2303 
2304 /* ARGSUSED */
2305 static int
2306 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2307     size_t msize, daddr_t offset, daddr_t dsize)
2308 {
2309 	struct buf *bp;
2310 	int error;
2311 
2312 	/* XXX should probably ensure that we don't try to do this if
2313 	   someone has changed rf_protected_sectors. */
2314 
2315 	if (b_vp == NULL) {
2316 		/* For whatever reason, this component is not valid.
2317 		   Don't try to read a component label from it. */
2318 		return(EINVAL);
2319 	}
2320 
2321 	/* get a block of the appropriate size... */
2322 	bp = geteblk((int)dsize);
2323 	bp->b_dev = dev;
2324 
2325 	/* get our ducks in a row for the read */
2326 	bp->b_blkno = offset / DEV_BSIZE;
2327 	bp->b_bcount = dsize;
2328 	bp->b_flags |= B_READ;
2329  	bp->b_resid = dsize;
2330 
2331 	bdev_strategy(bp);
2332 	error = biowait(bp);
2333 
2334 	if (!error) {
2335 		memcpy(data, bp->b_data, msize);
2336 	}
2337 
2338 	brelse(bp, 0);
2339 	return(error);
2340 }
2341 
2342 
2343 static int
2344 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2345     RF_ComponentLabel_t *clabel)
2346 {
2347 	return raidwrite_component_area(dev, b_vp, clabel,
2348 	    sizeof(RF_ComponentLabel_t),
2349 	    rf_component_info_offset(),
2350 	    rf_component_info_size(secsize), 0);
2351 }
2352 
2353 /* ARGSUSED */
2354 static int
2355 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2356     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2357 {
2358 	struct buf *bp;
2359 	int error;
2360 
2361 	/* get a block of the appropriate size... */
2362 	bp = geteblk((int)dsize);
2363 	bp->b_dev = dev;
2364 
2365 	/* get our ducks in a row for the write */
2366 	bp->b_blkno = offset / DEV_BSIZE;
2367 	bp->b_bcount = dsize;
2368 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2369  	bp->b_resid = dsize;
2370 
2371 	memset(bp->b_data, 0, dsize);
2372 	memcpy(bp->b_data, data, msize);
2373 
2374 	bdev_strategy(bp);
2375 	if (asyncp)
2376 		return 0;
2377 	error = biowait(bp);
2378 	brelse(bp, 0);
2379 	if (error) {
2380 #if 1
2381 		printf("Failed to write RAID component info!\n");
2382 #endif
2383 	}
2384 
2385 	return(error);
2386 }
2387 
2388 void
2389 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2390 {
2391 	int c;
2392 
2393 	for (c = 0; c < raidPtr->numCol; c++) {
2394 		/* Skip dead disks. */
2395 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2396 			continue;
2397 		/* XXXjld: what if an error occurs here? */
2398 		raidwrite_component_area(raidPtr->Disks[c].dev,
2399 		    raidPtr->raid_cinfo[c].ci_vp, map,
2400 		    RF_PARITYMAP_NBYTE,
2401 		    rf_parity_map_offset(raidPtr),
2402 		    rf_parity_map_size(raidPtr), 0);
2403 	}
2404 }
2405 
2406 void
2407 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2408 {
2409 	struct rf_paritymap_ondisk tmp;
2410 	int c,first;
2411 
2412 	first=1;
2413 	for (c = 0; c < raidPtr->numCol; c++) {
2414 		/* Skip dead disks. */
2415 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2416 			continue;
2417 		raidread_component_area(raidPtr->Disks[c].dev,
2418 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2419 		    RF_PARITYMAP_NBYTE,
2420 		    rf_parity_map_offset(raidPtr),
2421 		    rf_parity_map_size(raidPtr));
2422 		if (first) {
2423 			memcpy(map, &tmp, sizeof(*map));
2424 			first = 0;
2425 		} else {
2426 			rf_paritymap_merge(map, &tmp);
2427 		}
2428 	}
2429 }
2430 
2431 void
2432 rf_markalldirty(RF_Raid_t *raidPtr)
2433 {
2434 	RF_ComponentLabel_t *clabel;
2435 	int sparecol;
2436 	int c;
2437 	int j;
2438 	int scol = -1;
2439 
2440 	raidPtr->mod_counter++;
2441 	for (c = 0; c < raidPtr->numCol; c++) {
2442 		/* we don't want to touch (at all) a disk that has
2443 		   failed */
2444 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2445 			clabel = raidget_component_label(raidPtr, c);
2446 			if (clabel->status == rf_ds_spared) {
2447 				/* XXX do something special...
2448 				   but whatever you do, don't
2449 				   try to access it!! */
2450 			} else {
2451 				raidmarkdirty(raidPtr, c);
2452 			}
2453 		}
2454 	}
2455 
2456 	for( c = 0; c < raidPtr->numSpare ; c++) {
2457 		sparecol = raidPtr->numCol + c;
2458 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2459 			/*
2460 
2461 			   we claim this disk is "optimal" if it's
2462 			   rf_ds_used_spare, as that means it should be
2463 			   directly substitutable for the disk it replaced.
2464 			   We note that too...
2465 
2466 			 */
2467 
2468 			for(j=0;j<raidPtr->numCol;j++) {
2469 				if (raidPtr->Disks[j].spareCol == sparecol) {
2470 					scol = j;
2471 					break;
2472 				}
2473 			}
2474 
2475 			clabel = raidget_component_label(raidPtr, sparecol);
2476 			/* make sure status is noted */
2477 
2478 			raid_init_component_label(raidPtr, clabel);
2479 
2480 			clabel->row = 0;
2481 			clabel->column = scol;
2482 			/* Note: we *don't* change status from rf_ds_used_spare
2483 			   to rf_ds_optimal */
2484 			/* clabel.status = rf_ds_optimal; */
2485 
2486 			raidmarkdirty(raidPtr, sparecol);
2487 		}
2488 	}
2489 }
2490 
2491 
2492 void
2493 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2494 {
2495 	RF_ComponentLabel_t *clabel;
2496 	int sparecol;
2497 	int c;
2498 	int j;
2499 	int scol;
2500 	struct raid_softc *rs = raidPtr->softc;
2501 
2502 	scol = -1;
2503 
2504 	/* XXX should do extra checks to make sure things really are clean,
2505 	   rather than blindly setting the clean bit... */
2506 
2507 	raidPtr->mod_counter++;
2508 
2509 	for (c = 0; c < raidPtr->numCol; c++) {
2510 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2511 			clabel = raidget_component_label(raidPtr, c);
2512 			/* make sure status is noted */
2513 			clabel->status = rf_ds_optimal;
2514 
2515 			/* note what unit we are configured as */
2516 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2517 				clabel->last_unit = raidPtr->raidid;
2518 
2519 			raidflush_component_label(raidPtr, c);
2520 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2521 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2522 					raidmarkclean(raidPtr, c);
2523 				}
2524 			}
2525 		}
2526 		/* else we don't touch it.. */
2527 	}
2528 
2529 	for( c = 0; c < raidPtr->numSpare ; c++) {
2530 		sparecol = raidPtr->numCol + c;
2531 		/* Need to ensure that the reconstruct actually completed! */
2532 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2533 			/*
2534 
2535 			   we claim this disk is "optimal" if it's
2536 			   rf_ds_used_spare, as that means it should be
2537 			   directly substitutable for the disk it replaced.
2538 			   We note that too...
2539 
2540 			 */
2541 
2542 			for(j=0;j<raidPtr->numCol;j++) {
2543 				if (raidPtr->Disks[j].spareCol == sparecol) {
2544 					scol = j;
2545 					break;
2546 				}
2547 			}
2548 
2549 			/* XXX shouldn't *really* need this... */
2550 			clabel = raidget_component_label(raidPtr, sparecol);
2551 			/* make sure status is noted */
2552 
2553 			raid_init_component_label(raidPtr, clabel);
2554 
2555 			clabel->column = scol;
2556 			clabel->status = rf_ds_optimal;
2557 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2558 				clabel->last_unit = raidPtr->raidid;
2559 
2560 			raidflush_component_label(raidPtr, sparecol);
2561 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2562 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2563 					raidmarkclean(raidPtr, sparecol);
2564 				}
2565 			}
2566 		}
2567 	}
2568 }
2569 
2570 void
2571 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2572 {
2573 
2574 	if (vp != NULL) {
2575 		if (auto_configured == 1) {
2576 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2577 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2578 			vput(vp);
2579 
2580 		} else {
2581 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2582 		}
2583 	}
2584 }
2585 
2586 
2587 void
2588 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2589 {
2590 	int r,c;
2591 	struct vnode *vp;
2592 	int acd;
2593 
2594 
2595 	/* We take this opportunity to close the vnodes like we should.. */
2596 
2597 	for (c = 0; c < raidPtr->numCol; c++) {
2598 		vp = raidPtr->raid_cinfo[c].ci_vp;
2599 		acd = raidPtr->Disks[c].auto_configured;
2600 		rf_close_component(raidPtr, vp, acd);
2601 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2602 		raidPtr->Disks[c].auto_configured = 0;
2603 	}
2604 
2605 	for (r = 0; r < raidPtr->numSpare; r++) {
2606 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2607 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2608 		rf_close_component(raidPtr, vp, acd);
2609 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2610 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2611 	}
2612 }
2613 
2614 
2615 void
2616 rf_ReconThread(struct rf_recon_req_internal *req)
2617 {
2618 	int     s;
2619 	RF_Raid_t *raidPtr;
2620 
2621 	s = splbio();
2622 	raidPtr = (RF_Raid_t *) req->raidPtr;
2623 	raidPtr->recon_in_progress = 1;
2624 
2625 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2626 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2627 
2628 	RF_Free(req, sizeof(*req));
2629 
2630 	raidPtr->recon_in_progress = 0;
2631 	splx(s);
2632 
2633 	/* That's all... */
2634 	kthread_exit(0);	/* does not return */
2635 }
2636 
2637 void
2638 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2639 {
2640 	int retcode;
2641 	int s;
2642 
2643 	raidPtr->parity_rewrite_stripes_done = 0;
2644 	raidPtr->parity_rewrite_in_progress = 1;
2645 	s = splbio();
2646 	retcode = rf_RewriteParity(raidPtr);
2647 	splx(s);
2648 	if (retcode) {
2649 		printf("raid%d: Error re-writing parity (%d)!\n",
2650 		    raidPtr->raidid, retcode);
2651 	} else {
2652 		/* set the clean bit!  If we shutdown correctly,
2653 		   the clean bit on each component label will get
2654 		   set */
2655 		raidPtr->parity_good = RF_RAID_CLEAN;
2656 	}
2657 	raidPtr->parity_rewrite_in_progress = 0;
2658 
2659 	/* Anyone waiting for us to stop?  If so, inform them... */
2660 	if (raidPtr->waitShutdown) {
2661 		rf_lock_mutex2(raidPtr->rad_lock);
2662 		cv_broadcast(&raidPtr->parity_rewrite_cv);
2663 		rf_unlock_mutex2(raidPtr->rad_lock);
2664 	}
2665 
2666 	/* That's all... */
2667 	kthread_exit(0);	/* does not return */
2668 }
2669 
2670 
2671 void
2672 rf_CopybackThread(RF_Raid_t *raidPtr)
2673 {
2674 	int s;
2675 
2676 	raidPtr->copyback_in_progress = 1;
2677 	s = splbio();
2678 	rf_CopybackReconstructedData(raidPtr);
2679 	splx(s);
2680 	raidPtr->copyback_in_progress = 0;
2681 
2682 	/* That's all... */
2683 	kthread_exit(0);	/* does not return */
2684 }
2685 
2686 
2687 void
2688 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2689 {
2690 	int s;
2691 	RF_Raid_t *raidPtr;
2692 
2693 	s = splbio();
2694 	raidPtr = req->raidPtr;
2695 	raidPtr->recon_in_progress = 1;
2696 	rf_ReconstructInPlace(raidPtr, req->col);
2697 	RF_Free(req, sizeof(*req));
2698 	raidPtr->recon_in_progress = 0;
2699 	splx(s);
2700 
2701 	/* That's all... */
2702 	kthread_exit(0);	/* does not return */
2703 }
2704 
2705 static RF_AutoConfig_t *
2706 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2707     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2708     unsigned secsize)
2709 {
2710 	int good_one = 0;
2711 	RF_ComponentLabel_t *clabel;
2712 	RF_AutoConfig_t *ac;
2713 
2714 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2715 
2716 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2717 		/* Got the label.  Does it look reasonable? */
2718 		if (rf_reasonable_label(clabel, numsecs) &&
2719 		    (rf_component_label_partitionsize(clabel) <= size)) {
2720 #ifdef DEBUG
2721 			printf("Component on: %s: %llu\n",
2722 				cname, (unsigned long long)size);
2723 			rf_print_component_label(clabel);
2724 #endif
2725 			/* if it's reasonable, add it, else ignore it. */
2726 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2727 				M_WAITOK);
2728 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2729 			ac->dev = dev;
2730 			ac->vp = vp;
2731 			ac->clabel = clabel;
2732 			ac->next = ac_list;
2733 			ac_list = ac;
2734 			good_one = 1;
2735 		}
2736 	}
2737 	if (!good_one) {
2738 		/* cleanup */
2739 		free(clabel, M_RAIDFRAME);
2740 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2741 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2742 		vput(vp);
2743 	}
2744 	return ac_list;
2745 }
2746 
2747 RF_AutoConfig_t *
2748 rf_find_raid_components(void)
2749 {
2750 	struct vnode *vp;
2751 	struct disklabel label;
2752 	device_t dv;
2753 	deviter_t di;
2754 	dev_t dev;
2755 	int bmajor, bminor, wedge, rf_part_found;
2756 	int error;
2757 	int i;
2758 	RF_AutoConfig_t *ac_list;
2759 	uint64_t numsecs;
2760 	unsigned secsize;
2761 	int dowedges;
2762 
2763 	/* initialize the AutoConfig list */
2764 	ac_list = NULL;
2765 
2766 	/*
2767 	 * we begin by trolling through *all* the devices on the system *twice*
2768 	 * first we scan for wedges, second for other devices. This avoids
2769 	 * using a raw partition instead of a wedge that covers the whole disk
2770 	 */
2771 
2772 	for (dowedges=1; dowedges>=0; --dowedges) {
2773 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2774 		     dv = deviter_next(&di)) {
2775 
2776 			/* we are only interested in disks... */
2777 			if (device_class(dv) != DV_DISK)
2778 				continue;
2779 
2780 			/* we don't care about floppies... */
2781 			if (device_is_a(dv, "fd")) {
2782 				continue;
2783 			}
2784 
2785 			/* we don't care about CD's... */
2786 			if (device_is_a(dv, "cd")) {
2787 				continue;
2788 			}
2789 
2790 			/* we don't care about md's... */
2791 			if (device_is_a(dv, "md")) {
2792 				continue;
2793 			}
2794 
2795 			/* hdfd is the Atari/Hades floppy driver */
2796 			if (device_is_a(dv, "hdfd")) {
2797 				continue;
2798 			}
2799 
2800 			/* fdisa is the Atari/Milan floppy driver */
2801 			if (device_is_a(dv, "fdisa")) {
2802 				continue;
2803 			}
2804 
2805 			/* are we in the wedges pass ? */
2806 			wedge = device_is_a(dv, "dk");
2807 			if (wedge != dowedges) {
2808 				continue;
2809 			}
2810 
2811 			/* need to find the device_name_to_block_device_major stuff */
2812 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2813 
2814 			rf_part_found = 0; /*No raid partition as yet*/
2815 
2816 			/* get a vnode for the raw partition of this disk */
2817 			bminor = minor(device_unit(dv));
2818 			dev = wedge ? makedev(bmajor, bminor) :
2819 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2820 			if (bdevvp(dev, &vp))
2821 				panic("RAID can't alloc vnode");
2822 
2823 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2824 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2825 
2826 			if (error) {
2827 				/* "Who cares."  Continue looking
2828 				   for something that exists*/
2829 				vput(vp);
2830 				continue;
2831 			}
2832 
2833 			error = getdisksize(vp, &numsecs, &secsize);
2834 			if (error) {
2835 				/*
2836 				 * Pseudo devices like vnd and cgd can be
2837 				 * opened but may still need some configuration.
2838 				 * Ignore these quietly.
2839 				 */
2840 				if (error != ENXIO)
2841 					printf("RAIDframe: can't get disk size"
2842 					    " for dev %s (%d)\n",
2843 					    device_xname(dv), error);
2844 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2845 				vput(vp);
2846 				continue;
2847 			}
2848 			if (wedge) {
2849 				struct dkwedge_info dkw;
2850 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2851 				    NOCRED);
2852 				if (error) {
2853 					printf("RAIDframe: can't get wedge info for "
2854 					    "dev %s (%d)\n", device_xname(dv), error);
2855 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2856 					vput(vp);
2857 					continue;
2858 				}
2859 
2860 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2861 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2862 					vput(vp);
2863 					continue;
2864 				}
2865 
2866 				VOP_UNLOCK(vp);
2867 				ac_list = rf_get_component(ac_list, dev, vp,
2868 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
2869 				rf_part_found = 1; /*There is a raid component on this disk*/
2870 				continue;
2871 			}
2872 
2873 			/* Ok, the disk exists.  Go get the disklabel. */
2874 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2875 			if (error) {
2876 				/*
2877 				 * XXX can't happen - open() would
2878 				 * have errored out (or faked up one)
2879 				 */
2880 				if (error != ENOTTY)
2881 					printf("RAIDframe: can't get label for dev "
2882 					    "%s (%d)\n", device_xname(dv), error);
2883 			}
2884 
2885 			/* don't need this any more.  We'll allocate it again
2886 			   a little later if we really do... */
2887 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2888 			vput(vp);
2889 
2890 			if (error)
2891 				continue;
2892 
2893 			rf_part_found = 0; /*No raid partitions yet*/
2894 			for (i = 0; i < label.d_npartitions; i++) {
2895 				char cname[sizeof(ac_list->devname)];
2896 
2897 				/* We only support partitions marked as RAID */
2898 				if (label.d_partitions[i].p_fstype != FS_RAID)
2899 					continue;
2900 
2901 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2902 				if (bdevvp(dev, &vp))
2903 					panic("RAID can't alloc vnode");
2904 
2905 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2906 				error = VOP_OPEN(vp, FREAD, NOCRED);
2907 				if (error) {
2908 					/* Whatever... */
2909 					vput(vp);
2910 					continue;
2911 				}
2912 				VOP_UNLOCK(vp);
2913 				snprintf(cname, sizeof(cname), "%s%c",
2914 				    device_xname(dv), 'a' + i);
2915 				ac_list = rf_get_component(ac_list, dev, vp, cname,
2916 					label.d_partitions[i].p_size, numsecs, secsize);
2917 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
2918 			}
2919 
2920 			/*
2921 			 *If there is no raid component on this disk, either in a
2922 			 *disklabel or inside a wedge, check the raw partition as well,
2923 			 *as it is possible to configure raid components on raw disk
2924 			 *devices.
2925 			 */
2926 
2927 			if (!rf_part_found) {
2928 				char cname[sizeof(ac_list->devname)];
2929 
2930 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2931 				if (bdevvp(dev, &vp))
2932 					panic("RAID can't alloc vnode");
2933 
2934 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2935 
2936 				error = VOP_OPEN(vp, FREAD, NOCRED);
2937 				if (error) {
2938 					/* Whatever... */
2939 					vput(vp);
2940 					continue;
2941 				}
2942 				VOP_UNLOCK(vp);
2943 				snprintf(cname, sizeof(cname), "%s%c",
2944 				    device_xname(dv), 'a' + RAW_PART);
2945 				ac_list = rf_get_component(ac_list, dev, vp, cname,
2946 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2947 			}
2948 		}
2949 		deviter_release(&di);
2950 	}
2951 	return ac_list;
2952 }
2953 
2954 
2955 int
2956 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2957 {
2958 
2959 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2960 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2961 	    ((clabel->clean == RF_RAID_CLEAN) ||
2962 	     (clabel->clean == RF_RAID_DIRTY)) &&
2963 	    clabel->row >=0 &&
2964 	    clabel->column >= 0 &&
2965 	    clabel->num_rows > 0 &&
2966 	    clabel->num_columns > 0 &&
2967 	    clabel->row < clabel->num_rows &&
2968 	    clabel->column < clabel->num_columns &&
2969 	    clabel->blockSize > 0 &&
2970 	    /*
2971 	     * numBlocksHi may contain garbage, but it is ok since
2972 	     * the type is unsigned.  If it is really garbage,
2973 	     * rf_fix_old_label_size() will fix it.
2974 	     */
2975 	    rf_component_label_numblocks(clabel) > 0) {
2976 		/*
2977 		 * label looks reasonable enough...
2978 		 * let's make sure it has no old garbage.
2979 		 */
2980 		if (numsecs)
2981 			rf_fix_old_label_size(clabel, numsecs);
2982 		return(1);
2983 	}
2984 	return(0);
2985 }
2986 
2987 
2988 /*
2989  * For reasons yet unknown, some old component labels have garbage in
2990  * the newer numBlocksHi region, and this causes lossage.  Since those
2991  * disks will also have numsecs set to less than 32 bits of sectors,
2992  * we can determine when this corruption has occurred, and fix it.
2993  *
2994  * The exact same problem, with the same unknown reason, happens to
2995  * the partitionSizeHi member as well.
2996  */
2997 static void
2998 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2999 {
3000 
3001 	if (numsecs < ((uint64_t)1 << 32)) {
3002 		if (clabel->numBlocksHi) {
3003 			printf("WARNING: total sectors < 32 bits, yet "
3004 			       "numBlocksHi set\n"
3005 			       "WARNING: resetting numBlocksHi to zero.\n");
3006 			clabel->numBlocksHi = 0;
3007 		}
3008 
3009 		if (clabel->partitionSizeHi) {
3010 			printf("WARNING: total sectors < 32 bits, yet "
3011 			       "partitionSizeHi set\n"
3012 			       "WARNING: resetting partitionSizeHi to zero.\n");
3013 			clabel->partitionSizeHi = 0;
3014 		}
3015 	}
3016 }
3017 
3018 
3019 #ifdef DEBUG
3020 void
3021 rf_print_component_label(RF_ComponentLabel_t *clabel)
3022 {
3023 	uint64_t numBlocks;
3024 	static const char *rp[] = {
3025 	    "No", "Force", "Soft", "*invalid*"
3026 	};
3027 
3028 
3029 	numBlocks = rf_component_label_numblocks(clabel);
3030 
3031 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3032 	       clabel->row, clabel->column,
3033 	       clabel->num_rows, clabel->num_columns);
3034 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3035 	       clabel->version, clabel->serial_number,
3036 	       clabel->mod_counter);
3037 	printf("   Clean: %s Status: %d\n",
3038 	       clabel->clean ? "Yes" : "No", clabel->status);
3039 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3040 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3041 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3042 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3043 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3044 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3045 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3046 #if 0
3047 	   printf("   Config order: %d\n", clabel->config_order);
3048 #endif
3049 
3050 }
3051 #endif
3052 
3053 RF_ConfigSet_t *
3054 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3055 {
3056 	RF_AutoConfig_t *ac;
3057 	RF_ConfigSet_t *config_sets;
3058 	RF_ConfigSet_t *cset;
3059 	RF_AutoConfig_t *ac_next;
3060 
3061 
3062 	config_sets = NULL;
3063 
3064 	/* Go through the AutoConfig list, and figure out which components
3065 	   belong to what sets.  */
3066 	ac = ac_list;
3067 	while(ac!=NULL) {
3068 		/* we're going to putz with ac->next, so save it here
3069 		   for use at the end of the loop */
3070 		ac_next = ac->next;
3071 
3072 		if (config_sets == NULL) {
3073 			/* will need at least this one... */
3074 			config_sets = malloc(sizeof(RF_ConfigSet_t),
3075 				       M_RAIDFRAME, M_WAITOK);
3076 			/* this one is easy :) */
3077 			config_sets->ac = ac;
3078 			config_sets->next = NULL;
3079 			config_sets->rootable = 0;
3080 			ac->next = NULL;
3081 		} else {
3082 			/* which set does this component fit into? */
3083 			cset = config_sets;
3084 			while(cset!=NULL) {
3085 				if (rf_does_it_fit(cset, ac)) {
3086 					/* looks like it matches... */
3087 					ac->next = cset->ac;
3088 					cset->ac = ac;
3089 					break;
3090 				}
3091 				cset = cset->next;
3092 			}
3093 			if (cset==NULL) {
3094 				/* didn't find a match above... new set..*/
3095 				cset = malloc(sizeof(RF_ConfigSet_t),
3096 					       M_RAIDFRAME, M_WAITOK);
3097 				cset->ac = ac;
3098 				ac->next = NULL;
3099 				cset->next = config_sets;
3100 				cset->rootable = 0;
3101 				config_sets = cset;
3102 			}
3103 		}
3104 		ac = ac_next;
3105 	}
3106 
3107 
3108 	return(config_sets);
3109 }
3110 
3111 static int
3112 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3113 {
3114 	RF_ComponentLabel_t *clabel1, *clabel2;
3115 
3116 	/* If this one matches the *first* one in the set, that's good
3117 	   enough, since the other members of the set would have been
3118 	   through here too... */
3119 	/* note that we are not checking partitionSize here..
3120 
3121 	   Note that we are also not checking the mod_counters here.
3122 	   If everything else matches except the mod_counter, that's
3123 	   good enough for this test.  We will deal with the mod_counters
3124 	   a little later in the autoconfiguration process.
3125 
3126 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3127 
3128 	   The reason we don't check for this is that failed disks
3129 	   will have lower modification counts.  If those disks are
3130 	   not added to the set they used to belong to, then they will
3131 	   form their own set, which may result in 2 different sets,
3132 	   for example, competing to be configured at raid0, and
3133 	   perhaps competing to be the root filesystem set.  If the
3134 	   wrong ones get configured, or both attempt to become /,
3135 	   weird behaviour and or serious lossage will occur.  Thus we
3136 	   need to bring them into the fold here, and kick them out at
3137 	   a later point.
3138 
3139 	*/
3140 
3141 	clabel1 = cset->ac->clabel;
3142 	clabel2 = ac->clabel;
3143 	if ((clabel1->version == clabel2->version) &&
3144 	    (clabel1->serial_number == clabel2->serial_number) &&
3145 	    (clabel1->num_rows == clabel2->num_rows) &&
3146 	    (clabel1->num_columns == clabel2->num_columns) &&
3147 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3148 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3149 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3150 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3151 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3152 	    (clabel1->blockSize == clabel2->blockSize) &&
3153 	    rf_component_label_numblocks(clabel1) ==
3154 	    rf_component_label_numblocks(clabel2) &&
3155 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3156 	    (clabel1->root_partition == clabel2->root_partition) &&
3157 	    (clabel1->last_unit == clabel2->last_unit) &&
3158 	    (clabel1->config_order == clabel2->config_order)) {
3159 		/* if it get's here, it almost *has* to be a match */
3160 	} else {
3161 		/* it's not consistent with somebody in the set..
3162 		   punt */
3163 		return(0);
3164 	}
3165 	/* all was fine.. it must fit... */
3166 	return(1);
3167 }
3168 
3169 int
3170 rf_have_enough_components(RF_ConfigSet_t *cset)
3171 {
3172 	RF_AutoConfig_t *ac;
3173 	RF_AutoConfig_t *auto_config;
3174 	RF_ComponentLabel_t *clabel;
3175 	int c;
3176 	int num_cols;
3177 	int num_missing;
3178 	int mod_counter;
3179 	int mod_counter_found;
3180 	int even_pair_failed;
3181 	char parity_type;
3182 
3183 
3184 	/* check to see that we have enough 'live' components
3185 	   of this set.  If so, we can configure it if necessary */
3186 
3187 	num_cols = cset->ac->clabel->num_columns;
3188 	parity_type = cset->ac->clabel->parityConfig;
3189 
3190 	/* XXX Check for duplicate components!?!?!? */
3191 
3192 	/* Determine what the mod_counter is supposed to be for this set. */
3193 
3194 	mod_counter_found = 0;
3195 	mod_counter = 0;
3196 	ac = cset->ac;
3197 	while(ac!=NULL) {
3198 		if (mod_counter_found==0) {
3199 			mod_counter = ac->clabel->mod_counter;
3200 			mod_counter_found = 1;
3201 		} else {
3202 			if (ac->clabel->mod_counter > mod_counter) {
3203 				mod_counter = ac->clabel->mod_counter;
3204 			}
3205 		}
3206 		ac = ac->next;
3207 	}
3208 
3209 	num_missing = 0;
3210 	auto_config = cset->ac;
3211 
3212 	even_pair_failed = 0;
3213 	for(c=0; c<num_cols; c++) {
3214 		ac = auto_config;
3215 		while(ac!=NULL) {
3216 			if ((ac->clabel->column == c) &&
3217 			    (ac->clabel->mod_counter == mod_counter)) {
3218 				/* it's this one... */
3219 #ifdef DEBUG
3220 				printf("Found: %s at %d\n",
3221 				       ac->devname,c);
3222 #endif
3223 				break;
3224 			}
3225 			ac=ac->next;
3226 		}
3227 		if (ac==NULL) {
3228 				/* Didn't find one here! */
3229 				/* special case for RAID 1, especially
3230 				   where there are more than 2
3231 				   components (where RAIDframe treats
3232 				   things a little differently :( ) */
3233 			if (parity_type == '1') {
3234 				if (c%2 == 0) { /* even component */
3235 					even_pair_failed = 1;
3236 				} else { /* odd component.  If
3237 					    we're failed, and
3238 					    so is the even
3239 					    component, it's
3240 					    "Good Night, Charlie" */
3241 					if (even_pair_failed == 1) {
3242 						return(0);
3243 					}
3244 				}
3245 			} else {
3246 				/* normal accounting */
3247 				num_missing++;
3248 			}
3249 		}
3250 		if ((parity_type == '1') && (c%2 == 1)) {
3251 				/* Just did an even component, and we didn't
3252 				   bail.. reset the even_pair_failed flag,
3253 				   and go on to the next component.... */
3254 			even_pair_failed = 0;
3255 		}
3256 	}
3257 
3258 	clabel = cset->ac->clabel;
3259 
3260 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3261 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3262 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3263 		/* XXX this needs to be made *much* more general */
3264 		/* Too many failures */
3265 		return(0);
3266 	}
3267 	/* otherwise, all is well, and we've got enough to take a kick
3268 	   at autoconfiguring this set */
3269 	return(1);
3270 }
3271 
3272 void
3273 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3274 			RF_Raid_t *raidPtr)
3275 {
3276 	RF_ComponentLabel_t *clabel;
3277 	int i;
3278 
3279 	clabel = ac->clabel;
3280 
3281 	/* 1. Fill in the common stuff */
3282 	config->numCol = clabel->num_columns;
3283 	config->numSpare = 0; /* XXX should this be set here? */
3284 	config->sectPerSU = clabel->sectPerSU;
3285 	config->SUsPerPU = clabel->SUsPerPU;
3286 	config->SUsPerRU = clabel->SUsPerRU;
3287 	config->parityConfig = clabel->parityConfig;
3288 	/* XXX... */
3289 	strcpy(config->diskQueueType,"fifo");
3290 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3291 	config->layoutSpecificSize = 0; /* XXX ?? */
3292 
3293 	while(ac!=NULL) {
3294 		/* row/col values will be in range due to the checks
3295 		   in reasonable_label() */
3296 		strcpy(config->devnames[0][ac->clabel->column],
3297 		       ac->devname);
3298 		ac = ac->next;
3299 	}
3300 
3301 	for(i=0;i<RF_MAXDBGV;i++) {
3302 		config->debugVars[i][0] = 0;
3303 	}
3304 }
3305 
3306 int
3307 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3308 {
3309 	RF_ComponentLabel_t *clabel;
3310 	int column;
3311 	int sparecol;
3312 
3313 	raidPtr->autoconfigure = new_value;
3314 
3315 	for(column=0; column<raidPtr->numCol; column++) {
3316 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3317 			clabel = raidget_component_label(raidPtr, column);
3318 			clabel->autoconfigure = new_value;
3319 			raidflush_component_label(raidPtr, column);
3320 		}
3321 	}
3322 	for(column = 0; column < raidPtr->numSpare ; column++) {
3323 		sparecol = raidPtr->numCol + column;
3324 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3325 			clabel = raidget_component_label(raidPtr, sparecol);
3326 			clabel->autoconfigure = new_value;
3327 			raidflush_component_label(raidPtr, sparecol);
3328 		}
3329 	}
3330 	return(new_value);
3331 }
3332 
3333 int
3334 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3335 {
3336 	RF_ComponentLabel_t *clabel;
3337 	int column;
3338 	int sparecol;
3339 
3340 	raidPtr->root_partition = new_value;
3341 	for(column=0; column<raidPtr->numCol; column++) {
3342 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3343 			clabel = raidget_component_label(raidPtr, column);
3344 			clabel->root_partition = new_value;
3345 			raidflush_component_label(raidPtr, column);
3346 		}
3347 	}
3348 	for(column = 0; column < raidPtr->numSpare ; column++) {
3349 		sparecol = raidPtr->numCol + column;
3350 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3351 			clabel = raidget_component_label(raidPtr, sparecol);
3352 			clabel->root_partition = new_value;
3353 			raidflush_component_label(raidPtr, sparecol);
3354 		}
3355 	}
3356 	return(new_value);
3357 }
3358 
3359 void
3360 rf_release_all_vps(RF_ConfigSet_t *cset)
3361 {
3362 	RF_AutoConfig_t *ac;
3363 
3364 	ac = cset->ac;
3365 	while(ac!=NULL) {
3366 		/* Close the vp, and give it back */
3367 		if (ac->vp) {
3368 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3369 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3370 			vput(ac->vp);
3371 			ac->vp = NULL;
3372 		}
3373 		ac = ac->next;
3374 	}
3375 }
3376 
3377 
3378 void
3379 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3380 {
3381 	RF_AutoConfig_t *ac;
3382 	RF_AutoConfig_t *next_ac;
3383 
3384 	ac = cset->ac;
3385 	while(ac!=NULL) {
3386 		next_ac = ac->next;
3387 		/* nuke the label */
3388 		free(ac->clabel, M_RAIDFRAME);
3389 		/* cleanup the config structure */
3390 		free(ac, M_RAIDFRAME);
3391 		/* "next.." */
3392 		ac = next_ac;
3393 	}
3394 	/* and, finally, nuke the config set */
3395 	free(cset, M_RAIDFRAME);
3396 }
3397 
3398 
3399 void
3400 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3401 {
3402 	/* current version number */
3403 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3404 	clabel->serial_number = raidPtr->serial_number;
3405 	clabel->mod_counter = raidPtr->mod_counter;
3406 
3407 	clabel->num_rows = 1;
3408 	clabel->num_columns = raidPtr->numCol;
3409 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3410 	clabel->status = rf_ds_optimal; /* "It's good!" */
3411 
3412 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3413 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3414 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3415 
3416 	clabel->blockSize = raidPtr->bytesPerSector;
3417 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3418 
3419 	/* XXX not portable */
3420 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3421 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3422 	clabel->autoconfigure = raidPtr->autoconfigure;
3423 	clabel->root_partition = raidPtr->root_partition;
3424 	clabel->last_unit = raidPtr->raidid;
3425 	clabel->config_order = raidPtr->config_order;
3426 
3427 #ifndef RF_NO_PARITY_MAP
3428 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3429 #endif
3430 }
3431 
3432 struct raid_softc *
3433 rf_auto_config_set(RF_ConfigSet_t *cset)
3434 {
3435 	RF_Raid_t *raidPtr;
3436 	RF_Config_t *config;
3437 	int raidID;
3438 	struct raid_softc *sc;
3439 
3440 #ifdef DEBUG
3441 	printf("RAID autoconfigure\n");
3442 #endif
3443 
3444 	/* 1. Create a config structure */
3445 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3446 
3447 	/*
3448 	   2. Figure out what RAID ID this one is supposed to live at
3449 	   See if we can get the same RAID dev that it was configured
3450 	   on last time..
3451 	*/
3452 
3453 	raidID = cset->ac->clabel->last_unit;
3454 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3455 	     sc = raidget(++raidID, false))
3456 		continue;
3457 #ifdef DEBUG
3458 	printf("Configuring raid%d:\n",raidID);
3459 #endif
3460 
3461 	if (sc == NULL)
3462 		sc = raidget(raidID, true);
3463 	raidPtr = &sc->sc_r;
3464 
3465 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3466 	raidPtr->softc = sc;
3467 	raidPtr->raidid = raidID;
3468 	raidPtr->openings = RAIDOUTSTANDING;
3469 
3470 	/* 3. Build the configuration structure */
3471 	rf_create_configuration(cset->ac, config, raidPtr);
3472 
3473 	/* 4. Do the configuration */
3474 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3475 		raidinit(sc);
3476 
3477 		rf_markalldirty(raidPtr);
3478 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3479 		switch (cset->ac->clabel->root_partition) {
3480 		case 1:	/* Force Root */
3481 		case 2:	/* Soft Root: root when boot partition part of raid */
3482 			/*
3483 			 * everything configured just fine.  Make a note
3484 			 * that this set is eligible to be root,
3485 			 * or forced to be root
3486 			 */
3487 			cset->rootable = cset->ac->clabel->root_partition;
3488 			/* XXX do this here? */
3489 			raidPtr->root_partition = cset->rootable;
3490 			break;
3491 		default:
3492 			break;
3493 		}
3494 	} else {
3495 		raidput(sc);
3496 		sc = NULL;
3497 	}
3498 
3499 	/* 5. Cleanup */
3500 	free(config, M_RAIDFRAME);
3501 	return sc;
3502 }
3503 
3504 void
3505 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3506 	     size_t xmin, size_t xmax)
3507 {
3508 
3509 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3510 	pool_sethiwat(p, xmax);
3511 	pool_prime(p, xmin);
3512 }
3513 
3514 /*
3515  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3516  * to see if there is IO pending and if that IO could possibly be done
3517  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3518  * otherwise.
3519  *
3520  */
3521 int
3522 rf_buf_queue_check(RF_Raid_t *raidPtr)
3523 {
3524 	struct raid_softc *rs;
3525 	struct dk_softc *dksc;
3526 
3527 	rs = raidPtr->softc;
3528 	dksc = &rs->sc_dksc;
3529 
3530 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3531 		return 1;
3532 
3533 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3534 		/* there is work to do */
3535 		return 0;
3536 	}
3537 	/* default is nothing to do */
3538 	return 1;
3539 }
3540 
3541 int
3542 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3543 {
3544 	uint64_t numsecs;
3545 	unsigned secsize;
3546 	int error;
3547 
3548 	error = getdisksize(vp, &numsecs, &secsize);
3549 	if (error == 0) {
3550 		diskPtr->blockSize = secsize;
3551 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3552 		diskPtr->partitionSize = numsecs;
3553 		return 0;
3554 	}
3555 	return error;
3556 }
3557 
3558 static int
3559 raid_match(device_t self, cfdata_t cfdata, void *aux)
3560 {
3561 	return 1;
3562 }
3563 
3564 static void
3565 raid_attach(device_t parent, device_t self, void *aux)
3566 {
3567 }
3568 
3569 
3570 static int
3571 raid_detach(device_t self, int flags)
3572 {
3573 	int error;
3574 	struct raid_softc *rs = raidsoftc(self);
3575 
3576 	if (rs == NULL)
3577 		return ENXIO;
3578 
3579 	if ((error = raidlock(rs)) != 0)
3580 		return error;
3581 
3582 	error = raid_detach_unlocked(rs);
3583 
3584 	raidunlock(rs);
3585 
3586 	/* XXX raid can be referenced here */
3587 
3588 	if (error)
3589 		return error;
3590 
3591 	/* Free the softc */
3592 	raidput(rs);
3593 
3594 	return 0;
3595 }
3596 
3597 static void
3598 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3599 {
3600 	struct dk_softc *dksc = &rs->sc_dksc;
3601 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3602 
3603 	memset(dg, 0, sizeof(*dg));
3604 
3605 	dg->dg_secperunit = raidPtr->totalSectors;
3606 	dg->dg_secsize = raidPtr->bytesPerSector;
3607 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3608 	dg->dg_ntracks = 4 * raidPtr->numCol;
3609 
3610 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3611 }
3612 
3613 /*
3614  * Get cache info for all the components (including spares).
3615  * Returns intersection of all the cache flags of all disks, or first
3616  * error if any encountered.
3617  * XXXfua feature flags can change as spares are added - lock down somehow
3618  */
3619 static int
3620 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3621 {
3622 	int c;
3623 	int error;
3624 	int dkwhole = 0, dkpart;
3625 
3626 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3627 		/*
3628 		 * Check any non-dead disk, even when currently being
3629 		 * reconstructed.
3630 		 */
3631 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3632 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3633 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3634 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
3635 			if (error) {
3636 				if (error != ENODEV) {
3637 					printf("raid%d: get cache for component %s failed\n",
3638 					    raidPtr->raidid,
3639 					    raidPtr->Disks[c].devname);
3640 				}
3641 
3642 				return error;
3643 			}
3644 
3645 			if (c == 0)
3646 				dkwhole = dkpart;
3647 			else
3648 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3649 		}
3650 	}
3651 
3652 	*data = dkwhole;
3653 
3654 	return 0;
3655 }
3656 
3657 /*
3658  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3659  * We end up returning whatever error was returned by the first cache flush
3660  * that fails.
3661  */
3662 
3663 static int
3664 rf_sync_component_cache(RF_Raid_t *raidPtr, int c)
3665 {
3666 	int force = 1;
3667 	int e = 0;
3668 	for (int i = 0; i < 5; i++) {
3669 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3670 		    &force, FWRITE, NOCRED);
3671 		if (!e || e == ENODEV)
3672 			return e;
3673 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3674 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3675 	}
3676 	return e;
3677 }
3678 
3679 int
3680 rf_sync_component_caches(RF_Raid_t *raidPtr)
3681 {
3682 	int c, error;
3683 
3684 	error = 0;
3685 	for (c = 0; c < raidPtr->numCol; c++) {
3686 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3687 			int e = rf_sync_component_cache(raidPtr, c);
3688 			if (e && !error)
3689 				error = e;
3690 		}
3691 	}
3692 
3693 	for (c = 0; c < raidPtr->numSpare ; c++) {
3694 		int sparecol = raidPtr->numCol + c;
3695 		/* Need to ensure that the reconstruct actually completed! */
3696 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3697 			int e = rf_sync_component_cache(raidPtr, sparecol);
3698 			if (e && !error)
3699 				error = e;
3700 		}
3701 	}
3702 	return error;
3703 }
3704 
3705 /* Fill in info with the current status */
3706 void
3707 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3708 {
3709 
3710 	if (raidPtr->status != rf_rs_reconstructing) {
3711 		info->total = 100;
3712 		info->completed = 100;
3713 	} else {
3714 		info->total = raidPtr->reconControl->numRUsTotal;
3715 		info->completed = raidPtr->reconControl->numRUsComplete;
3716 	}
3717 	info->remaining = info->total - info->completed;
3718 }
3719 
3720 /* Fill in info with the current status */
3721 void
3722 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3723 {
3724 
3725 	if (raidPtr->parity_rewrite_in_progress == 1) {
3726 		info->total = raidPtr->Layout.numStripe;
3727 		info->completed = raidPtr->parity_rewrite_stripes_done;
3728 	} else {
3729 		info->completed = 100;
3730 		info->total = 100;
3731 	}
3732 	info->remaining = info->total - info->completed;
3733 }
3734 
3735 /* Fill in info with the current status */
3736 void
3737 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3738 {
3739 
3740 	if (raidPtr->copyback_in_progress == 1) {
3741 		info->total = raidPtr->Layout.numStripe;
3742 		info->completed = raidPtr->copyback_stripes_done;
3743 		info->remaining = info->total - info->completed;
3744 	} else {
3745 		info->remaining = 0;
3746 		info->completed = 100;
3747 		info->total = 100;
3748 	}
3749 }
3750 
3751 /* Fill in config with the current info */
3752 int
3753 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3754 {
3755 	int	d, i, j;
3756 
3757 	if (!raidPtr->valid)
3758 		return ENODEV;
3759 	config->cols = raidPtr->numCol;
3760 	config->ndevs = raidPtr->numCol;
3761 	if (config->ndevs >= RF_MAX_DISKS)
3762 		return ENOMEM;
3763 	config->nspares = raidPtr->numSpare;
3764 	if (config->nspares >= RF_MAX_DISKS)
3765 		return ENOMEM;
3766 	config->maxqdepth = raidPtr->maxQueueDepth;
3767 	d = 0;
3768 	for (j = 0; j < config->cols; j++) {
3769 		config->devs[d] = raidPtr->Disks[j];
3770 		d++;
3771 	}
3772 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3773 		config->spares[i] = raidPtr->Disks[j];
3774 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
3775 			/* XXX: raidctl(8) expects to see this as a used spare */
3776 			config->spares[i].status = rf_ds_used_spare;
3777 		}
3778 	}
3779 	return 0;
3780 }
3781 
3782 int
3783 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3784 {
3785 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3786 	RF_ComponentLabel_t *raid_clabel;
3787 	int column = clabel->column;
3788 
3789 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3790 		return EINVAL;
3791 	raid_clabel = raidget_component_label(raidPtr, column);
3792 	memcpy(clabel, raid_clabel, sizeof *clabel);
3793 
3794 	return 0;
3795 }
3796 
3797 /*
3798  * Module interface
3799  */
3800 
3801 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3802 
3803 #ifdef _MODULE
3804 CFDRIVER_DECL(raid, DV_DISK, NULL);
3805 #endif
3806 
3807 static int raid_modcmd(modcmd_t, void *);
3808 static int raid_modcmd_init(void);
3809 static int raid_modcmd_fini(void);
3810 
3811 static int
3812 raid_modcmd(modcmd_t cmd, void *data)
3813 {
3814 	int error;
3815 
3816 	error = 0;
3817 	switch (cmd) {
3818 	case MODULE_CMD_INIT:
3819 		error = raid_modcmd_init();
3820 		break;
3821 	case MODULE_CMD_FINI:
3822 		error = raid_modcmd_fini();
3823 		break;
3824 	default:
3825 		error = ENOTTY;
3826 		break;
3827 	}
3828 	return error;
3829 }
3830 
3831 static int
3832 raid_modcmd_init(void)
3833 {
3834 	int error;
3835 	int bmajor, cmajor;
3836 
3837 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3838 	mutex_enter(&raid_lock);
3839 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3840 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3841 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3842 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3843 
3844 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3845 #endif
3846 
3847 	bmajor = cmajor = -1;
3848 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3849 	    &raid_cdevsw, &cmajor);
3850 	if (error != 0 && error != EEXIST) {
3851 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3852 		mutex_exit(&raid_lock);
3853 		return error;
3854 	}
3855 #ifdef _MODULE
3856 	error = config_cfdriver_attach(&raid_cd);
3857 	if (error != 0) {
3858 		aprint_error("%s: config_cfdriver_attach failed %d\n",
3859 		    __func__, error);
3860 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
3861 		mutex_exit(&raid_lock);
3862 		return error;
3863 	}
3864 #endif
3865 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3866 	if (error != 0) {
3867 		aprint_error("%s: config_cfattach_attach failed %d\n",
3868 		    __func__, error);
3869 #ifdef _MODULE
3870 		config_cfdriver_detach(&raid_cd);
3871 #endif
3872 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
3873 		mutex_exit(&raid_lock);
3874 		return error;
3875 	}
3876 
3877 	raidautoconfigdone = false;
3878 
3879 	mutex_exit(&raid_lock);
3880 
3881 	if (error == 0) {
3882 		if (rf_BootRaidframe(true) == 0)
3883 			aprint_verbose("Kernelized RAIDframe activated\n");
3884 		else
3885 			panic("Serious error activating RAID!!");
3886 	}
3887 
3888 	/*
3889 	 * Register a finalizer which will be used to auto-config RAID
3890 	 * sets once all real hardware devices have been found.
3891 	 */
3892 	error = config_finalize_register(NULL, rf_autoconfig);
3893 	if (error != 0) {
3894 		aprint_error("WARNING: unable to register RAIDframe "
3895 		    "finalizer\n");
3896 		error = 0;
3897 	}
3898 
3899 	return error;
3900 }
3901 
3902 static int
3903 raid_modcmd_fini(void)
3904 {
3905 	int error;
3906 
3907 	mutex_enter(&raid_lock);
3908 
3909 	/* Don't allow unload if raid device(s) exist.  */
3910 	if (!LIST_EMPTY(&raids)) {
3911 		mutex_exit(&raid_lock);
3912 		return EBUSY;
3913 	}
3914 
3915 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3916 	if (error != 0) {
3917 		aprint_error("%s: cannot detach cfattach\n",__func__);
3918 		mutex_exit(&raid_lock);
3919 		return error;
3920 	}
3921 #ifdef _MODULE
3922 	error = config_cfdriver_detach(&raid_cd);
3923 	if (error != 0) {
3924 		aprint_error("%s: cannot detach cfdriver\n",__func__);
3925 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3926 		mutex_exit(&raid_lock);
3927 		return error;
3928 	}
3929 #endif
3930 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3931 	if (error != 0) {
3932 		aprint_error("%s: cannot detach devsw\n",__func__);
3933 #ifdef _MODULE
3934 		config_cfdriver_attach(&raid_cd);
3935 #endif
3936 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3937 		mutex_exit(&raid_lock);
3938 		return error;
3939 	}
3940 	rf_BootRaidframe(false);
3941 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3942 	rf_destroy_mutex2(rf_sparet_wait_mutex);
3943 	rf_destroy_cond2(rf_sparet_wait_cv);
3944 	rf_destroy_cond2(rf_sparet_resp_cv);
3945 #endif
3946 	mutex_exit(&raid_lock);
3947 	mutex_destroy(&raid_lock);
3948 
3949 	return error;
3950 }
3951