xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.381 2020/03/21 06:02:13 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.381 2020/03/21 06:02:13 riastradh Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151 
152 #include "ioconf.h"
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166 
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171 
172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
173 						 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
175 						 * installation process */
176 #endif
177 
178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
179 
180 /* prototypes */
181 static void KernelWakeupFunc(struct buf *);
182 static void InitBP(struct buf *, struct vnode *, unsigned,
183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
184     void *, int, struct proc *);
185 static void raidinit(struct raid_softc *);
186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
188 
189 static int raid_match(device_t, cfdata_t, void *);
190 static void raid_attach(device_t, device_t, void *);
191 static int raid_detach(device_t, int);
192 
193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
194     daddr_t, daddr_t);
195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
196     daddr_t, daddr_t, int);
197 
198 static int raidwrite_component_label(unsigned,
199     dev_t, struct vnode *, RF_ComponentLabel_t *);
200 static int raidread_component_label(unsigned,
201     dev_t, struct vnode *, RF_ComponentLabel_t *);
202 
203 static int raid_diskstart(device_t, struct buf *bp);
204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
205 static int raid_lastclose(device_t);
206 
207 static dev_type_open(raidopen);
208 static dev_type_close(raidclose);
209 static dev_type_read(raidread);
210 static dev_type_write(raidwrite);
211 static dev_type_ioctl(raidioctl);
212 static dev_type_strategy(raidstrategy);
213 static dev_type_dump(raiddump);
214 static dev_type_size(raidsize);
215 
216 const struct bdevsw raid_bdevsw = {
217 	.d_open = raidopen,
218 	.d_close = raidclose,
219 	.d_strategy = raidstrategy,
220 	.d_ioctl = raidioctl,
221 	.d_dump = raiddump,
222 	.d_psize = raidsize,
223 	.d_discard = nodiscard,
224 	.d_flag = D_DISK
225 };
226 
227 const struct cdevsw raid_cdevsw = {
228 	.d_open = raidopen,
229 	.d_close = raidclose,
230 	.d_read = raidread,
231 	.d_write = raidwrite,
232 	.d_ioctl = raidioctl,
233 	.d_stop = nostop,
234 	.d_tty = notty,
235 	.d_poll = nopoll,
236 	.d_mmap = nommap,
237 	.d_kqfilter = nokqfilter,
238 	.d_discard = nodiscard,
239 	.d_flag = D_DISK
240 };
241 
242 static struct dkdriver rf_dkdriver = {
243 	.d_open = raidopen,
244 	.d_close = raidclose,
245 	.d_strategy = raidstrategy,
246 	.d_diskstart = raid_diskstart,
247 	.d_dumpblocks = raid_dumpblocks,
248 	.d_lastclose = raid_lastclose,
249 	.d_minphys = minphys
250 };
251 
252 #define	raidunit(x)	DISKUNIT(x)
253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
254 
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258     DVF_DETACH_SHUTDOWN);
259 
260 /* Internal representation of a rf_recon_req */
261 struct rf_recon_req_internal {
262 	RF_RowCol_t col;
263 	RF_ReconReqFlags_t flags;
264 	void   *raidPtr;
265 };
266 
267 /*
268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
269  * Be aware that large numbers can allow the driver to consume a lot of
270  * kernel memory, especially on writes, and in degraded mode reads.
271  *
272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
273  * a single 64K write will typically require 64K for the old data,
274  * 64K for the old parity, and 64K for the new parity, for a total
275  * of 192K (if the parity buffer is not re-used immediately).
276  * Even it if is used immediately, that's still 128K, which when multiplied
277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
278  *
279  * Now in degraded mode, for example, a 64K read on the above setup may
280  * require data reconstruction, which will require *all* of the 4 remaining
281  * disks to participate -- 4 * 32K/disk == 128K again.
282  */
283 
284 #ifndef RAIDOUTSTANDING
285 #define RAIDOUTSTANDING   6
286 #endif
287 
288 #define RAIDLABELDEV(dev)	\
289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
290 
291 /* declared here, and made public, for the benefit of KVM stuff.. */
292 
293 static int raidlock(struct raid_softc *);
294 static void raidunlock(struct raid_softc *);
295 
296 static int raid_detach_unlocked(struct raid_softc *);
297 
298 static void rf_markalldirty(RF_Raid_t *);
299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
300 
301 void rf_ReconThread(struct rf_recon_req_internal *);
302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
303 void rf_CopybackThread(RF_Raid_t *raidPtr);
304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
305 int rf_autoconfig(device_t);
306 void rf_buildroothack(RF_ConfigSet_t *);
307 
308 RF_AutoConfig_t *rf_find_raid_components(void);
309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
313 int rf_set_autoconfig(RF_Raid_t *, int);
314 int rf_set_rootpartition(RF_Raid_t *, int);
315 void rf_release_all_vps(RF_ConfigSet_t *);
316 void rf_cleanup_config_set(RF_ConfigSet_t *);
317 int rf_have_enough_components(RF_ConfigSet_t *);
318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
320 
321 /*
322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
324  * in the kernel config file.
325  */
326 #ifdef RAID_AUTOCONFIG
327 int raidautoconfig = 1;
328 #else
329 int raidautoconfig = 0;
330 #endif
331 static bool raidautoconfigdone = false;
332 
333 struct RF_Pools_s rf_pools;
334 
335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
336 static kmutex_t raid_lock;
337 
338 static struct raid_softc *
339 raidcreate(int unit) {
340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
341 	sc->sc_unit = unit;
342 	cv_init(&sc->sc_cv, "raidunit");
343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
344 	return sc;
345 }
346 
347 static void
348 raiddestroy(struct raid_softc *sc) {
349 	cv_destroy(&sc->sc_cv);
350 	mutex_destroy(&sc->sc_mutex);
351 	kmem_free(sc, sizeof(*sc));
352 }
353 
354 static struct raid_softc *
355 raidget(int unit, bool create) {
356 	struct raid_softc *sc;
357 	if (unit < 0) {
358 #ifdef DIAGNOSTIC
359 		panic("%s: unit %d!", __func__, unit);
360 #endif
361 		return NULL;
362 	}
363 	mutex_enter(&raid_lock);
364 	LIST_FOREACH(sc, &raids, sc_link) {
365 		if (sc->sc_unit == unit) {
366 			mutex_exit(&raid_lock);
367 			return sc;
368 		}
369 	}
370 	mutex_exit(&raid_lock);
371 	if (!create)
372 		return NULL;
373 	sc = raidcreate(unit);
374 	mutex_enter(&raid_lock);
375 	LIST_INSERT_HEAD(&raids, sc, sc_link);
376 	mutex_exit(&raid_lock);
377 	return sc;
378 }
379 
380 static void
381 raidput(struct raid_softc *sc) {
382 	mutex_enter(&raid_lock);
383 	LIST_REMOVE(sc, sc_link);
384 	mutex_exit(&raid_lock);
385 	raiddestroy(sc);
386 }
387 
388 void
389 raidattach(int num)
390 {
391 
392 	/*
393 	 * Device attachment and associated initialization now occurs
394 	 * as part of the module initialization.
395 	 */
396 }
397 
398 int
399 rf_autoconfig(device_t self)
400 {
401 	RF_AutoConfig_t *ac_list;
402 	RF_ConfigSet_t *config_sets;
403 
404 	if (!raidautoconfig || raidautoconfigdone == true)
405 		return (0);
406 
407 	/* XXX This code can only be run once. */
408 	raidautoconfigdone = true;
409 
410 #ifdef __HAVE_CPU_BOOTCONF
411 	/*
412 	 * 0. find the boot device if needed first so we can use it later
413 	 * this needs to be done before we autoconfigure any raid sets,
414 	 * because if we use wedges we are not going to be able to open
415 	 * the boot device later
416 	 */
417 	if (booted_device == NULL)
418 		cpu_bootconf();
419 #endif
420 	/* 1. locate all RAID components on the system */
421 	aprint_debug("Searching for RAID components...\n");
422 	ac_list = rf_find_raid_components();
423 
424 	/* 2. Sort them into their respective sets. */
425 	config_sets = rf_create_auto_sets(ac_list);
426 
427 	/*
428 	 * 3. Evaluate each set and configure the valid ones.
429 	 * This gets done in rf_buildroothack().
430 	 */
431 	rf_buildroothack(config_sets);
432 
433 	return 1;
434 }
435 
436 int
437 rf_inited(const struct raid_softc *rs) {
438 	return (rs->sc_flags & RAIDF_INITED) != 0;
439 }
440 
441 RF_Raid_t *
442 rf_get_raid(struct raid_softc *rs) {
443 	return &rs->sc_r;
444 }
445 
446 int
447 rf_get_unit(const struct raid_softc *rs) {
448 	return rs->sc_unit;
449 }
450 
451 static int
452 rf_containsboot(RF_Raid_t *r, device_t bdv) {
453 	const char *bootname;
454 	size_t len;
455 
456 	/* if bdv is NULL, the set can't contain it. exit early. */
457 	if (bdv == NULL)
458 		return 0;
459 
460 	bootname = device_xname(bdv);
461 	len = strlen(bootname);
462 
463 	for (int col = 0; col < r->numCol; col++) {
464 		const char *devname = r->Disks[col].devname;
465 		devname += sizeof("/dev/") - 1;
466 		if (strncmp(devname, "dk", 2) == 0) {
467 			const char *parent =
468 			    dkwedge_get_parent_name(r->Disks[col].dev);
469 			if (parent != NULL)
470 				devname = parent;
471 		}
472 		if (strncmp(devname, bootname, len) == 0) {
473 			struct raid_softc *sc = r->softc;
474 			aprint_debug("raid%d includes boot device %s\n",
475 			    sc->sc_unit, devname);
476 			return 1;
477 		}
478 	}
479 	return 0;
480 }
481 
482 void
483 rf_buildroothack(RF_ConfigSet_t *config_sets)
484 {
485 	RF_ConfigSet_t *cset;
486 	RF_ConfigSet_t *next_cset;
487 	int num_root;
488 	struct raid_softc *sc, *rsc;
489 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
490 
491 	sc = rsc = NULL;
492 	num_root = 0;
493 	cset = config_sets;
494 	while (cset != NULL) {
495 		next_cset = cset->next;
496 		if (rf_have_enough_components(cset) &&
497 		    cset->ac->clabel->autoconfigure == 1) {
498 			sc = rf_auto_config_set(cset);
499 			if (sc != NULL) {
500 				aprint_debug("raid%d: configured ok, rootable %d\n",
501 				    sc->sc_unit, cset->rootable);
502 				if (cset->rootable) {
503 					rsc = sc;
504 					num_root++;
505 				}
506 			} else {
507 				/* The autoconfig didn't work :( */
508 				aprint_debug("Autoconfig failed\n");
509 				rf_release_all_vps(cset);
510 			}
511 		} else {
512 			/* we're not autoconfiguring this set...
513 			   release the associated resources */
514 			rf_release_all_vps(cset);
515 		}
516 		/* cleanup */
517 		rf_cleanup_config_set(cset);
518 		cset = next_cset;
519 	}
520 
521 	/* if the user has specified what the root device should be
522 	   then we don't touch booted_device or boothowto... */
523 
524 	if (rootspec != NULL) {
525 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
526 		return;
527 	}
528 
529 	/* we found something bootable... */
530 
531 	/*
532 	 * XXX: The following code assumes that the root raid
533 	 * is the first ('a') partition. This is about the best
534 	 * we can do with a BSD disklabel, but we might be able
535 	 * to do better with a GPT label, by setting a specified
536 	 * attribute to indicate the root partition. We can then
537 	 * stash the partition number in the r->root_partition
538 	 * high bits (the bottom 2 bits are already used). For
539 	 * now we just set booted_partition to 0 when we override
540 	 * root.
541 	 */
542 	if (num_root == 1) {
543 		device_t candidate_root;
544 		dksc = &rsc->sc_dksc;
545 		if (dksc->sc_dkdev.dk_nwedges != 0) {
546 			char cname[sizeof(cset->ac->devname)];
547 			/* XXX: assume partition 'a' first */
548 			snprintf(cname, sizeof(cname), "%s%c",
549 			    device_xname(dksc->sc_dev), 'a');
550 			candidate_root = dkwedge_find_by_wname(cname);
551 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
552 			    cname);
553 			if (candidate_root == NULL) {
554 				/*
555 				 * If that is not found, because we don't use
556 				 * disklabel, return the first dk child
557 				 * XXX: we can skip the 'a' check above
558 				 * and always do this...
559 				 */
560 				size_t i = 0;
561 				candidate_root = dkwedge_find_by_parent(
562 				    device_xname(dksc->sc_dev), &i);
563 			}
564 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
565 			    candidate_root);
566 		} else
567 			candidate_root = dksc->sc_dev;
568 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
569 		DPRINTF("%s: booted_device=%p root_partition=%d "
570 			"contains_boot=%d",
571 		    __func__, booted_device, rsc->sc_r.root_partition,
572 			   rf_containsboot(&rsc->sc_r, booted_device));
573 		/* XXX the check for booted_device == NULL can probably be
574 		 * dropped, now that rf_containsboot handles that case.
575 		 */
576 		if (booted_device == NULL ||
577 		    rsc->sc_r.root_partition == 1 ||
578 		    rf_containsboot(&rsc->sc_r, booted_device)) {
579 			booted_device = candidate_root;
580 			booted_method = "raidframe/single";
581 			booted_partition = 0;	/* XXX assume 'a' */
582 		}
583 	} else if (num_root > 1) {
584 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
585 		    booted_device);
586 
587 		/*
588 		 * Maybe the MD code can help. If it cannot, then
589 		 * setroot() will discover that we have no
590 		 * booted_device and will ask the user if nothing was
591 		 * hardwired in the kernel config file
592 		 */
593 		if (booted_device == NULL)
594 			return;
595 
596 		num_root = 0;
597 		mutex_enter(&raid_lock);
598 		LIST_FOREACH(sc, &raids, sc_link) {
599 			RF_Raid_t *r = &sc->sc_r;
600 			if (r->valid == 0)
601 				continue;
602 
603 			if (r->root_partition == 0)
604 				continue;
605 
606 			if (rf_containsboot(r, booted_device)) {
607 				num_root++;
608 				rsc = sc;
609 				dksc = &rsc->sc_dksc;
610 			}
611 		}
612 		mutex_exit(&raid_lock);
613 
614 		if (num_root == 1) {
615 			booted_device = dksc->sc_dev;
616 			booted_method = "raidframe/multi";
617 			booted_partition = 0;	/* XXX assume 'a' */
618 		} else {
619 			/* we can't guess.. require the user to answer... */
620 			boothowto |= RB_ASKNAME;
621 		}
622 	}
623 }
624 
625 static int
626 raidsize(dev_t dev)
627 {
628 	struct raid_softc *rs;
629 	struct dk_softc *dksc;
630 	unsigned int unit;
631 
632 	unit = raidunit(dev);
633 	if ((rs = raidget(unit, false)) == NULL)
634 		return -1;
635 	dksc = &rs->sc_dksc;
636 
637 	if ((rs->sc_flags & RAIDF_INITED) == 0)
638 		return -1;
639 
640 	return dk_size(dksc, dev);
641 }
642 
643 static int
644 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
645 {
646 	unsigned int unit;
647 	struct raid_softc *rs;
648 	struct dk_softc *dksc;
649 
650 	unit = raidunit(dev);
651 	if ((rs = raidget(unit, false)) == NULL)
652 		return ENXIO;
653 	dksc = &rs->sc_dksc;
654 
655 	if ((rs->sc_flags & RAIDF_INITED) == 0)
656 		return ENODEV;
657 
658         /*
659            Note that blkno is relative to this particular partition.
660            By adding adding RF_PROTECTED_SECTORS, we get a value that
661 	   is relative to the partition used for the underlying component.
662         */
663 	blkno += RF_PROTECTED_SECTORS;
664 
665 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
666 }
667 
668 static int
669 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
670 {
671 	struct raid_softc *rs = raidsoftc(dev);
672 	const struct bdevsw *bdev;
673 	RF_Raid_t *raidPtr;
674 	int     c, sparecol, j, scol, dumpto;
675 	int     error = 0;
676 
677 	raidPtr = &rs->sc_r;
678 
679 	/* we only support dumping to RAID 1 sets */
680 	if (raidPtr->Layout.numDataCol != 1 ||
681 	    raidPtr->Layout.numParityCol != 1)
682 		return EINVAL;
683 
684 	if ((error = raidlock(rs)) != 0)
685 		return error;
686 
687 	/* figure out what device is alive.. */
688 
689 	/*
690 	   Look for a component to dump to.  The preference for the
691 	   component to dump to is as follows:
692 	   1) the master
693 	   2) a used_spare of the master
694 	   3) the slave
695 	   4) a used_spare of the slave
696 	*/
697 
698 	dumpto = -1;
699 	for (c = 0; c < raidPtr->numCol; c++) {
700 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
701 			/* this might be the one */
702 			dumpto = c;
703 			break;
704 		}
705 	}
706 
707 	/*
708 	   At this point we have possibly selected a live master or a
709 	   live slave.  We now check to see if there is a spared
710 	   master (or a spared slave), if we didn't find a live master
711 	   or a live slave.
712 	*/
713 
714 	for (c = 0; c < raidPtr->numSpare; c++) {
715 		sparecol = raidPtr->numCol + c;
716 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
717 			/* How about this one? */
718 			scol = -1;
719 			for(j=0;j<raidPtr->numCol;j++) {
720 				if (raidPtr->Disks[j].spareCol == sparecol) {
721 					scol = j;
722 					break;
723 				}
724 			}
725 			if (scol == 0) {
726 				/*
727 				   We must have found a spared master!
728 				   We'll take that over anything else
729 				   found so far.  (We couldn't have
730 				   found a real master before, since
731 				   this is a used spare, and it's
732 				   saying that it's replacing the
733 				   master.)  On reboot (with
734 				   autoconfiguration turned on)
735 				   sparecol will become the 1st
736 				   component (component0) of this set.
737 				*/
738 				dumpto = sparecol;
739 				break;
740 			} else if (scol != -1) {
741 				/*
742 				   Must be a spared slave.  We'll dump
743 				   to that if we havn't found anything
744 				   else so far.
745 				*/
746 				if (dumpto == -1)
747 					dumpto = sparecol;
748 			}
749 		}
750 	}
751 
752 	if (dumpto == -1) {
753 		/* we couldn't find any live components to dump to!?!?
754 		 */
755 		error = EINVAL;
756 		goto out;
757 	}
758 
759 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
760 	if (bdev == NULL) {
761 		error = ENXIO;
762 		goto out;
763 	}
764 
765 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
766 				blkno, va, nblk * raidPtr->bytesPerSector);
767 
768 out:
769 	raidunlock(rs);
770 
771 	return error;
772 }
773 
774 /* ARGSUSED */
775 static int
776 raidopen(dev_t dev, int flags, int fmt,
777     struct lwp *l)
778 {
779 	int     unit = raidunit(dev);
780 	struct raid_softc *rs;
781 	struct dk_softc *dksc;
782 	int     error = 0;
783 	int     part, pmask;
784 
785 	if ((rs = raidget(unit, true)) == NULL)
786 		return ENXIO;
787 	if ((error = raidlock(rs)) != 0)
788 		return (error);
789 
790 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
791 		error = EBUSY;
792 		goto bad;
793 	}
794 
795 	dksc = &rs->sc_dksc;
796 
797 	part = DISKPART(dev);
798 	pmask = (1 << part);
799 
800 	if (!DK_BUSY(dksc, pmask) &&
801 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
802 		/* First one... mark things as dirty... Note that we *MUST*
803 		 have done a configure before this.  I DO NOT WANT TO BE
804 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
805 		 THAT THEY BELONG TOGETHER!!!!! */
806 		/* XXX should check to see if we're only open for reading
807 		   here... If so, we needn't do this, but then need some
808 		   other way of keeping track of what's happened.. */
809 
810 		rf_markalldirty(&rs->sc_r);
811 	}
812 
813 	if ((rs->sc_flags & RAIDF_INITED) != 0)
814 		error = dk_open(dksc, dev, flags, fmt, l);
815 
816 bad:
817 	raidunlock(rs);
818 
819 	return (error);
820 
821 
822 }
823 
824 static int
825 raid_lastclose(device_t self)
826 {
827 	struct raid_softc *rs = raidsoftc(self);
828 
829 	/* Last one... device is not unconfigured yet.
830 	   Device shutdown has taken care of setting the
831 	   clean bits if RAIDF_INITED is not set
832 	   mark things as clean... */
833 
834 	rf_update_component_labels(&rs->sc_r,
835 	    RF_FINAL_COMPONENT_UPDATE);
836 
837 	/* pass to unlocked code */
838 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
839 		rs->sc_flags |= RAIDF_DETACH;
840 
841 	return 0;
842 }
843 
844 /* ARGSUSED */
845 static int
846 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
847 {
848 	int     unit = raidunit(dev);
849 	struct raid_softc *rs;
850 	struct dk_softc *dksc;
851 	cfdata_t cf;
852 	int     error = 0, do_detach = 0, do_put = 0;
853 
854 	if ((rs = raidget(unit, false)) == NULL)
855 		return ENXIO;
856 	dksc = &rs->sc_dksc;
857 
858 	if ((error = raidlock(rs)) != 0)
859 		return (error);
860 
861 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
862 		error = dk_close(dksc, dev, flags, fmt, l);
863 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
864 			do_detach = 1;
865 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
866 		do_put = 1;
867 
868 	raidunlock(rs);
869 
870 	if (do_detach) {
871 		/* free the pseudo device attach bits */
872 		cf = device_cfdata(dksc->sc_dev);
873 		error = config_detach(dksc->sc_dev, 0);
874 		if (error == 0)
875 			free(cf, M_RAIDFRAME);
876 	} else if (do_put) {
877 		raidput(rs);
878 	}
879 
880 	return (error);
881 
882 }
883 
884 static void
885 raid_wakeup(RF_Raid_t *raidPtr)
886 {
887 	rf_lock_mutex2(raidPtr->iodone_lock);
888 	rf_signal_cond2(raidPtr->iodone_cv);
889 	rf_unlock_mutex2(raidPtr->iodone_lock);
890 }
891 
892 static void
893 raidstrategy(struct buf *bp)
894 {
895 	unsigned int unit;
896 	struct raid_softc *rs;
897 	struct dk_softc *dksc;
898 	RF_Raid_t *raidPtr;
899 
900 	unit = raidunit(bp->b_dev);
901 	if ((rs = raidget(unit, false)) == NULL) {
902 		bp->b_error = ENXIO;
903 		goto fail;
904 	}
905 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
906 		bp->b_error = ENXIO;
907 		goto fail;
908 	}
909 	dksc = &rs->sc_dksc;
910 	raidPtr = &rs->sc_r;
911 
912 	/* Queue IO only */
913 	if (dk_strategy_defer(dksc, bp))
914 		goto done;
915 
916 	/* schedule the IO to happen at the next convenient time */
917 	raid_wakeup(raidPtr);
918 
919 done:
920 	return;
921 
922 fail:
923 	bp->b_resid = bp->b_bcount;
924 	biodone(bp);
925 }
926 
927 static int
928 raid_diskstart(device_t dev, struct buf *bp)
929 {
930 	struct raid_softc *rs = raidsoftc(dev);
931 	RF_Raid_t *raidPtr;
932 
933 	raidPtr = &rs->sc_r;
934 	if (!raidPtr->valid) {
935 		db1_printf(("raid is not valid..\n"));
936 		return ENODEV;
937 	}
938 
939 	/* XXX */
940 	bp->b_resid = 0;
941 
942 	return raiddoaccess(raidPtr, bp);
943 }
944 
945 void
946 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
947 {
948 	struct raid_softc *rs;
949 	struct dk_softc *dksc;
950 
951 	rs = raidPtr->softc;
952 	dksc = &rs->sc_dksc;
953 
954 	dk_done(dksc, bp);
955 
956 	rf_lock_mutex2(raidPtr->mutex);
957 	raidPtr->openings++;
958 	rf_unlock_mutex2(raidPtr->mutex);
959 
960 	/* schedule more IO */
961 	raid_wakeup(raidPtr);
962 }
963 
964 /* ARGSUSED */
965 static int
966 raidread(dev_t dev, struct uio *uio, int flags)
967 {
968 	int     unit = raidunit(dev);
969 	struct raid_softc *rs;
970 
971 	if ((rs = raidget(unit, false)) == NULL)
972 		return ENXIO;
973 
974 	if ((rs->sc_flags & RAIDF_INITED) == 0)
975 		return (ENXIO);
976 
977 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
978 
979 }
980 
981 /* ARGSUSED */
982 static int
983 raidwrite(dev_t dev, struct uio *uio, int flags)
984 {
985 	int     unit = raidunit(dev);
986 	struct raid_softc *rs;
987 
988 	if ((rs = raidget(unit, false)) == NULL)
989 		return ENXIO;
990 
991 	if ((rs->sc_flags & RAIDF_INITED) == 0)
992 		return (ENXIO);
993 
994 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
995 
996 }
997 
998 static int
999 raid_detach_unlocked(struct raid_softc *rs)
1000 {
1001 	struct dk_softc *dksc = &rs->sc_dksc;
1002 	RF_Raid_t *raidPtr;
1003 	int error;
1004 
1005 	raidPtr = &rs->sc_r;
1006 
1007 	if (DK_BUSY(dksc, 0) ||
1008 	    raidPtr->recon_in_progress != 0 ||
1009 	    raidPtr->parity_rewrite_in_progress != 0 ||
1010 	    raidPtr->copyback_in_progress != 0)
1011 		return EBUSY;
1012 
1013 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1014 		return 0;
1015 
1016 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1017 
1018 	if ((error = rf_Shutdown(raidPtr)) != 0)
1019 		return error;
1020 
1021 	rs->sc_flags &= ~RAIDF_INITED;
1022 
1023 	/* Kill off any queued buffers */
1024 	dk_drain(dksc);
1025 	bufq_free(dksc->sc_bufq);
1026 
1027 	/* Detach the disk. */
1028 	dkwedge_delall(&dksc->sc_dkdev);
1029 	disk_detach(&dksc->sc_dkdev);
1030 	disk_destroy(&dksc->sc_dkdev);
1031 	dk_detach(dksc);
1032 
1033 	return 0;
1034 }
1035 
1036 static bool
1037 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1038 {
1039 	switch (cmd) {
1040 	case RAIDFRAME_ADD_HOT_SPARE:
1041 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1043 	case RAIDFRAME_CHECK_PARITY:
1044 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1046 	case RAIDFRAME_CHECK_RECON_STATUS:
1047 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1048 	case RAIDFRAME_COPYBACK:
1049 	case RAIDFRAME_DELETE_COMPONENT:
1050 	case RAIDFRAME_FAIL_DISK:
1051 	case RAIDFRAME_GET_ACCTOTALS:
1052 	case RAIDFRAME_GET_COMPONENT_LABEL:
1053 	case RAIDFRAME_GET_INFO:
1054 	case RAIDFRAME_GET_SIZE:
1055 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1056 	case RAIDFRAME_INIT_LABELS:
1057 	case RAIDFRAME_KEEP_ACCTOTALS:
1058 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1059 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1060 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1061 	case RAIDFRAME_PARITYMAP_STATUS:
1062 	case RAIDFRAME_REBUILD_IN_PLACE:
1063 	case RAIDFRAME_REMOVE_HOT_SPARE:
1064 	case RAIDFRAME_RESET_ACCTOTALS:
1065 	case RAIDFRAME_REWRITEPARITY:
1066 	case RAIDFRAME_SET_AUTOCONFIG:
1067 	case RAIDFRAME_SET_COMPONENT_LABEL:
1068 	case RAIDFRAME_SET_ROOT:
1069 		return (rs->sc_flags & RAIDF_INITED) == 0;
1070 	}
1071 	return false;
1072 }
1073 
1074 int
1075 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1076 {
1077 	struct rf_recon_req_internal *rrint;
1078 
1079 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1080 		/* Can't do this on a RAID 0!! */
1081 		return EINVAL;
1082 	}
1083 
1084 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1085 		/* bad column */
1086 		return EINVAL;
1087 	}
1088 
1089 	rf_lock_mutex2(raidPtr->mutex);
1090 	if (raidPtr->status == rf_rs_reconstructing) {
1091 		/* you can't fail a disk while we're reconstructing! */
1092 		/* XXX wrong for RAID6 */
1093 		goto out;
1094 	}
1095 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1096 	    (raidPtr->numFailures > 0)) {
1097 		/* some other component has failed.  Let's not make
1098 		   things worse. XXX wrong for RAID6 */
1099 		goto out;
1100 	}
1101 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1102 		/* Can't fail a spared disk! */
1103 		goto out;
1104 	}
1105 	rf_unlock_mutex2(raidPtr->mutex);
1106 
1107 	/* make a copy of the recon request so that we don't rely on
1108 	 * the user's buffer */
1109 	rrint = RF_Malloc(sizeof(*rrint));
1110 	if (rrint == NULL)
1111 		return(ENOMEM);
1112 	rrint->col = rr->col;
1113 	rrint->flags = rr->flags;
1114 	rrint->raidPtr = raidPtr;
1115 
1116 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1117 	    rrint, "raid_recon");
1118 out:
1119 	rf_unlock_mutex2(raidPtr->mutex);
1120 	return EINVAL;
1121 }
1122 
1123 static int
1124 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1125 {
1126 	/* allocate a buffer for the layout-specific data, and copy it in */
1127 	if (k_cfg->layoutSpecificSize == 0)
1128 		return 0;
1129 
1130 	if (k_cfg->layoutSpecificSize > 10000) {
1131 	    /* sanity check */
1132 	    return EINVAL;
1133 	}
1134 
1135 	u_char *specific_buf;
1136 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1137 	if (specific_buf == NULL)
1138 		return ENOMEM;
1139 
1140 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1141 	    k_cfg->layoutSpecificSize);
1142 	if (retcode) {
1143 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1144 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1145 		return retcode;
1146 	}
1147 
1148 	k_cfg->layoutSpecific = specific_buf;
1149 	return 0;
1150 }
1151 
1152 static int
1153 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1154 {
1155 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
1156 
1157 	if (rs->sc_r.valid) {
1158 		/* There is a valid RAID set running on this unit! */
1159 		printf("raid%d: Device already configured!\n", rs->sc_unit);
1160 		return EINVAL;
1161 	}
1162 
1163 	/* copy-in the configuration information */
1164 	/* data points to a pointer to the configuration structure */
1165 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
1166 	if (*k_cfg == NULL) {
1167 		return ENOMEM;
1168 	}
1169 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1170 	if (retcode == 0)
1171 		return 0;
1172 	RF_Free(*k_cfg, sizeof(RF_Config_t));
1173 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1174 	rs->sc_flags |= RAIDF_SHUTDOWN;
1175 	return retcode;
1176 }
1177 
1178 int
1179 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1180 {
1181 	int retcode;
1182 	RF_Raid_t *raidPtr = &rs->sc_r;
1183 
1184 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1185 
1186 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1187 		goto out;
1188 
1189 	/* should do some kind of sanity check on the configuration.
1190 	 * Store the sum of all the bytes in the last byte? */
1191 
1192 	/* configure the system */
1193 
1194 	/*
1195 	 * Clear the entire RAID descriptor, just to make sure
1196 	 *  there is no stale data left in the case of a
1197 	 *  reconfiguration
1198 	 */
1199 	memset(raidPtr, 0, sizeof(*raidPtr));
1200 	raidPtr->softc = rs;
1201 	raidPtr->raidid = rs->sc_unit;
1202 
1203 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
1204 
1205 	if (retcode == 0) {
1206 		/* allow this many simultaneous IO's to
1207 		   this RAID device */
1208 		raidPtr->openings = RAIDOUTSTANDING;
1209 
1210 		raidinit(rs);
1211 		raid_wakeup(raidPtr);
1212 		rf_markalldirty(raidPtr);
1213 	}
1214 
1215 	/* free the buffers.  No return code here. */
1216 	if (k_cfg->layoutSpecificSize) {
1217 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1218 	}
1219 out:
1220 	RF_Free(k_cfg, sizeof(RF_Config_t));
1221 	if (retcode) {
1222 		/*
1223 		 * If configuration failed, set sc_flags so that we
1224 		 * will detach the device when we close it.
1225 		 */
1226 		rs->sc_flags |= RAIDF_SHUTDOWN;
1227 	}
1228 	return retcode;
1229 }
1230 
1231 #if RF_DISABLED
1232 static int
1233 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1234 {
1235 
1236 	/* XXX check the label for valid stuff... */
1237 	/* Note that some things *should not* get modified --
1238 	   the user should be re-initing the labels instead of
1239 	   trying to patch things.
1240 	   */
1241 #ifdef DEBUG
1242 	int raidid = raidPtr->raidid;
1243 	printf("raid%d: Got component label:\n", raidid);
1244 	printf("raid%d: Version: %d\n", raidid, clabel->version);
1245 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1246 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1247 	printf("raid%d: Column: %d\n", raidid, clabel->column);
1248 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1249 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1250 	printf("raid%d: Status: %d\n", raidid, clabel->status);
1251 #endif	/* DEBUG */
1252 	clabel->row = 0;
1253 	int column = clabel->column;
1254 
1255 	if ((column < 0) || (column >= raidPtr->numCol)) {
1256 		return(EINVAL);
1257 	}
1258 
1259 	/* XXX this isn't allowed to do anything for now :-) */
1260 
1261 	/* XXX and before it is, we need to fill in the rest
1262 	   of the fields!?!?!?! */
1263 	memcpy(raidget_component_label(raidPtr, column),
1264 	    clabel, sizeof(*clabel));
1265 	raidflush_component_label(raidPtr, column);
1266 	return 0;
1267 }
1268 #endif
1269 
1270 static int
1271 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1272 {
1273 	/*
1274 	   we only want the serial number from
1275 	   the above.  We get all the rest of the information
1276 	   from the config that was used to create this RAID
1277 	   set.
1278 	   */
1279 
1280 	raidPtr->serial_number = clabel->serial_number;
1281 
1282 	for (int column = 0; column < raidPtr->numCol; column++) {
1283 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1284 		if (RF_DEAD_DISK(diskPtr->status))
1285 			continue;
1286 		RF_ComponentLabel_t *ci_label = raidget_component_label(
1287 		    raidPtr, column);
1288 		/* Zeroing this is important. */
1289 		memset(ci_label, 0, sizeof(*ci_label));
1290 		raid_init_component_label(raidPtr, ci_label);
1291 		ci_label->serial_number = raidPtr->serial_number;
1292 		ci_label->row = 0; /* we dont' pretend to support more */
1293 		rf_component_label_set_partitionsize(ci_label,
1294 		    diskPtr->partitionSize);
1295 		ci_label->column = column;
1296 		raidflush_component_label(raidPtr, column);
1297 		/* XXXjld what about the spares? */
1298 	}
1299 
1300 	return 0;
1301 }
1302 
1303 static int
1304 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1305 {
1306 
1307 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1308 		/* Can't do this on a RAID 0!! */
1309 		return EINVAL;
1310 	}
1311 
1312 	if (raidPtr->recon_in_progress == 1) {
1313 		/* a reconstruct is already in progress! */
1314 		return EINVAL;
1315 	}
1316 
1317 	RF_SingleComponent_t component;
1318 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1319 	component.row = 0; /* we don't support any more */
1320 	int column = component.column;
1321 
1322 	if ((column < 0) || (column >= raidPtr->numCol)) {
1323 		return EINVAL;
1324 	}
1325 
1326 	rf_lock_mutex2(raidPtr->mutex);
1327 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1328 	    (raidPtr->numFailures > 0)) {
1329 		/* XXX 0 above shouldn't be constant!!! */
1330 		/* some component other than this has failed.
1331 		   Let's not make things worse than they already
1332 		   are... */
1333 		printf("raid%d: Unable to reconstruct to disk at:\n",
1334 		       raidPtr->raidid);
1335 		printf("raid%d:     Col: %d   Too many failures.\n",
1336 		       raidPtr->raidid, column);
1337 		rf_unlock_mutex2(raidPtr->mutex);
1338 		return EINVAL;
1339 	}
1340 
1341 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1342 		printf("raid%d: Unable to reconstruct to disk at:\n",
1343 		       raidPtr->raidid);
1344 		printf("raid%d:    Col: %d   "
1345 		    "Reconstruction already occurring!\n",
1346 		    raidPtr->raidid, column);
1347 
1348 		rf_unlock_mutex2(raidPtr->mutex);
1349 		return EINVAL;
1350 	}
1351 
1352 	if (raidPtr->Disks[column].status == rf_ds_spared) {
1353 		rf_unlock_mutex2(raidPtr->mutex);
1354 		return EINVAL;
1355 	}
1356 
1357 	rf_unlock_mutex2(raidPtr->mutex);
1358 
1359 	struct rf_recon_req_internal *rrint;
1360 	rrint = RF_Malloc(sizeof(*rrint));
1361 	if (rrint == NULL)
1362 		return ENOMEM;
1363 
1364 	rrint->col = column;
1365 	rrint->raidPtr = raidPtr;
1366 
1367 	return RF_CREATE_THREAD(raidPtr->recon_thread,
1368 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1369 }
1370 
1371 static int
1372 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1373 {
1374 	/*
1375 	 * This makes no sense on a RAID 0, or if we are not reconstructing
1376 	 * so tell the user it's done.
1377 	 */
1378 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
1379 	    raidPtr->status != rf_rs_reconstructing) {
1380 		*data = 100;
1381 		return 0;
1382 	}
1383 	if (raidPtr->reconControl->numRUsTotal == 0) {
1384 		*data = 0;
1385 		return 0;
1386 	}
1387 	*data = (raidPtr->reconControl->numRUsComplete * 100
1388 	    / raidPtr->reconControl->numRUsTotal);
1389 	return 0;
1390 }
1391 
1392 static int
1393 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1394 {
1395 	int     unit = raidunit(dev);
1396 	int     part, pmask;
1397 	struct raid_softc *rs;
1398 	struct dk_softc *dksc;
1399 	RF_Config_t *k_cfg;
1400 	RF_Raid_t *raidPtr;
1401 	RF_AccTotals_t *totals;
1402 	RF_SingleComponent_t component;
1403 	RF_DeviceConfig_t *d_cfg, *ucfgp;
1404 	int retcode = 0;
1405 	int column;
1406 	RF_ComponentLabel_t *clabel;
1407 	RF_SingleComponent_t *sparePtr,*componentPtr;
1408 	int d;
1409 
1410 	if ((rs = raidget(unit, false)) == NULL)
1411 		return ENXIO;
1412 
1413 	dksc = &rs->sc_dksc;
1414 	raidPtr = &rs->sc_r;
1415 
1416 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1417 	    (int) DISKPART(dev), (int) unit, cmd));
1418 
1419 	/* Must be initialized for these... */
1420 	if (rf_must_be_initialized(rs, cmd))
1421 		return ENXIO;
1422 
1423 	switch (cmd) {
1424 		/* configure the system */
1425 	case RAIDFRAME_CONFIGURE:
1426 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1427 			return retcode;
1428 		return rf_construct(rs, k_cfg);
1429 
1430 		/* shutdown the system */
1431 	case RAIDFRAME_SHUTDOWN:
1432 
1433 		part = DISKPART(dev);
1434 		pmask = (1 << part);
1435 
1436 		if ((retcode = raidlock(rs)) != 0)
1437 			return retcode;
1438 
1439 		if (DK_BUSY(dksc, pmask) ||
1440 		    raidPtr->recon_in_progress != 0 ||
1441 		    raidPtr->parity_rewrite_in_progress != 0 ||
1442 		    raidPtr->copyback_in_progress != 0)
1443 			retcode = EBUSY;
1444 		else {
1445 			/* detach and free on close */
1446 			rs->sc_flags |= RAIDF_SHUTDOWN;
1447 			retcode = 0;
1448 		}
1449 
1450 		raidunlock(rs);
1451 
1452 		return retcode;
1453 	case RAIDFRAME_GET_COMPONENT_LABEL:
1454 		return rf_get_component_label(raidPtr, data);
1455 
1456 #if RF_DISABLED
1457 	case RAIDFRAME_SET_COMPONENT_LABEL:
1458 		return rf_set_component_label(raidPtr, data);
1459 #endif
1460 
1461 	case RAIDFRAME_INIT_LABELS:
1462 		return rf_init_component_label(raidPtr, data);
1463 
1464 	case RAIDFRAME_SET_AUTOCONFIG:
1465 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1466 		printf("raid%d: New autoconfig value is: %d\n",
1467 		       raidPtr->raidid, d);
1468 		*(int *) data = d;
1469 		return retcode;
1470 
1471 	case RAIDFRAME_SET_ROOT:
1472 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1473 		printf("raid%d: New rootpartition value is: %d\n",
1474 		       raidPtr->raidid, d);
1475 		*(int *) data = d;
1476 		return retcode;
1477 
1478 		/* initialize all parity */
1479 	case RAIDFRAME_REWRITEPARITY:
1480 
1481 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1482 			/* Parity for RAID 0 is trivially correct */
1483 			raidPtr->parity_good = RF_RAID_CLEAN;
1484 			return 0;
1485 		}
1486 
1487 		if (raidPtr->parity_rewrite_in_progress == 1) {
1488 			/* Re-write is already in progress! */
1489 			return EINVAL;
1490 		}
1491 
1492 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1493 		    rf_RewriteParityThread, raidPtr,"raid_parity");
1494 
1495 	case RAIDFRAME_ADD_HOT_SPARE:
1496 		sparePtr = (RF_SingleComponent_t *) data;
1497 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1498 		return rf_add_hot_spare(raidPtr, &component);
1499 
1500 	case RAIDFRAME_REMOVE_HOT_SPARE:
1501 		return retcode;
1502 
1503 	case RAIDFRAME_DELETE_COMPONENT:
1504 		componentPtr = (RF_SingleComponent_t *)data;
1505 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1506 		return rf_delete_component(raidPtr, &component);
1507 
1508 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1509 		componentPtr = (RF_SingleComponent_t *)data;
1510 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1511 		return rf_incorporate_hot_spare(raidPtr, &component);
1512 
1513 	case RAIDFRAME_REBUILD_IN_PLACE:
1514 		return rf_rebuild_in_place(raidPtr, data);
1515 
1516 	case RAIDFRAME_GET_INFO:
1517 		ucfgp = *(RF_DeviceConfig_t **)data;
1518 		d_cfg = RF_Malloc(sizeof(*d_cfg));
1519 		if (d_cfg == NULL)
1520 			return ENOMEM;
1521 		retcode = rf_get_info(raidPtr, d_cfg);
1522 		if (retcode == 0) {
1523 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1524 		}
1525 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1526 		return retcode;
1527 
1528 	case RAIDFRAME_CHECK_PARITY:
1529 		*(int *) data = raidPtr->parity_good;
1530 		return 0;
1531 
1532 	case RAIDFRAME_PARITYMAP_STATUS:
1533 		if (rf_paritymap_ineligible(raidPtr))
1534 			return EINVAL;
1535 		rf_paritymap_status(raidPtr->parity_map, data);
1536 		return 0;
1537 
1538 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1539 		if (rf_paritymap_ineligible(raidPtr))
1540 			return EINVAL;
1541 		if (raidPtr->parity_map == NULL)
1542 			return ENOENT; /* ??? */
1543 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1544 			return EINVAL;
1545 		return 0;
1546 
1547 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1548 		if (rf_paritymap_ineligible(raidPtr))
1549 			return EINVAL;
1550 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1551 		return 0;
1552 
1553 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1554 		if (rf_paritymap_ineligible(raidPtr))
1555 			return EINVAL;
1556 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1557 		/* XXX should errors be passed up? */
1558 		return 0;
1559 
1560 	case RAIDFRAME_RESET_ACCTOTALS:
1561 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1562 		return 0;
1563 
1564 	case RAIDFRAME_GET_ACCTOTALS:
1565 		totals = (RF_AccTotals_t *) data;
1566 		*totals = raidPtr->acc_totals;
1567 		return 0;
1568 
1569 	case RAIDFRAME_KEEP_ACCTOTALS:
1570 		raidPtr->keep_acc_totals = *(int *)data;
1571 		return 0;
1572 
1573 	case RAIDFRAME_GET_SIZE:
1574 		*(int *) data = raidPtr->totalSectors;
1575 		return 0;
1576 
1577 	case RAIDFRAME_FAIL_DISK:
1578 		return rf_fail_disk(raidPtr, data);
1579 
1580 		/* invoke a copyback operation after recon on whatever disk
1581 		 * needs it, if any */
1582 	case RAIDFRAME_COPYBACK:
1583 
1584 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1585 			/* This makes no sense on a RAID 0!! */
1586 			return EINVAL;
1587 		}
1588 
1589 		if (raidPtr->copyback_in_progress == 1) {
1590 			/* Copyback is already in progress! */
1591 			return EINVAL;
1592 		}
1593 
1594 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
1595 		    rf_CopybackThread, raidPtr, "raid_copyback");
1596 
1597 		/* return the percentage completion of reconstruction */
1598 	case RAIDFRAME_CHECK_RECON_STATUS:
1599 		return rf_check_recon_status(raidPtr, data);
1600 
1601 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1602 		rf_check_recon_status_ext(raidPtr, data);
1603 		return 0;
1604 
1605 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1606 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1607 			/* This makes no sense on a RAID 0, so tell the
1608 			   user it's done. */
1609 			*(int *) data = 100;
1610 			return 0;
1611 		}
1612 		if (raidPtr->parity_rewrite_in_progress == 1) {
1613 			*(int *) data = 100 *
1614 				raidPtr->parity_rewrite_stripes_done /
1615 				raidPtr->Layout.numStripe;
1616 		} else {
1617 			*(int *) data = 100;
1618 		}
1619 		return 0;
1620 
1621 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1622 		rf_check_parityrewrite_status_ext(raidPtr, data);
1623 		return 0;
1624 
1625 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1626 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1627 			/* This makes no sense on a RAID 0 */
1628 			*(int *) data = 100;
1629 			return 0;
1630 		}
1631 		if (raidPtr->copyback_in_progress == 1) {
1632 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1633 				raidPtr->Layout.numStripe;
1634 		} else {
1635 			*(int *) data = 100;
1636 		}
1637 		return 0;
1638 
1639 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1640 		rf_check_copyback_status_ext(raidPtr, data);
1641 		return 0;
1642 
1643 	case RAIDFRAME_SET_LAST_UNIT:
1644 		for (column = 0; column < raidPtr->numCol; column++)
1645 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1646 				return EBUSY;
1647 
1648 		for (column = 0; column < raidPtr->numCol; column++) {
1649 			clabel = raidget_component_label(raidPtr, column);
1650 			clabel->last_unit = *(int *)data;
1651 			raidflush_component_label(raidPtr, column);
1652 		}
1653 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1654 		return 0;
1655 
1656 		/* the sparetable daemon calls this to wait for the kernel to
1657 		 * need a spare table. this ioctl does not return until a
1658 		 * spare table is needed. XXX -- calling mpsleep here in the
1659 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1660 		 * -- I should either compute the spare table in the kernel,
1661 		 * or have a different -- XXX XXX -- interface (a different
1662 		 * character device) for delivering the table     -- XXX */
1663 #if RF_DISABLED
1664 	case RAIDFRAME_SPARET_WAIT:
1665 		rf_lock_mutex2(rf_sparet_wait_mutex);
1666 		while (!rf_sparet_wait_queue)
1667 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1668 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1669 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1670 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1671 
1672 		/* structure assignment */
1673 		*((RF_SparetWait_t *) data) = *waitreq;
1674 
1675 		RF_Free(waitreq, sizeof(*waitreq));
1676 		return 0;
1677 
1678 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1679 		 * code in it that will cause the dameon to exit */
1680 	case RAIDFRAME_ABORT_SPARET_WAIT:
1681 		waitreq = RF_Malloc(sizeof(*waitreq));
1682 		waitreq->fcol = -1;
1683 		rf_lock_mutex2(rf_sparet_wait_mutex);
1684 		waitreq->next = rf_sparet_wait_queue;
1685 		rf_sparet_wait_queue = waitreq;
1686 		rf_broadcast_cond2(rf_sparet_wait_cv);
1687 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1688 		return 0;
1689 
1690 		/* used by the spare table daemon to deliver a spare table
1691 		 * into the kernel */
1692 	case RAIDFRAME_SEND_SPARET:
1693 
1694 		/* install the spare table */
1695 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1696 
1697 		/* respond to the requestor.  the return status of the spare
1698 		 * table installation is passed in the "fcol" field */
1699 		waitred = RF_Malloc(sizeof(*waitreq));
1700 		waitreq->fcol = retcode;
1701 		rf_lock_mutex2(rf_sparet_wait_mutex);
1702 		waitreq->next = rf_sparet_resp_queue;
1703 		rf_sparet_resp_queue = waitreq;
1704 		rf_broadcast_cond2(rf_sparet_resp_cv);
1705 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1706 
1707 		return retcode;
1708 #endif
1709 	default:
1710 		/*
1711 		 * Don't bother trying to load compat modules
1712 		 * if it is not our ioctl. This is more efficient
1713 		 * and makes rump tests not depend on compat code
1714 		 */
1715 		if (IOCGROUP(cmd) != 'r')
1716 			break;
1717 #ifdef _LP64
1718 		if ((l->l_proc->p_flag & PK_32) != 0) {
1719 			module_autoload("compat_netbsd32_raid",
1720 			    MODULE_CLASS_EXEC);
1721 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1722 			    (rs, cmd, data), enosys(), retcode);
1723 			if (retcode != EPASSTHROUGH)
1724 				return retcode;
1725 		}
1726 #endif
1727 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1728 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1729 		    (rs, cmd, data), enosys(), retcode);
1730 		if (retcode != EPASSTHROUGH)
1731 			return retcode;
1732 
1733 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1734 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1735 		    (rs, cmd, data), enosys(), retcode);
1736 		if (retcode != EPASSTHROUGH)
1737 			return retcode;
1738 		break; /* fall through to the os-specific code below */
1739 
1740 	}
1741 
1742 	if (!raidPtr->valid)
1743 		return (EINVAL);
1744 
1745 	/*
1746 	 * Add support for "regular" device ioctls here.
1747 	 */
1748 
1749 	switch (cmd) {
1750 	case DIOCGCACHE:
1751 		retcode = rf_get_component_caches(raidPtr, (int *)data);
1752 		break;
1753 
1754 	case DIOCCACHESYNC:
1755 		retcode = rf_sync_component_caches(raidPtr);
1756 		break;
1757 
1758 	default:
1759 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1760 		break;
1761 	}
1762 
1763 	return (retcode);
1764 
1765 }
1766 
1767 
1768 /* raidinit -- complete the rest of the initialization for the
1769    RAIDframe device.  */
1770 
1771 
1772 static void
1773 raidinit(struct raid_softc *rs)
1774 {
1775 	cfdata_t cf;
1776 	unsigned int unit;
1777 	struct dk_softc *dksc = &rs->sc_dksc;
1778 	RF_Raid_t *raidPtr = &rs->sc_r;
1779 	device_t dev;
1780 
1781 	unit = raidPtr->raidid;
1782 
1783 	/* XXX doesn't check bounds. */
1784 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1785 
1786 	/* attach the pseudo device */
1787 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1788 	cf->cf_name = raid_cd.cd_name;
1789 	cf->cf_atname = raid_cd.cd_name;
1790 	cf->cf_unit = unit;
1791 	cf->cf_fstate = FSTATE_STAR;
1792 
1793 	dev = config_attach_pseudo(cf);
1794 	if (dev == NULL) {
1795 		printf("raid%d: config_attach_pseudo failed\n",
1796 		    raidPtr->raidid);
1797 		free(cf, M_RAIDFRAME);
1798 		return;
1799 	}
1800 
1801 	/* provide a backpointer to the real softc */
1802 	raidsoftc(dev) = rs;
1803 
1804 	/* disk_attach actually creates space for the CPU disklabel, among
1805 	 * other things, so it's critical to call this *BEFORE* we try putzing
1806 	 * with disklabels. */
1807 	dk_init(dksc, dev, DKTYPE_RAID);
1808 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1809 
1810 	/* XXX There may be a weird interaction here between this, and
1811 	 * protectedSectors, as used in RAIDframe.  */
1812 
1813 	rs->sc_size = raidPtr->totalSectors;
1814 
1815 	/* Attach dk and disk subsystems */
1816 	dk_attach(dksc);
1817 	disk_attach(&dksc->sc_dkdev);
1818 	rf_set_geometry(rs, raidPtr);
1819 
1820 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1821 
1822 	/* mark unit as usuable */
1823 	rs->sc_flags |= RAIDF_INITED;
1824 
1825 	dkwedge_discover(&dksc->sc_dkdev);
1826 }
1827 
1828 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1829 /* wake up the daemon & tell it to get us a spare table
1830  * XXX
1831  * the entries in the queues should be tagged with the raidPtr
1832  * so that in the extremely rare case that two recons happen at once,
1833  * we know for which device were requesting a spare table
1834  * XXX
1835  *
1836  * XXX This code is not currently used. GO
1837  */
1838 int
1839 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1840 {
1841 	int     retcode;
1842 
1843 	rf_lock_mutex2(rf_sparet_wait_mutex);
1844 	req->next = rf_sparet_wait_queue;
1845 	rf_sparet_wait_queue = req;
1846 	rf_broadcast_cond2(rf_sparet_wait_cv);
1847 
1848 	/* mpsleep unlocks the mutex */
1849 	while (!rf_sparet_resp_queue) {
1850 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1851 	}
1852 	req = rf_sparet_resp_queue;
1853 	rf_sparet_resp_queue = req->next;
1854 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1855 
1856 	retcode = req->fcol;
1857 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1858 					 * alloc'd */
1859 	return (retcode);
1860 }
1861 #endif
1862 
1863 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1864  * bp & passes it down.
1865  * any calls originating in the kernel must use non-blocking I/O
1866  * do some extra sanity checking to return "appropriate" error values for
1867  * certain conditions (to make some standard utilities work)
1868  *
1869  * Formerly known as: rf_DoAccessKernel
1870  */
1871 void
1872 raidstart(RF_Raid_t *raidPtr)
1873 {
1874 	struct raid_softc *rs;
1875 	struct dk_softc *dksc;
1876 
1877 	rs = raidPtr->softc;
1878 	dksc = &rs->sc_dksc;
1879 	/* quick check to see if anything has died recently */
1880 	rf_lock_mutex2(raidPtr->mutex);
1881 	if (raidPtr->numNewFailures > 0) {
1882 		rf_unlock_mutex2(raidPtr->mutex);
1883 		rf_update_component_labels(raidPtr,
1884 					   RF_NORMAL_COMPONENT_UPDATE);
1885 		rf_lock_mutex2(raidPtr->mutex);
1886 		raidPtr->numNewFailures--;
1887 	}
1888 	rf_unlock_mutex2(raidPtr->mutex);
1889 
1890 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1891 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1892 		return;
1893 	}
1894 
1895 	dk_start(dksc, NULL);
1896 }
1897 
1898 static int
1899 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1900 {
1901 	RF_SectorCount_t num_blocks, pb, sum;
1902 	RF_RaidAddr_t raid_addr;
1903 	daddr_t blocknum;
1904 	int     do_async;
1905 	int rc;
1906 
1907 	rf_lock_mutex2(raidPtr->mutex);
1908 	if (raidPtr->openings == 0) {
1909 		rf_unlock_mutex2(raidPtr->mutex);
1910 		return EAGAIN;
1911 	}
1912 	rf_unlock_mutex2(raidPtr->mutex);
1913 
1914 	blocknum = bp->b_rawblkno;
1915 
1916 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1917 		    (int) blocknum));
1918 
1919 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1920 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1921 
1922 	/* *THIS* is where we adjust what block we're going to...
1923 	 * but DO NOT TOUCH bp->b_blkno!!! */
1924 	raid_addr = blocknum;
1925 
1926 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1927 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1928 	sum = raid_addr + num_blocks + pb;
1929 	if (1 || rf_debugKernelAccess) {
1930 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1931 			    (int) raid_addr, (int) sum, (int) num_blocks,
1932 			    (int) pb, (int) bp->b_resid));
1933 	}
1934 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1935 	    || (sum < num_blocks) || (sum < pb)) {
1936 		rc = ENOSPC;
1937 		goto done;
1938 	}
1939 	/*
1940 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1941 	 */
1942 
1943 	if (bp->b_bcount & raidPtr->sectorMask) {
1944 		rc = ENOSPC;
1945 		goto done;
1946 	}
1947 	db1_printf(("Calling DoAccess..\n"));
1948 
1949 
1950 	rf_lock_mutex2(raidPtr->mutex);
1951 	raidPtr->openings--;
1952 	rf_unlock_mutex2(raidPtr->mutex);
1953 
1954 	/*
1955 	 * Everything is async.
1956 	 */
1957 	do_async = 1;
1958 
1959 	/* don't ever condition on bp->b_flags & B_WRITE.
1960 	 * always condition on B_READ instead */
1961 
1962 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
1963 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
1964 			 do_async, raid_addr, num_blocks,
1965 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
1966 
1967 done:
1968 	return rc;
1969 }
1970 
1971 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
1972 
1973 int
1974 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
1975 {
1976 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
1977 	struct buf *bp;
1978 
1979 	req->queue = queue;
1980 	bp = req->bp;
1981 
1982 	switch (req->type) {
1983 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
1984 		/* XXX need to do something extra here.. */
1985 		/* I'm leaving this in, as I've never actually seen it used,
1986 		 * and I'd like folks to report it... GO */
1987 		printf(("WAKEUP CALLED\n"));
1988 		queue->numOutstanding++;
1989 
1990 		bp->b_flags = 0;
1991 		bp->b_private = req;
1992 
1993 		KernelWakeupFunc(bp);
1994 		break;
1995 
1996 	case RF_IO_TYPE_READ:
1997 	case RF_IO_TYPE_WRITE:
1998 #if RF_ACC_TRACE > 0
1999 		if (req->tracerec) {
2000 			RF_ETIMER_START(req->tracerec->timer);
2001 		}
2002 #endif
2003 		InitBP(bp, queue->rf_cinfo->ci_vp,
2004 		    op, queue->rf_cinfo->ci_dev,
2005 		    req->sectorOffset, req->numSector,
2006 		    req->buf, KernelWakeupFunc, (void *) req,
2007 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2008 
2009 		if (rf_debugKernelAccess) {
2010 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2011 				(long) bp->b_blkno));
2012 		}
2013 		queue->numOutstanding++;
2014 		queue->last_deq_sector = req->sectorOffset;
2015 		/* acc wouldn't have been let in if there were any pending
2016 		 * reqs at any other priority */
2017 		queue->curPriority = req->priority;
2018 
2019 		db1_printf(("Going for %c to unit %d col %d\n",
2020 			    req->type, queue->raidPtr->raidid,
2021 			    queue->col));
2022 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2023 			(int) req->sectorOffset, (int) req->numSector,
2024 			(int) (req->numSector <<
2025 			    queue->raidPtr->logBytesPerSector),
2026 			(int) queue->raidPtr->logBytesPerSector));
2027 
2028 		/*
2029 		 * XXX: drop lock here since this can block at
2030 		 * least with backing SCSI devices.  Retake it
2031 		 * to minimize fuss with calling interfaces.
2032 		 */
2033 
2034 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2035 		bdev_strategy(bp);
2036 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2037 		break;
2038 
2039 	default:
2040 		panic("bad req->type in rf_DispatchKernelIO");
2041 	}
2042 	db1_printf(("Exiting from DispatchKernelIO\n"));
2043 
2044 	return (0);
2045 }
2046 /* this is the callback function associated with a I/O invoked from
2047    kernel code.
2048  */
2049 static void
2050 KernelWakeupFunc(struct buf *bp)
2051 {
2052 	RF_DiskQueueData_t *req = NULL;
2053 	RF_DiskQueue_t *queue;
2054 
2055 	db1_printf(("recovering the request queue:\n"));
2056 
2057 	req = bp->b_private;
2058 
2059 	queue = (RF_DiskQueue_t *) req->queue;
2060 
2061 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2062 
2063 #if RF_ACC_TRACE > 0
2064 	if (req->tracerec) {
2065 		RF_ETIMER_STOP(req->tracerec->timer);
2066 		RF_ETIMER_EVAL(req->tracerec->timer);
2067 		rf_lock_mutex2(rf_tracing_mutex);
2068 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2069 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2070 		req->tracerec->num_phys_ios++;
2071 		rf_unlock_mutex2(rf_tracing_mutex);
2072 	}
2073 #endif
2074 
2075 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2076 	 * ballistic, and mark the component as hosed... */
2077 
2078 	if (bp->b_error != 0) {
2079 		/* Mark the disk as dead */
2080 		/* but only mark it once... */
2081 		/* and only if it wouldn't leave this RAID set
2082 		   completely broken */
2083 		if (((queue->raidPtr->Disks[queue->col].status ==
2084 		      rf_ds_optimal) ||
2085 		     (queue->raidPtr->Disks[queue->col].status ==
2086 		      rf_ds_used_spare)) &&
2087 		     (queue->raidPtr->numFailures <
2088 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2089 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2090 			       queue->raidPtr->raidid,
2091 			       bp->b_error,
2092 			       queue->raidPtr->Disks[queue->col].devname);
2093 			queue->raidPtr->Disks[queue->col].status =
2094 			    rf_ds_failed;
2095 			queue->raidPtr->status = rf_rs_degraded;
2096 			queue->raidPtr->numFailures++;
2097 			queue->raidPtr->numNewFailures++;
2098 		} else {	/* Disk is already dead... */
2099 			/* printf("Disk already marked as dead!\n"); */
2100 		}
2101 
2102 	}
2103 
2104 	/* Fill in the error value */
2105 	req->error = bp->b_error;
2106 
2107 	/* Drop this one on the "finished" queue... */
2108 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2109 
2110 	/* Let the raidio thread know there is work to be done. */
2111 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2112 
2113 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2114 }
2115 
2116 
2117 /*
2118  * initialize a buf structure for doing an I/O in the kernel.
2119  */
2120 static void
2121 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2122        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2123        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2124        struct proc *b_proc)
2125 {
2126 	/* bp->b_flags       = B_PHYS | rw_flag; */
2127 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2128 	bp->b_oflags = 0;
2129 	bp->b_cflags = 0;
2130 	bp->b_bcount = numSect << logBytesPerSector;
2131 	bp->b_bufsize = bp->b_bcount;
2132 	bp->b_error = 0;
2133 	bp->b_dev = dev;
2134 	bp->b_data = bf;
2135 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2136 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2137 	if (bp->b_bcount == 0) {
2138 		panic("bp->b_bcount is zero in InitBP!!");
2139 	}
2140 	bp->b_proc = b_proc;
2141 	bp->b_iodone = cbFunc;
2142 	bp->b_private = cbArg;
2143 }
2144 
2145 /*
2146  * Wait interruptibly for an exclusive lock.
2147  *
2148  * XXX
2149  * Several drivers do this; it should be abstracted and made MP-safe.
2150  * (Hmm... where have we seen this warning before :->  GO )
2151  */
2152 static int
2153 raidlock(struct raid_softc *rs)
2154 {
2155 	int     error;
2156 
2157 	error = 0;
2158 	mutex_enter(&rs->sc_mutex);
2159 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2160 		rs->sc_flags |= RAIDF_WANTED;
2161 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2162 		if (error != 0)
2163 			goto done;
2164 	}
2165 	rs->sc_flags |= RAIDF_LOCKED;
2166 done:
2167 	mutex_exit(&rs->sc_mutex);
2168 	return (error);
2169 }
2170 /*
2171  * Unlock and wake up any waiters.
2172  */
2173 static void
2174 raidunlock(struct raid_softc *rs)
2175 {
2176 
2177 	mutex_enter(&rs->sc_mutex);
2178 	rs->sc_flags &= ~RAIDF_LOCKED;
2179 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2180 		rs->sc_flags &= ~RAIDF_WANTED;
2181 		cv_broadcast(&rs->sc_cv);
2182 	}
2183 	mutex_exit(&rs->sc_mutex);
2184 }
2185 
2186 
2187 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2188 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2189 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2190 
2191 static daddr_t
2192 rf_component_info_offset(void)
2193 {
2194 
2195 	return RF_COMPONENT_INFO_OFFSET;
2196 }
2197 
2198 static daddr_t
2199 rf_component_info_size(unsigned secsize)
2200 {
2201 	daddr_t info_size;
2202 
2203 	KASSERT(secsize);
2204 	if (secsize > RF_COMPONENT_INFO_SIZE)
2205 		info_size = secsize;
2206 	else
2207 		info_size = RF_COMPONENT_INFO_SIZE;
2208 
2209 	return info_size;
2210 }
2211 
2212 static daddr_t
2213 rf_parity_map_offset(RF_Raid_t *raidPtr)
2214 {
2215 	daddr_t map_offset;
2216 
2217 	KASSERT(raidPtr->bytesPerSector);
2218 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2219 		map_offset = raidPtr->bytesPerSector;
2220 	else
2221 		map_offset = RF_COMPONENT_INFO_SIZE;
2222 	map_offset += rf_component_info_offset();
2223 
2224 	return map_offset;
2225 }
2226 
2227 static daddr_t
2228 rf_parity_map_size(RF_Raid_t *raidPtr)
2229 {
2230 	daddr_t map_size;
2231 
2232 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2233 		map_size = raidPtr->bytesPerSector;
2234 	else
2235 		map_size = RF_PARITY_MAP_SIZE;
2236 
2237 	return map_size;
2238 }
2239 
2240 int
2241 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2242 {
2243 	RF_ComponentLabel_t *clabel;
2244 
2245 	clabel = raidget_component_label(raidPtr, col);
2246 	clabel->clean = RF_RAID_CLEAN;
2247 	raidflush_component_label(raidPtr, col);
2248 	return(0);
2249 }
2250 
2251 
2252 int
2253 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2254 {
2255 	RF_ComponentLabel_t *clabel;
2256 
2257 	clabel = raidget_component_label(raidPtr, col);
2258 	clabel->clean = RF_RAID_DIRTY;
2259 	raidflush_component_label(raidPtr, col);
2260 	return(0);
2261 }
2262 
2263 int
2264 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2265 {
2266 	KASSERT(raidPtr->bytesPerSector);
2267 	return raidread_component_label(raidPtr->bytesPerSector,
2268 	    raidPtr->Disks[col].dev,
2269 	    raidPtr->raid_cinfo[col].ci_vp,
2270 	    &raidPtr->raid_cinfo[col].ci_label);
2271 }
2272 
2273 RF_ComponentLabel_t *
2274 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2275 {
2276 	return &raidPtr->raid_cinfo[col].ci_label;
2277 }
2278 
2279 int
2280 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2281 {
2282 	RF_ComponentLabel_t *label;
2283 
2284 	label = &raidPtr->raid_cinfo[col].ci_label;
2285 	label->mod_counter = raidPtr->mod_counter;
2286 #ifndef RF_NO_PARITY_MAP
2287 	label->parity_map_modcount = label->mod_counter;
2288 #endif
2289 	return raidwrite_component_label(raidPtr->bytesPerSector,
2290 	    raidPtr->Disks[col].dev,
2291 	    raidPtr->raid_cinfo[col].ci_vp, label);
2292 }
2293 
2294 
2295 static int
2296 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2297     RF_ComponentLabel_t *clabel)
2298 {
2299 	return raidread_component_area(dev, b_vp, clabel,
2300 	    sizeof(RF_ComponentLabel_t),
2301 	    rf_component_info_offset(),
2302 	    rf_component_info_size(secsize));
2303 }
2304 
2305 /* ARGSUSED */
2306 static int
2307 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2308     size_t msize, daddr_t offset, daddr_t dsize)
2309 {
2310 	struct buf *bp;
2311 	int error;
2312 
2313 	/* XXX should probably ensure that we don't try to do this if
2314 	   someone has changed rf_protected_sectors. */
2315 
2316 	if (b_vp == NULL) {
2317 		/* For whatever reason, this component is not valid.
2318 		   Don't try to read a component label from it. */
2319 		return(EINVAL);
2320 	}
2321 
2322 	/* get a block of the appropriate size... */
2323 	bp = geteblk((int)dsize);
2324 	bp->b_dev = dev;
2325 
2326 	/* get our ducks in a row for the read */
2327 	bp->b_blkno = offset / DEV_BSIZE;
2328 	bp->b_bcount = dsize;
2329 	bp->b_flags |= B_READ;
2330  	bp->b_resid = dsize;
2331 
2332 	bdev_strategy(bp);
2333 	error = biowait(bp);
2334 
2335 	if (!error) {
2336 		memcpy(data, bp->b_data, msize);
2337 	}
2338 
2339 	brelse(bp, 0);
2340 	return(error);
2341 }
2342 
2343 
2344 static int
2345 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2346     RF_ComponentLabel_t *clabel)
2347 {
2348 	return raidwrite_component_area(dev, b_vp, clabel,
2349 	    sizeof(RF_ComponentLabel_t),
2350 	    rf_component_info_offset(),
2351 	    rf_component_info_size(secsize), 0);
2352 }
2353 
2354 /* ARGSUSED */
2355 static int
2356 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2357     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2358 {
2359 	struct buf *bp;
2360 	int error;
2361 
2362 	/* get a block of the appropriate size... */
2363 	bp = geteblk((int)dsize);
2364 	bp->b_dev = dev;
2365 
2366 	/* get our ducks in a row for the write */
2367 	bp->b_blkno = offset / DEV_BSIZE;
2368 	bp->b_bcount = dsize;
2369 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2370  	bp->b_resid = dsize;
2371 
2372 	memset(bp->b_data, 0, dsize);
2373 	memcpy(bp->b_data, data, msize);
2374 
2375 	bdev_strategy(bp);
2376 	if (asyncp)
2377 		return 0;
2378 	error = biowait(bp);
2379 	brelse(bp, 0);
2380 	if (error) {
2381 #if 1
2382 		printf("Failed to write RAID component info!\n");
2383 #endif
2384 	}
2385 
2386 	return(error);
2387 }
2388 
2389 void
2390 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2391 {
2392 	int c;
2393 
2394 	for (c = 0; c < raidPtr->numCol; c++) {
2395 		/* Skip dead disks. */
2396 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2397 			continue;
2398 		/* XXXjld: what if an error occurs here? */
2399 		raidwrite_component_area(raidPtr->Disks[c].dev,
2400 		    raidPtr->raid_cinfo[c].ci_vp, map,
2401 		    RF_PARITYMAP_NBYTE,
2402 		    rf_parity_map_offset(raidPtr),
2403 		    rf_parity_map_size(raidPtr), 0);
2404 	}
2405 }
2406 
2407 void
2408 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2409 {
2410 	struct rf_paritymap_ondisk tmp;
2411 	int c,first;
2412 
2413 	first=1;
2414 	for (c = 0; c < raidPtr->numCol; c++) {
2415 		/* Skip dead disks. */
2416 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2417 			continue;
2418 		raidread_component_area(raidPtr->Disks[c].dev,
2419 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2420 		    RF_PARITYMAP_NBYTE,
2421 		    rf_parity_map_offset(raidPtr),
2422 		    rf_parity_map_size(raidPtr));
2423 		if (first) {
2424 			memcpy(map, &tmp, sizeof(*map));
2425 			first = 0;
2426 		} else {
2427 			rf_paritymap_merge(map, &tmp);
2428 		}
2429 	}
2430 }
2431 
2432 void
2433 rf_markalldirty(RF_Raid_t *raidPtr)
2434 {
2435 	RF_ComponentLabel_t *clabel;
2436 	int sparecol;
2437 	int c;
2438 	int j;
2439 	int scol = -1;
2440 
2441 	raidPtr->mod_counter++;
2442 	for (c = 0; c < raidPtr->numCol; c++) {
2443 		/* we don't want to touch (at all) a disk that has
2444 		   failed */
2445 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2446 			clabel = raidget_component_label(raidPtr, c);
2447 			if (clabel->status == rf_ds_spared) {
2448 				/* XXX do something special...
2449 				   but whatever you do, don't
2450 				   try to access it!! */
2451 			} else {
2452 				raidmarkdirty(raidPtr, c);
2453 			}
2454 		}
2455 	}
2456 
2457 	for( c = 0; c < raidPtr->numSpare ; c++) {
2458 		sparecol = raidPtr->numCol + c;
2459 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2460 			/*
2461 
2462 			   we claim this disk is "optimal" if it's
2463 			   rf_ds_used_spare, as that means it should be
2464 			   directly substitutable for the disk it replaced.
2465 			   We note that too...
2466 
2467 			 */
2468 
2469 			for(j=0;j<raidPtr->numCol;j++) {
2470 				if (raidPtr->Disks[j].spareCol == sparecol) {
2471 					scol = j;
2472 					break;
2473 				}
2474 			}
2475 
2476 			clabel = raidget_component_label(raidPtr, sparecol);
2477 			/* make sure status is noted */
2478 
2479 			raid_init_component_label(raidPtr, clabel);
2480 
2481 			clabel->row = 0;
2482 			clabel->column = scol;
2483 			/* Note: we *don't* change status from rf_ds_used_spare
2484 			   to rf_ds_optimal */
2485 			/* clabel.status = rf_ds_optimal; */
2486 
2487 			raidmarkdirty(raidPtr, sparecol);
2488 		}
2489 	}
2490 }
2491 
2492 
2493 void
2494 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2495 {
2496 	RF_ComponentLabel_t *clabel;
2497 	int sparecol;
2498 	int c;
2499 	int j;
2500 	int scol;
2501 	struct raid_softc *rs = raidPtr->softc;
2502 
2503 	scol = -1;
2504 
2505 	/* XXX should do extra checks to make sure things really are clean,
2506 	   rather than blindly setting the clean bit... */
2507 
2508 	raidPtr->mod_counter++;
2509 
2510 	for (c = 0; c < raidPtr->numCol; c++) {
2511 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2512 			clabel = raidget_component_label(raidPtr, c);
2513 			/* make sure status is noted */
2514 			clabel->status = rf_ds_optimal;
2515 
2516 			/* note what unit we are configured as */
2517 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2518 				clabel->last_unit = raidPtr->raidid;
2519 
2520 			raidflush_component_label(raidPtr, c);
2521 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2522 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2523 					raidmarkclean(raidPtr, c);
2524 				}
2525 			}
2526 		}
2527 		/* else we don't touch it.. */
2528 	}
2529 
2530 	for( c = 0; c < raidPtr->numSpare ; c++) {
2531 		sparecol = raidPtr->numCol + c;
2532 		/* Need to ensure that the reconstruct actually completed! */
2533 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2534 			/*
2535 
2536 			   we claim this disk is "optimal" if it's
2537 			   rf_ds_used_spare, as that means it should be
2538 			   directly substitutable for the disk it replaced.
2539 			   We note that too...
2540 
2541 			 */
2542 
2543 			for(j=0;j<raidPtr->numCol;j++) {
2544 				if (raidPtr->Disks[j].spareCol == sparecol) {
2545 					scol = j;
2546 					break;
2547 				}
2548 			}
2549 
2550 			/* XXX shouldn't *really* need this... */
2551 			clabel = raidget_component_label(raidPtr, sparecol);
2552 			/* make sure status is noted */
2553 
2554 			raid_init_component_label(raidPtr, clabel);
2555 
2556 			clabel->column = scol;
2557 			clabel->status = rf_ds_optimal;
2558 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2559 				clabel->last_unit = raidPtr->raidid;
2560 
2561 			raidflush_component_label(raidPtr, sparecol);
2562 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2563 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2564 					raidmarkclean(raidPtr, sparecol);
2565 				}
2566 			}
2567 		}
2568 	}
2569 }
2570 
2571 void
2572 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2573 {
2574 
2575 	if (vp != NULL) {
2576 		if (auto_configured == 1) {
2577 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2578 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2579 			vput(vp);
2580 
2581 		} else {
2582 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2583 		}
2584 	}
2585 }
2586 
2587 
2588 void
2589 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2590 {
2591 	int r,c;
2592 	struct vnode *vp;
2593 	int acd;
2594 
2595 
2596 	/* We take this opportunity to close the vnodes like we should.. */
2597 
2598 	for (c = 0; c < raidPtr->numCol; c++) {
2599 		vp = raidPtr->raid_cinfo[c].ci_vp;
2600 		acd = raidPtr->Disks[c].auto_configured;
2601 		rf_close_component(raidPtr, vp, acd);
2602 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2603 		raidPtr->Disks[c].auto_configured = 0;
2604 	}
2605 
2606 	for (r = 0; r < raidPtr->numSpare; r++) {
2607 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2608 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2609 		rf_close_component(raidPtr, vp, acd);
2610 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2611 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2612 	}
2613 }
2614 
2615 
2616 void
2617 rf_ReconThread(struct rf_recon_req_internal *req)
2618 {
2619 	int     s;
2620 	RF_Raid_t *raidPtr;
2621 
2622 	s = splbio();
2623 	raidPtr = (RF_Raid_t *) req->raidPtr;
2624 	raidPtr->recon_in_progress = 1;
2625 
2626 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2627 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2628 
2629 	RF_Free(req, sizeof(*req));
2630 
2631 	raidPtr->recon_in_progress = 0;
2632 	splx(s);
2633 
2634 	/* That's all... */
2635 	kthread_exit(0);	/* does not return */
2636 }
2637 
2638 void
2639 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2640 {
2641 	int retcode;
2642 	int s;
2643 
2644 	raidPtr->parity_rewrite_stripes_done = 0;
2645 	raidPtr->parity_rewrite_in_progress = 1;
2646 	s = splbio();
2647 	retcode = rf_RewriteParity(raidPtr);
2648 	splx(s);
2649 	if (retcode) {
2650 		printf("raid%d: Error re-writing parity (%d)!\n",
2651 		    raidPtr->raidid, retcode);
2652 	} else {
2653 		/* set the clean bit!  If we shutdown correctly,
2654 		   the clean bit on each component label will get
2655 		   set */
2656 		raidPtr->parity_good = RF_RAID_CLEAN;
2657 	}
2658 	raidPtr->parity_rewrite_in_progress = 0;
2659 
2660 	/* Anyone waiting for us to stop?  If so, inform them... */
2661 	if (raidPtr->waitShutdown) {
2662 		rf_lock_mutex2(raidPtr->rad_lock);
2663 		cv_broadcast(&raidPtr->parity_rewrite_cv);
2664 		rf_unlock_mutex2(raidPtr->rad_lock);
2665 	}
2666 
2667 	/* That's all... */
2668 	kthread_exit(0);	/* does not return */
2669 }
2670 
2671 
2672 void
2673 rf_CopybackThread(RF_Raid_t *raidPtr)
2674 {
2675 	int s;
2676 
2677 	raidPtr->copyback_in_progress = 1;
2678 	s = splbio();
2679 	rf_CopybackReconstructedData(raidPtr);
2680 	splx(s);
2681 	raidPtr->copyback_in_progress = 0;
2682 
2683 	/* That's all... */
2684 	kthread_exit(0);	/* does not return */
2685 }
2686 
2687 
2688 void
2689 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2690 {
2691 	int s;
2692 	RF_Raid_t *raidPtr;
2693 
2694 	s = splbio();
2695 	raidPtr = req->raidPtr;
2696 	raidPtr->recon_in_progress = 1;
2697 	rf_ReconstructInPlace(raidPtr, req->col);
2698 	RF_Free(req, sizeof(*req));
2699 	raidPtr->recon_in_progress = 0;
2700 	splx(s);
2701 
2702 	/* That's all... */
2703 	kthread_exit(0);	/* does not return */
2704 }
2705 
2706 static RF_AutoConfig_t *
2707 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2708     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2709     unsigned secsize)
2710 {
2711 	int good_one = 0;
2712 	RF_ComponentLabel_t *clabel;
2713 	RF_AutoConfig_t *ac;
2714 
2715 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2716 
2717 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2718 		/* Got the label.  Does it look reasonable? */
2719 		if (rf_reasonable_label(clabel, numsecs) &&
2720 		    (rf_component_label_partitionsize(clabel) <= size)) {
2721 #ifdef DEBUG
2722 			printf("Component on: %s: %llu\n",
2723 				cname, (unsigned long long)size);
2724 			rf_print_component_label(clabel);
2725 #endif
2726 			/* if it's reasonable, add it, else ignore it. */
2727 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2728 				M_WAITOK);
2729 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2730 			ac->dev = dev;
2731 			ac->vp = vp;
2732 			ac->clabel = clabel;
2733 			ac->next = ac_list;
2734 			ac_list = ac;
2735 			good_one = 1;
2736 		}
2737 	}
2738 	if (!good_one) {
2739 		/* cleanup */
2740 		free(clabel, M_RAIDFRAME);
2741 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2742 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2743 		vput(vp);
2744 	}
2745 	return ac_list;
2746 }
2747 
2748 RF_AutoConfig_t *
2749 rf_find_raid_components(void)
2750 {
2751 	struct vnode *vp;
2752 	struct disklabel label;
2753 	device_t dv;
2754 	deviter_t di;
2755 	dev_t dev;
2756 	int bmajor, bminor, wedge, rf_part_found;
2757 	int error;
2758 	int i;
2759 	RF_AutoConfig_t *ac_list;
2760 	uint64_t numsecs;
2761 	unsigned secsize;
2762 	int dowedges;
2763 
2764 	/* initialize the AutoConfig list */
2765 	ac_list = NULL;
2766 
2767 	/*
2768 	 * we begin by trolling through *all* the devices on the system *twice*
2769 	 * first we scan for wedges, second for other devices. This avoids
2770 	 * using a raw partition instead of a wedge that covers the whole disk
2771 	 */
2772 
2773 	for (dowedges=1; dowedges>=0; --dowedges) {
2774 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2775 		     dv = deviter_next(&di)) {
2776 
2777 			/* we are only interested in disks... */
2778 			if (device_class(dv) != DV_DISK)
2779 				continue;
2780 
2781 			/* we don't care about floppies... */
2782 			if (device_is_a(dv, "fd")) {
2783 				continue;
2784 			}
2785 
2786 			/* we don't care about CD's... */
2787 			if (device_is_a(dv, "cd")) {
2788 				continue;
2789 			}
2790 
2791 			/* we don't care about md's... */
2792 			if (device_is_a(dv, "md")) {
2793 				continue;
2794 			}
2795 
2796 			/* hdfd is the Atari/Hades floppy driver */
2797 			if (device_is_a(dv, "hdfd")) {
2798 				continue;
2799 			}
2800 
2801 			/* fdisa is the Atari/Milan floppy driver */
2802 			if (device_is_a(dv, "fdisa")) {
2803 				continue;
2804 			}
2805 
2806 			/* are we in the wedges pass ? */
2807 			wedge = device_is_a(dv, "dk");
2808 			if (wedge != dowedges) {
2809 				continue;
2810 			}
2811 
2812 			/* need to find the device_name_to_block_device_major stuff */
2813 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2814 
2815 			rf_part_found = 0; /*No raid partition as yet*/
2816 
2817 			/* get a vnode for the raw partition of this disk */
2818 			bminor = minor(device_unit(dv));
2819 			dev = wedge ? makedev(bmajor, bminor) :
2820 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2821 			if (bdevvp(dev, &vp))
2822 				panic("RAID can't alloc vnode");
2823 
2824 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2825 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2826 
2827 			if (error) {
2828 				/* "Who cares."  Continue looking
2829 				   for something that exists*/
2830 				vput(vp);
2831 				continue;
2832 			}
2833 
2834 			error = getdisksize(vp, &numsecs, &secsize);
2835 			if (error) {
2836 				/*
2837 				 * Pseudo devices like vnd and cgd can be
2838 				 * opened but may still need some configuration.
2839 				 * Ignore these quietly.
2840 				 */
2841 				if (error != ENXIO)
2842 					printf("RAIDframe: can't get disk size"
2843 					    " for dev %s (%d)\n",
2844 					    device_xname(dv), error);
2845 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2846 				vput(vp);
2847 				continue;
2848 			}
2849 			if (wedge) {
2850 				struct dkwedge_info dkw;
2851 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2852 				    NOCRED);
2853 				if (error) {
2854 					printf("RAIDframe: can't get wedge info for "
2855 					    "dev %s (%d)\n", device_xname(dv), error);
2856 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2857 					vput(vp);
2858 					continue;
2859 				}
2860 
2861 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2862 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2863 					vput(vp);
2864 					continue;
2865 				}
2866 
2867 				VOP_UNLOCK(vp);
2868 				ac_list = rf_get_component(ac_list, dev, vp,
2869 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
2870 				rf_part_found = 1; /*There is a raid component on this disk*/
2871 				continue;
2872 			}
2873 
2874 			/* Ok, the disk exists.  Go get the disklabel. */
2875 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2876 			if (error) {
2877 				/*
2878 				 * XXX can't happen - open() would
2879 				 * have errored out (or faked up one)
2880 				 */
2881 				if (error != ENOTTY)
2882 					printf("RAIDframe: can't get label for dev "
2883 					    "%s (%d)\n", device_xname(dv), error);
2884 			}
2885 
2886 			/* don't need this any more.  We'll allocate it again
2887 			   a little later if we really do... */
2888 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2889 			vput(vp);
2890 
2891 			if (error)
2892 				continue;
2893 
2894 			rf_part_found = 0; /*No raid partitions yet*/
2895 			for (i = 0; i < label.d_npartitions; i++) {
2896 				char cname[sizeof(ac_list->devname)];
2897 
2898 				/* We only support partitions marked as RAID */
2899 				if (label.d_partitions[i].p_fstype != FS_RAID)
2900 					continue;
2901 
2902 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2903 				if (bdevvp(dev, &vp))
2904 					panic("RAID can't alloc vnode");
2905 
2906 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2907 				error = VOP_OPEN(vp, FREAD, NOCRED);
2908 				if (error) {
2909 					/* Whatever... */
2910 					vput(vp);
2911 					continue;
2912 				}
2913 				VOP_UNLOCK(vp);
2914 				snprintf(cname, sizeof(cname), "%s%c",
2915 				    device_xname(dv), 'a' + i);
2916 				ac_list = rf_get_component(ac_list, dev, vp, cname,
2917 					label.d_partitions[i].p_size, numsecs, secsize);
2918 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
2919 			}
2920 
2921 			/*
2922 			 *If there is no raid component on this disk, either in a
2923 			 *disklabel or inside a wedge, check the raw partition as well,
2924 			 *as it is possible to configure raid components on raw disk
2925 			 *devices.
2926 			 */
2927 
2928 			if (!rf_part_found) {
2929 				char cname[sizeof(ac_list->devname)];
2930 
2931 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2932 				if (bdevvp(dev, &vp))
2933 					panic("RAID can't alloc vnode");
2934 
2935 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2936 
2937 				error = VOP_OPEN(vp, FREAD, NOCRED);
2938 				if (error) {
2939 					/* Whatever... */
2940 					vput(vp);
2941 					continue;
2942 				}
2943 				VOP_UNLOCK(vp);
2944 				snprintf(cname, sizeof(cname), "%s%c",
2945 				    device_xname(dv), 'a' + RAW_PART);
2946 				ac_list = rf_get_component(ac_list, dev, vp, cname,
2947 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
2948 			}
2949 		}
2950 		deviter_release(&di);
2951 	}
2952 	return ac_list;
2953 }
2954 
2955 
2956 int
2957 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
2958 {
2959 
2960 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2961 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2962 	    ((clabel->clean == RF_RAID_CLEAN) ||
2963 	     (clabel->clean == RF_RAID_DIRTY)) &&
2964 	    clabel->row >=0 &&
2965 	    clabel->column >= 0 &&
2966 	    clabel->num_rows > 0 &&
2967 	    clabel->num_columns > 0 &&
2968 	    clabel->row < clabel->num_rows &&
2969 	    clabel->column < clabel->num_columns &&
2970 	    clabel->blockSize > 0 &&
2971 	    /*
2972 	     * numBlocksHi may contain garbage, but it is ok since
2973 	     * the type is unsigned.  If it is really garbage,
2974 	     * rf_fix_old_label_size() will fix it.
2975 	     */
2976 	    rf_component_label_numblocks(clabel) > 0) {
2977 		/*
2978 		 * label looks reasonable enough...
2979 		 * let's make sure it has no old garbage.
2980 		 */
2981 		if (numsecs)
2982 			rf_fix_old_label_size(clabel, numsecs);
2983 		return(1);
2984 	}
2985 	return(0);
2986 }
2987 
2988 
2989 /*
2990  * For reasons yet unknown, some old component labels have garbage in
2991  * the newer numBlocksHi region, and this causes lossage.  Since those
2992  * disks will also have numsecs set to less than 32 bits of sectors,
2993  * we can determine when this corruption has occurred, and fix it.
2994  *
2995  * The exact same problem, with the same unknown reason, happens to
2996  * the partitionSizeHi member as well.
2997  */
2998 static void
2999 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3000 {
3001 
3002 	if (numsecs < ((uint64_t)1 << 32)) {
3003 		if (clabel->numBlocksHi) {
3004 			printf("WARNING: total sectors < 32 bits, yet "
3005 			       "numBlocksHi set\n"
3006 			       "WARNING: resetting numBlocksHi to zero.\n");
3007 			clabel->numBlocksHi = 0;
3008 		}
3009 
3010 		if (clabel->partitionSizeHi) {
3011 			printf("WARNING: total sectors < 32 bits, yet "
3012 			       "partitionSizeHi set\n"
3013 			       "WARNING: resetting partitionSizeHi to zero.\n");
3014 			clabel->partitionSizeHi = 0;
3015 		}
3016 	}
3017 }
3018 
3019 
3020 #ifdef DEBUG
3021 void
3022 rf_print_component_label(RF_ComponentLabel_t *clabel)
3023 {
3024 	uint64_t numBlocks;
3025 	static const char *rp[] = {
3026 	    "No", "Force", "Soft", "*invalid*"
3027 	};
3028 
3029 
3030 	numBlocks = rf_component_label_numblocks(clabel);
3031 
3032 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3033 	       clabel->row, clabel->column,
3034 	       clabel->num_rows, clabel->num_columns);
3035 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3036 	       clabel->version, clabel->serial_number,
3037 	       clabel->mod_counter);
3038 	printf("   Clean: %s Status: %d\n",
3039 	       clabel->clean ? "Yes" : "No", clabel->status);
3040 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3041 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3042 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3043 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3044 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3045 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3046 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3047 #if 0
3048 	   printf("   Config order: %d\n", clabel->config_order);
3049 #endif
3050 
3051 }
3052 #endif
3053 
3054 RF_ConfigSet_t *
3055 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3056 {
3057 	RF_AutoConfig_t *ac;
3058 	RF_ConfigSet_t *config_sets;
3059 	RF_ConfigSet_t *cset;
3060 	RF_AutoConfig_t *ac_next;
3061 
3062 
3063 	config_sets = NULL;
3064 
3065 	/* Go through the AutoConfig list, and figure out which components
3066 	   belong to what sets.  */
3067 	ac = ac_list;
3068 	while(ac!=NULL) {
3069 		/* we're going to putz with ac->next, so save it here
3070 		   for use at the end of the loop */
3071 		ac_next = ac->next;
3072 
3073 		if (config_sets == NULL) {
3074 			/* will need at least this one... */
3075 			config_sets = malloc(sizeof(RF_ConfigSet_t),
3076 				       M_RAIDFRAME, M_WAITOK);
3077 			/* this one is easy :) */
3078 			config_sets->ac = ac;
3079 			config_sets->next = NULL;
3080 			config_sets->rootable = 0;
3081 			ac->next = NULL;
3082 		} else {
3083 			/* which set does this component fit into? */
3084 			cset = config_sets;
3085 			while(cset!=NULL) {
3086 				if (rf_does_it_fit(cset, ac)) {
3087 					/* looks like it matches... */
3088 					ac->next = cset->ac;
3089 					cset->ac = ac;
3090 					break;
3091 				}
3092 				cset = cset->next;
3093 			}
3094 			if (cset==NULL) {
3095 				/* didn't find a match above... new set..*/
3096 				cset = malloc(sizeof(RF_ConfigSet_t),
3097 					       M_RAIDFRAME, M_WAITOK);
3098 				cset->ac = ac;
3099 				ac->next = NULL;
3100 				cset->next = config_sets;
3101 				cset->rootable = 0;
3102 				config_sets = cset;
3103 			}
3104 		}
3105 		ac = ac_next;
3106 	}
3107 
3108 
3109 	return(config_sets);
3110 }
3111 
3112 static int
3113 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3114 {
3115 	RF_ComponentLabel_t *clabel1, *clabel2;
3116 
3117 	/* If this one matches the *first* one in the set, that's good
3118 	   enough, since the other members of the set would have been
3119 	   through here too... */
3120 	/* note that we are not checking partitionSize here..
3121 
3122 	   Note that we are also not checking the mod_counters here.
3123 	   If everything else matches except the mod_counter, that's
3124 	   good enough for this test.  We will deal with the mod_counters
3125 	   a little later in the autoconfiguration process.
3126 
3127 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3128 
3129 	   The reason we don't check for this is that failed disks
3130 	   will have lower modification counts.  If those disks are
3131 	   not added to the set they used to belong to, then they will
3132 	   form their own set, which may result in 2 different sets,
3133 	   for example, competing to be configured at raid0, and
3134 	   perhaps competing to be the root filesystem set.  If the
3135 	   wrong ones get configured, or both attempt to become /,
3136 	   weird behaviour and or serious lossage will occur.  Thus we
3137 	   need to bring them into the fold here, and kick them out at
3138 	   a later point.
3139 
3140 	*/
3141 
3142 	clabel1 = cset->ac->clabel;
3143 	clabel2 = ac->clabel;
3144 	if ((clabel1->version == clabel2->version) &&
3145 	    (clabel1->serial_number == clabel2->serial_number) &&
3146 	    (clabel1->num_rows == clabel2->num_rows) &&
3147 	    (clabel1->num_columns == clabel2->num_columns) &&
3148 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3149 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3150 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3151 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3152 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3153 	    (clabel1->blockSize == clabel2->blockSize) &&
3154 	    rf_component_label_numblocks(clabel1) ==
3155 	    rf_component_label_numblocks(clabel2) &&
3156 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3157 	    (clabel1->root_partition == clabel2->root_partition) &&
3158 	    (clabel1->last_unit == clabel2->last_unit) &&
3159 	    (clabel1->config_order == clabel2->config_order)) {
3160 		/* if it get's here, it almost *has* to be a match */
3161 	} else {
3162 		/* it's not consistent with somebody in the set..
3163 		   punt */
3164 		return(0);
3165 	}
3166 	/* all was fine.. it must fit... */
3167 	return(1);
3168 }
3169 
3170 int
3171 rf_have_enough_components(RF_ConfigSet_t *cset)
3172 {
3173 	RF_AutoConfig_t *ac;
3174 	RF_AutoConfig_t *auto_config;
3175 	RF_ComponentLabel_t *clabel;
3176 	int c;
3177 	int num_cols;
3178 	int num_missing;
3179 	int mod_counter;
3180 	int mod_counter_found;
3181 	int even_pair_failed;
3182 	char parity_type;
3183 
3184 
3185 	/* check to see that we have enough 'live' components
3186 	   of this set.  If so, we can configure it if necessary */
3187 
3188 	num_cols = cset->ac->clabel->num_columns;
3189 	parity_type = cset->ac->clabel->parityConfig;
3190 
3191 	/* XXX Check for duplicate components!?!?!? */
3192 
3193 	/* Determine what the mod_counter is supposed to be for this set. */
3194 
3195 	mod_counter_found = 0;
3196 	mod_counter = 0;
3197 	ac = cset->ac;
3198 	while(ac!=NULL) {
3199 		if (mod_counter_found==0) {
3200 			mod_counter = ac->clabel->mod_counter;
3201 			mod_counter_found = 1;
3202 		} else {
3203 			if (ac->clabel->mod_counter > mod_counter) {
3204 				mod_counter = ac->clabel->mod_counter;
3205 			}
3206 		}
3207 		ac = ac->next;
3208 	}
3209 
3210 	num_missing = 0;
3211 	auto_config = cset->ac;
3212 
3213 	even_pair_failed = 0;
3214 	for(c=0; c<num_cols; c++) {
3215 		ac = auto_config;
3216 		while(ac!=NULL) {
3217 			if ((ac->clabel->column == c) &&
3218 			    (ac->clabel->mod_counter == mod_counter)) {
3219 				/* it's this one... */
3220 #ifdef DEBUG
3221 				printf("Found: %s at %d\n",
3222 				       ac->devname,c);
3223 #endif
3224 				break;
3225 			}
3226 			ac=ac->next;
3227 		}
3228 		if (ac==NULL) {
3229 				/* Didn't find one here! */
3230 				/* special case for RAID 1, especially
3231 				   where there are more than 2
3232 				   components (where RAIDframe treats
3233 				   things a little differently :( ) */
3234 			if (parity_type == '1') {
3235 				if (c%2 == 0) { /* even component */
3236 					even_pair_failed = 1;
3237 				} else { /* odd component.  If
3238 					    we're failed, and
3239 					    so is the even
3240 					    component, it's
3241 					    "Good Night, Charlie" */
3242 					if (even_pair_failed == 1) {
3243 						return(0);
3244 					}
3245 				}
3246 			} else {
3247 				/* normal accounting */
3248 				num_missing++;
3249 			}
3250 		}
3251 		if ((parity_type == '1') && (c%2 == 1)) {
3252 				/* Just did an even component, and we didn't
3253 				   bail.. reset the even_pair_failed flag,
3254 				   and go on to the next component.... */
3255 			even_pair_failed = 0;
3256 		}
3257 	}
3258 
3259 	clabel = cset->ac->clabel;
3260 
3261 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3262 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3263 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3264 		/* XXX this needs to be made *much* more general */
3265 		/* Too many failures */
3266 		return(0);
3267 	}
3268 	/* otherwise, all is well, and we've got enough to take a kick
3269 	   at autoconfiguring this set */
3270 	return(1);
3271 }
3272 
3273 void
3274 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3275 			RF_Raid_t *raidPtr)
3276 {
3277 	RF_ComponentLabel_t *clabel;
3278 	int i;
3279 
3280 	clabel = ac->clabel;
3281 
3282 	/* 1. Fill in the common stuff */
3283 	config->numCol = clabel->num_columns;
3284 	config->numSpare = 0; /* XXX should this be set here? */
3285 	config->sectPerSU = clabel->sectPerSU;
3286 	config->SUsPerPU = clabel->SUsPerPU;
3287 	config->SUsPerRU = clabel->SUsPerRU;
3288 	config->parityConfig = clabel->parityConfig;
3289 	/* XXX... */
3290 	strcpy(config->diskQueueType,"fifo");
3291 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3292 	config->layoutSpecificSize = 0; /* XXX ?? */
3293 
3294 	while(ac!=NULL) {
3295 		/* row/col values will be in range due to the checks
3296 		   in reasonable_label() */
3297 		strcpy(config->devnames[0][ac->clabel->column],
3298 		       ac->devname);
3299 		ac = ac->next;
3300 	}
3301 
3302 	for(i=0;i<RF_MAXDBGV;i++) {
3303 		config->debugVars[i][0] = 0;
3304 	}
3305 }
3306 
3307 int
3308 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3309 {
3310 	RF_ComponentLabel_t *clabel;
3311 	int column;
3312 	int sparecol;
3313 
3314 	raidPtr->autoconfigure = new_value;
3315 
3316 	for(column=0; column<raidPtr->numCol; column++) {
3317 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3318 			clabel = raidget_component_label(raidPtr, column);
3319 			clabel->autoconfigure = new_value;
3320 			raidflush_component_label(raidPtr, column);
3321 		}
3322 	}
3323 	for(column = 0; column < raidPtr->numSpare ; column++) {
3324 		sparecol = raidPtr->numCol + column;
3325 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3326 			clabel = raidget_component_label(raidPtr, sparecol);
3327 			clabel->autoconfigure = new_value;
3328 			raidflush_component_label(raidPtr, sparecol);
3329 		}
3330 	}
3331 	return(new_value);
3332 }
3333 
3334 int
3335 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3336 {
3337 	RF_ComponentLabel_t *clabel;
3338 	int column;
3339 	int sparecol;
3340 
3341 	raidPtr->root_partition = new_value;
3342 	for(column=0; column<raidPtr->numCol; column++) {
3343 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3344 			clabel = raidget_component_label(raidPtr, column);
3345 			clabel->root_partition = new_value;
3346 			raidflush_component_label(raidPtr, column);
3347 		}
3348 	}
3349 	for(column = 0; column < raidPtr->numSpare ; column++) {
3350 		sparecol = raidPtr->numCol + column;
3351 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3352 			clabel = raidget_component_label(raidPtr, sparecol);
3353 			clabel->root_partition = new_value;
3354 			raidflush_component_label(raidPtr, sparecol);
3355 		}
3356 	}
3357 	return(new_value);
3358 }
3359 
3360 void
3361 rf_release_all_vps(RF_ConfigSet_t *cset)
3362 {
3363 	RF_AutoConfig_t *ac;
3364 
3365 	ac = cset->ac;
3366 	while(ac!=NULL) {
3367 		/* Close the vp, and give it back */
3368 		if (ac->vp) {
3369 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3370 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3371 			vput(ac->vp);
3372 			ac->vp = NULL;
3373 		}
3374 		ac = ac->next;
3375 	}
3376 }
3377 
3378 
3379 void
3380 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3381 {
3382 	RF_AutoConfig_t *ac;
3383 	RF_AutoConfig_t *next_ac;
3384 
3385 	ac = cset->ac;
3386 	while(ac!=NULL) {
3387 		next_ac = ac->next;
3388 		/* nuke the label */
3389 		free(ac->clabel, M_RAIDFRAME);
3390 		/* cleanup the config structure */
3391 		free(ac, M_RAIDFRAME);
3392 		/* "next.." */
3393 		ac = next_ac;
3394 	}
3395 	/* and, finally, nuke the config set */
3396 	free(cset, M_RAIDFRAME);
3397 }
3398 
3399 
3400 void
3401 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3402 {
3403 	/* current version number */
3404 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3405 	clabel->serial_number = raidPtr->serial_number;
3406 	clabel->mod_counter = raidPtr->mod_counter;
3407 
3408 	clabel->num_rows = 1;
3409 	clabel->num_columns = raidPtr->numCol;
3410 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3411 	clabel->status = rf_ds_optimal; /* "It's good!" */
3412 
3413 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3414 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3415 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3416 
3417 	clabel->blockSize = raidPtr->bytesPerSector;
3418 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3419 
3420 	/* XXX not portable */
3421 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3422 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3423 	clabel->autoconfigure = raidPtr->autoconfigure;
3424 	clabel->root_partition = raidPtr->root_partition;
3425 	clabel->last_unit = raidPtr->raidid;
3426 	clabel->config_order = raidPtr->config_order;
3427 
3428 #ifndef RF_NO_PARITY_MAP
3429 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3430 #endif
3431 }
3432 
3433 struct raid_softc *
3434 rf_auto_config_set(RF_ConfigSet_t *cset)
3435 {
3436 	RF_Raid_t *raidPtr;
3437 	RF_Config_t *config;
3438 	int raidID;
3439 	struct raid_softc *sc;
3440 
3441 #ifdef DEBUG
3442 	printf("RAID autoconfigure\n");
3443 #endif
3444 
3445 	/* 1. Create a config structure */
3446 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3447 
3448 	/*
3449 	   2. Figure out what RAID ID this one is supposed to live at
3450 	   See if we can get the same RAID dev that it was configured
3451 	   on last time..
3452 	*/
3453 
3454 	raidID = cset->ac->clabel->last_unit;
3455 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3456 	     sc = raidget(++raidID, false))
3457 		continue;
3458 #ifdef DEBUG
3459 	printf("Configuring raid%d:\n",raidID);
3460 #endif
3461 
3462 	if (sc == NULL)
3463 		sc = raidget(raidID, true);
3464 	raidPtr = &sc->sc_r;
3465 
3466 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3467 	raidPtr->softc = sc;
3468 	raidPtr->raidid = raidID;
3469 	raidPtr->openings = RAIDOUTSTANDING;
3470 
3471 	/* 3. Build the configuration structure */
3472 	rf_create_configuration(cset->ac, config, raidPtr);
3473 
3474 	/* 4. Do the configuration */
3475 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3476 		raidinit(sc);
3477 
3478 		rf_markalldirty(raidPtr);
3479 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3480 		switch (cset->ac->clabel->root_partition) {
3481 		case 1:	/* Force Root */
3482 		case 2:	/* Soft Root: root when boot partition part of raid */
3483 			/*
3484 			 * everything configured just fine.  Make a note
3485 			 * that this set is eligible to be root,
3486 			 * or forced to be root
3487 			 */
3488 			cset->rootable = cset->ac->clabel->root_partition;
3489 			/* XXX do this here? */
3490 			raidPtr->root_partition = cset->rootable;
3491 			break;
3492 		default:
3493 			break;
3494 		}
3495 	} else {
3496 		raidput(sc);
3497 		sc = NULL;
3498 	}
3499 
3500 	/* 5. Cleanup */
3501 	free(config, M_RAIDFRAME);
3502 	return sc;
3503 }
3504 
3505 void
3506 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3507 	     size_t xmin, size_t xmax)
3508 {
3509 	int error;
3510 
3511 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3512 	pool_sethiwat(p, xmax);
3513 	if ((error = pool_prime(p, xmin)) != 0)
3514 		panic("%s: failed to prime pool: %d", __func__, error);
3515 	pool_setlowat(p, xmin);
3516 }
3517 
3518 /*
3519  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3520  * to see if there is IO pending and if that IO could possibly be done
3521  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3522  * otherwise.
3523  *
3524  */
3525 int
3526 rf_buf_queue_check(RF_Raid_t *raidPtr)
3527 {
3528 	struct raid_softc *rs;
3529 	struct dk_softc *dksc;
3530 
3531 	rs = raidPtr->softc;
3532 	dksc = &rs->sc_dksc;
3533 
3534 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3535 		return 1;
3536 
3537 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3538 		/* there is work to do */
3539 		return 0;
3540 	}
3541 	/* default is nothing to do */
3542 	return 1;
3543 }
3544 
3545 int
3546 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3547 {
3548 	uint64_t numsecs;
3549 	unsigned secsize;
3550 	int error;
3551 
3552 	error = getdisksize(vp, &numsecs, &secsize);
3553 	if (error == 0) {
3554 		diskPtr->blockSize = secsize;
3555 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3556 		diskPtr->partitionSize = numsecs;
3557 		return 0;
3558 	}
3559 	return error;
3560 }
3561 
3562 static int
3563 raid_match(device_t self, cfdata_t cfdata, void *aux)
3564 {
3565 	return 1;
3566 }
3567 
3568 static void
3569 raid_attach(device_t parent, device_t self, void *aux)
3570 {
3571 }
3572 
3573 
3574 static int
3575 raid_detach(device_t self, int flags)
3576 {
3577 	int error;
3578 	struct raid_softc *rs = raidsoftc(self);
3579 
3580 	if (rs == NULL)
3581 		return ENXIO;
3582 
3583 	if ((error = raidlock(rs)) != 0)
3584 		return (error);
3585 
3586 	error = raid_detach_unlocked(rs);
3587 
3588 	raidunlock(rs);
3589 
3590 	/* XXX raid can be referenced here */
3591 
3592 	if (error)
3593 		return error;
3594 
3595 	/* Free the softc */
3596 	raidput(rs);
3597 
3598 	return 0;
3599 }
3600 
3601 static void
3602 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3603 {
3604 	struct dk_softc *dksc = &rs->sc_dksc;
3605 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3606 
3607 	memset(dg, 0, sizeof(*dg));
3608 
3609 	dg->dg_secperunit = raidPtr->totalSectors;
3610 	dg->dg_secsize = raidPtr->bytesPerSector;
3611 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3612 	dg->dg_ntracks = 4 * raidPtr->numCol;
3613 
3614 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3615 }
3616 
3617 /*
3618  * Get cache info for all the components (including spares).
3619  * Returns intersection of all the cache flags of all disks, or first
3620  * error if any encountered.
3621  * XXXfua feature flags can change as spares are added - lock down somehow
3622  */
3623 static int
3624 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3625 {
3626 	int c;
3627 	int error;
3628 	int dkwhole = 0, dkpart;
3629 
3630 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3631 		/*
3632 		 * Check any non-dead disk, even when currently being
3633 		 * reconstructed.
3634 		 */
3635 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3636 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3637 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3638 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
3639 			if (error) {
3640 				if (error != ENODEV) {
3641 					printf("raid%d: get cache for component %s failed\n",
3642 					    raidPtr->raidid,
3643 					    raidPtr->Disks[c].devname);
3644 				}
3645 
3646 				return error;
3647 			}
3648 
3649 			if (c == 0)
3650 				dkwhole = dkpart;
3651 			else
3652 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3653 		}
3654 	}
3655 
3656 	*data = dkwhole;
3657 
3658 	return 0;
3659 }
3660 
3661 /*
3662  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3663  * We end up returning whatever error was returned by the first cache flush
3664  * that fails.
3665  */
3666 
3667 int
3668 rf_sync_component_caches(RF_Raid_t *raidPtr)
3669 {
3670 	int c, sparecol;
3671 	int e,error;
3672 	int force = 1;
3673 
3674 	error = 0;
3675 	for (c = 0; c < raidPtr->numCol; c++) {
3676 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3677 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3678 					  &force, FWRITE, NOCRED);
3679 			if (e) {
3680 				if (e != ENODEV)
3681 					printf("raid%d: cache flush to component %s failed.\n",
3682 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3683 				if (error == 0) {
3684 					error = e;
3685 				}
3686 			}
3687 		}
3688 	}
3689 
3690 	for( c = 0; c < raidPtr->numSpare ; c++) {
3691 		sparecol = raidPtr->numCol + c;
3692 		/* Need to ensure that the reconstruct actually completed! */
3693 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3694 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3695 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3696 			if (e) {
3697 				if (e != ENODEV)
3698 					printf("raid%d: cache flush to component %s failed.\n",
3699 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3700 				if (error == 0) {
3701 					error = e;
3702 				}
3703 			}
3704 		}
3705 	}
3706 	return error;
3707 }
3708 
3709 /* Fill in info with the current status */
3710 void
3711 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3712 {
3713 
3714 	if (raidPtr->status != rf_rs_reconstructing) {
3715 		info->total = 100;
3716 		info->completed = 100;
3717 	} else {
3718 		info->total = raidPtr->reconControl->numRUsTotal;
3719 		info->completed = raidPtr->reconControl->numRUsComplete;
3720 	}
3721 	info->remaining = info->total - info->completed;
3722 }
3723 
3724 /* Fill in info with the current status */
3725 void
3726 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3727 {
3728 
3729 	if (raidPtr->parity_rewrite_in_progress == 1) {
3730 		info->total = raidPtr->Layout.numStripe;
3731 		info->completed = raidPtr->parity_rewrite_stripes_done;
3732 	} else {
3733 		info->completed = 100;
3734 		info->total = 100;
3735 	}
3736 	info->remaining = info->total - info->completed;
3737 }
3738 
3739 /* Fill in info with the current status */
3740 void
3741 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3742 {
3743 
3744 	if (raidPtr->copyback_in_progress == 1) {
3745 		info->total = raidPtr->Layout.numStripe;
3746 		info->completed = raidPtr->copyback_stripes_done;
3747 		info->remaining = info->total - info->completed;
3748 	} else {
3749 		info->remaining = 0;
3750 		info->completed = 100;
3751 		info->total = 100;
3752 	}
3753 }
3754 
3755 /* Fill in config with the current info */
3756 int
3757 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3758 {
3759 	int	d, i, j;
3760 
3761 	if (!raidPtr->valid)
3762 		return (ENODEV);
3763 	config->cols = raidPtr->numCol;
3764 	config->ndevs = raidPtr->numCol;
3765 	if (config->ndevs >= RF_MAX_DISKS)
3766 		return (ENOMEM);
3767 	config->nspares = raidPtr->numSpare;
3768 	if (config->nspares >= RF_MAX_DISKS)
3769 		return (ENOMEM);
3770 	config->maxqdepth = raidPtr->maxQueueDepth;
3771 	d = 0;
3772 	for (j = 0; j < config->cols; j++) {
3773 		config->devs[d] = raidPtr->Disks[j];
3774 		d++;
3775 	}
3776 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3777 		config->spares[i] = raidPtr->Disks[j];
3778 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
3779 			/* XXX: raidctl(8) expects to see this as a used spare */
3780 			config->spares[i].status = rf_ds_used_spare;
3781 		}
3782 	}
3783 	return 0;
3784 }
3785 
3786 int
3787 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3788 {
3789 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3790 	RF_ComponentLabel_t *raid_clabel;
3791 	int column = clabel->column;
3792 
3793 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3794 		return EINVAL;
3795 	raid_clabel = raidget_component_label(raidPtr, column);
3796 	memcpy(clabel, raid_clabel, sizeof *clabel);
3797 
3798 	return 0;
3799 }
3800 
3801 /*
3802  * Module interface
3803  */
3804 
3805 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3806 
3807 #ifdef _MODULE
3808 CFDRIVER_DECL(raid, DV_DISK, NULL);
3809 #endif
3810 
3811 static int raid_modcmd(modcmd_t, void *);
3812 static int raid_modcmd_init(void);
3813 static int raid_modcmd_fini(void);
3814 
3815 static int
3816 raid_modcmd(modcmd_t cmd, void *data)
3817 {
3818 	int error;
3819 
3820 	error = 0;
3821 	switch (cmd) {
3822 	case MODULE_CMD_INIT:
3823 		error = raid_modcmd_init();
3824 		break;
3825 	case MODULE_CMD_FINI:
3826 		error = raid_modcmd_fini();
3827 		break;
3828 	default:
3829 		error = ENOTTY;
3830 		break;
3831 	}
3832 	return error;
3833 }
3834 
3835 static int
3836 raid_modcmd_init(void)
3837 {
3838 	int error;
3839 	int bmajor, cmajor;
3840 
3841 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3842 	mutex_enter(&raid_lock);
3843 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3844 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3845 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3846 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3847 
3848 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3849 #endif
3850 
3851 	bmajor = cmajor = -1;
3852 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3853 	    &raid_cdevsw, &cmajor);
3854 	if (error != 0 && error != EEXIST) {
3855 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3856 		mutex_exit(&raid_lock);
3857 		return error;
3858 	}
3859 #ifdef _MODULE
3860 	error = config_cfdriver_attach(&raid_cd);
3861 	if (error != 0) {
3862 		aprint_error("%s: config_cfdriver_attach failed %d\n",
3863 		    __func__, error);
3864 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
3865 		mutex_exit(&raid_lock);
3866 		return error;
3867 	}
3868 #endif
3869 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3870 	if (error != 0) {
3871 		aprint_error("%s: config_cfattach_attach failed %d\n",
3872 		    __func__, error);
3873 #ifdef _MODULE
3874 		config_cfdriver_detach(&raid_cd);
3875 #endif
3876 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
3877 		mutex_exit(&raid_lock);
3878 		return error;
3879 	}
3880 
3881 	raidautoconfigdone = false;
3882 
3883 	mutex_exit(&raid_lock);
3884 
3885 	if (error == 0) {
3886 		if (rf_BootRaidframe(true) == 0)
3887 			aprint_verbose("Kernelized RAIDframe activated\n");
3888 		else
3889 			panic("Serious error activating RAID!!");
3890 	}
3891 
3892 	/*
3893 	 * Register a finalizer which will be used to auto-config RAID
3894 	 * sets once all real hardware devices have been found.
3895 	 */
3896 	error = config_finalize_register(NULL, rf_autoconfig);
3897 	if (error != 0) {
3898 		aprint_error("WARNING: unable to register RAIDframe "
3899 		    "finalizer\n");
3900 		error = 0;
3901 	}
3902 
3903 	return error;
3904 }
3905 
3906 static int
3907 raid_modcmd_fini(void)
3908 {
3909 	int error;
3910 
3911 	mutex_enter(&raid_lock);
3912 
3913 	/* Don't allow unload if raid device(s) exist.  */
3914 	if (!LIST_EMPTY(&raids)) {
3915 		mutex_exit(&raid_lock);
3916 		return EBUSY;
3917 	}
3918 
3919 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3920 	if (error != 0) {
3921 		aprint_error("%s: cannot detach cfattach\n",__func__);
3922 		mutex_exit(&raid_lock);
3923 		return error;
3924 	}
3925 #ifdef _MODULE
3926 	error = config_cfdriver_detach(&raid_cd);
3927 	if (error != 0) {
3928 		aprint_error("%s: cannot detach cfdriver\n",__func__);
3929 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3930 		mutex_exit(&raid_lock);
3931 		return error;
3932 	}
3933 #endif
3934 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3935 	if (error != 0) {
3936 		aprint_error("%s: cannot detach devsw\n",__func__);
3937 #ifdef _MODULE
3938 		config_cfdriver_attach(&raid_cd);
3939 #endif
3940 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3941 		mutex_exit(&raid_lock);
3942 		return error;
3943 	}
3944 	rf_BootRaidframe(false);
3945 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3946 	rf_destroy_mutex2(rf_sparet_wait_mutex);
3947 	rf_destroy_cond2(rf_sparet_wait_cv);
3948 	rf_destroy_cond2(rf_sparet_resp_cv);
3949 #endif
3950 	mutex_exit(&raid_lock);
3951 	mutex_destroy(&raid_lock);
3952 
3953 	return error;
3954 }
3955