xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.412 2023/06/15 09:15:54 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.412 2023/06/15 09:15:54 hannken Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151 
152 #include "ioconf.h"
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165 
166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
167 						 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
169 						 * installation process */
170 #endif
171 
172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
173 
174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
175 
176 /* prototypes */
177 static void KernelWakeupFunc(struct buf *);
178 static void InitBP(struct buf *, struct vnode *, unsigned,
179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
180     void *, int);
181 static void raidinit(struct raid_softc *);
182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
184 
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188 
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190     daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192     daddr_t, daddr_t);
193 
194 static int raidwrite_component_label(unsigned,
195     dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197     dev_t, struct vnode *, RF_ComponentLabel_t *);
198 
199 static int raid_diskstart(device_t, struct buf *bp);
200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
201 static int raid_lastclose(device_t);
202 
203 static dev_type_open(raidopen);
204 static dev_type_close(raidclose);
205 static dev_type_read(raidread);
206 static dev_type_write(raidwrite);
207 static dev_type_ioctl(raidioctl);
208 static dev_type_strategy(raidstrategy);
209 static dev_type_dump(raiddump);
210 static dev_type_size(raidsize);
211 
212 const struct bdevsw raid_bdevsw = {
213 	.d_open = raidopen,
214 	.d_close = raidclose,
215 	.d_strategy = raidstrategy,
216 	.d_ioctl = raidioctl,
217 	.d_dump = raiddump,
218 	.d_psize = raidsize,
219 	.d_discard = nodiscard,
220 	.d_flag = D_DISK
221 };
222 
223 const struct cdevsw raid_cdevsw = {
224 	.d_open = raidopen,
225 	.d_close = raidclose,
226 	.d_read = raidread,
227 	.d_write = raidwrite,
228 	.d_ioctl = raidioctl,
229 	.d_stop = nostop,
230 	.d_tty = notty,
231 	.d_poll = nopoll,
232 	.d_mmap = nommap,
233 	.d_kqfilter = nokqfilter,
234 	.d_discard = nodiscard,
235 	.d_flag = D_DISK
236 };
237 
238 static struct dkdriver rf_dkdriver = {
239 	.d_open = raidopen,
240 	.d_close = raidclose,
241 	.d_strategy = raidstrategy,
242 	.d_diskstart = raid_diskstart,
243 	.d_dumpblocks = raid_dumpblocks,
244 	.d_lastclose = raid_lastclose,
245 	.d_minphys = minphys
246 };
247 
248 #define	raidunit(x)	DISKUNIT(x)
249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
250 
251 extern struct cfdriver raid_cd;
252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
254     DVF_DETACH_SHUTDOWN);
255 
256 /* Internal representation of a rf_recon_req */
257 struct rf_recon_req_internal {
258 	RF_RowCol_t col;
259 	RF_ReconReqFlags_t flags;
260 	void   *raidPtr;
261 };
262 
263 /*
264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
265  * Be aware that large numbers can allow the driver to consume a lot of
266  * kernel memory, especially on writes, and in degraded mode reads.
267  *
268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
269  * a single 64K write will typically require 64K for the old data,
270  * 64K for the old parity, and 64K for the new parity, for a total
271  * of 192K (if the parity buffer is not re-used immediately).
272  * Even it if is used immediately, that's still 128K, which when multiplied
273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
274  *
275  * Now in degraded mode, for example, a 64K read on the above setup may
276  * require data reconstruction, which will require *all* of the 4 remaining
277  * disks to participate -- 4 * 32K/disk == 128K again.
278  */
279 
280 #ifndef RAIDOUTSTANDING
281 #define RAIDOUTSTANDING   6
282 #endif
283 
284 #define RAIDLABELDEV(dev)	\
285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
286 
287 /* declared here, and made public, for the benefit of KVM stuff.. */
288 
289 static int raidlock(struct raid_softc *);
290 static void raidunlock(struct raid_softc *);
291 
292 static int raid_detach_unlocked(struct raid_softc *);
293 
294 static void rf_markalldirty(RF_Raid_t *);
295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
296 
297 static void rf_ReconThread(struct rf_recon_req_internal *);
298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304 
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316 
317 /*
318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
320  * in the kernel config file.
321  */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328 
329 struct pool rf_alloclist_pool;   /* AllocList */
330 
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333 
334 static struct raid_softc *
335 raidcreate(int unit) {
336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337 	sc->sc_unit = unit;
338 	cv_init(&sc->sc_cv, "raidunit");
339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340 	return sc;
341 }
342 
343 static void
344 raiddestroy(struct raid_softc *sc) {
345 	cv_destroy(&sc->sc_cv);
346 	mutex_destroy(&sc->sc_mutex);
347 	kmem_free(sc, sizeof(*sc));
348 }
349 
350 static struct raid_softc *
351 raidget(int unit, bool create) {
352 	struct raid_softc *sc;
353 	if (unit < 0) {
354 #ifdef DIAGNOSTIC
355 		panic("%s: unit %d!", __func__, unit);
356 #endif
357 		return NULL;
358 	}
359 	mutex_enter(&raid_lock);
360 	LIST_FOREACH(sc, &raids, sc_link) {
361 		if (sc->sc_unit == unit) {
362 			mutex_exit(&raid_lock);
363 			return sc;
364 		}
365 	}
366 	mutex_exit(&raid_lock);
367 	if (!create)
368 		return NULL;
369 	sc = raidcreate(unit);
370 	mutex_enter(&raid_lock);
371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
372 	mutex_exit(&raid_lock);
373 	return sc;
374 }
375 
376 static void
377 raidput(struct raid_softc *sc) {
378 	mutex_enter(&raid_lock);
379 	LIST_REMOVE(sc, sc_link);
380 	mutex_exit(&raid_lock);
381 	raiddestroy(sc);
382 }
383 
384 void
385 raidattach(int num)
386 {
387 
388 	/*
389 	 * Device attachment and associated initialization now occurs
390 	 * as part of the module initialization.
391 	 */
392 }
393 
394 static int
395 rf_autoconfig(device_t self)
396 {
397 	RF_AutoConfig_t *ac_list;
398 	RF_ConfigSet_t *config_sets;
399 
400 	if (!raidautoconfig || raidautoconfigdone == true)
401 		return 0;
402 
403 	/* XXX This code can only be run once. */
404 	raidautoconfigdone = true;
405 
406 #ifdef __HAVE_CPU_BOOTCONF
407 	/*
408 	 * 0. find the boot device if needed first so we can use it later
409 	 * this needs to be done before we autoconfigure any raid sets,
410 	 * because if we use wedges we are not going to be able to open
411 	 * the boot device later
412 	 */
413 	if (booted_device == NULL)
414 		cpu_bootconf();
415 #endif
416 	/* 1. locate all RAID components on the system */
417 	aprint_debug("Searching for RAID components...\n");
418 	ac_list = rf_find_raid_components();
419 
420 	/* 2. Sort them into their respective sets. */
421 	config_sets = rf_create_auto_sets(ac_list);
422 
423 	/*
424 	 * 3. Evaluate each set and configure the valid ones.
425 	 * This gets done in rf_buildroothack().
426 	 */
427 	rf_buildroothack(config_sets);
428 
429 	return 1;
430 }
431 
432 int
433 rf_inited(const struct raid_softc *rs) {
434 	return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436 
437 RF_Raid_t *
438 rf_get_raid(struct raid_softc *rs) {
439 	return &rs->sc_r;
440 }
441 
442 int
443 rf_get_unit(const struct raid_softc *rs) {
444 	return rs->sc_unit;
445 }
446 
447 static int
448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449 	const char *bootname;
450 	size_t len;
451 
452 	/* if bdv is NULL, the set can't contain it. exit early. */
453 	if (bdv == NULL)
454 		return 0;
455 
456 	bootname = device_xname(bdv);
457 	len = strlen(bootname);
458 
459 	for (int col = 0; col < r->numCol; col++) {
460 		const char *devname = r->Disks[col].devname;
461 		devname += sizeof("/dev/") - 1;
462 		if (strncmp(devname, "dk", 2) == 0) {
463 			const char *parent =
464 			    dkwedge_get_parent_name(r->Disks[col].dev);
465 			if (parent != NULL)
466 				devname = parent;
467 		}
468 		if (strncmp(devname, bootname, len) == 0) {
469 			struct raid_softc *sc = r->softc;
470 			aprint_debug("raid%d includes boot device %s\n",
471 			    sc->sc_unit, devname);
472 			return 1;
473 		}
474 	}
475 	return 0;
476 }
477 
478 static int
479 rf_rescan(void)
480 {
481 	RF_AutoConfig_t *ac_list;
482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
483 	struct raid_softc *sc;
484 	int raid_added;
485 
486 	ac_list = rf_find_raid_components();
487 	config_sets = rf_create_auto_sets(ac_list);
488 
489 	raid_added = 1;
490 	while (raid_added > 0) {
491 		raid_added = 0;
492 		cset = config_sets;
493 		while (cset != NULL) {
494 			next_cset = cset->next;
495 			if (rf_have_enough_components(cset) &&
496 			    cset->ac->clabel->autoconfigure == 1) {
497 				sc = rf_auto_config_set(cset);
498 				if (sc != NULL) {
499 					aprint_debug("raid%d: configured ok, rootable %d\n",
500 						     sc->sc_unit, cset->rootable);
501 					/* We added one RAID set */
502 					raid_added++;
503 				} else {
504 					/* The autoconfig didn't work :( */
505 					aprint_debug("Autoconfig failed\n");
506 					rf_release_all_vps(cset);
507 				}
508 			} else {
509 				/* we're not autoconfiguring this set...
510 				   release the associated resources */
511 				rf_release_all_vps(cset);
512 			}
513 			/* cleanup */
514 			rf_cleanup_config_set(cset);
515 			cset = next_cset;
516 		}
517 		if (raid_added > 0) {
518 			/* We added at least one RAID set, so re-scan for recursive RAID */
519 			ac_list = rf_find_raid_components();
520 			config_sets = rf_create_auto_sets(ac_list);
521 		}
522 	}
523 
524 	return 0;
525 }
526 
527 
528 static void
529 rf_buildroothack(RF_ConfigSet_t *config_sets)
530 {
531 	RF_AutoConfig_t *ac_list;
532 	RF_ConfigSet_t *cset;
533 	RF_ConfigSet_t *next_cset;
534 	int num_root;
535 	int raid_added;
536 	struct raid_softc *sc, *rsc;
537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
538 
539 	sc = rsc = NULL;
540 	num_root = 0;
541 
542 	raid_added = 1;
543 	while (raid_added > 0) {
544 		raid_added = 0;
545 		cset = config_sets;
546 		while (cset != NULL) {
547 			next_cset = cset->next;
548 			if (rf_have_enough_components(cset) &&
549 			    cset->ac->clabel->autoconfigure == 1) {
550 				sc = rf_auto_config_set(cset);
551 				if (sc != NULL) {
552 					aprint_debug("raid%d: configured ok, rootable %d\n",
553 						     sc->sc_unit, cset->rootable);
554 					/* We added one RAID set */
555 					raid_added++;
556 					if (cset->rootable) {
557 						rsc = sc;
558 						num_root++;
559 					}
560 				} else {
561 					/* The autoconfig didn't work :( */
562 					aprint_debug("Autoconfig failed\n");
563 					rf_release_all_vps(cset);
564 				}
565 			} else {
566 				/* we're not autoconfiguring this set...
567 				   release the associated resources */
568 				rf_release_all_vps(cset);
569 			}
570 			/* cleanup */
571 			rf_cleanup_config_set(cset);
572 			cset = next_cset;
573 		}
574 		if (raid_added > 0) {
575 			/* We added at least one RAID set, so re-scan for recursive RAID */
576 			ac_list = rf_find_raid_components();
577 			config_sets = rf_create_auto_sets(ac_list);
578 		}
579 	}
580 
581 	/* if the user has specified what the root device should be
582 	   then we don't touch booted_device or boothowto... */
583 
584 	if (rootspec != NULL) {
585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
586 		return;
587 	}
588 
589 	/* we found something bootable... */
590 
591 	/*
592 	 * XXX: The following code assumes that the root raid
593 	 * is the first ('a') partition. This is about the best
594 	 * we can do with a BSD disklabel, but we might be able
595 	 * to do better with a GPT label, by setting a specified
596 	 * attribute to indicate the root partition. We can then
597 	 * stash the partition number in the r->root_partition
598 	 * high bits (the bottom 2 bits are already used). For
599 	 * now we just set booted_partition to 0 when we override
600 	 * root.
601 	 */
602 	if (num_root == 1) {
603 		device_t candidate_root;
604 		dksc = &rsc->sc_dksc;
605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
606 			char cname[sizeof(cset->ac->devname)];
607 			/* XXX: assume partition 'a' first */
608 			snprintf(cname, sizeof(cname), "%s%c",
609 			    device_xname(dksc->sc_dev), 'a');
610 			candidate_root = dkwedge_find_by_wname(cname);
611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
612 			    cname);
613 			if (candidate_root == NULL) {
614 				/*
615 				 * If that is not found, because we don't use
616 				 * disklabel, return the first dk child
617 				 * XXX: we can skip the 'a' check above
618 				 * and always do this...
619 				 */
620 				size_t i = 0;
621 				candidate_root = dkwedge_find_by_parent(
622 				    device_xname(dksc->sc_dev), &i);
623 			}
624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
625 			    candidate_root);
626 		} else
627 			candidate_root = dksc->sc_dev;
628 		aprint_debug("%s: candidate root=%p booted_device=%p "
629 			     "root_partition=%d contains_boot=%d\n",
630 		    __func__, candidate_root, booted_device,
631 		    rsc->sc_r.root_partition,
632 		    rf_containsboot(&rsc->sc_r, booted_device));
633 		/* XXX the check for booted_device == NULL can probably be
634 		 * dropped, now that rf_containsboot handles that case.
635 		 */
636 		if (booted_device == NULL ||
637 		    rsc->sc_r.root_partition == 1 ||
638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
639 			booted_device = candidate_root;
640 			booted_method = "raidframe/single";
641 			booted_partition = 0;	/* XXX assume 'a' */
642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
643 			    device_xname(booted_device), booted_device);
644 		}
645 	} else if (num_root > 1) {
646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
647 		    booted_device);
648 
649 		/*
650 		 * Maybe the MD code can help. If it cannot, then
651 		 * setroot() will discover that we have no
652 		 * booted_device and will ask the user if nothing was
653 		 * hardwired in the kernel config file
654 		 */
655 		if (booted_device == NULL)
656 			return;
657 
658 		num_root = 0;
659 		mutex_enter(&raid_lock);
660 		LIST_FOREACH(sc, &raids, sc_link) {
661 			RF_Raid_t *r = &sc->sc_r;
662 			if (r->valid == 0)
663 				continue;
664 
665 			if (r->root_partition == 0)
666 				continue;
667 
668 			if (rf_containsboot(r, booted_device)) {
669 				num_root++;
670 				rsc = sc;
671 				dksc = &rsc->sc_dksc;
672 			}
673 		}
674 		mutex_exit(&raid_lock);
675 
676 		if (num_root == 1) {
677 			booted_device = dksc->sc_dev;
678 			booted_method = "raidframe/multi";
679 			booted_partition = 0;	/* XXX assume 'a' */
680 		} else {
681 			/* we can't guess.. require the user to answer... */
682 			boothowto |= RB_ASKNAME;
683 		}
684 	}
685 }
686 
687 static int
688 raidsize(dev_t dev)
689 {
690 	struct raid_softc *rs;
691 	struct dk_softc *dksc;
692 	unsigned int unit;
693 
694 	unit = raidunit(dev);
695 	if ((rs = raidget(unit, false)) == NULL)
696 		return -1;
697 	dksc = &rs->sc_dksc;
698 
699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
700 		return -1;
701 
702 	return dk_size(dksc, dev);
703 }
704 
705 static int
706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
707 {
708 	unsigned int unit;
709 	struct raid_softc *rs;
710 	struct dk_softc *dksc;
711 
712 	unit = raidunit(dev);
713 	if ((rs = raidget(unit, false)) == NULL)
714 		return ENXIO;
715 	dksc = &rs->sc_dksc;
716 
717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
718 		return ENODEV;
719 
720         /*
721            Note that blkno is relative to this particular partition.
722            By adding adding RF_PROTECTED_SECTORS, we get a value that
723 	   is relative to the partition used for the underlying component.
724         */
725 	blkno += RF_PROTECTED_SECTORS;
726 
727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
728 }
729 
730 static int
731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
732 {
733 	struct raid_softc *rs = raidsoftc(dev);
734 	const struct bdevsw *bdev;
735 	RF_Raid_t *raidPtr;
736 	int     c, sparecol, j, scol, dumpto;
737 	int     error = 0;
738 
739 	raidPtr = &rs->sc_r;
740 
741 	/* we only support dumping to RAID 1 sets */
742 	if (raidPtr->Layout.numDataCol != 1 ||
743 	    raidPtr->Layout.numParityCol != 1)
744 		return EINVAL;
745 
746 	if ((error = raidlock(rs)) != 0)
747 		return error;
748 
749 	/* figure out what device is alive.. */
750 
751 	/*
752 	   Look for a component to dump to.  The preference for the
753 	   component to dump to is as follows:
754 	   1) the first component
755 	   2) a used_spare of the first component
756 	   3) the second component
757 	   4) a used_spare of the second component
758 	*/
759 
760 	dumpto = -1;
761 	for (c = 0; c < raidPtr->numCol; c++) {
762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
763 			/* this might be the one */
764 			dumpto = c;
765 			break;
766 		}
767 	}
768 
769 	/*
770 	   At this point we have possibly selected a live component.
771 	   If we didn't find a live ocmponent, we now check to see
772 	   if there is a relevant spared component.
773 	*/
774 
775 	for (c = 0; c < raidPtr->numSpare; c++) {
776 		sparecol = raidPtr->numCol + c;
777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
778 			/* How about this one? */
779 			scol = -1;
780 			for(j=0;j<raidPtr->numCol;j++) {
781 				if (raidPtr->Disks[j].spareCol == sparecol) {
782 					scol = j;
783 					break;
784 				}
785 			}
786 			if (scol == 0) {
787 				/*
788 				   We must have found a spared first
789 				   component!  We'll take that over
790 				   anything else found so far.  (We
791 				   couldn't have found a real first
792 				   component before, since this is a
793 				   used spare, and it's saying that
794 				   it's replacing the first
795 				   component.)  On reboot (with
796 				   autoconfiguration turned on)
797 				   sparecol will become the first
798 				   component (component0) of this set.
799 				*/
800 				dumpto = sparecol;
801 				break;
802 			} else if (scol != -1) {
803 				/*
804 				   Must be a spared second component.
805 				   We'll dump to that if we havn't found
806 				   anything else so far.
807 				*/
808 				if (dumpto == -1)
809 					dumpto = sparecol;
810 			}
811 		}
812 	}
813 
814 	if (dumpto == -1) {
815 		/* we couldn't find any live components to dump to!?!?
816 		 */
817 		error = EINVAL;
818 		goto out;
819 	}
820 
821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
822 	if (bdev == NULL) {
823 		error = ENXIO;
824 		goto out;
825 	}
826 
827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
828 				blkno, va, nblk * raidPtr->bytesPerSector);
829 
830 out:
831 	raidunlock(rs);
832 
833 	return error;
834 }
835 
836 /* ARGSUSED */
837 static int
838 raidopen(dev_t dev, int flags, int fmt,
839     struct lwp *l)
840 {
841 	int     unit = raidunit(dev);
842 	struct raid_softc *rs;
843 	struct dk_softc *dksc;
844 	int     error = 0;
845 	int     part, pmask;
846 
847 	if ((rs = raidget(unit, true)) == NULL)
848 		return ENXIO;
849 	if ((error = raidlock(rs)) != 0)
850 		return error;
851 
852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
853 		error = EBUSY;
854 		goto bad;
855 	}
856 
857 	dksc = &rs->sc_dksc;
858 
859 	part = DISKPART(dev);
860 	pmask = (1 << part);
861 
862 	if (!DK_BUSY(dksc, pmask) &&
863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
864 		/* First one... mark things as dirty... Note that we *MUST*
865 		 have done a configure before this.  I DO NOT WANT TO BE
866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
867 		 THAT THEY BELONG TOGETHER!!!!! */
868 		/* XXX should check to see if we're only open for reading
869 		   here... If so, we needn't do this, but then need some
870 		   other way of keeping track of what's happened.. */
871 
872 		rf_markalldirty(&rs->sc_r);
873 	}
874 
875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
876 		error = dk_open(dksc, dev, flags, fmt, l);
877 
878 bad:
879 	raidunlock(rs);
880 
881 	return error;
882 
883 
884 }
885 
886 static int
887 raid_lastclose(device_t self)
888 {
889 	struct raid_softc *rs = raidsoftc(self);
890 
891 	/* Last one... device is not unconfigured yet.
892 	   Device shutdown has taken care of setting the
893 	   clean bits if RAIDF_INITED is not set
894 	   mark things as clean... */
895 
896 	rf_update_component_labels(&rs->sc_r,
897 	    RF_FINAL_COMPONENT_UPDATE);
898 
899 	/* pass to unlocked code */
900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
901 		rs->sc_flags |= RAIDF_DETACH;
902 
903 	return 0;
904 }
905 
906 /* ARGSUSED */
907 static int
908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
909 {
910 	int     unit = raidunit(dev);
911 	struct raid_softc *rs;
912 	struct dk_softc *dksc;
913 	cfdata_t cf;
914 	int     error = 0, do_detach = 0, do_put = 0;
915 
916 	if ((rs = raidget(unit, false)) == NULL)
917 		return ENXIO;
918 	dksc = &rs->sc_dksc;
919 
920 	if ((error = raidlock(rs)) != 0)
921 		return error;
922 
923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
924 		error = dk_close(dksc, dev, flags, fmt, l);
925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
926 			do_detach = 1;
927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
928 		do_put = 1;
929 
930 	raidunlock(rs);
931 
932 	if (do_detach) {
933 		/* free the pseudo device attach bits */
934 		cf = device_cfdata(dksc->sc_dev);
935 		error = config_detach(dksc->sc_dev, 0);
936 		if (error == 0)
937 			free(cf, M_RAIDFRAME);
938 	} else if (do_put) {
939 		raidput(rs);
940 	}
941 
942 	return error;
943 
944 }
945 
946 static void
947 raid_wakeup(RF_Raid_t *raidPtr)
948 {
949 	rf_lock_mutex2(raidPtr->iodone_lock);
950 	rf_signal_cond2(raidPtr->iodone_cv);
951 	rf_unlock_mutex2(raidPtr->iodone_lock);
952 }
953 
954 static void
955 raidstrategy(struct buf *bp)
956 {
957 	unsigned int unit;
958 	struct raid_softc *rs;
959 	struct dk_softc *dksc;
960 	RF_Raid_t *raidPtr;
961 
962 	unit = raidunit(bp->b_dev);
963 	if ((rs = raidget(unit, false)) == NULL) {
964 		bp->b_error = ENXIO;
965 		goto fail;
966 	}
967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
968 		bp->b_error = ENXIO;
969 		goto fail;
970 	}
971 	dksc = &rs->sc_dksc;
972 	raidPtr = &rs->sc_r;
973 
974 	/* Queue IO only */
975 	if (dk_strategy_defer(dksc, bp))
976 		goto done;
977 
978 	/* schedule the IO to happen at the next convenient time */
979 	raid_wakeup(raidPtr);
980 
981 done:
982 	return;
983 
984 fail:
985 	bp->b_resid = bp->b_bcount;
986 	biodone(bp);
987 }
988 
989 static int
990 raid_diskstart(device_t dev, struct buf *bp)
991 {
992 	struct raid_softc *rs = raidsoftc(dev);
993 	RF_Raid_t *raidPtr;
994 
995 	raidPtr = &rs->sc_r;
996 	if (!raidPtr->valid) {
997 		db1_printf(("raid is not valid..\n"));
998 		return ENODEV;
999 	}
1000 
1001 	/* XXX */
1002 	bp->b_resid = 0;
1003 
1004 	return raiddoaccess(raidPtr, bp);
1005 }
1006 
1007 void
1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1009 {
1010 	struct raid_softc *rs;
1011 	struct dk_softc *dksc;
1012 
1013 	rs = raidPtr->softc;
1014 	dksc = &rs->sc_dksc;
1015 
1016 	dk_done(dksc, bp);
1017 
1018 	rf_lock_mutex2(raidPtr->mutex);
1019 	raidPtr->openings++;
1020 	rf_unlock_mutex2(raidPtr->mutex);
1021 
1022 	/* schedule more IO */
1023 	raid_wakeup(raidPtr);
1024 }
1025 
1026 /* ARGSUSED */
1027 static int
1028 raidread(dev_t dev, struct uio *uio, int flags)
1029 {
1030 	int     unit = raidunit(dev);
1031 	struct raid_softc *rs;
1032 
1033 	if ((rs = raidget(unit, false)) == NULL)
1034 		return ENXIO;
1035 
1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1037 		return ENXIO;
1038 
1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1040 
1041 }
1042 
1043 /* ARGSUSED */
1044 static int
1045 raidwrite(dev_t dev, struct uio *uio, int flags)
1046 {
1047 	int     unit = raidunit(dev);
1048 	struct raid_softc *rs;
1049 
1050 	if ((rs = raidget(unit, false)) == NULL)
1051 		return ENXIO;
1052 
1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1054 		return ENXIO;
1055 
1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1057 
1058 }
1059 
1060 static int
1061 raid_detach_unlocked(struct raid_softc *rs)
1062 {
1063 	struct dk_softc *dksc = &rs->sc_dksc;
1064 	RF_Raid_t *raidPtr;
1065 	int error;
1066 
1067 	raidPtr = &rs->sc_r;
1068 
1069 	if (DK_BUSY(dksc, 0) ||
1070 	    raidPtr->recon_in_progress != 0 ||
1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
1072 	    raidPtr->copyback_in_progress != 0)
1073 		return EBUSY;
1074 
1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1076 		return 0;
1077 
1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1079 
1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
1081 		return error;
1082 
1083 	rs->sc_flags &= ~RAIDF_INITED;
1084 
1085 	/* Kill off any queued buffers */
1086 	dk_drain(dksc);
1087 	bufq_free(dksc->sc_bufq);
1088 
1089 	/* Detach the disk. */
1090 	dkwedge_delall(&dksc->sc_dkdev);
1091 	disk_detach(&dksc->sc_dkdev);
1092 	disk_destroy(&dksc->sc_dkdev);
1093 	dk_detach(dksc);
1094 
1095 	return 0;
1096 }
1097 
1098 int
1099 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1100 {
1101 	struct rf_recon_req_internal *rrint;
1102 
1103 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1104 		/* Can't do this on a RAID 0!! */
1105 		return EINVAL;
1106 	}
1107 
1108 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1109 		/* bad column */
1110 		return EINVAL;
1111 	}
1112 
1113 	rf_lock_mutex2(raidPtr->mutex);
1114 	if (raidPtr->status == rf_rs_reconstructing) {
1115 		/* you can't fail a disk while we're reconstructing! */
1116 		/* XXX wrong for RAID6 */
1117 		goto out;
1118 	}
1119 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1120 	    (raidPtr->numFailures > 0)) {
1121 		/* some other component has failed.  Let's not make
1122 		   things worse. XXX wrong for RAID6 */
1123 		goto out;
1124 	}
1125 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1126 		/* Can't fail a spared disk! */
1127 		goto out;
1128 	}
1129 	rf_unlock_mutex2(raidPtr->mutex);
1130 
1131 	/* make a copy of the recon request so that we don't rely on
1132 	 * the user's buffer */
1133 	rrint = RF_Malloc(sizeof(*rrint));
1134 	if (rrint == NULL)
1135 		return(ENOMEM);
1136 	rrint->col = rr->col;
1137 	rrint->flags = rr->flags;
1138 	rrint->raidPtr = raidPtr;
1139 
1140 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1141 	    rrint, "raid_recon");
1142 out:
1143 	rf_unlock_mutex2(raidPtr->mutex);
1144 	return EINVAL;
1145 }
1146 
1147 static int
1148 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1149 {
1150 	/* allocate a buffer for the layout-specific data, and copy it in */
1151 	if (k_cfg->layoutSpecificSize == 0)
1152 		return 0;
1153 
1154 	if (k_cfg->layoutSpecificSize > 10000) {
1155 	    /* sanity check */
1156 	    return EINVAL;
1157 	}
1158 
1159 	u_char *specific_buf;
1160 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1161 	if (specific_buf == NULL)
1162 		return ENOMEM;
1163 
1164 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1165 	    k_cfg->layoutSpecificSize);
1166 	if (retcode) {
1167 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1168 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1169 		return retcode;
1170 	}
1171 
1172 	k_cfg->layoutSpecific = specific_buf;
1173 	return 0;
1174 }
1175 
1176 static int
1177 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1178 {
1179 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
1180 
1181 	if (rs->sc_r.valid) {
1182 		/* There is a valid RAID set running on this unit! */
1183 		printf("raid%d: Device already configured!\n", rs->sc_unit);
1184 		return EINVAL;
1185 	}
1186 
1187 	/* copy-in the configuration information */
1188 	/* data points to a pointer to the configuration structure */
1189 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
1190 	if (*k_cfg == NULL) {
1191 		return ENOMEM;
1192 	}
1193 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1194 	if (retcode == 0)
1195 		return 0;
1196 	RF_Free(*k_cfg, sizeof(RF_Config_t));
1197 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1198 	rs->sc_flags |= RAIDF_SHUTDOWN;
1199 	return retcode;
1200 }
1201 
1202 int
1203 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1204 {
1205 	int retcode, i;
1206 	RF_Raid_t *raidPtr = &rs->sc_r;
1207 
1208 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1209 
1210 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1211 		goto out;
1212 
1213 	/* should do some kind of sanity check on the configuration.
1214 	 * Store the sum of all the bytes in the last byte? */
1215 
1216 	/* Force nul-termination on all strings. */
1217 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
1218 	for (i = 0; i < RF_MAXCOL; i++) {
1219 		ZERO_FINAL(k_cfg->devnames[0][i]);
1220 	}
1221 	for (i = 0; i < RF_MAXSPARE; i++) {
1222 		ZERO_FINAL(k_cfg->spare_names[i]);
1223 	}
1224 	for (i = 0; i < RF_MAXDBGV; i++) {
1225 		ZERO_FINAL(k_cfg->debugVars[i]);
1226 	}
1227 #undef ZERO_FINAL
1228 
1229 	/* Check some basic limits. */
1230 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1231 		retcode = EINVAL;
1232 		goto out;
1233 	}
1234 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1235 		retcode = EINVAL;
1236 		goto out;
1237 	}
1238 
1239 	/* configure the system */
1240 
1241 	/*
1242 	 * Clear the entire RAID descriptor, just to make sure
1243 	 *  there is no stale data left in the case of a
1244 	 *  reconfiguration
1245 	 */
1246 	memset(raidPtr, 0, sizeof(*raidPtr));
1247 	raidPtr->softc = rs;
1248 	raidPtr->raidid = rs->sc_unit;
1249 
1250 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
1251 
1252 	if (retcode == 0) {
1253 		/* allow this many simultaneous IO's to
1254 		   this RAID device */
1255 		raidPtr->openings = RAIDOUTSTANDING;
1256 
1257 		raidinit(rs);
1258 		raid_wakeup(raidPtr);
1259 		rf_markalldirty(raidPtr);
1260 	}
1261 
1262 	/* free the buffers.  No return code here. */
1263 	if (k_cfg->layoutSpecificSize) {
1264 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1265 	}
1266 out:
1267 	RF_Free(k_cfg, sizeof(RF_Config_t));
1268 	if (retcode) {
1269 		/*
1270 		 * If configuration failed, set sc_flags so that we
1271 		 * will detach the device when we close it.
1272 		 */
1273 		rs->sc_flags |= RAIDF_SHUTDOWN;
1274 	}
1275 	return retcode;
1276 }
1277 
1278 #if RF_DISABLED
1279 static int
1280 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1281 {
1282 
1283 	/* XXX check the label for valid stuff... */
1284 	/* Note that some things *should not* get modified --
1285 	   the user should be re-initing the labels instead of
1286 	   trying to patch things.
1287 	   */
1288 #ifdef DEBUG
1289 	int raidid = raidPtr->raidid;
1290 	printf("raid%d: Got component label:\n", raidid);
1291 	printf("raid%d: Version: %d\n", raidid, clabel->version);
1292 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1293 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1294 	printf("raid%d: Column: %d\n", raidid, clabel->column);
1295 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1296 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1297 	printf("raid%d: Status: %d\n", raidid, clabel->status);
1298 #endif	/* DEBUG */
1299 	clabel->row = 0;
1300 	int column = clabel->column;
1301 
1302 	if ((column < 0) || (column >= raidPtr->numCol)) {
1303 		return(EINVAL);
1304 	}
1305 
1306 	/* XXX this isn't allowed to do anything for now :-) */
1307 
1308 	/* XXX and before it is, we need to fill in the rest
1309 	   of the fields!?!?!?! */
1310 	memcpy(raidget_component_label(raidPtr, column),
1311 	    clabel, sizeof(*clabel));
1312 	raidflush_component_label(raidPtr, column);
1313 	return 0;
1314 }
1315 #endif
1316 
1317 static int
1318 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1319 {
1320 	/*
1321 	   we only want the serial number from
1322 	   the above.  We get all the rest of the information
1323 	   from the config that was used to create this RAID
1324 	   set.
1325 	   */
1326 
1327 	raidPtr->serial_number = clabel->serial_number;
1328 
1329 	for (int column = 0; column < raidPtr->numCol; column++) {
1330 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1331 		if (RF_DEAD_DISK(diskPtr->status))
1332 			continue;
1333 		RF_ComponentLabel_t *ci_label = raidget_component_label(
1334 		    raidPtr, column);
1335 		/* Zeroing this is important. */
1336 		memset(ci_label, 0, sizeof(*ci_label));
1337 		raid_init_component_label(raidPtr, ci_label);
1338 		ci_label->serial_number = raidPtr->serial_number;
1339 		ci_label->row = 0; /* we dont' pretend to support more */
1340 		rf_component_label_set_partitionsize(ci_label,
1341 		    diskPtr->partitionSize);
1342 		ci_label->column = column;
1343 		raidflush_component_label(raidPtr, column);
1344 		/* XXXjld what about the spares? */
1345 	}
1346 
1347 	return 0;
1348 }
1349 
1350 static int
1351 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1352 {
1353 
1354 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1355 		/* Can't do this on a RAID 0!! */
1356 		return EINVAL;
1357 	}
1358 
1359 	if (raidPtr->recon_in_progress == 1) {
1360 		/* a reconstruct is already in progress! */
1361 		return EINVAL;
1362 	}
1363 
1364 	RF_SingleComponent_t component;
1365 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1366 	component.row = 0; /* we don't support any more */
1367 	int column = component.column;
1368 
1369 	if ((column < 0) || (column >= raidPtr->numCol)) {
1370 		return EINVAL;
1371 	}
1372 
1373 	rf_lock_mutex2(raidPtr->mutex);
1374 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1375 	    (raidPtr->numFailures > 0)) {
1376 		/* XXX 0 above shouldn't be constant!!! */
1377 		/* some component other than this has failed.
1378 		   Let's not make things worse than they already
1379 		   are... */
1380 		printf("raid%d: Unable to reconstruct to disk at:\n",
1381 		       raidPtr->raidid);
1382 		printf("raid%d:     Col: %d   Too many failures.\n",
1383 		       raidPtr->raidid, column);
1384 		rf_unlock_mutex2(raidPtr->mutex);
1385 		return EINVAL;
1386 	}
1387 
1388 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1389 		printf("raid%d: Unable to reconstruct to disk at:\n",
1390 		       raidPtr->raidid);
1391 		printf("raid%d:    Col: %d   "
1392 		    "Reconstruction already occurring!\n",
1393 		    raidPtr->raidid, column);
1394 
1395 		rf_unlock_mutex2(raidPtr->mutex);
1396 		return EINVAL;
1397 	}
1398 
1399 	if (raidPtr->Disks[column].status == rf_ds_spared) {
1400 		rf_unlock_mutex2(raidPtr->mutex);
1401 		return EINVAL;
1402 	}
1403 
1404 	rf_unlock_mutex2(raidPtr->mutex);
1405 
1406 	struct rf_recon_req_internal *rrint;
1407 	rrint = RF_Malloc(sizeof(*rrint));
1408 	if (rrint == NULL)
1409 		return ENOMEM;
1410 
1411 	rrint->col = column;
1412 	rrint->raidPtr = raidPtr;
1413 
1414 	return RF_CREATE_THREAD(raidPtr->recon_thread,
1415 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1416 }
1417 
1418 static int
1419 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1420 {
1421 	/*
1422 	 * This makes no sense on a RAID 0, or if we are not reconstructing
1423 	 * so tell the user it's done.
1424 	 */
1425 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
1426 	    raidPtr->status != rf_rs_reconstructing) {
1427 		*data = 100;
1428 		return 0;
1429 	}
1430 	if (raidPtr->reconControl->numRUsTotal == 0) {
1431 		*data = 0;
1432 		return 0;
1433 	}
1434 	*data = (raidPtr->reconControl->numRUsComplete * 100
1435 	    / raidPtr->reconControl->numRUsTotal);
1436 	return 0;
1437 }
1438 
1439 /*
1440  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1441  * on the component_name[] array.
1442  */
1443 static void
1444 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1445 {
1446 
1447 	memcpy(component, data, sizeof *component);
1448 	component->component_name[sizeof(component->component_name) - 1] = '\0';
1449 }
1450 
1451 static int
1452 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1453 {
1454 	int     unit = raidunit(dev);
1455 	int     part, pmask;
1456 	struct raid_softc *rs;
1457 	struct dk_softc *dksc;
1458 	RF_Config_t *k_cfg;
1459 	RF_Raid_t *raidPtr;
1460 	RF_AccTotals_t *totals;
1461 	RF_SingleComponent_t component;
1462 	RF_DeviceConfig_t *d_cfg, *ucfgp;
1463 	int retcode = 0;
1464 	int column;
1465 	RF_ComponentLabel_t *clabel;
1466 	int d;
1467 
1468 	if ((rs = raidget(unit, false)) == NULL)
1469 		return ENXIO;
1470 
1471 	dksc = &rs->sc_dksc;
1472 	raidPtr = &rs->sc_r;
1473 
1474 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1475 	    (int) DISKPART(dev), (int) unit, cmd));
1476 
1477 	/* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1478 	switch (cmd) {
1479 	case RAIDFRAME_CONFIGURE:
1480 	case RAIDFRAME_RESCAN:
1481 		break;
1482 	default:
1483 		if (!rf_inited(rs))
1484 			return ENXIO;
1485 	}
1486 
1487 	switch (cmd) {
1488 		/* configure the system */
1489 	case RAIDFRAME_CONFIGURE:
1490 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1491 			return retcode;
1492 		return rf_construct(rs, k_cfg);
1493 
1494 		/* shutdown the system */
1495 	case RAIDFRAME_SHUTDOWN:
1496 
1497 		part = DISKPART(dev);
1498 		pmask = (1 << part);
1499 
1500 		if ((retcode = raidlock(rs)) != 0)
1501 			return retcode;
1502 
1503 		if (DK_BUSY(dksc, pmask) ||
1504 		    raidPtr->recon_in_progress != 0 ||
1505 		    raidPtr->parity_rewrite_in_progress != 0 ||
1506 		    raidPtr->copyback_in_progress != 0)
1507 			retcode = EBUSY;
1508 		else {
1509 			/* detach and free on close */
1510 			rs->sc_flags |= RAIDF_SHUTDOWN;
1511 			retcode = 0;
1512 		}
1513 
1514 		raidunlock(rs);
1515 
1516 		return retcode;
1517 	case RAIDFRAME_GET_COMPONENT_LABEL:
1518 		return rf_get_component_label(raidPtr, data);
1519 
1520 #if RF_DISABLED
1521 	case RAIDFRAME_SET_COMPONENT_LABEL:
1522 		return rf_set_component_label(raidPtr, data);
1523 #endif
1524 
1525 	case RAIDFRAME_INIT_LABELS:
1526 		return rf_init_component_label(raidPtr, data);
1527 
1528 	case RAIDFRAME_SET_AUTOCONFIG:
1529 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1530 		printf("raid%d: New autoconfig value is: %d\n",
1531 		       raidPtr->raidid, d);
1532 		*(int *) data = d;
1533 		return retcode;
1534 
1535 	case RAIDFRAME_SET_ROOT:
1536 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1537 		printf("raid%d: New rootpartition value is: %d\n",
1538 		       raidPtr->raidid, d);
1539 		*(int *) data = d;
1540 		return retcode;
1541 
1542 		/* initialize all parity */
1543 	case RAIDFRAME_REWRITEPARITY:
1544 
1545 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1546 			/* Parity for RAID 0 is trivially correct */
1547 			raidPtr->parity_good = RF_RAID_CLEAN;
1548 			return 0;
1549 		}
1550 
1551 		if (raidPtr->parity_rewrite_in_progress == 1) {
1552 			/* Re-write is already in progress! */
1553 			return EINVAL;
1554 		}
1555 
1556 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1557 		    rf_RewriteParityThread, raidPtr,"raid_parity");
1558 
1559 	case RAIDFRAME_ADD_HOT_SPARE:
1560 		rf_copy_single_component(&component, data);
1561 		return rf_add_hot_spare(raidPtr, &component);
1562 
1563 	case RAIDFRAME_REMOVE_HOT_SPARE:
1564 		return retcode;
1565 
1566 	case RAIDFRAME_DELETE_COMPONENT:
1567 		rf_copy_single_component(&component, data);
1568 		return rf_delete_component(raidPtr, &component);
1569 
1570 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1571 		rf_copy_single_component(&component, data);
1572 		return rf_incorporate_hot_spare(raidPtr, &component);
1573 
1574 	case RAIDFRAME_REBUILD_IN_PLACE:
1575 		return rf_rebuild_in_place(raidPtr, data);
1576 
1577 	case RAIDFRAME_GET_INFO:
1578 		ucfgp = *(RF_DeviceConfig_t **)data;
1579 		d_cfg = RF_Malloc(sizeof(*d_cfg));
1580 		if (d_cfg == NULL)
1581 			return ENOMEM;
1582 		retcode = rf_get_info(raidPtr, d_cfg);
1583 		if (retcode == 0) {
1584 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1585 		}
1586 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1587 		return retcode;
1588 
1589 	case RAIDFRAME_CHECK_PARITY:
1590 		*(int *) data = raidPtr->parity_good;
1591 		return 0;
1592 
1593 	case RAIDFRAME_PARITYMAP_STATUS:
1594 		if (rf_paritymap_ineligible(raidPtr))
1595 			return EINVAL;
1596 		rf_paritymap_status(raidPtr->parity_map, data);
1597 		return 0;
1598 
1599 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1600 		if (rf_paritymap_ineligible(raidPtr))
1601 			return EINVAL;
1602 		if (raidPtr->parity_map == NULL)
1603 			return ENOENT; /* ??? */
1604 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1605 			return EINVAL;
1606 		return 0;
1607 
1608 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1609 		if (rf_paritymap_ineligible(raidPtr))
1610 			return EINVAL;
1611 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1612 		return 0;
1613 
1614 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1615 		if (rf_paritymap_ineligible(raidPtr))
1616 			return EINVAL;
1617 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1618 		/* XXX should errors be passed up? */
1619 		return 0;
1620 
1621 	case RAIDFRAME_RESCAN:
1622 		return rf_rescan();
1623 
1624 	case RAIDFRAME_RESET_ACCTOTALS:
1625 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1626 		return 0;
1627 
1628 	case RAIDFRAME_GET_ACCTOTALS:
1629 		totals = (RF_AccTotals_t *) data;
1630 		*totals = raidPtr->acc_totals;
1631 		return 0;
1632 
1633 	case RAIDFRAME_KEEP_ACCTOTALS:
1634 		raidPtr->keep_acc_totals = *(int *)data;
1635 		return 0;
1636 
1637 	case RAIDFRAME_GET_SIZE:
1638 		*(int *) data = raidPtr->totalSectors;
1639 		return 0;
1640 
1641 	case RAIDFRAME_FAIL_DISK:
1642 		return rf_fail_disk(raidPtr, data);
1643 
1644 		/* invoke a copyback operation after recon on whatever disk
1645 		 * needs it, if any */
1646 	case RAIDFRAME_COPYBACK:
1647 
1648 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1649 			/* This makes no sense on a RAID 0!! */
1650 			return EINVAL;
1651 		}
1652 
1653 		if (raidPtr->copyback_in_progress == 1) {
1654 			/* Copyback is already in progress! */
1655 			return EINVAL;
1656 		}
1657 
1658 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
1659 		    rf_CopybackThread, raidPtr, "raid_copyback");
1660 
1661 		/* return the percentage completion of reconstruction */
1662 	case RAIDFRAME_CHECK_RECON_STATUS:
1663 		return rf_check_recon_status(raidPtr, data);
1664 
1665 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1666 		rf_check_recon_status_ext(raidPtr, data);
1667 		return 0;
1668 
1669 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1670 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1671 			/* This makes no sense on a RAID 0, so tell the
1672 			   user it's done. */
1673 			*(int *) data = 100;
1674 			return 0;
1675 		}
1676 		if (raidPtr->parity_rewrite_in_progress == 1) {
1677 			*(int *) data = 100 *
1678 				raidPtr->parity_rewrite_stripes_done /
1679 				raidPtr->Layout.numStripe;
1680 		} else {
1681 			*(int *) data = 100;
1682 		}
1683 		return 0;
1684 
1685 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1686 		rf_check_parityrewrite_status_ext(raidPtr, data);
1687 		return 0;
1688 
1689 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1690 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1691 			/* This makes no sense on a RAID 0 */
1692 			*(int *) data = 100;
1693 			return 0;
1694 		}
1695 		if (raidPtr->copyback_in_progress == 1) {
1696 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1697 				raidPtr->Layout.numStripe;
1698 		} else {
1699 			*(int *) data = 100;
1700 		}
1701 		return 0;
1702 
1703 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1704 		rf_check_copyback_status_ext(raidPtr, data);
1705 		return 0;
1706 
1707 	case RAIDFRAME_SET_LAST_UNIT:
1708 		for (column = 0; column < raidPtr->numCol; column++)
1709 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1710 				return EBUSY;
1711 
1712 		for (column = 0; column < raidPtr->numCol; column++) {
1713 			clabel = raidget_component_label(raidPtr, column);
1714 			clabel->last_unit = *(int *)data;
1715 			raidflush_component_label(raidPtr, column);
1716 		}
1717 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1718 		return 0;
1719 
1720 		/* the sparetable daemon calls this to wait for the kernel to
1721 		 * need a spare table. this ioctl does not return until a
1722 		 * spare table is needed. XXX -- calling mpsleep here in the
1723 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1724 		 * -- I should either compute the spare table in the kernel,
1725 		 * or have a different -- XXX XXX -- interface (a different
1726 		 * character device) for delivering the table     -- XXX */
1727 #if RF_DISABLED
1728 	case RAIDFRAME_SPARET_WAIT:
1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
1730 		while (!rf_sparet_wait_queue)
1731 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1732 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1733 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1734 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1735 
1736 		/* structure assignment */
1737 		*((RF_SparetWait_t *) data) = *waitreq;
1738 
1739 		RF_Free(waitreq, sizeof(*waitreq));
1740 		return 0;
1741 
1742 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1743 		 * code in it that will cause the dameon to exit */
1744 	case RAIDFRAME_ABORT_SPARET_WAIT:
1745 		waitreq = RF_Malloc(sizeof(*waitreq));
1746 		waitreq->fcol = -1;
1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
1748 		waitreq->next = rf_sparet_wait_queue;
1749 		rf_sparet_wait_queue = waitreq;
1750 		rf_broadcast_cond2(rf_sparet_wait_cv);
1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1752 		return 0;
1753 
1754 		/* used by the spare table daemon to deliver a spare table
1755 		 * into the kernel */
1756 	case RAIDFRAME_SEND_SPARET:
1757 
1758 		/* install the spare table */
1759 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1760 
1761 		/* respond to the requestor.  the return status of the spare
1762 		 * table installation is passed in the "fcol" field */
1763 		waitred = RF_Malloc(sizeof(*waitreq));
1764 		waitreq->fcol = retcode;
1765 		rf_lock_mutex2(rf_sparet_wait_mutex);
1766 		waitreq->next = rf_sparet_resp_queue;
1767 		rf_sparet_resp_queue = waitreq;
1768 		rf_broadcast_cond2(rf_sparet_resp_cv);
1769 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1770 
1771 		return retcode;
1772 #endif
1773 	default:
1774 		/*
1775 		 * Don't bother trying to load compat modules
1776 		 * if it is not our ioctl. This is more efficient
1777 		 * and makes rump tests not depend on compat code
1778 		 */
1779 		if (IOCGROUP(cmd) != 'r')
1780 			break;
1781 #ifdef _LP64
1782 		if ((l->l_proc->p_flag & PK_32) != 0) {
1783 			module_autoload("compat_netbsd32_raid",
1784 			    MODULE_CLASS_EXEC);
1785 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1786 			    (rs, cmd, data), enosys(), retcode);
1787 			if (retcode != EPASSTHROUGH)
1788 				return retcode;
1789 		}
1790 #endif
1791 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1792 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1793 		    (rs, cmd, data), enosys(), retcode);
1794 		if (retcode != EPASSTHROUGH)
1795 			return retcode;
1796 
1797 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1798 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1799 		    (rs, cmd, data), enosys(), retcode);
1800 		if (retcode != EPASSTHROUGH)
1801 			return retcode;
1802 		break; /* fall through to the os-specific code below */
1803 
1804 	}
1805 
1806 	if (!raidPtr->valid)
1807 		return EINVAL;
1808 
1809 	/*
1810 	 * Add support for "regular" device ioctls here.
1811 	 */
1812 
1813 	switch (cmd) {
1814 	case DIOCGCACHE:
1815 		retcode = rf_get_component_caches(raidPtr, (int *)data);
1816 		break;
1817 
1818 	case DIOCCACHESYNC:
1819 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1820 		break;
1821 
1822 	default:
1823 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1824 		break;
1825 	}
1826 
1827 	return retcode;
1828 
1829 }
1830 
1831 
1832 /* raidinit -- complete the rest of the initialization for the
1833    RAIDframe device.  */
1834 
1835 
1836 static void
1837 raidinit(struct raid_softc *rs)
1838 {
1839 	cfdata_t cf;
1840 	unsigned int unit;
1841 	struct dk_softc *dksc = &rs->sc_dksc;
1842 	RF_Raid_t *raidPtr = &rs->sc_r;
1843 	device_t dev;
1844 
1845 	unit = raidPtr->raidid;
1846 
1847 	/* XXX doesn't check bounds. */
1848 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1849 
1850 	/* attach the pseudo device */
1851 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1852 	cf->cf_name = raid_cd.cd_name;
1853 	cf->cf_atname = raid_cd.cd_name;
1854 	cf->cf_unit = unit;
1855 	cf->cf_fstate = FSTATE_STAR;
1856 
1857 	dev = config_attach_pseudo(cf);
1858 	if (dev == NULL) {
1859 		printf("raid%d: config_attach_pseudo failed\n",
1860 		    raidPtr->raidid);
1861 		free(cf, M_RAIDFRAME);
1862 		return;
1863 	}
1864 
1865 	/* provide a backpointer to the real softc */
1866 	raidsoftc(dev) = rs;
1867 
1868 	/* disk_attach actually creates space for the CPU disklabel, among
1869 	 * other things, so it's critical to call this *BEFORE* we try putzing
1870 	 * with disklabels. */
1871 	dk_init(dksc, dev, DKTYPE_RAID);
1872 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1873 
1874 	/* XXX There may be a weird interaction here between this, and
1875 	 * protectedSectors, as used in RAIDframe.  */
1876 
1877 	rs->sc_size = raidPtr->totalSectors;
1878 
1879 	/* Attach dk and disk subsystems */
1880 	dk_attach(dksc);
1881 	disk_attach(&dksc->sc_dkdev);
1882 	rf_set_geometry(rs, raidPtr);
1883 
1884 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1885 
1886 	/* mark unit as usuable */
1887 	rs->sc_flags |= RAIDF_INITED;
1888 
1889 	dkwedge_discover(&dksc->sc_dkdev);
1890 }
1891 
1892 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1893 /* wake up the daemon & tell it to get us a spare table
1894  * XXX
1895  * the entries in the queues should be tagged with the raidPtr
1896  * so that in the extremely rare case that two recons happen at once,
1897  * we know for which device were requesting a spare table
1898  * XXX
1899  *
1900  * XXX This code is not currently used. GO
1901  */
1902 int
1903 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1904 {
1905 	int     retcode;
1906 
1907 	rf_lock_mutex2(rf_sparet_wait_mutex);
1908 	req->next = rf_sparet_wait_queue;
1909 	rf_sparet_wait_queue = req;
1910 	rf_broadcast_cond2(rf_sparet_wait_cv);
1911 
1912 	/* mpsleep unlocks the mutex */
1913 	while (!rf_sparet_resp_queue) {
1914 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1915 	}
1916 	req = rf_sparet_resp_queue;
1917 	rf_sparet_resp_queue = req->next;
1918 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1919 
1920 	retcode = req->fcol;
1921 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1922 					 * alloc'd */
1923 	return retcode;
1924 }
1925 #endif
1926 
1927 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1928  * bp & passes it down.
1929  * any calls originating in the kernel must use non-blocking I/O
1930  * do some extra sanity checking to return "appropriate" error values for
1931  * certain conditions (to make some standard utilities work)
1932  *
1933  * Formerly known as: rf_DoAccessKernel
1934  */
1935 void
1936 raidstart(RF_Raid_t *raidPtr)
1937 {
1938 	struct raid_softc *rs;
1939 	struct dk_softc *dksc;
1940 
1941 	rs = raidPtr->softc;
1942 	dksc = &rs->sc_dksc;
1943 	/* quick check to see if anything has died recently */
1944 	rf_lock_mutex2(raidPtr->mutex);
1945 	if (raidPtr->numNewFailures > 0) {
1946 		rf_unlock_mutex2(raidPtr->mutex);
1947 		rf_update_component_labels(raidPtr,
1948 					   RF_NORMAL_COMPONENT_UPDATE);
1949 		rf_lock_mutex2(raidPtr->mutex);
1950 		raidPtr->numNewFailures--;
1951 	}
1952 	rf_unlock_mutex2(raidPtr->mutex);
1953 
1954 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1955 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1956 		return;
1957 	}
1958 
1959 	dk_start(dksc, NULL);
1960 }
1961 
1962 static int
1963 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1964 {
1965 	RF_SectorCount_t num_blocks, pb, sum;
1966 	RF_RaidAddr_t raid_addr;
1967 	daddr_t blocknum;
1968 	int rc;
1969 
1970 	rf_lock_mutex2(raidPtr->mutex);
1971 	if (raidPtr->openings == 0) {
1972 		rf_unlock_mutex2(raidPtr->mutex);
1973 		return EAGAIN;
1974 	}
1975 	rf_unlock_mutex2(raidPtr->mutex);
1976 
1977 	blocknum = bp->b_rawblkno;
1978 
1979 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1980 		    (int) blocknum));
1981 
1982 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1983 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1984 
1985 	/* *THIS* is where we adjust what block we're going to...
1986 	 * but DO NOT TOUCH bp->b_blkno!!! */
1987 	raid_addr = blocknum;
1988 
1989 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1990 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1991 	sum = raid_addr + num_blocks + pb;
1992 	if (1 || rf_debugKernelAccess) {
1993 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1994 			    (int) raid_addr, (int) sum, (int) num_blocks,
1995 			    (int) pb, (int) bp->b_resid));
1996 	}
1997 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1998 	    || (sum < num_blocks) || (sum < pb)) {
1999 		rc = ENOSPC;
2000 		goto done;
2001 	}
2002 	/*
2003 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2004 	 */
2005 
2006 	if (bp->b_bcount & raidPtr->sectorMask) {
2007 		rc = ENOSPC;
2008 		goto done;
2009 	}
2010 	db1_printf(("Calling DoAccess..\n"));
2011 
2012 
2013 	rf_lock_mutex2(raidPtr->mutex);
2014 	raidPtr->openings--;
2015 	rf_unlock_mutex2(raidPtr->mutex);
2016 
2017 	/* don't ever condition on bp->b_flags & B_WRITE.
2018 	 * always condition on B_READ instead */
2019 
2020 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2021 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2022 			 raid_addr, num_blocks,
2023 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2024 
2025 done:
2026 	return rc;
2027 }
2028 
2029 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2030 
2031 int
2032 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2033 {
2034 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2035 	struct buf *bp;
2036 
2037 	req->queue = queue;
2038 	bp = req->bp;
2039 
2040 	switch (req->type) {
2041 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2042 		/* XXX need to do something extra here.. */
2043 		/* I'm leaving this in, as I've never actually seen it used,
2044 		 * and I'd like folks to report it... GO */
2045 		printf("%s: WAKEUP CALLED\n", __func__);
2046 		queue->numOutstanding++;
2047 
2048 		bp->b_flags = 0;
2049 		bp->b_private = req;
2050 
2051 		KernelWakeupFunc(bp);
2052 		break;
2053 
2054 	case RF_IO_TYPE_READ:
2055 	case RF_IO_TYPE_WRITE:
2056 #if RF_ACC_TRACE > 0
2057 		if (req->tracerec) {
2058 			RF_ETIMER_START(req->tracerec->timer);
2059 		}
2060 #endif
2061 		InitBP(bp, queue->rf_cinfo->ci_vp,
2062 		    op, queue->rf_cinfo->ci_dev,
2063 		    req->sectorOffset, req->numSector,
2064 		    req->buf, KernelWakeupFunc, (void *) req,
2065 		    queue->raidPtr->logBytesPerSector);
2066 
2067 		if (rf_debugKernelAccess) {
2068 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2069 				(long) bp->b_blkno));
2070 		}
2071 		queue->numOutstanding++;
2072 		queue->last_deq_sector = req->sectorOffset;
2073 		/* acc wouldn't have been let in if there were any pending
2074 		 * reqs at any other priority */
2075 		queue->curPriority = req->priority;
2076 
2077 		db1_printf(("Going for %c to unit %d col %d\n",
2078 			    req->type, queue->raidPtr->raidid,
2079 			    queue->col));
2080 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2081 			(int) req->sectorOffset, (int) req->numSector,
2082 			(int) (req->numSector <<
2083 			    queue->raidPtr->logBytesPerSector),
2084 			(int) queue->raidPtr->logBytesPerSector));
2085 
2086 		/*
2087 		 * XXX: drop lock here since this can block at
2088 		 * least with backing SCSI devices.  Retake it
2089 		 * to minimize fuss with calling interfaces.
2090 		 */
2091 
2092 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2093 		bdev_strategy(bp);
2094 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2095 		break;
2096 
2097 	default:
2098 		panic("bad req->type in rf_DispatchKernelIO");
2099 	}
2100 	db1_printf(("Exiting from DispatchKernelIO\n"));
2101 
2102 	return 0;
2103 }
2104 /* this is the callback function associated with a I/O invoked from
2105    kernel code.
2106  */
2107 static void
2108 KernelWakeupFunc(struct buf *bp)
2109 {
2110 	RF_DiskQueueData_t *req = NULL;
2111 	RF_DiskQueue_t *queue;
2112 
2113 	db1_printf(("recovering the request queue:\n"));
2114 
2115 	req = bp->b_private;
2116 
2117 	queue = (RF_DiskQueue_t *) req->queue;
2118 
2119 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2120 
2121 #if RF_ACC_TRACE > 0
2122 	if (req->tracerec) {
2123 		RF_ETIMER_STOP(req->tracerec->timer);
2124 		RF_ETIMER_EVAL(req->tracerec->timer);
2125 		rf_lock_mutex2(rf_tracing_mutex);
2126 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2127 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2128 		req->tracerec->num_phys_ios++;
2129 		rf_unlock_mutex2(rf_tracing_mutex);
2130 	}
2131 #endif
2132 
2133 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2134 	 * ballistic, and mark the component as hosed... */
2135 
2136 	if (bp->b_error != 0) {
2137 		/* Mark the disk as dead */
2138 		/* but only mark it once... */
2139 		/* and only if it wouldn't leave this RAID set
2140 		   completely broken */
2141 		if (((queue->raidPtr->Disks[queue->col].status ==
2142 		      rf_ds_optimal) ||
2143 		     (queue->raidPtr->Disks[queue->col].status ==
2144 		      rf_ds_used_spare)) &&
2145 		     (queue->raidPtr->numFailures <
2146 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2147 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2148 			       queue->raidPtr->raidid,
2149 			       bp->b_error,
2150 			       queue->raidPtr->Disks[queue->col].devname);
2151 			queue->raidPtr->Disks[queue->col].status =
2152 			    rf_ds_failed;
2153 			queue->raidPtr->status = rf_rs_degraded;
2154 			queue->raidPtr->numFailures++;
2155 			queue->raidPtr->numNewFailures++;
2156 		} else {	/* Disk is already dead... */
2157 			/* printf("Disk already marked as dead!\n"); */
2158 		}
2159 
2160 	}
2161 
2162 	/* Fill in the error value */
2163 	req->error = bp->b_error;
2164 
2165 	/* Drop this one on the "finished" queue... */
2166 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2167 
2168 	/* Let the raidio thread know there is work to be done. */
2169 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2170 
2171 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2172 }
2173 
2174 
2175 /*
2176  * initialize a buf structure for doing an I/O in the kernel.
2177  */
2178 static void
2179 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2180        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2181        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2182 {
2183 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2184 	bp->b_oflags = 0;
2185 	bp->b_cflags = 0;
2186 	bp->b_bcount = numSect << logBytesPerSector;
2187 	bp->b_bufsize = bp->b_bcount;
2188 	bp->b_error = 0;
2189 	bp->b_dev = dev;
2190 	bp->b_data = bf;
2191 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2192 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2193 	if (bp->b_bcount == 0) {
2194 		panic("bp->b_bcount is zero in InitBP!!");
2195 	}
2196 	bp->b_iodone = cbFunc;
2197 	bp->b_private = cbArg;
2198 }
2199 
2200 /*
2201  * Wait interruptibly for an exclusive lock.
2202  *
2203  * XXX
2204  * Several drivers do this; it should be abstracted and made MP-safe.
2205  * (Hmm... where have we seen this warning before :->  GO )
2206  */
2207 static int
2208 raidlock(struct raid_softc *rs)
2209 {
2210 	int     error;
2211 
2212 	error = 0;
2213 	mutex_enter(&rs->sc_mutex);
2214 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2215 		rs->sc_flags |= RAIDF_WANTED;
2216 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2217 		if (error != 0)
2218 			goto done;
2219 	}
2220 	rs->sc_flags |= RAIDF_LOCKED;
2221 done:
2222 	mutex_exit(&rs->sc_mutex);
2223 	return error;
2224 }
2225 /*
2226  * Unlock and wake up any waiters.
2227  */
2228 static void
2229 raidunlock(struct raid_softc *rs)
2230 {
2231 
2232 	mutex_enter(&rs->sc_mutex);
2233 	rs->sc_flags &= ~RAIDF_LOCKED;
2234 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2235 		rs->sc_flags &= ~RAIDF_WANTED;
2236 		cv_broadcast(&rs->sc_cv);
2237 	}
2238 	mutex_exit(&rs->sc_mutex);
2239 }
2240 
2241 
2242 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2243 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2244 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2245 
2246 static daddr_t
2247 rf_component_info_offset(void)
2248 {
2249 
2250 	return RF_COMPONENT_INFO_OFFSET;
2251 }
2252 
2253 static daddr_t
2254 rf_component_info_size(unsigned secsize)
2255 {
2256 	daddr_t info_size;
2257 
2258 	KASSERT(secsize);
2259 	if (secsize > RF_COMPONENT_INFO_SIZE)
2260 		info_size = secsize;
2261 	else
2262 		info_size = RF_COMPONENT_INFO_SIZE;
2263 
2264 	return info_size;
2265 }
2266 
2267 static daddr_t
2268 rf_parity_map_offset(RF_Raid_t *raidPtr)
2269 {
2270 	daddr_t map_offset;
2271 
2272 	KASSERT(raidPtr->bytesPerSector);
2273 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2274 		map_offset = raidPtr->bytesPerSector;
2275 	else
2276 		map_offset = RF_COMPONENT_INFO_SIZE;
2277 	map_offset += rf_component_info_offset();
2278 
2279 	return map_offset;
2280 }
2281 
2282 static daddr_t
2283 rf_parity_map_size(RF_Raid_t *raidPtr)
2284 {
2285 	daddr_t map_size;
2286 
2287 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2288 		map_size = raidPtr->bytesPerSector;
2289 	else
2290 		map_size = RF_PARITY_MAP_SIZE;
2291 
2292 	return map_size;
2293 }
2294 
2295 int
2296 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2297 {
2298 	RF_ComponentLabel_t *clabel;
2299 
2300 	clabel = raidget_component_label(raidPtr, col);
2301 	clabel->clean = RF_RAID_CLEAN;
2302 	raidflush_component_label(raidPtr, col);
2303 	return(0);
2304 }
2305 
2306 
2307 int
2308 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2309 {
2310 	RF_ComponentLabel_t *clabel;
2311 
2312 	clabel = raidget_component_label(raidPtr, col);
2313 	clabel->clean = RF_RAID_DIRTY;
2314 	raidflush_component_label(raidPtr, col);
2315 	return(0);
2316 }
2317 
2318 int
2319 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2320 {
2321 	KASSERT(raidPtr->bytesPerSector);
2322 
2323 	return raidread_component_label(raidPtr->bytesPerSector,
2324 	    raidPtr->Disks[col].dev,
2325 	    raidPtr->raid_cinfo[col].ci_vp,
2326 	    &raidPtr->raid_cinfo[col].ci_label);
2327 }
2328 
2329 RF_ComponentLabel_t *
2330 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2331 {
2332 	return &raidPtr->raid_cinfo[col].ci_label;
2333 }
2334 
2335 int
2336 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2337 {
2338 	RF_ComponentLabel_t *label;
2339 
2340 	label = &raidPtr->raid_cinfo[col].ci_label;
2341 	label->mod_counter = raidPtr->mod_counter;
2342 #ifndef RF_NO_PARITY_MAP
2343 	label->parity_map_modcount = label->mod_counter;
2344 #endif
2345 	return raidwrite_component_label(raidPtr->bytesPerSector,
2346 	    raidPtr->Disks[col].dev,
2347 	    raidPtr->raid_cinfo[col].ci_vp, label);
2348 }
2349 
2350 /*
2351  * Swap the label endianness.
2352  *
2353  * Everything in the component label is 4-byte-swapped except the version,
2354  * which is kept in the byte-swapped version at all times, and indicates
2355  * for the writer that a swap is necessary.
2356  *
2357  * For reads it is expected that out_label == clabel, but writes expect
2358  * separate labels so only the re-swapped label is written out to disk,
2359  * leaving the swapped-except-version internally.
2360  *
2361  * Only support swapping label version 2.
2362  */
2363 static void
2364 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2365 {
2366 	int	*in, *out, *in_last;
2367 
2368 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2369 
2370 	/* Don't swap the label, but do copy it. */
2371 	out_label->version = clabel->version;
2372 
2373 	in = &clabel->serial_number;
2374 	in_last = &clabel->future_use2[42];
2375 	out = &out_label->serial_number;
2376 
2377 	for (; in < in_last; in++, out++)
2378 		*out = bswap32(*in);
2379 }
2380 
2381 static int
2382 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2383     RF_ComponentLabel_t *clabel)
2384 {
2385 	int error;
2386 
2387 	error = raidread_component_area(dev, b_vp, clabel,
2388 	    sizeof(RF_ComponentLabel_t),
2389 	    rf_component_info_offset(),
2390 	    rf_component_info_size(secsize));
2391 
2392 	if (error == 0 &&
2393 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2394 		rf_swap_label(clabel, clabel);
2395 	}
2396 
2397 	return error;
2398 }
2399 
2400 /* ARGSUSED */
2401 static int
2402 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2403     size_t msize, daddr_t offset, daddr_t dsize)
2404 {
2405 	struct buf *bp;
2406 	int error;
2407 
2408 	/* XXX should probably ensure that we don't try to do this if
2409 	   someone has changed rf_protected_sectors. */
2410 
2411 	if (b_vp == NULL) {
2412 		/* For whatever reason, this component is not valid.
2413 		   Don't try to read a component label from it. */
2414 		return(EINVAL);
2415 	}
2416 
2417 	/* get a block of the appropriate size... */
2418 	bp = geteblk((int)dsize);
2419 	bp->b_dev = dev;
2420 
2421 	/* get our ducks in a row for the read */
2422 	bp->b_blkno = offset / DEV_BSIZE;
2423 	bp->b_bcount = dsize;
2424 	bp->b_flags |= B_READ;
2425  	bp->b_resid = dsize;
2426 
2427 	bdev_strategy(bp);
2428 	error = biowait(bp);
2429 
2430 	if (!error) {
2431 		memcpy(data, bp->b_data, msize);
2432 	}
2433 
2434 	brelse(bp, 0);
2435 	return(error);
2436 }
2437 
2438 static int
2439 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2440     RF_ComponentLabel_t *clabel)
2441 {
2442 	RF_ComponentLabel_t *clabel_write = clabel;
2443 	RF_ComponentLabel_t lclabel;
2444 	int error;
2445 
2446 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2447 		clabel_write = &lclabel;
2448 		rf_swap_label(clabel, clabel_write);
2449 	}
2450 	error = raidwrite_component_area(dev, b_vp, clabel_write,
2451 	    sizeof(RF_ComponentLabel_t),
2452 	    rf_component_info_offset(),
2453 	    rf_component_info_size(secsize));
2454 
2455 	return error;
2456 }
2457 
2458 /* ARGSUSED */
2459 static int
2460 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2461     size_t msize, daddr_t offset, daddr_t dsize)
2462 {
2463 	struct buf *bp;
2464 	int error;
2465 
2466 	/* get a block of the appropriate size... */
2467 	bp = geteblk((int)dsize);
2468 	bp->b_dev = dev;
2469 
2470 	/* get our ducks in a row for the write */
2471 	bp->b_blkno = offset / DEV_BSIZE;
2472 	bp->b_bcount = dsize;
2473 	bp->b_flags |= B_WRITE;
2474  	bp->b_resid = dsize;
2475 
2476 	memset(bp->b_data, 0, dsize);
2477 	memcpy(bp->b_data, data, msize);
2478 
2479 	bdev_strategy(bp);
2480 	error = biowait(bp);
2481 	brelse(bp, 0);
2482 	if (error) {
2483 #if 1
2484 		printf("Failed to write RAID component info!\n");
2485 #endif
2486 	}
2487 
2488 	return(error);
2489 }
2490 
2491 void
2492 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2493 {
2494 	int c;
2495 
2496 	for (c = 0; c < raidPtr->numCol; c++) {
2497 		/* Skip dead disks. */
2498 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2499 			continue;
2500 		/* XXXjld: what if an error occurs here? */
2501 		raidwrite_component_area(raidPtr->Disks[c].dev,
2502 		    raidPtr->raid_cinfo[c].ci_vp, map,
2503 		    RF_PARITYMAP_NBYTE,
2504 		    rf_parity_map_offset(raidPtr),
2505 		    rf_parity_map_size(raidPtr));
2506 	}
2507 }
2508 
2509 void
2510 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2511 {
2512 	struct rf_paritymap_ondisk tmp;
2513 	int c,first;
2514 
2515 	first=1;
2516 	for (c = 0; c < raidPtr->numCol; c++) {
2517 		/* Skip dead disks. */
2518 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2519 			continue;
2520 		raidread_component_area(raidPtr->Disks[c].dev,
2521 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2522 		    RF_PARITYMAP_NBYTE,
2523 		    rf_parity_map_offset(raidPtr),
2524 		    rf_parity_map_size(raidPtr));
2525 		if (first) {
2526 			memcpy(map, &tmp, sizeof(*map));
2527 			first = 0;
2528 		} else {
2529 			rf_paritymap_merge(map, &tmp);
2530 		}
2531 	}
2532 }
2533 
2534 void
2535 rf_markalldirty(RF_Raid_t *raidPtr)
2536 {
2537 	RF_ComponentLabel_t *clabel;
2538 	int sparecol;
2539 	int c;
2540 	int j;
2541 	int scol = -1;
2542 
2543 	raidPtr->mod_counter++;
2544 	for (c = 0; c < raidPtr->numCol; c++) {
2545 		/* we don't want to touch (at all) a disk that has
2546 		   failed */
2547 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2548 			clabel = raidget_component_label(raidPtr, c);
2549 			if (clabel->status == rf_ds_spared) {
2550 				/* XXX do something special...
2551 				   but whatever you do, don't
2552 				   try to access it!! */
2553 			} else {
2554 				raidmarkdirty(raidPtr, c);
2555 			}
2556 		}
2557 	}
2558 
2559 	for( c = 0; c < raidPtr->numSpare ; c++) {
2560 		sparecol = raidPtr->numCol + c;
2561 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2562 			/*
2563 
2564 			   we claim this disk is "optimal" if it's
2565 			   rf_ds_used_spare, as that means it should be
2566 			   directly substitutable for the disk it replaced.
2567 			   We note that too...
2568 
2569 			 */
2570 
2571 			for(j=0;j<raidPtr->numCol;j++) {
2572 				if (raidPtr->Disks[j].spareCol == sparecol) {
2573 					scol = j;
2574 					break;
2575 				}
2576 			}
2577 
2578 			clabel = raidget_component_label(raidPtr, sparecol);
2579 			/* make sure status is noted */
2580 
2581 			raid_init_component_label(raidPtr, clabel);
2582 
2583 			clabel->row = 0;
2584 			clabel->column = scol;
2585 			/* Note: we *don't* change status from rf_ds_used_spare
2586 			   to rf_ds_optimal */
2587 			/* clabel.status = rf_ds_optimal; */
2588 
2589 			raidmarkdirty(raidPtr, sparecol);
2590 		}
2591 	}
2592 }
2593 
2594 
2595 void
2596 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2597 {
2598 	RF_ComponentLabel_t *clabel;
2599 	int sparecol;
2600 	int c;
2601 	int j;
2602 	int scol;
2603 	struct raid_softc *rs = raidPtr->softc;
2604 
2605 	scol = -1;
2606 
2607 	/* XXX should do extra checks to make sure things really are clean,
2608 	   rather than blindly setting the clean bit... */
2609 
2610 	raidPtr->mod_counter++;
2611 
2612 	for (c = 0; c < raidPtr->numCol; c++) {
2613 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2614 			clabel = raidget_component_label(raidPtr, c);
2615 			/* make sure status is noted */
2616 			clabel->status = rf_ds_optimal;
2617 
2618 			/* note what unit we are configured as */
2619 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2620 				clabel->last_unit = raidPtr->raidid;
2621 
2622 			raidflush_component_label(raidPtr, c);
2623 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2624 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2625 					raidmarkclean(raidPtr, c);
2626 				}
2627 			}
2628 		}
2629 		/* else we don't touch it.. */
2630 	}
2631 
2632 	for( c = 0; c < raidPtr->numSpare ; c++) {
2633 		sparecol = raidPtr->numCol + c;
2634 		/* Need to ensure that the reconstruct actually completed! */
2635 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2636 			/*
2637 
2638 			   we claim this disk is "optimal" if it's
2639 			   rf_ds_used_spare, as that means it should be
2640 			   directly substitutable for the disk it replaced.
2641 			   We note that too...
2642 
2643 			 */
2644 
2645 			for(j=0;j<raidPtr->numCol;j++) {
2646 				if (raidPtr->Disks[j].spareCol == sparecol) {
2647 					scol = j;
2648 					break;
2649 				}
2650 			}
2651 
2652 			/* XXX shouldn't *really* need this... */
2653 			clabel = raidget_component_label(raidPtr, sparecol);
2654 			/* make sure status is noted */
2655 
2656 			raid_init_component_label(raidPtr, clabel);
2657 
2658 			clabel->column = scol;
2659 			clabel->status = rf_ds_optimal;
2660 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2661 				clabel->last_unit = raidPtr->raidid;
2662 
2663 			raidflush_component_label(raidPtr, sparecol);
2664 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2665 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2666 					raidmarkclean(raidPtr, sparecol);
2667 				}
2668 			}
2669 		}
2670 	}
2671 }
2672 
2673 void
2674 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2675 {
2676 
2677 	if (vp != NULL) {
2678 		if (auto_configured == 1) {
2679 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2680 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2681 			vput(vp);
2682 
2683 		} else {
2684 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2685 		}
2686 	}
2687 }
2688 
2689 
2690 void
2691 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2692 {
2693 	int r,c;
2694 	struct vnode *vp;
2695 	int acd;
2696 
2697 
2698 	/* We take this opportunity to close the vnodes like we should.. */
2699 
2700 	for (c = 0; c < raidPtr->numCol; c++) {
2701 		vp = raidPtr->raid_cinfo[c].ci_vp;
2702 		acd = raidPtr->Disks[c].auto_configured;
2703 		rf_close_component(raidPtr, vp, acd);
2704 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2705 		raidPtr->Disks[c].auto_configured = 0;
2706 	}
2707 
2708 	for (r = 0; r < raidPtr->numSpare; r++) {
2709 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2710 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2711 		rf_close_component(raidPtr, vp, acd);
2712 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2713 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2714 	}
2715 }
2716 
2717 
2718 static void
2719 rf_ReconThread(struct rf_recon_req_internal *req)
2720 {
2721 	int     s;
2722 	RF_Raid_t *raidPtr;
2723 
2724 	s = splbio();
2725 	raidPtr = (RF_Raid_t *) req->raidPtr;
2726 	raidPtr->recon_in_progress = 1;
2727 
2728 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2729 		raidPtr->forceRecon = 1;
2730 	}
2731 
2732 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2733 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2734 
2735 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2736 		raidPtr->forceRecon = 0;
2737 	}
2738 
2739 	RF_Free(req, sizeof(*req));
2740 
2741 	raidPtr->recon_in_progress = 0;
2742 	splx(s);
2743 
2744 	/* That's all... */
2745 	kthread_exit(0);	/* does not return */
2746 }
2747 
2748 static void
2749 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2750 {
2751 	int retcode;
2752 	int s;
2753 
2754 	raidPtr->parity_rewrite_stripes_done = 0;
2755 	raidPtr->parity_rewrite_in_progress = 1;
2756 	s = splbio();
2757 	retcode = rf_RewriteParity(raidPtr);
2758 	splx(s);
2759 	if (retcode) {
2760 		printf("raid%d: Error re-writing parity (%d)!\n",
2761 		    raidPtr->raidid, retcode);
2762 	} else {
2763 		/* set the clean bit!  If we shutdown correctly,
2764 		   the clean bit on each component label will get
2765 		   set */
2766 		raidPtr->parity_good = RF_RAID_CLEAN;
2767 	}
2768 	raidPtr->parity_rewrite_in_progress = 0;
2769 
2770 	/* Anyone waiting for us to stop?  If so, inform them... */
2771 	if (raidPtr->waitShutdown) {
2772 		rf_lock_mutex2(raidPtr->rad_lock);
2773 		cv_broadcast(&raidPtr->parity_rewrite_cv);
2774 		rf_unlock_mutex2(raidPtr->rad_lock);
2775 	}
2776 
2777 	/* That's all... */
2778 	kthread_exit(0);	/* does not return */
2779 }
2780 
2781 
2782 static void
2783 rf_CopybackThread(RF_Raid_t *raidPtr)
2784 {
2785 	int s;
2786 
2787 	raidPtr->copyback_in_progress = 1;
2788 	s = splbio();
2789 	rf_CopybackReconstructedData(raidPtr);
2790 	splx(s);
2791 	raidPtr->copyback_in_progress = 0;
2792 
2793 	/* That's all... */
2794 	kthread_exit(0);	/* does not return */
2795 }
2796 
2797 
2798 static void
2799 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2800 {
2801 	int s;
2802 	RF_Raid_t *raidPtr;
2803 
2804 	s = splbio();
2805 	raidPtr = req->raidPtr;
2806 	raidPtr->recon_in_progress = 1;
2807 
2808 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2809 		raidPtr->forceRecon = 1;
2810 	}
2811 
2812 	rf_ReconstructInPlace(raidPtr, req->col);
2813 
2814 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2815 		raidPtr->forceRecon = 0;
2816 	}
2817 
2818 	RF_Free(req, sizeof(*req));
2819 	raidPtr->recon_in_progress = 0;
2820 	splx(s);
2821 
2822 	/* That's all... */
2823 	kthread_exit(0);	/* does not return */
2824 }
2825 
2826 static RF_AutoConfig_t *
2827 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2828     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2829     unsigned secsize)
2830 {
2831 	int good_one = 0;
2832 	RF_ComponentLabel_t *clabel;
2833 	RF_AutoConfig_t *ac;
2834 
2835 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2836 
2837 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2838 		/* Got the label.  Does it look reasonable? */
2839 		if (rf_reasonable_label(clabel, numsecs) &&
2840 		    (rf_component_label_partitionsize(clabel) <= size)) {
2841 #ifdef DEBUG
2842 			printf("Component on: %s: %llu\n",
2843 				cname, (unsigned long long)size);
2844 			rf_print_component_label(clabel);
2845 #endif
2846 			/* if it's reasonable, add it, else ignore it. */
2847 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2848 				M_WAITOK);
2849 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2850 			ac->dev = dev;
2851 			ac->vp = vp;
2852 			ac->clabel = clabel;
2853 			ac->next = ac_list;
2854 			ac_list = ac;
2855 			good_one = 1;
2856 		}
2857 	}
2858 	if (!good_one) {
2859 		/* cleanup */
2860 		free(clabel, M_RAIDFRAME);
2861 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2862 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2863 		vput(vp);
2864 	}
2865 	return ac_list;
2866 }
2867 
2868 static RF_AutoConfig_t *
2869 rf_find_raid_components(void)
2870 {
2871 	struct vnode *vp;
2872 	struct disklabel label;
2873 	device_t dv;
2874 	deviter_t di;
2875 	dev_t dev;
2876 	int bmajor, bminor, wedge, rf_part_found;
2877 	int error;
2878 	int i;
2879 	RF_AutoConfig_t *ac_list;
2880 	uint64_t numsecs;
2881 	unsigned secsize;
2882 	int dowedges;
2883 
2884 	/* initialize the AutoConfig list */
2885 	ac_list = NULL;
2886 
2887 	/*
2888 	 * we begin by trolling through *all* the devices on the system *twice*
2889 	 * first we scan for wedges, second for other devices. This avoids
2890 	 * using a raw partition instead of a wedge that covers the whole disk
2891 	 */
2892 
2893 	for (dowedges=1; dowedges>=0; --dowedges) {
2894 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2895 		     dv = deviter_next(&di)) {
2896 
2897 			/* we are only interested in disks */
2898 			if (device_class(dv) != DV_DISK)
2899 				continue;
2900 
2901 			/* we don't care about floppies */
2902 			if (device_is_a(dv, "fd")) {
2903 				continue;
2904 			}
2905 
2906 			/* we don't care about CDs. */
2907 			if (device_is_a(dv, "cd")) {
2908 				continue;
2909 			}
2910 
2911 			/* we don't care about md. */
2912 			if (device_is_a(dv, "md")) {
2913 				continue;
2914 			}
2915 
2916 			/* hdfd is the Atari/Hades floppy driver */
2917 			if (device_is_a(dv, "hdfd")) {
2918 				continue;
2919 			}
2920 
2921 			/* fdisa is the Atari/Milan floppy driver */
2922 			if (device_is_a(dv, "fdisa")) {
2923 				continue;
2924 			}
2925 
2926 			/* we don't care about spiflash */
2927 			if (device_is_a(dv, "spiflash")) {
2928 				continue;
2929 			}
2930 
2931 			/* are we in the wedges pass ? */
2932 			wedge = device_is_a(dv, "dk");
2933 			if (wedge != dowedges) {
2934 				continue;
2935 			}
2936 
2937 			/* need to find the device_name_to_block_device_major stuff */
2938 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2939 
2940 			rf_part_found = 0; /*No raid partition as yet*/
2941 
2942 			/* get a vnode for the raw partition of this disk */
2943 			bminor = minor(device_unit(dv));
2944 			dev = wedge ? makedev(bmajor, bminor) :
2945 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2946 			if (bdevvp(dev, &vp))
2947 				panic("RAID can't alloc vnode");
2948 
2949 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2950 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2951 
2952 			if (error) {
2953 				/* "Who cares."  Continue looking
2954 				   for something that exists*/
2955 				vput(vp);
2956 				continue;
2957 			}
2958 
2959 			error = getdisksize(vp, &numsecs, &secsize);
2960 			if (error) {
2961 				/*
2962 				 * Pseudo devices like vnd and cgd can be
2963 				 * opened but may still need some configuration.
2964 				 * Ignore these quietly.
2965 				 */
2966 				if (error != ENXIO)
2967 					printf("RAIDframe: can't get disk size"
2968 					    " for dev %s (%d)\n",
2969 					    device_xname(dv), error);
2970 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2971 				vput(vp);
2972 				continue;
2973 			}
2974 			if (wedge) {
2975 				struct dkwedge_info dkw;
2976 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2977 				    NOCRED);
2978 				if (error) {
2979 					printf("RAIDframe: can't get wedge info for "
2980 					    "dev %s (%d)\n", device_xname(dv), error);
2981 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2982 					vput(vp);
2983 					continue;
2984 				}
2985 
2986 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2987 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2988 					vput(vp);
2989 					continue;
2990 				}
2991 
2992 				VOP_UNLOCK(vp);
2993 				ac_list = rf_get_component(ac_list, dev, vp,
2994 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
2995 				rf_part_found = 1; /*There is a raid component on this disk*/
2996 				continue;
2997 			}
2998 
2999 			/* Ok, the disk exists.  Go get the disklabel. */
3000 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3001 			if (error) {
3002 				/*
3003 				 * XXX can't happen - open() would
3004 				 * have errored out (or faked up one)
3005 				 */
3006 				if (error != ENOTTY)
3007 					printf("RAIDframe: can't get label for dev "
3008 					    "%s (%d)\n", device_xname(dv), error);
3009 			}
3010 
3011 			/* don't need this any more.  We'll allocate it again
3012 			   a little later if we really do... */
3013 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3014 			vput(vp);
3015 
3016 			if (error)
3017 				continue;
3018 
3019 			rf_part_found = 0; /*No raid partitions yet*/
3020 			for (i = 0; i < label.d_npartitions; i++) {
3021 				char cname[sizeof(ac_list->devname)];
3022 
3023 				/* We only support partitions marked as RAID */
3024 				if (label.d_partitions[i].p_fstype != FS_RAID)
3025 					continue;
3026 
3027 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3028 				if (bdevvp(dev, &vp))
3029 					panic("RAID can't alloc vnode");
3030 
3031 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3032 				error = VOP_OPEN(vp, FREAD, NOCRED);
3033 				if (error) {
3034 					/* Not quite a 'whatever'.  In
3035 					 * this situation we know
3036 					 * there is a FS_RAID
3037 					 * partition, but we can't
3038 					 * open it.  The most likely
3039 					 * reason is that the
3040 					 * partition is already in
3041 					 * use by another RAID set.
3042 					 * So note that we've already
3043 					 * found a partition on this
3044 					 * disk so we don't attempt
3045 					 * to use the raw disk later. */
3046 					rf_part_found = 1;
3047 					vput(vp);
3048 					continue;
3049 				}
3050 				VOP_UNLOCK(vp);
3051 				snprintf(cname, sizeof(cname), "%s%c",
3052 				    device_xname(dv), 'a' + i);
3053 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3054 					label.d_partitions[i].p_size, numsecs, secsize);
3055 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3056 			}
3057 
3058 			/*
3059 			 *If there is no raid component on this disk, either in a
3060 			 *disklabel or inside a wedge, check the raw partition as well,
3061 			 *as it is possible to configure raid components on raw disk
3062 			 *devices.
3063 			 */
3064 
3065 			if (!rf_part_found) {
3066 				char cname[sizeof(ac_list->devname)];
3067 
3068 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3069 				if (bdevvp(dev, &vp))
3070 					panic("RAID can't alloc vnode");
3071 
3072 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3073 
3074 				error = VOP_OPEN(vp, FREAD, NOCRED);
3075 				if (error) {
3076 					/* Whatever... */
3077 					vput(vp);
3078 					continue;
3079 				}
3080 				VOP_UNLOCK(vp);
3081 				snprintf(cname, sizeof(cname), "%s%c",
3082 				    device_xname(dv), 'a' + RAW_PART);
3083 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3084 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3085 			}
3086 		}
3087 		deviter_release(&di);
3088 	}
3089 	return ac_list;
3090 }
3091 
3092 int
3093 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3094 {
3095 
3096 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3097 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
3098 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3099 	    (clabel->clean == RF_RAID_CLEAN ||
3100 	     clabel->clean == RF_RAID_DIRTY) &&
3101 	    clabel->row >=0 &&
3102 	    clabel->column >= 0 &&
3103 	    clabel->num_rows > 0 &&
3104 	    clabel->num_columns > 0 &&
3105 	    clabel->row < clabel->num_rows &&
3106 	    clabel->column < clabel->num_columns &&
3107 	    clabel->blockSize > 0 &&
3108 	    /*
3109 	     * numBlocksHi may contain garbage, but it is ok since
3110 	     * the type is unsigned.  If it is really garbage,
3111 	     * rf_fix_old_label_size() will fix it.
3112 	     */
3113 	    rf_component_label_numblocks(clabel) > 0) {
3114 		/*
3115 		 * label looks reasonable enough...
3116 		 * let's make sure it has no old garbage.
3117 		 */
3118 		if (numsecs)
3119 			rf_fix_old_label_size(clabel, numsecs);
3120 		return(1);
3121 	}
3122 	return(0);
3123 }
3124 
3125 
3126 /*
3127  * For reasons yet unknown, some old component labels have garbage in
3128  * the newer numBlocksHi region, and this causes lossage.  Since those
3129  * disks will also have numsecs set to less than 32 bits of sectors,
3130  * we can determine when this corruption has occurred, and fix it.
3131  *
3132  * The exact same problem, with the same unknown reason, happens to
3133  * the partitionSizeHi member as well.
3134  */
3135 static void
3136 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3137 {
3138 
3139 	if (numsecs < ((uint64_t)1 << 32)) {
3140 		if (clabel->numBlocksHi) {
3141 			printf("WARNING: total sectors < 32 bits, yet "
3142 			       "numBlocksHi set\n"
3143 			       "WARNING: resetting numBlocksHi to zero.\n");
3144 			clabel->numBlocksHi = 0;
3145 		}
3146 
3147 		if (clabel->partitionSizeHi) {
3148 			printf("WARNING: total sectors < 32 bits, yet "
3149 			       "partitionSizeHi set\n"
3150 			       "WARNING: resetting partitionSizeHi to zero.\n");
3151 			clabel->partitionSizeHi = 0;
3152 		}
3153 	}
3154 }
3155 
3156 
3157 #ifdef DEBUG
3158 void
3159 rf_print_component_label(RF_ComponentLabel_t *clabel)
3160 {
3161 	uint64_t numBlocks;
3162 	static const char *rp[] = {
3163 	    "No", "Force", "Soft", "*invalid*"
3164 	};
3165 
3166 
3167 	numBlocks = rf_component_label_numblocks(clabel);
3168 
3169 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3170 	       clabel->row, clabel->column,
3171 	       clabel->num_rows, clabel->num_columns);
3172 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3173 	       clabel->version, clabel->serial_number,
3174 	       clabel->mod_counter);
3175 	printf("   Clean: %s Status: %d\n",
3176 	       clabel->clean ? "Yes" : "No", clabel->status);
3177 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3178 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3179 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3180 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3181 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3182 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3183 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3184 #if 0
3185 	   printf("   Config order: %d\n", clabel->config_order);
3186 #endif
3187 
3188 }
3189 #endif
3190 
3191 static RF_ConfigSet_t *
3192 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3193 {
3194 	RF_AutoConfig_t *ac;
3195 	RF_ConfigSet_t *config_sets;
3196 	RF_ConfigSet_t *cset;
3197 	RF_AutoConfig_t *ac_next;
3198 
3199 
3200 	config_sets = NULL;
3201 
3202 	/* Go through the AutoConfig list, and figure out which components
3203 	   belong to what sets.  */
3204 	ac = ac_list;
3205 	while(ac!=NULL) {
3206 		/* we're going to putz with ac->next, so save it here
3207 		   for use at the end of the loop */
3208 		ac_next = ac->next;
3209 
3210 		if (config_sets == NULL) {
3211 			/* will need at least this one... */
3212 			config_sets = malloc(sizeof(RF_ConfigSet_t),
3213 				       M_RAIDFRAME, M_WAITOK);
3214 			/* this one is easy :) */
3215 			config_sets->ac = ac;
3216 			config_sets->next = NULL;
3217 			config_sets->rootable = 0;
3218 			ac->next = NULL;
3219 		} else {
3220 			/* which set does this component fit into? */
3221 			cset = config_sets;
3222 			while(cset!=NULL) {
3223 				if (rf_does_it_fit(cset, ac)) {
3224 					/* looks like it matches... */
3225 					ac->next = cset->ac;
3226 					cset->ac = ac;
3227 					break;
3228 				}
3229 				cset = cset->next;
3230 			}
3231 			if (cset==NULL) {
3232 				/* didn't find a match above... new set..*/
3233 				cset = malloc(sizeof(RF_ConfigSet_t),
3234 					       M_RAIDFRAME, M_WAITOK);
3235 				cset->ac = ac;
3236 				ac->next = NULL;
3237 				cset->next = config_sets;
3238 				cset->rootable = 0;
3239 				config_sets = cset;
3240 			}
3241 		}
3242 		ac = ac_next;
3243 	}
3244 
3245 
3246 	return(config_sets);
3247 }
3248 
3249 static int
3250 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3251 {
3252 	RF_ComponentLabel_t *clabel1, *clabel2;
3253 
3254 	/* If this one matches the *first* one in the set, that's good
3255 	   enough, since the other members of the set would have been
3256 	   through here too... */
3257 	/* note that we are not checking partitionSize here..
3258 
3259 	   Note that we are also not checking the mod_counters here.
3260 	   If everything else matches except the mod_counter, that's
3261 	   good enough for this test.  We will deal with the mod_counters
3262 	   a little later in the autoconfiguration process.
3263 
3264 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3265 
3266 	   The reason we don't check for this is that failed disks
3267 	   will have lower modification counts.  If those disks are
3268 	   not added to the set they used to belong to, then they will
3269 	   form their own set, which may result in 2 different sets,
3270 	   for example, competing to be configured at raid0, and
3271 	   perhaps competing to be the root filesystem set.  If the
3272 	   wrong ones get configured, or both attempt to become /,
3273 	   weird behaviour and or serious lossage will occur.  Thus we
3274 	   need to bring them into the fold here, and kick them out at
3275 	   a later point.
3276 
3277 	*/
3278 
3279 	clabel1 = cset->ac->clabel;
3280 	clabel2 = ac->clabel;
3281 	if ((clabel1->version == clabel2->version) &&
3282 	    (clabel1->serial_number == clabel2->serial_number) &&
3283 	    (clabel1->num_rows == clabel2->num_rows) &&
3284 	    (clabel1->num_columns == clabel2->num_columns) &&
3285 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3286 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3287 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3288 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3289 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3290 	    (clabel1->blockSize == clabel2->blockSize) &&
3291 	    rf_component_label_numblocks(clabel1) ==
3292 	    rf_component_label_numblocks(clabel2) &&
3293 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3294 	    (clabel1->root_partition == clabel2->root_partition) &&
3295 	    (clabel1->last_unit == clabel2->last_unit) &&
3296 	    (clabel1->config_order == clabel2->config_order)) {
3297 		/* if it get's here, it almost *has* to be a match */
3298 	} else {
3299 		/* it's not consistent with somebody in the set..
3300 		   punt */
3301 		return(0);
3302 	}
3303 	/* all was fine.. it must fit... */
3304 	return(1);
3305 }
3306 
3307 static int
3308 rf_have_enough_components(RF_ConfigSet_t *cset)
3309 {
3310 	RF_AutoConfig_t *ac;
3311 	RF_AutoConfig_t *auto_config;
3312 	RF_ComponentLabel_t *clabel;
3313 	int c;
3314 	int num_cols;
3315 	int num_missing;
3316 	int mod_counter;
3317 	int mod_counter_found;
3318 	int even_pair_failed;
3319 	char parity_type;
3320 
3321 
3322 	/* check to see that we have enough 'live' components
3323 	   of this set.  If so, we can configure it if necessary */
3324 
3325 	num_cols = cset->ac->clabel->num_columns;
3326 	parity_type = cset->ac->clabel->parityConfig;
3327 
3328 	/* XXX Check for duplicate components!?!?!? */
3329 
3330 	/* Determine what the mod_counter is supposed to be for this set. */
3331 
3332 	mod_counter_found = 0;
3333 	mod_counter = 0;
3334 	ac = cset->ac;
3335 	while(ac!=NULL) {
3336 		if (mod_counter_found==0) {
3337 			mod_counter = ac->clabel->mod_counter;
3338 			mod_counter_found = 1;
3339 		} else {
3340 			if (ac->clabel->mod_counter > mod_counter) {
3341 				mod_counter = ac->clabel->mod_counter;
3342 			}
3343 		}
3344 		ac = ac->next;
3345 	}
3346 
3347 	num_missing = 0;
3348 	auto_config = cset->ac;
3349 
3350 	even_pair_failed = 0;
3351 	for(c=0; c<num_cols; c++) {
3352 		ac = auto_config;
3353 		while(ac!=NULL) {
3354 			if ((ac->clabel->column == c) &&
3355 			    (ac->clabel->mod_counter == mod_counter)) {
3356 				/* it's this one... */
3357 #ifdef DEBUG
3358 				printf("Found: %s at %d\n",
3359 				       ac->devname,c);
3360 #endif
3361 				break;
3362 			}
3363 			ac=ac->next;
3364 		}
3365 		if (ac==NULL) {
3366 				/* Didn't find one here! */
3367 				/* special case for RAID 1, especially
3368 				   where there are more than 2
3369 				   components (where RAIDframe treats
3370 				   things a little differently :( ) */
3371 			if (parity_type == '1') {
3372 				if (c%2 == 0) { /* even component */
3373 					even_pair_failed = 1;
3374 				} else { /* odd component.  If
3375 					    we're failed, and
3376 					    so is the even
3377 					    component, it's
3378 					    "Good Night, Charlie" */
3379 					if (even_pair_failed == 1) {
3380 						return(0);
3381 					}
3382 				}
3383 			} else {
3384 				/* normal accounting */
3385 				num_missing++;
3386 			}
3387 		}
3388 		if ((parity_type == '1') && (c%2 == 1)) {
3389 				/* Just did an even component, and we didn't
3390 				   bail.. reset the even_pair_failed flag,
3391 				   and go on to the next component.... */
3392 			even_pair_failed = 0;
3393 		}
3394 	}
3395 
3396 	clabel = cset->ac->clabel;
3397 
3398 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3399 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3400 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3401 		/* XXX this needs to be made *much* more general */
3402 		/* Too many failures */
3403 		return(0);
3404 	}
3405 	/* otherwise, all is well, and we've got enough to take a kick
3406 	   at autoconfiguring this set */
3407 	return(1);
3408 }
3409 
3410 static void
3411 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3412 			RF_Raid_t *raidPtr)
3413 {
3414 	RF_ComponentLabel_t *clabel;
3415 	int i;
3416 
3417 	clabel = ac->clabel;
3418 
3419 	/* 1. Fill in the common stuff */
3420 	config->numCol = clabel->num_columns;
3421 	config->numSpare = 0; /* XXX should this be set here? */
3422 	config->sectPerSU = clabel->sectPerSU;
3423 	config->SUsPerPU = clabel->SUsPerPU;
3424 	config->SUsPerRU = clabel->SUsPerRU;
3425 	config->parityConfig = clabel->parityConfig;
3426 	/* XXX... */
3427 	strcpy(config->diskQueueType,"fifo");
3428 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3429 	config->layoutSpecificSize = 0; /* XXX ?? */
3430 
3431 	while(ac!=NULL) {
3432 		/* row/col values will be in range due to the checks
3433 		   in reasonable_label() */
3434 		strcpy(config->devnames[0][ac->clabel->column],
3435 		       ac->devname);
3436 		ac = ac->next;
3437 	}
3438 
3439 	for(i=0;i<RF_MAXDBGV;i++) {
3440 		config->debugVars[i][0] = 0;
3441 	}
3442 }
3443 
3444 static int
3445 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3446 {
3447 	RF_ComponentLabel_t *clabel;
3448 	int column;
3449 	int sparecol;
3450 
3451 	raidPtr->autoconfigure = new_value;
3452 
3453 	for(column=0; column<raidPtr->numCol; column++) {
3454 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3455 			clabel = raidget_component_label(raidPtr, column);
3456 			clabel->autoconfigure = new_value;
3457 			raidflush_component_label(raidPtr, column);
3458 		}
3459 	}
3460 	for(column = 0; column < raidPtr->numSpare ; column++) {
3461 		sparecol = raidPtr->numCol + column;
3462 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3463 			clabel = raidget_component_label(raidPtr, sparecol);
3464 			clabel->autoconfigure = new_value;
3465 			raidflush_component_label(raidPtr, sparecol);
3466 		}
3467 	}
3468 	return(new_value);
3469 }
3470 
3471 static int
3472 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3473 {
3474 	RF_ComponentLabel_t *clabel;
3475 	int column;
3476 	int sparecol;
3477 
3478 	raidPtr->root_partition = new_value;
3479 	for(column=0; column<raidPtr->numCol; column++) {
3480 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3481 			clabel = raidget_component_label(raidPtr, column);
3482 			clabel->root_partition = new_value;
3483 			raidflush_component_label(raidPtr, column);
3484 		}
3485 	}
3486 	for(column = 0; column < raidPtr->numSpare ; column++) {
3487 		sparecol = raidPtr->numCol + column;
3488 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3489 			clabel = raidget_component_label(raidPtr, sparecol);
3490 			clabel->root_partition = new_value;
3491 			raidflush_component_label(raidPtr, sparecol);
3492 		}
3493 	}
3494 	return(new_value);
3495 }
3496 
3497 static void
3498 rf_release_all_vps(RF_ConfigSet_t *cset)
3499 {
3500 	RF_AutoConfig_t *ac;
3501 
3502 	ac = cset->ac;
3503 	while(ac!=NULL) {
3504 		/* Close the vp, and give it back */
3505 		if (ac->vp) {
3506 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3507 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3508 			vput(ac->vp);
3509 			ac->vp = NULL;
3510 		}
3511 		ac = ac->next;
3512 	}
3513 }
3514 
3515 
3516 static void
3517 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3518 {
3519 	RF_AutoConfig_t *ac;
3520 	RF_AutoConfig_t *next_ac;
3521 
3522 	ac = cset->ac;
3523 	while(ac!=NULL) {
3524 		next_ac = ac->next;
3525 		/* nuke the label */
3526 		free(ac->clabel, M_RAIDFRAME);
3527 		/* cleanup the config structure */
3528 		free(ac, M_RAIDFRAME);
3529 		/* "next.." */
3530 		ac = next_ac;
3531 	}
3532 	/* and, finally, nuke the config set */
3533 	free(cset, M_RAIDFRAME);
3534 }
3535 
3536 
3537 void
3538 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3539 {
3540 	/* avoid over-writing byteswapped version. */
3541 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3542 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3543 	clabel->serial_number = raidPtr->serial_number;
3544 	clabel->mod_counter = raidPtr->mod_counter;
3545 
3546 	clabel->num_rows = 1;
3547 	clabel->num_columns = raidPtr->numCol;
3548 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3549 	clabel->status = rf_ds_optimal; /* "It's good!" */
3550 
3551 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3552 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3553 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3554 
3555 	clabel->blockSize = raidPtr->bytesPerSector;
3556 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3557 
3558 	/* XXX not portable */
3559 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3560 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3561 	clabel->autoconfigure = raidPtr->autoconfigure;
3562 	clabel->root_partition = raidPtr->root_partition;
3563 	clabel->last_unit = raidPtr->raidid;
3564 	clabel->config_order = raidPtr->config_order;
3565 
3566 #ifndef RF_NO_PARITY_MAP
3567 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3568 #endif
3569 }
3570 
3571 static struct raid_softc *
3572 rf_auto_config_set(RF_ConfigSet_t *cset)
3573 {
3574 	RF_Raid_t *raidPtr;
3575 	RF_Config_t *config;
3576 	int raidID;
3577 	struct raid_softc *sc;
3578 
3579 #ifdef DEBUG
3580 	printf("RAID autoconfigure\n");
3581 #endif
3582 
3583 	/* 1. Create a config structure */
3584 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3585 
3586 	/*
3587 	   2. Figure out what RAID ID this one is supposed to live at
3588 	   See if we can get the same RAID dev that it was configured
3589 	   on last time..
3590 	*/
3591 
3592 	raidID = cset->ac->clabel->last_unit;
3593 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3594 	     sc = raidget(++raidID, false))
3595 		continue;
3596 #ifdef DEBUG
3597 	printf("Configuring raid%d:\n",raidID);
3598 #endif
3599 
3600 	if (sc == NULL)
3601 		sc = raidget(raidID, true);
3602 	raidPtr = &sc->sc_r;
3603 
3604 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3605 	raidPtr->softc = sc;
3606 	raidPtr->raidid = raidID;
3607 	raidPtr->openings = RAIDOUTSTANDING;
3608 
3609 	/* 3. Build the configuration structure */
3610 	rf_create_configuration(cset->ac, config, raidPtr);
3611 
3612 	/* 4. Do the configuration */
3613 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3614 		raidinit(sc);
3615 
3616 		rf_markalldirty(raidPtr);
3617 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3618 		switch (cset->ac->clabel->root_partition) {
3619 		case 1:	/* Force Root */
3620 		case 2:	/* Soft Root: root when boot partition part of raid */
3621 			/*
3622 			 * everything configured just fine.  Make a note
3623 			 * that this set is eligible to be root,
3624 			 * or forced to be root
3625 			 */
3626 			cset->rootable = cset->ac->clabel->root_partition;
3627 			/* XXX do this here? */
3628 			raidPtr->root_partition = cset->rootable;
3629 			break;
3630 		default:
3631 			break;
3632 		}
3633 	} else {
3634 		raidput(sc);
3635 		sc = NULL;
3636 	}
3637 
3638 	/* 5. Cleanup */
3639 	free(config, M_RAIDFRAME);
3640 	return sc;
3641 }
3642 
3643 void
3644 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3645 	     size_t xmin, size_t xmax)
3646 {
3647 
3648 	/* Format: raid%d_foo */
3649 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3650 
3651 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3652 	pool_sethiwat(p, xmax);
3653 	pool_prime(p, xmin);
3654 }
3655 
3656 
3657 /*
3658  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3659  * to see if there is IO pending and if that IO could possibly be done
3660  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3661  * otherwise.
3662  *
3663  */
3664 int
3665 rf_buf_queue_check(RF_Raid_t *raidPtr)
3666 {
3667 	struct raid_softc *rs;
3668 	struct dk_softc *dksc;
3669 
3670 	rs = raidPtr->softc;
3671 	dksc = &rs->sc_dksc;
3672 
3673 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3674 		return 1;
3675 
3676 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3677 		/* there is work to do */
3678 		return 0;
3679 	}
3680 	/* default is nothing to do */
3681 	return 1;
3682 }
3683 
3684 int
3685 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3686 {
3687 	uint64_t numsecs;
3688 	unsigned secsize;
3689 	int error;
3690 
3691 	error = getdisksize(vp, &numsecs, &secsize);
3692 	if (error == 0) {
3693 		diskPtr->blockSize = secsize;
3694 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3695 		diskPtr->partitionSize = numsecs;
3696 		return 0;
3697 	}
3698 	return error;
3699 }
3700 
3701 static int
3702 raid_match(device_t self, cfdata_t cfdata, void *aux)
3703 {
3704 	return 1;
3705 }
3706 
3707 static void
3708 raid_attach(device_t parent, device_t self, void *aux)
3709 {
3710 }
3711 
3712 
3713 static int
3714 raid_detach(device_t self, int flags)
3715 {
3716 	int error;
3717 	struct raid_softc *rs = raidsoftc(self);
3718 
3719 	if (rs == NULL)
3720 		return ENXIO;
3721 
3722 	if ((error = raidlock(rs)) != 0)
3723 		return error;
3724 
3725 	error = raid_detach_unlocked(rs);
3726 
3727 	raidunlock(rs);
3728 
3729 	/* XXX raid can be referenced here */
3730 
3731 	if (error)
3732 		return error;
3733 
3734 	/* Free the softc */
3735 	raidput(rs);
3736 
3737 	return 0;
3738 }
3739 
3740 static void
3741 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3742 {
3743 	struct dk_softc *dksc = &rs->sc_dksc;
3744 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3745 
3746 	memset(dg, 0, sizeof(*dg));
3747 
3748 	dg->dg_secperunit = raidPtr->totalSectors;
3749 	dg->dg_secsize = raidPtr->bytesPerSector;
3750 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3751 	dg->dg_ntracks = 4 * raidPtr->numCol;
3752 
3753 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3754 }
3755 
3756 /*
3757  * Get cache info for all the components (including spares).
3758  * Returns intersection of all the cache flags of all disks, or first
3759  * error if any encountered.
3760  * XXXfua feature flags can change as spares are added - lock down somehow
3761  */
3762 static int
3763 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3764 {
3765 	int c;
3766 	int error;
3767 	int dkwhole = 0, dkpart;
3768 
3769 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3770 		/*
3771 		 * Check any non-dead disk, even when currently being
3772 		 * reconstructed.
3773 		 */
3774 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3775 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3776 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3777 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
3778 			if (error) {
3779 				if (error != ENODEV) {
3780 					printf("raid%d: get cache for component %s failed\n",
3781 					    raidPtr->raidid,
3782 					    raidPtr->Disks[c].devname);
3783 				}
3784 
3785 				return error;
3786 			}
3787 
3788 			if (c == 0)
3789 				dkwhole = dkpart;
3790 			else
3791 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3792 		}
3793 	}
3794 
3795 	*data = dkwhole;
3796 
3797 	return 0;
3798 }
3799 
3800 /*
3801  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3802  * We end up returning whatever error was returned by the first cache flush
3803  * that fails.
3804  */
3805 
3806 static int
3807 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3808 {
3809 	int e = 0;
3810 	for (int i = 0; i < 5; i++) {
3811 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3812 		    &force, FWRITE, NOCRED);
3813 		if (!e || e == ENODEV)
3814 			return e;
3815 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3816 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3817 	}
3818 	return e;
3819 }
3820 
3821 int
3822 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3823 {
3824 	int c, error;
3825 
3826 	error = 0;
3827 	for (c = 0; c < raidPtr->numCol; c++) {
3828 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3829 			int e = rf_sync_component_cache(raidPtr, c, force);
3830 			if (e && !error)
3831 				error = e;
3832 		}
3833 	}
3834 
3835 	for (c = 0; c < raidPtr->numSpare ; c++) {
3836 		int sparecol = raidPtr->numCol + c;
3837 		/* Need to ensure that the reconstruct actually completed! */
3838 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3839 			int e = rf_sync_component_cache(raidPtr, sparecol,
3840 			    force);
3841 			if (e && !error)
3842 				error = e;
3843 		}
3844 	}
3845 	return error;
3846 }
3847 
3848 /* Fill in info with the current status */
3849 void
3850 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3851 {
3852 
3853 	memset(info, 0, sizeof(*info));
3854 
3855 	if (raidPtr->status != rf_rs_reconstructing) {
3856 		info->total = 100;
3857 		info->completed = 100;
3858 	} else {
3859 		info->total = raidPtr->reconControl->numRUsTotal;
3860 		info->completed = raidPtr->reconControl->numRUsComplete;
3861 	}
3862 	info->remaining = info->total - info->completed;
3863 }
3864 
3865 /* Fill in info with the current status */
3866 void
3867 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3868 {
3869 
3870 	memset(info, 0, sizeof(*info));
3871 
3872 	if (raidPtr->parity_rewrite_in_progress == 1) {
3873 		info->total = raidPtr->Layout.numStripe;
3874 		info->completed = raidPtr->parity_rewrite_stripes_done;
3875 	} else {
3876 		info->completed = 100;
3877 		info->total = 100;
3878 	}
3879 	info->remaining = info->total - info->completed;
3880 }
3881 
3882 /* Fill in info with the current status */
3883 void
3884 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3885 {
3886 
3887 	memset(info, 0, sizeof(*info));
3888 
3889 	if (raidPtr->copyback_in_progress == 1) {
3890 		info->total = raidPtr->Layout.numStripe;
3891 		info->completed = raidPtr->copyback_stripes_done;
3892 		info->remaining = info->total - info->completed;
3893 	} else {
3894 		info->remaining = 0;
3895 		info->completed = 100;
3896 		info->total = 100;
3897 	}
3898 }
3899 
3900 /* Fill in config with the current info */
3901 int
3902 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3903 {
3904 	int	d, i, j;
3905 
3906 	if (!raidPtr->valid)
3907 		return ENODEV;
3908 	config->cols = raidPtr->numCol;
3909 	config->ndevs = raidPtr->numCol;
3910 	if (config->ndevs >= RF_MAX_DISKS)
3911 		return ENOMEM;
3912 	config->nspares = raidPtr->numSpare;
3913 	if (config->nspares >= RF_MAX_DISKS)
3914 		return ENOMEM;
3915 	config->maxqdepth = raidPtr->maxQueueDepth;
3916 	d = 0;
3917 	for (j = 0; j < config->cols; j++) {
3918 		config->devs[d] = raidPtr->Disks[j];
3919 		d++;
3920 	}
3921 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3922 		config->spares[i] = raidPtr->Disks[j];
3923 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
3924 			/* XXX: raidctl(8) expects to see this as a used spare */
3925 			config->spares[i].status = rf_ds_used_spare;
3926 		}
3927 	}
3928 	return 0;
3929 }
3930 
3931 int
3932 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3933 {
3934 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3935 	RF_ComponentLabel_t *raid_clabel;
3936 	int column = clabel->column;
3937 
3938 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3939 		return EINVAL;
3940 	raid_clabel = raidget_component_label(raidPtr, column);
3941 	memcpy(clabel, raid_clabel, sizeof *clabel);
3942 	/* Fix-up for userland. */
3943 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
3944 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3945 
3946 	return 0;
3947 }
3948 
3949 /*
3950  * Module interface
3951  */
3952 
3953 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3954 
3955 #ifdef _MODULE
3956 CFDRIVER_DECL(raid, DV_DISK, NULL);
3957 #endif
3958 
3959 static int raid_modcmd(modcmd_t, void *);
3960 static int raid_modcmd_init(void);
3961 static int raid_modcmd_fini(void);
3962 
3963 static int
3964 raid_modcmd(modcmd_t cmd, void *data)
3965 {
3966 	int error;
3967 
3968 	error = 0;
3969 	switch (cmd) {
3970 	case MODULE_CMD_INIT:
3971 		error = raid_modcmd_init();
3972 		break;
3973 	case MODULE_CMD_FINI:
3974 		error = raid_modcmd_fini();
3975 		break;
3976 	default:
3977 		error = ENOTTY;
3978 		break;
3979 	}
3980 	return error;
3981 }
3982 
3983 static int
3984 raid_modcmd_init(void)
3985 {
3986 	int error;
3987 	int bmajor, cmajor;
3988 
3989 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3990 	mutex_enter(&raid_lock);
3991 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3992 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3993 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3994 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3995 
3996 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3997 #endif
3998 
3999 	bmajor = cmajor = -1;
4000 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4001 	    &raid_cdevsw, &cmajor);
4002 	if (error != 0 && error != EEXIST) {
4003 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4004 		mutex_exit(&raid_lock);
4005 		return error;
4006 	}
4007 #ifdef _MODULE
4008 	error = config_cfdriver_attach(&raid_cd);
4009 	if (error != 0) {
4010 		aprint_error("%s: config_cfdriver_attach failed %d\n",
4011 		    __func__, error);
4012 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4013 		mutex_exit(&raid_lock);
4014 		return error;
4015 	}
4016 #endif
4017 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4018 	if (error != 0) {
4019 		aprint_error("%s: config_cfattach_attach failed %d\n",
4020 		    __func__, error);
4021 #ifdef _MODULE
4022 		config_cfdriver_detach(&raid_cd);
4023 #endif
4024 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4025 		mutex_exit(&raid_lock);
4026 		return error;
4027 	}
4028 
4029 	raidautoconfigdone = false;
4030 
4031 	mutex_exit(&raid_lock);
4032 
4033 	if (error == 0) {
4034 		if (rf_BootRaidframe(true) == 0)
4035 			aprint_verbose("Kernelized RAIDframe activated\n");
4036 		else
4037 			panic("Serious error activating RAID!!");
4038 	}
4039 
4040 	/*
4041 	 * Register a finalizer which will be used to auto-config RAID
4042 	 * sets once all real hardware devices have been found.
4043 	 */
4044 	error = config_finalize_register(NULL, rf_autoconfig);
4045 	if (error != 0) {
4046 		aprint_error("WARNING: unable to register RAIDframe "
4047 		    "finalizer\n");
4048 		error = 0;
4049 	}
4050 
4051 	return error;
4052 }
4053 
4054 static int
4055 raid_modcmd_fini(void)
4056 {
4057 	int error;
4058 
4059 	mutex_enter(&raid_lock);
4060 
4061 	/* Don't allow unload if raid device(s) exist.  */
4062 	if (!LIST_EMPTY(&raids)) {
4063 		mutex_exit(&raid_lock);
4064 		return EBUSY;
4065 	}
4066 
4067 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4068 	if (error != 0) {
4069 		aprint_error("%s: cannot detach cfattach\n",__func__);
4070 		mutex_exit(&raid_lock);
4071 		return error;
4072 	}
4073 #ifdef _MODULE
4074 	error = config_cfdriver_detach(&raid_cd);
4075 	if (error != 0) {
4076 		aprint_error("%s: cannot detach cfdriver\n",__func__);
4077 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4078 		mutex_exit(&raid_lock);
4079 		return error;
4080 	}
4081 #endif
4082 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
4083 	rf_BootRaidframe(false);
4084 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4085 	rf_destroy_mutex2(rf_sparet_wait_mutex);
4086 	rf_destroy_cond2(rf_sparet_wait_cv);
4087 	rf_destroy_cond2(rf_sparet_resp_cv);
4088 #endif
4089 	mutex_exit(&raid_lock);
4090 	mutex_destroy(&raid_lock);
4091 
4092 	return error;
4093 }
4094