xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 7d62b00eb9ad855ffcd7da46b41e23feb5476fac)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.410 2022/08/28 00:37:41 oster Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.410 2022/08/28 00:37:41 oster Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151 
152 #include "ioconf.h"
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165 
166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
167 						 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
169 						 * installation process */
170 #endif
171 
172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
173 
174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
175 
176 /* prototypes */
177 static void KernelWakeupFunc(struct buf *);
178 static void InitBP(struct buf *, struct vnode *, unsigned,
179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
180     void *, int);
181 static void raidinit(struct raid_softc *);
182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
184 
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188 
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190     daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192     daddr_t, daddr_t, int);
193 
194 static int raidwrite_component_label(unsigned,
195     dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197     dev_t, struct vnode *, RF_ComponentLabel_t *);
198 
199 static int raid_diskstart(device_t, struct buf *bp);
200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
201 static int raid_lastclose(device_t);
202 
203 static dev_type_open(raidopen);
204 static dev_type_close(raidclose);
205 static dev_type_read(raidread);
206 static dev_type_write(raidwrite);
207 static dev_type_ioctl(raidioctl);
208 static dev_type_strategy(raidstrategy);
209 static dev_type_dump(raiddump);
210 static dev_type_size(raidsize);
211 
212 const struct bdevsw raid_bdevsw = {
213 	.d_open = raidopen,
214 	.d_close = raidclose,
215 	.d_strategy = raidstrategy,
216 	.d_ioctl = raidioctl,
217 	.d_dump = raiddump,
218 	.d_psize = raidsize,
219 	.d_discard = nodiscard,
220 	.d_flag = D_DISK
221 };
222 
223 const struct cdevsw raid_cdevsw = {
224 	.d_open = raidopen,
225 	.d_close = raidclose,
226 	.d_read = raidread,
227 	.d_write = raidwrite,
228 	.d_ioctl = raidioctl,
229 	.d_stop = nostop,
230 	.d_tty = notty,
231 	.d_poll = nopoll,
232 	.d_mmap = nommap,
233 	.d_kqfilter = nokqfilter,
234 	.d_discard = nodiscard,
235 	.d_flag = D_DISK
236 };
237 
238 static struct dkdriver rf_dkdriver = {
239 	.d_open = raidopen,
240 	.d_close = raidclose,
241 	.d_strategy = raidstrategy,
242 	.d_diskstart = raid_diskstart,
243 	.d_dumpblocks = raid_dumpblocks,
244 	.d_lastclose = raid_lastclose,
245 	.d_minphys = minphys
246 };
247 
248 #define	raidunit(x)	DISKUNIT(x)
249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
250 
251 extern struct cfdriver raid_cd;
252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
254     DVF_DETACH_SHUTDOWN);
255 
256 /* Internal representation of a rf_recon_req */
257 struct rf_recon_req_internal {
258 	RF_RowCol_t col;
259 	RF_ReconReqFlags_t flags;
260 	void   *raidPtr;
261 };
262 
263 /*
264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
265  * Be aware that large numbers can allow the driver to consume a lot of
266  * kernel memory, especially on writes, and in degraded mode reads.
267  *
268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
269  * a single 64K write will typically require 64K for the old data,
270  * 64K for the old parity, and 64K for the new parity, for a total
271  * of 192K (if the parity buffer is not re-used immediately).
272  * Even it if is used immediately, that's still 128K, which when multiplied
273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
274  *
275  * Now in degraded mode, for example, a 64K read on the above setup may
276  * require data reconstruction, which will require *all* of the 4 remaining
277  * disks to participate -- 4 * 32K/disk == 128K again.
278  */
279 
280 #ifndef RAIDOUTSTANDING
281 #define RAIDOUTSTANDING   6
282 #endif
283 
284 #define RAIDLABELDEV(dev)	\
285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
286 
287 /* declared here, and made public, for the benefit of KVM stuff.. */
288 
289 static int raidlock(struct raid_softc *);
290 static void raidunlock(struct raid_softc *);
291 
292 static int raid_detach_unlocked(struct raid_softc *);
293 
294 static void rf_markalldirty(RF_Raid_t *);
295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
296 
297 static void rf_ReconThread(struct rf_recon_req_internal *);
298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304 
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316 
317 /*
318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
320  * in the kernel config file.
321  */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328 
329 struct pool rf_alloclist_pool;   /* AllocList */
330 
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333 
334 static struct raid_softc *
335 raidcreate(int unit) {
336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337 	sc->sc_unit = unit;
338 	cv_init(&sc->sc_cv, "raidunit");
339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340 	return sc;
341 }
342 
343 static void
344 raiddestroy(struct raid_softc *sc) {
345 	cv_destroy(&sc->sc_cv);
346 	mutex_destroy(&sc->sc_mutex);
347 	kmem_free(sc, sizeof(*sc));
348 }
349 
350 static struct raid_softc *
351 raidget(int unit, bool create) {
352 	struct raid_softc *sc;
353 	if (unit < 0) {
354 #ifdef DIAGNOSTIC
355 		panic("%s: unit %d!", __func__, unit);
356 #endif
357 		return NULL;
358 	}
359 	mutex_enter(&raid_lock);
360 	LIST_FOREACH(sc, &raids, sc_link) {
361 		if (sc->sc_unit == unit) {
362 			mutex_exit(&raid_lock);
363 			return sc;
364 		}
365 	}
366 	mutex_exit(&raid_lock);
367 	if (!create)
368 		return NULL;
369 	sc = raidcreate(unit);
370 	mutex_enter(&raid_lock);
371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
372 	mutex_exit(&raid_lock);
373 	return sc;
374 }
375 
376 static void
377 raidput(struct raid_softc *sc) {
378 	mutex_enter(&raid_lock);
379 	LIST_REMOVE(sc, sc_link);
380 	mutex_exit(&raid_lock);
381 	raiddestroy(sc);
382 }
383 
384 void
385 raidattach(int num)
386 {
387 
388 	/*
389 	 * Device attachment and associated initialization now occurs
390 	 * as part of the module initialization.
391 	 */
392 }
393 
394 static int
395 rf_autoconfig(device_t self)
396 {
397 	RF_AutoConfig_t *ac_list;
398 	RF_ConfigSet_t *config_sets;
399 
400 	if (!raidautoconfig || raidautoconfigdone == true)
401 		return 0;
402 
403 	/* XXX This code can only be run once. */
404 	raidautoconfigdone = true;
405 
406 #ifdef __HAVE_CPU_BOOTCONF
407 	/*
408 	 * 0. find the boot device if needed first so we can use it later
409 	 * this needs to be done before we autoconfigure any raid sets,
410 	 * because if we use wedges we are not going to be able to open
411 	 * the boot device later
412 	 */
413 	if (booted_device == NULL)
414 		cpu_bootconf();
415 #endif
416 	/* 1. locate all RAID components on the system */
417 	aprint_debug("Searching for RAID components...\n");
418 	ac_list = rf_find_raid_components();
419 
420 	/* 2. Sort them into their respective sets. */
421 	config_sets = rf_create_auto_sets(ac_list);
422 
423 	/*
424 	 * 3. Evaluate each set and configure the valid ones.
425 	 * This gets done in rf_buildroothack().
426 	 */
427 	rf_buildroothack(config_sets);
428 
429 	return 1;
430 }
431 
432 int
433 rf_inited(const struct raid_softc *rs) {
434 	return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436 
437 RF_Raid_t *
438 rf_get_raid(struct raid_softc *rs) {
439 	return &rs->sc_r;
440 }
441 
442 int
443 rf_get_unit(const struct raid_softc *rs) {
444 	return rs->sc_unit;
445 }
446 
447 static int
448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449 	const char *bootname;
450 	size_t len;
451 
452 	/* if bdv is NULL, the set can't contain it. exit early. */
453 	if (bdv == NULL)
454 		return 0;
455 
456 	bootname = device_xname(bdv);
457 	len = strlen(bootname);
458 
459 	for (int col = 0; col < r->numCol; col++) {
460 		const char *devname = r->Disks[col].devname;
461 		devname += sizeof("/dev/") - 1;
462 		if (strncmp(devname, "dk", 2) == 0) {
463 			const char *parent =
464 			    dkwedge_get_parent_name(r->Disks[col].dev);
465 			if (parent != NULL)
466 				devname = parent;
467 		}
468 		if (strncmp(devname, bootname, len) == 0) {
469 			struct raid_softc *sc = r->softc;
470 			aprint_debug("raid%d includes boot device %s\n",
471 			    sc->sc_unit, devname);
472 			return 1;
473 		}
474 	}
475 	return 0;
476 }
477 
478 static int
479 rf_rescan(void)
480 {
481 	RF_AutoConfig_t *ac_list;
482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
483 	struct raid_softc *sc;
484 	int raid_added;
485 
486 	ac_list = rf_find_raid_components();
487 	config_sets = rf_create_auto_sets(ac_list);
488 
489 	raid_added = 1;
490 	while (raid_added > 0) {
491 		raid_added = 0;
492 		cset = config_sets;
493 		while (cset != NULL) {
494 			next_cset = cset->next;
495 			if (rf_have_enough_components(cset) &&
496 			    cset->ac->clabel->autoconfigure == 1) {
497 				sc = rf_auto_config_set(cset);
498 				if (sc != NULL) {
499 					aprint_debug("raid%d: configured ok, rootable %d\n",
500 						     sc->sc_unit, cset->rootable);
501 					/* We added one RAID set */
502 					raid_added++;
503 				} else {
504 					/* The autoconfig didn't work :( */
505 					aprint_debug("Autoconfig failed\n");
506 					rf_release_all_vps(cset);
507 				}
508 			} else {
509 				/* we're not autoconfiguring this set...
510 				   release the associated resources */
511 				rf_release_all_vps(cset);
512 			}
513 			/* cleanup */
514 			rf_cleanup_config_set(cset);
515 			cset = next_cset;
516 		}
517 		if (raid_added > 0) {
518 			/* We added at least one RAID set, so re-scan for recursive RAID */
519 			ac_list = rf_find_raid_components();
520 			config_sets = rf_create_auto_sets(ac_list);
521 		}
522 	}
523 
524 	return 0;
525 }
526 
527 
528 static void
529 rf_buildroothack(RF_ConfigSet_t *config_sets)
530 {
531 	RF_AutoConfig_t *ac_list;
532 	RF_ConfigSet_t *cset;
533 	RF_ConfigSet_t *next_cset;
534 	int num_root;
535 	int raid_added;
536 	struct raid_softc *sc, *rsc;
537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
538 
539 	sc = rsc = NULL;
540 	num_root = 0;
541 
542 	raid_added = 1;
543 	while (raid_added > 0) {
544 		raid_added = 0;
545 		cset = config_sets;
546 		while (cset != NULL) {
547 			next_cset = cset->next;
548 			if (rf_have_enough_components(cset) &&
549 			    cset->ac->clabel->autoconfigure == 1) {
550 				sc = rf_auto_config_set(cset);
551 				if (sc != NULL) {
552 					aprint_debug("raid%d: configured ok, rootable %d\n",
553 						     sc->sc_unit, cset->rootable);
554 					/* We added one RAID set */
555 					raid_added++;
556 					if (cset->rootable) {
557 						rsc = sc;
558 						num_root++;
559 					}
560 				} else {
561 					/* The autoconfig didn't work :( */
562 					aprint_debug("Autoconfig failed\n");
563 					rf_release_all_vps(cset);
564 				}
565 			} else {
566 				/* we're not autoconfiguring this set...
567 				   release the associated resources */
568 				rf_release_all_vps(cset);
569 			}
570 			/* cleanup */
571 			rf_cleanup_config_set(cset);
572 			cset = next_cset;
573 		}
574 		if (raid_added > 0) {
575 			/* We added at least one RAID set, so re-scan for recursive RAID */
576 			ac_list = rf_find_raid_components();
577 			config_sets = rf_create_auto_sets(ac_list);
578 		}
579 	}
580 
581 	/* if the user has specified what the root device should be
582 	   then we don't touch booted_device or boothowto... */
583 
584 	if (rootspec != NULL) {
585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
586 		return;
587 	}
588 
589 	/* we found something bootable... */
590 
591 	/*
592 	 * XXX: The following code assumes that the root raid
593 	 * is the first ('a') partition. This is about the best
594 	 * we can do with a BSD disklabel, but we might be able
595 	 * to do better with a GPT label, by setting a specified
596 	 * attribute to indicate the root partition. We can then
597 	 * stash the partition number in the r->root_partition
598 	 * high bits (the bottom 2 bits are already used). For
599 	 * now we just set booted_partition to 0 when we override
600 	 * root.
601 	 */
602 	if (num_root == 1) {
603 		device_t candidate_root;
604 		dksc = &rsc->sc_dksc;
605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
606 			char cname[sizeof(cset->ac->devname)];
607 			/* XXX: assume partition 'a' first */
608 			snprintf(cname, sizeof(cname), "%s%c",
609 			    device_xname(dksc->sc_dev), 'a');
610 			candidate_root = dkwedge_find_by_wname(cname);
611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
612 			    cname);
613 			if (candidate_root == NULL) {
614 				/*
615 				 * If that is not found, because we don't use
616 				 * disklabel, return the first dk child
617 				 * XXX: we can skip the 'a' check above
618 				 * and always do this...
619 				 */
620 				size_t i = 0;
621 				candidate_root = dkwedge_find_by_parent(
622 				    device_xname(dksc->sc_dev), &i);
623 			}
624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
625 			    candidate_root);
626 		} else
627 			candidate_root = dksc->sc_dev;
628 		aprint_debug("%s: candidate root=%p booted_device=%p "
629 			     "root_partition=%d contains_boot=%d\n",
630 		    __func__, candidate_root, booted_device,
631 		    rsc->sc_r.root_partition,
632 		    rf_containsboot(&rsc->sc_r, booted_device));
633 		/* XXX the check for booted_device == NULL can probably be
634 		 * dropped, now that rf_containsboot handles that case.
635 		 */
636 		if (booted_device == NULL ||
637 		    rsc->sc_r.root_partition == 1 ||
638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
639 			booted_device = candidate_root;
640 			booted_method = "raidframe/single";
641 			booted_partition = 0;	/* XXX assume 'a' */
642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
643 			    device_xname(booted_device), booted_device);
644 		}
645 	} else if (num_root > 1) {
646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
647 		    booted_device);
648 
649 		/*
650 		 * Maybe the MD code can help. If it cannot, then
651 		 * setroot() will discover that we have no
652 		 * booted_device and will ask the user if nothing was
653 		 * hardwired in the kernel config file
654 		 */
655 		if (booted_device == NULL)
656 			return;
657 
658 		num_root = 0;
659 		mutex_enter(&raid_lock);
660 		LIST_FOREACH(sc, &raids, sc_link) {
661 			RF_Raid_t *r = &sc->sc_r;
662 			if (r->valid == 0)
663 				continue;
664 
665 			if (r->root_partition == 0)
666 				continue;
667 
668 			if (rf_containsboot(r, booted_device)) {
669 				num_root++;
670 				rsc = sc;
671 				dksc = &rsc->sc_dksc;
672 			}
673 		}
674 		mutex_exit(&raid_lock);
675 
676 		if (num_root == 1) {
677 			booted_device = dksc->sc_dev;
678 			booted_method = "raidframe/multi";
679 			booted_partition = 0;	/* XXX assume 'a' */
680 		} else {
681 			/* we can't guess.. require the user to answer... */
682 			boothowto |= RB_ASKNAME;
683 		}
684 	}
685 }
686 
687 static int
688 raidsize(dev_t dev)
689 {
690 	struct raid_softc *rs;
691 	struct dk_softc *dksc;
692 	unsigned int unit;
693 
694 	unit = raidunit(dev);
695 	if ((rs = raidget(unit, false)) == NULL)
696 		return -1;
697 	dksc = &rs->sc_dksc;
698 
699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
700 		return -1;
701 
702 	return dk_size(dksc, dev);
703 }
704 
705 static int
706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
707 {
708 	unsigned int unit;
709 	struct raid_softc *rs;
710 	struct dk_softc *dksc;
711 
712 	unit = raidunit(dev);
713 	if ((rs = raidget(unit, false)) == NULL)
714 		return ENXIO;
715 	dksc = &rs->sc_dksc;
716 
717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
718 		return ENODEV;
719 
720         /*
721            Note that blkno is relative to this particular partition.
722            By adding adding RF_PROTECTED_SECTORS, we get a value that
723 	   is relative to the partition used for the underlying component.
724         */
725 	blkno += RF_PROTECTED_SECTORS;
726 
727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
728 }
729 
730 static int
731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
732 {
733 	struct raid_softc *rs = raidsoftc(dev);
734 	const struct bdevsw *bdev;
735 	RF_Raid_t *raidPtr;
736 	int     c, sparecol, j, scol, dumpto;
737 	int     error = 0;
738 
739 	raidPtr = &rs->sc_r;
740 
741 	/* we only support dumping to RAID 1 sets */
742 	if (raidPtr->Layout.numDataCol != 1 ||
743 	    raidPtr->Layout.numParityCol != 1)
744 		return EINVAL;
745 
746 	if ((error = raidlock(rs)) != 0)
747 		return error;
748 
749 	/* figure out what device is alive.. */
750 
751 	/*
752 	   Look for a component to dump to.  The preference for the
753 	   component to dump to is as follows:
754 	   1) the first component
755 	   2) a used_spare of the first component
756 	   3) the second component
757 	   4) a used_spare of the second component
758 	*/
759 
760 	dumpto = -1;
761 	for (c = 0; c < raidPtr->numCol; c++) {
762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
763 			/* this might be the one */
764 			dumpto = c;
765 			break;
766 		}
767 	}
768 
769 	/*
770 	   At this point we have possibly selected a live component.
771 	   If we didn't find a live ocmponent, we now check to see
772 	   if there is a relevant spared component.
773 	*/
774 
775 	for (c = 0; c < raidPtr->numSpare; c++) {
776 		sparecol = raidPtr->numCol + c;
777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
778 			/* How about this one? */
779 			scol = -1;
780 			for(j=0;j<raidPtr->numCol;j++) {
781 				if (raidPtr->Disks[j].spareCol == sparecol) {
782 					scol = j;
783 					break;
784 				}
785 			}
786 			if (scol == 0) {
787 				/*
788 				   We must have found a spared first
789 				   component!  We'll take that over
790 				   anything else found so far.  (We
791 				   couldn't have found a real first
792 				   component before, since this is a
793 				   used spare, and it's saying that
794 				   it's replacing the first
795 				   component.)  On reboot (with
796 				   autoconfiguration turned on)
797 				   sparecol will become the first
798 				   component (component0) of this set.
799 				*/
800 				dumpto = sparecol;
801 				break;
802 			} else if (scol != -1) {
803 				/*
804 				   Must be a spared second component.
805 				   We'll dump to that if we havn't found
806 				   anything else so far.
807 				*/
808 				if (dumpto == -1)
809 					dumpto = sparecol;
810 			}
811 		}
812 	}
813 
814 	if (dumpto == -1) {
815 		/* we couldn't find any live components to dump to!?!?
816 		 */
817 		error = EINVAL;
818 		goto out;
819 	}
820 
821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
822 	if (bdev == NULL) {
823 		error = ENXIO;
824 		goto out;
825 	}
826 
827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
828 				blkno, va, nblk * raidPtr->bytesPerSector);
829 
830 out:
831 	raidunlock(rs);
832 
833 	return error;
834 }
835 
836 /* ARGSUSED */
837 static int
838 raidopen(dev_t dev, int flags, int fmt,
839     struct lwp *l)
840 {
841 	int     unit = raidunit(dev);
842 	struct raid_softc *rs;
843 	struct dk_softc *dksc;
844 	int     error = 0;
845 	int     part, pmask;
846 
847 	if ((rs = raidget(unit, true)) == NULL)
848 		return ENXIO;
849 	if ((error = raidlock(rs)) != 0)
850 		return error;
851 
852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
853 		error = EBUSY;
854 		goto bad;
855 	}
856 
857 	dksc = &rs->sc_dksc;
858 
859 	part = DISKPART(dev);
860 	pmask = (1 << part);
861 
862 	if (!DK_BUSY(dksc, pmask) &&
863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
864 		/* First one... mark things as dirty... Note that we *MUST*
865 		 have done a configure before this.  I DO NOT WANT TO BE
866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
867 		 THAT THEY BELONG TOGETHER!!!!! */
868 		/* XXX should check to see if we're only open for reading
869 		   here... If so, we needn't do this, but then need some
870 		   other way of keeping track of what's happened.. */
871 
872 		rf_markalldirty(&rs->sc_r);
873 	}
874 
875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
876 		error = dk_open(dksc, dev, flags, fmt, l);
877 
878 bad:
879 	raidunlock(rs);
880 
881 	return error;
882 
883 
884 }
885 
886 static int
887 raid_lastclose(device_t self)
888 {
889 	struct raid_softc *rs = raidsoftc(self);
890 
891 	/* Last one... device is not unconfigured yet.
892 	   Device shutdown has taken care of setting the
893 	   clean bits if RAIDF_INITED is not set
894 	   mark things as clean... */
895 
896 	rf_update_component_labels(&rs->sc_r,
897 	    RF_FINAL_COMPONENT_UPDATE);
898 
899 	/* pass to unlocked code */
900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
901 		rs->sc_flags |= RAIDF_DETACH;
902 
903 	return 0;
904 }
905 
906 /* ARGSUSED */
907 static int
908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
909 {
910 	int     unit = raidunit(dev);
911 	struct raid_softc *rs;
912 	struct dk_softc *dksc;
913 	cfdata_t cf;
914 	int     error = 0, do_detach = 0, do_put = 0;
915 
916 	if ((rs = raidget(unit, false)) == NULL)
917 		return ENXIO;
918 	dksc = &rs->sc_dksc;
919 
920 	if ((error = raidlock(rs)) != 0)
921 		return error;
922 
923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
924 		error = dk_close(dksc, dev, flags, fmt, l);
925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
926 			do_detach = 1;
927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
928 		do_put = 1;
929 
930 	raidunlock(rs);
931 
932 	if (do_detach) {
933 		/* free the pseudo device attach bits */
934 		cf = device_cfdata(dksc->sc_dev);
935 		error = config_detach(dksc->sc_dev, 0);
936 		if (error == 0)
937 			free(cf, M_RAIDFRAME);
938 	} else if (do_put) {
939 		raidput(rs);
940 	}
941 
942 	return error;
943 
944 }
945 
946 static void
947 raid_wakeup(RF_Raid_t *raidPtr)
948 {
949 	rf_lock_mutex2(raidPtr->iodone_lock);
950 	rf_signal_cond2(raidPtr->iodone_cv);
951 	rf_unlock_mutex2(raidPtr->iodone_lock);
952 }
953 
954 static void
955 raidstrategy(struct buf *bp)
956 {
957 	unsigned int unit;
958 	struct raid_softc *rs;
959 	struct dk_softc *dksc;
960 	RF_Raid_t *raidPtr;
961 
962 	unit = raidunit(bp->b_dev);
963 	if ((rs = raidget(unit, false)) == NULL) {
964 		bp->b_error = ENXIO;
965 		goto fail;
966 	}
967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
968 		bp->b_error = ENXIO;
969 		goto fail;
970 	}
971 	dksc = &rs->sc_dksc;
972 	raidPtr = &rs->sc_r;
973 
974 	/* Queue IO only */
975 	if (dk_strategy_defer(dksc, bp))
976 		goto done;
977 
978 	/* schedule the IO to happen at the next convenient time */
979 	raid_wakeup(raidPtr);
980 
981 done:
982 	return;
983 
984 fail:
985 	bp->b_resid = bp->b_bcount;
986 	biodone(bp);
987 }
988 
989 static int
990 raid_diskstart(device_t dev, struct buf *bp)
991 {
992 	struct raid_softc *rs = raidsoftc(dev);
993 	RF_Raid_t *raidPtr;
994 
995 	raidPtr = &rs->sc_r;
996 	if (!raidPtr->valid) {
997 		db1_printf(("raid is not valid..\n"));
998 		return ENODEV;
999 	}
1000 
1001 	/* XXX */
1002 	bp->b_resid = 0;
1003 
1004 	return raiddoaccess(raidPtr, bp);
1005 }
1006 
1007 void
1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1009 {
1010 	struct raid_softc *rs;
1011 	struct dk_softc *dksc;
1012 
1013 	rs = raidPtr->softc;
1014 	dksc = &rs->sc_dksc;
1015 
1016 	dk_done(dksc, bp);
1017 
1018 	rf_lock_mutex2(raidPtr->mutex);
1019 	raidPtr->openings++;
1020 	rf_unlock_mutex2(raidPtr->mutex);
1021 
1022 	/* schedule more IO */
1023 	raid_wakeup(raidPtr);
1024 }
1025 
1026 /* ARGSUSED */
1027 static int
1028 raidread(dev_t dev, struct uio *uio, int flags)
1029 {
1030 	int     unit = raidunit(dev);
1031 	struct raid_softc *rs;
1032 
1033 	if ((rs = raidget(unit, false)) == NULL)
1034 		return ENXIO;
1035 
1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1037 		return ENXIO;
1038 
1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1040 
1041 }
1042 
1043 /* ARGSUSED */
1044 static int
1045 raidwrite(dev_t dev, struct uio *uio, int flags)
1046 {
1047 	int     unit = raidunit(dev);
1048 	struct raid_softc *rs;
1049 
1050 	if ((rs = raidget(unit, false)) == NULL)
1051 		return ENXIO;
1052 
1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1054 		return ENXIO;
1055 
1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1057 
1058 }
1059 
1060 static int
1061 raid_detach_unlocked(struct raid_softc *rs)
1062 {
1063 	struct dk_softc *dksc = &rs->sc_dksc;
1064 	RF_Raid_t *raidPtr;
1065 	int error;
1066 
1067 	raidPtr = &rs->sc_r;
1068 
1069 	if (DK_BUSY(dksc, 0) ||
1070 	    raidPtr->recon_in_progress != 0 ||
1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
1072 	    raidPtr->copyback_in_progress != 0)
1073 		return EBUSY;
1074 
1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1076 		return 0;
1077 
1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1079 
1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
1081 		return error;
1082 
1083 	rs->sc_flags &= ~RAIDF_INITED;
1084 
1085 	/* Kill off any queued buffers */
1086 	dk_drain(dksc);
1087 	bufq_free(dksc->sc_bufq);
1088 
1089 	/* Detach the disk. */
1090 	dkwedge_delall(&dksc->sc_dkdev);
1091 	disk_detach(&dksc->sc_dkdev);
1092 	disk_destroy(&dksc->sc_dkdev);
1093 	dk_detach(dksc);
1094 
1095 	return 0;
1096 }
1097 
1098 int
1099 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1100 {
1101 	struct rf_recon_req_internal *rrint;
1102 
1103 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1104 		/* Can't do this on a RAID 0!! */
1105 		return EINVAL;
1106 	}
1107 
1108 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1109 		/* bad column */
1110 		return EINVAL;
1111 	}
1112 
1113 	rf_lock_mutex2(raidPtr->mutex);
1114 	if (raidPtr->status == rf_rs_reconstructing) {
1115 		/* you can't fail a disk while we're reconstructing! */
1116 		/* XXX wrong for RAID6 */
1117 		goto out;
1118 	}
1119 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1120 	    (raidPtr->numFailures > 0)) {
1121 		/* some other component has failed.  Let's not make
1122 		   things worse. XXX wrong for RAID6 */
1123 		goto out;
1124 	}
1125 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1126 		/* Can't fail a spared disk! */
1127 		goto out;
1128 	}
1129 	rf_unlock_mutex2(raidPtr->mutex);
1130 
1131 	/* make a copy of the recon request so that we don't rely on
1132 	 * the user's buffer */
1133 	rrint = RF_Malloc(sizeof(*rrint));
1134 	if (rrint == NULL)
1135 		return(ENOMEM);
1136 	rrint->col = rr->col;
1137 	rrint->flags = rr->flags;
1138 	rrint->raidPtr = raidPtr;
1139 
1140 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1141 	    rrint, "raid_recon");
1142 out:
1143 	rf_unlock_mutex2(raidPtr->mutex);
1144 	return EINVAL;
1145 }
1146 
1147 static int
1148 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1149 {
1150 	/* allocate a buffer for the layout-specific data, and copy it in */
1151 	if (k_cfg->layoutSpecificSize == 0)
1152 		return 0;
1153 
1154 	if (k_cfg->layoutSpecificSize > 10000) {
1155 	    /* sanity check */
1156 	    return EINVAL;
1157 	}
1158 
1159 	u_char *specific_buf;
1160 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1161 	if (specific_buf == NULL)
1162 		return ENOMEM;
1163 
1164 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1165 	    k_cfg->layoutSpecificSize);
1166 	if (retcode) {
1167 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1168 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1169 		return retcode;
1170 	}
1171 
1172 	k_cfg->layoutSpecific = specific_buf;
1173 	return 0;
1174 }
1175 
1176 static int
1177 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1178 {
1179 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
1180 
1181 	if (rs->sc_r.valid) {
1182 		/* There is a valid RAID set running on this unit! */
1183 		printf("raid%d: Device already configured!\n", rs->sc_unit);
1184 		return EINVAL;
1185 	}
1186 
1187 	/* copy-in the configuration information */
1188 	/* data points to a pointer to the configuration structure */
1189 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
1190 	if (*k_cfg == NULL) {
1191 		return ENOMEM;
1192 	}
1193 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1194 	if (retcode == 0)
1195 		return 0;
1196 	RF_Free(*k_cfg, sizeof(RF_Config_t));
1197 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1198 	rs->sc_flags |= RAIDF_SHUTDOWN;
1199 	return retcode;
1200 }
1201 
1202 int
1203 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1204 {
1205 	int retcode, i;
1206 	RF_Raid_t *raidPtr = &rs->sc_r;
1207 
1208 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1209 
1210 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1211 		goto out;
1212 
1213 	/* should do some kind of sanity check on the configuration.
1214 	 * Store the sum of all the bytes in the last byte? */
1215 
1216 	/* Force nul-termination on all strings. */
1217 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
1218 	for (i = 0; i < RF_MAXCOL; i++) {
1219 		ZERO_FINAL(k_cfg->devnames[0][i]);
1220 	}
1221 	for (i = 0; i < RF_MAXSPARE; i++) {
1222 		ZERO_FINAL(k_cfg->spare_names[i]);
1223 	}
1224 	for (i = 0; i < RF_MAXDBGV; i++) {
1225 		ZERO_FINAL(k_cfg->debugVars[i]);
1226 	}
1227 #undef ZERO_FINAL
1228 
1229 	/* Check some basic limits. */
1230 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1231 		retcode = EINVAL;
1232 		goto out;
1233 	}
1234 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1235 		retcode = EINVAL;
1236 		goto out;
1237 	}
1238 
1239 	/* configure the system */
1240 
1241 	/*
1242 	 * Clear the entire RAID descriptor, just to make sure
1243 	 *  there is no stale data left in the case of a
1244 	 *  reconfiguration
1245 	 */
1246 	memset(raidPtr, 0, sizeof(*raidPtr));
1247 	raidPtr->softc = rs;
1248 	raidPtr->raidid = rs->sc_unit;
1249 
1250 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
1251 
1252 	if (retcode == 0) {
1253 		/* allow this many simultaneous IO's to
1254 		   this RAID device */
1255 		raidPtr->openings = RAIDOUTSTANDING;
1256 
1257 		raidinit(rs);
1258 		raid_wakeup(raidPtr);
1259 		rf_markalldirty(raidPtr);
1260 	}
1261 
1262 	/* free the buffers.  No return code here. */
1263 	if (k_cfg->layoutSpecificSize) {
1264 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1265 	}
1266 out:
1267 	RF_Free(k_cfg, sizeof(RF_Config_t));
1268 	if (retcode) {
1269 		/*
1270 		 * If configuration failed, set sc_flags so that we
1271 		 * will detach the device when we close it.
1272 		 */
1273 		rs->sc_flags |= RAIDF_SHUTDOWN;
1274 	}
1275 	return retcode;
1276 }
1277 
1278 #if RF_DISABLED
1279 static int
1280 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1281 {
1282 
1283 	/* XXX check the label for valid stuff... */
1284 	/* Note that some things *should not* get modified --
1285 	   the user should be re-initing the labels instead of
1286 	   trying to patch things.
1287 	   */
1288 #ifdef DEBUG
1289 	int raidid = raidPtr->raidid;
1290 	printf("raid%d: Got component label:\n", raidid);
1291 	printf("raid%d: Version: %d\n", raidid, clabel->version);
1292 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1293 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1294 	printf("raid%d: Column: %d\n", raidid, clabel->column);
1295 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1296 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1297 	printf("raid%d: Status: %d\n", raidid, clabel->status);
1298 #endif	/* DEBUG */
1299 	clabel->row = 0;
1300 	int column = clabel->column;
1301 
1302 	if ((column < 0) || (column >= raidPtr->numCol)) {
1303 		return(EINVAL);
1304 	}
1305 
1306 	/* XXX this isn't allowed to do anything for now :-) */
1307 
1308 	/* XXX and before it is, we need to fill in the rest
1309 	   of the fields!?!?!?! */
1310 	memcpy(raidget_component_label(raidPtr, column),
1311 	    clabel, sizeof(*clabel));
1312 	raidflush_component_label(raidPtr, column);
1313 	return 0;
1314 }
1315 #endif
1316 
1317 static int
1318 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1319 {
1320 	/*
1321 	   we only want the serial number from
1322 	   the above.  We get all the rest of the information
1323 	   from the config that was used to create this RAID
1324 	   set.
1325 	   */
1326 
1327 	raidPtr->serial_number = clabel->serial_number;
1328 
1329 	for (int column = 0; column < raidPtr->numCol; column++) {
1330 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1331 		if (RF_DEAD_DISK(diskPtr->status))
1332 			continue;
1333 		RF_ComponentLabel_t *ci_label = raidget_component_label(
1334 		    raidPtr, column);
1335 		/* Zeroing this is important. */
1336 		memset(ci_label, 0, sizeof(*ci_label));
1337 		raid_init_component_label(raidPtr, ci_label);
1338 		ci_label->serial_number = raidPtr->serial_number;
1339 		ci_label->row = 0; /* we dont' pretend to support more */
1340 		rf_component_label_set_partitionsize(ci_label,
1341 		    diskPtr->partitionSize);
1342 		ci_label->column = column;
1343 		raidflush_component_label(raidPtr, column);
1344 		/* XXXjld what about the spares? */
1345 	}
1346 
1347 	return 0;
1348 }
1349 
1350 static int
1351 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1352 {
1353 
1354 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1355 		/* Can't do this on a RAID 0!! */
1356 		return EINVAL;
1357 	}
1358 
1359 	if (raidPtr->recon_in_progress == 1) {
1360 		/* a reconstruct is already in progress! */
1361 		return EINVAL;
1362 	}
1363 
1364 	RF_SingleComponent_t component;
1365 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1366 	component.row = 0; /* we don't support any more */
1367 	int column = component.column;
1368 
1369 	if ((column < 0) || (column >= raidPtr->numCol)) {
1370 		return EINVAL;
1371 	}
1372 
1373 	rf_lock_mutex2(raidPtr->mutex);
1374 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1375 	    (raidPtr->numFailures > 0)) {
1376 		/* XXX 0 above shouldn't be constant!!! */
1377 		/* some component other than this has failed.
1378 		   Let's not make things worse than they already
1379 		   are... */
1380 		printf("raid%d: Unable to reconstruct to disk at:\n",
1381 		       raidPtr->raidid);
1382 		printf("raid%d:     Col: %d   Too many failures.\n",
1383 		       raidPtr->raidid, column);
1384 		rf_unlock_mutex2(raidPtr->mutex);
1385 		return EINVAL;
1386 	}
1387 
1388 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1389 		printf("raid%d: Unable to reconstruct to disk at:\n",
1390 		       raidPtr->raidid);
1391 		printf("raid%d:    Col: %d   "
1392 		    "Reconstruction already occurring!\n",
1393 		    raidPtr->raidid, column);
1394 
1395 		rf_unlock_mutex2(raidPtr->mutex);
1396 		return EINVAL;
1397 	}
1398 
1399 	if (raidPtr->Disks[column].status == rf_ds_spared) {
1400 		rf_unlock_mutex2(raidPtr->mutex);
1401 		return EINVAL;
1402 	}
1403 
1404 	rf_unlock_mutex2(raidPtr->mutex);
1405 
1406 	struct rf_recon_req_internal *rrint;
1407 	rrint = RF_Malloc(sizeof(*rrint));
1408 	if (rrint == NULL)
1409 		return ENOMEM;
1410 
1411 	rrint->col = column;
1412 	rrint->raidPtr = raidPtr;
1413 
1414 	return RF_CREATE_THREAD(raidPtr->recon_thread,
1415 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1416 }
1417 
1418 static int
1419 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1420 {
1421 	/*
1422 	 * This makes no sense on a RAID 0, or if we are not reconstructing
1423 	 * so tell the user it's done.
1424 	 */
1425 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
1426 	    raidPtr->status != rf_rs_reconstructing) {
1427 		*data = 100;
1428 		return 0;
1429 	}
1430 	if (raidPtr->reconControl->numRUsTotal == 0) {
1431 		*data = 0;
1432 		return 0;
1433 	}
1434 	*data = (raidPtr->reconControl->numRUsComplete * 100
1435 	    / raidPtr->reconControl->numRUsTotal);
1436 	return 0;
1437 }
1438 
1439 /*
1440  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1441  * on the component_name[] array.
1442  */
1443 static void
1444 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1445 {
1446 
1447 	memcpy(component, data, sizeof *component);
1448 	component->component_name[sizeof(component->component_name) - 1] = '\0';
1449 }
1450 
1451 static int
1452 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1453 {
1454 	int     unit = raidunit(dev);
1455 	int     part, pmask;
1456 	struct raid_softc *rs;
1457 	struct dk_softc *dksc;
1458 	RF_Config_t *k_cfg;
1459 	RF_Raid_t *raidPtr;
1460 	RF_AccTotals_t *totals;
1461 	RF_SingleComponent_t component;
1462 	RF_DeviceConfig_t *d_cfg, *ucfgp;
1463 	int retcode = 0;
1464 	int column;
1465 	RF_ComponentLabel_t *clabel;
1466 	int d;
1467 
1468 	if ((rs = raidget(unit, false)) == NULL)
1469 		return ENXIO;
1470 
1471 	dksc = &rs->sc_dksc;
1472 	raidPtr = &rs->sc_r;
1473 
1474 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1475 	    (int) DISKPART(dev), (int) unit, cmd));
1476 
1477 	/* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1478 	switch (cmd) {
1479 	case RAIDFRAME_CONFIGURE:
1480 	case RAIDFRAME_RESCAN:
1481 		break;
1482 	default:
1483 		if (!rf_inited(rs))
1484 			return ENXIO;
1485 	}
1486 
1487 	switch (cmd) {
1488 		/* configure the system */
1489 	case RAIDFRAME_CONFIGURE:
1490 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1491 			return retcode;
1492 		return rf_construct(rs, k_cfg);
1493 
1494 		/* shutdown the system */
1495 	case RAIDFRAME_SHUTDOWN:
1496 
1497 		part = DISKPART(dev);
1498 		pmask = (1 << part);
1499 
1500 		if ((retcode = raidlock(rs)) != 0)
1501 			return retcode;
1502 
1503 		if (DK_BUSY(dksc, pmask) ||
1504 		    raidPtr->recon_in_progress != 0 ||
1505 		    raidPtr->parity_rewrite_in_progress != 0 ||
1506 		    raidPtr->copyback_in_progress != 0)
1507 			retcode = EBUSY;
1508 		else {
1509 			/* detach and free on close */
1510 			rs->sc_flags |= RAIDF_SHUTDOWN;
1511 			retcode = 0;
1512 		}
1513 
1514 		raidunlock(rs);
1515 
1516 		return retcode;
1517 	case RAIDFRAME_GET_COMPONENT_LABEL:
1518 		return rf_get_component_label(raidPtr, data);
1519 
1520 #if RF_DISABLED
1521 	case RAIDFRAME_SET_COMPONENT_LABEL:
1522 		return rf_set_component_label(raidPtr, data);
1523 #endif
1524 
1525 	case RAIDFRAME_INIT_LABELS:
1526 		return rf_init_component_label(raidPtr, data);
1527 
1528 	case RAIDFRAME_SET_AUTOCONFIG:
1529 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1530 		printf("raid%d: New autoconfig value is: %d\n",
1531 		       raidPtr->raidid, d);
1532 		*(int *) data = d;
1533 		return retcode;
1534 
1535 	case RAIDFRAME_SET_ROOT:
1536 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1537 		printf("raid%d: New rootpartition value is: %d\n",
1538 		       raidPtr->raidid, d);
1539 		*(int *) data = d;
1540 		return retcode;
1541 
1542 		/* initialize all parity */
1543 	case RAIDFRAME_REWRITEPARITY:
1544 
1545 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1546 			/* Parity for RAID 0 is trivially correct */
1547 			raidPtr->parity_good = RF_RAID_CLEAN;
1548 			return 0;
1549 		}
1550 
1551 		if (raidPtr->parity_rewrite_in_progress == 1) {
1552 			/* Re-write is already in progress! */
1553 			return EINVAL;
1554 		}
1555 
1556 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1557 		    rf_RewriteParityThread, raidPtr,"raid_parity");
1558 
1559 	case RAIDFRAME_ADD_HOT_SPARE:
1560 		rf_copy_single_component(&component, data);
1561 		return rf_add_hot_spare(raidPtr, &component);
1562 
1563 	case RAIDFRAME_REMOVE_HOT_SPARE:
1564 		return retcode;
1565 
1566 	case RAIDFRAME_DELETE_COMPONENT:
1567 		rf_copy_single_component(&component, data);
1568 		return rf_delete_component(raidPtr, &component);
1569 
1570 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1571 		rf_copy_single_component(&component, data);
1572 		return rf_incorporate_hot_spare(raidPtr, &component);
1573 
1574 	case RAIDFRAME_REBUILD_IN_PLACE:
1575 		return rf_rebuild_in_place(raidPtr, data);
1576 
1577 	case RAIDFRAME_GET_INFO:
1578 		ucfgp = *(RF_DeviceConfig_t **)data;
1579 		d_cfg = RF_Malloc(sizeof(*d_cfg));
1580 		if (d_cfg == NULL)
1581 			return ENOMEM;
1582 		retcode = rf_get_info(raidPtr, d_cfg);
1583 		if (retcode == 0) {
1584 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1585 		}
1586 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1587 		return retcode;
1588 
1589 	case RAIDFRAME_CHECK_PARITY:
1590 		*(int *) data = raidPtr->parity_good;
1591 		return 0;
1592 
1593 	case RAIDFRAME_PARITYMAP_STATUS:
1594 		if (rf_paritymap_ineligible(raidPtr))
1595 			return EINVAL;
1596 		rf_paritymap_status(raidPtr->parity_map, data);
1597 		return 0;
1598 
1599 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1600 		if (rf_paritymap_ineligible(raidPtr))
1601 			return EINVAL;
1602 		if (raidPtr->parity_map == NULL)
1603 			return ENOENT; /* ??? */
1604 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1605 			return EINVAL;
1606 		return 0;
1607 
1608 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1609 		if (rf_paritymap_ineligible(raidPtr))
1610 			return EINVAL;
1611 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1612 		return 0;
1613 
1614 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1615 		if (rf_paritymap_ineligible(raidPtr))
1616 			return EINVAL;
1617 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1618 		/* XXX should errors be passed up? */
1619 		return 0;
1620 
1621 	case RAIDFRAME_RESCAN:
1622 		return rf_rescan();
1623 
1624 	case RAIDFRAME_RESET_ACCTOTALS:
1625 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1626 		return 0;
1627 
1628 	case RAIDFRAME_GET_ACCTOTALS:
1629 		totals = (RF_AccTotals_t *) data;
1630 		*totals = raidPtr->acc_totals;
1631 		return 0;
1632 
1633 	case RAIDFRAME_KEEP_ACCTOTALS:
1634 		raidPtr->keep_acc_totals = *(int *)data;
1635 		return 0;
1636 
1637 	case RAIDFRAME_GET_SIZE:
1638 		*(int *) data = raidPtr->totalSectors;
1639 		return 0;
1640 
1641 	case RAIDFRAME_FAIL_DISK:
1642 		return rf_fail_disk(raidPtr, data);
1643 
1644 		/* invoke a copyback operation after recon on whatever disk
1645 		 * needs it, if any */
1646 	case RAIDFRAME_COPYBACK:
1647 
1648 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1649 			/* This makes no sense on a RAID 0!! */
1650 			return EINVAL;
1651 		}
1652 
1653 		if (raidPtr->copyback_in_progress == 1) {
1654 			/* Copyback is already in progress! */
1655 			return EINVAL;
1656 		}
1657 
1658 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
1659 		    rf_CopybackThread, raidPtr, "raid_copyback");
1660 
1661 		/* return the percentage completion of reconstruction */
1662 	case RAIDFRAME_CHECK_RECON_STATUS:
1663 		return rf_check_recon_status(raidPtr, data);
1664 
1665 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1666 		rf_check_recon_status_ext(raidPtr, data);
1667 		return 0;
1668 
1669 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1670 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1671 			/* This makes no sense on a RAID 0, so tell the
1672 			   user it's done. */
1673 			*(int *) data = 100;
1674 			return 0;
1675 		}
1676 		if (raidPtr->parity_rewrite_in_progress == 1) {
1677 			*(int *) data = 100 *
1678 				raidPtr->parity_rewrite_stripes_done /
1679 				raidPtr->Layout.numStripe;
1680 		} else {
1681 			*(int *) data = 100;
1682 		}
1683 		return 0;
1684 
1685 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1686 		rf_check_parityrewrite_status_ext(raidPtr, data);
1687 		return 0;
1688 
1689 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1690 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1691 			/* This makes no sense on a RAID 0 */
1692 			*(int *) data = 100;
1693 			return 0;
1694 		}
1695 		if (raidPtr->copyback_in_progress == 1) {
1696 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1697 				raidPtr->Layout.numStripe;
1698 		} else {
1699 			*(int *) data = 100;
1700 		}
1701 		return 0;
1702 
1703 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1704 		rf_check_copyback_status_ext(raidPtr, data);
1705 		return 0;
1706 
1707 	case RAIDFRAME_SET_LAST_UNIT:
1708 		for (column = 0; column < raidPtr->numCol; column++)
1709 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1710 				return EBUSY;
1711 
1712 		for (column = 0; column < raidPtr->numCol; column++) {
1713 			clabel = raidget_component_label(raidPtr, column);
1714 			clabel->last_unit = *(int *)data;
1715 			raidflush_component_label(raidPtr, column);
1716 		}
1717 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1718 		return 0;
1719 
1720 		/* the sparetable daemon calls this to wait for the kernel to
1721 		 * need a spare table. this ioctl does not return until a
1722 		 * spare table is needed. XXX -- calling mpsleep here in the
1723 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1724 		 * -- I should either compute the spare table in the kernel,
1725 		 * or have a different -- XXX XXX -- interface (a different
1726 		 * character device) for delivering the table     -- XXX */
1727 #if RF_DISABLED
1728 	case RAIDFRAME_SPARET_WAIT:
1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
1730 		while (!rf_sparet_wait_queue)
1731 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1732 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1733 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1734 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1735 
1736 		/* structure assignment */
1737 		*((RF_SparetWait_t *) data) = *waitreq;
1738 
1739 		RF_Free(waitreq, sizeof(*waitreq));
1740 		return 0;
1741 
1742 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1743 		 * code in it that will cause the dameon to exit */
1744 	case RAIDFRAME_ABORT_SPARET_WAIT:
1745 		waitreq = RF_Malloc(sizeof(*waitreq));
1746 		waitreq->fcol = -1;
1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
1748 		waitreq->next = rf_sparet_wait_queue;
1749 		rf_sparet_wait_queue = waitreq;
1750 		rf_broadcast_cond2(rf_sparet_wait_cv);
1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1752 		return 0;
1753 
1754 		/* used by the spare table daemon to deliver a spare table
1755 		 * into the kernel */
1756 	case RAIDFRAME_SEND_SPARET:
1757 
1758 		/* install the spare table */
1759 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1760 
1761 		/* respond to the requestor.  the return status of the spare
1762 		 * table installation is passed in the "fcol" field */
1763 		waitred = RF_Malloc(sizeof(*waitreq));
1764 		waitreq->fcol = retcode;
1765 		rf_lock_mutex2(rf_sparet_wait_mutex);
1766 		waitreq->next = rf_sparet_resp_queue;
1767 		rf_sparet_resp_queue = waitreq;
1768 		rf_broadcast_cond2(rf_sparet_resp_cv);
1769 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1770 
1771 		return retcode;
1772 #endif
1773 	default:
1774 		/*
1775 		 * Don't bother trying to load compat modules
1776 		 * if it is not our ioctl. This is more efficient
1777 		 * and makes rump tests not depend on compat code
1778 		 */
1779 		if (IOCGROUP(cmd) != 'r')
1780 			break;
1781 #ifdef _LP64
1782 		if ((l->l_proc->p_flag & PK_32) != 0) {
1783 			module_autoload("compat_netbsd32_raid",
1784 			    MODULE_CLASS_EXEC);
1785 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1786 			    (rs, cmd, data), enosys(), retcode);
1787 			if (retcode != EPASSTHROUGH)
1788 				return retcode;
1789 		}
1790 #endif
1791 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1792 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1793 		    (rs, cmd, data), enosys(), retcode);
1794 		if (retcode != EPASSTHROUGH)
1795 			return retcode;
1796 
1797 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1798 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1799 		    (rs, cmd, data), enosys(), retcode);
1800 		if (retcode != EPASSTHROUGH)
1801 			return retcode;
1802 		break; /* fall through to the os-specific code below */
1803 
1804 	}
1805 
1806 	if (!raidPtr->valid)
1807 		return EINVAL;
1808 
1809 	/*
1810 	 * Add support for "regular" device ioctls here.
1811 	 */
1812 
1813 	switch (cmd) {
1814 	case DIOCGCACHE:
1815 		retcode = rf_get_component_caches(raidPtr, (int *)data);
1816 		break;
1817 
1818 	case DIOCCACHESYNC:
1819 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1820 		break;
1821 
1822 	default:
1823 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1824 		break;
1825 	}
1826 
1827 	return retcode;
1828 
1829 }
1830 
1831 
1832 /* raidinit -- complete the rest of the initialization for the
1833    RAIDframe device.  */
1834 
1835 
1836 static void
1837 raidinit(struct raid_softc *rs)
1838 {
1839 	cfdata_t cf;
1840 	unsigned int unit;
1841 	struct dk_softc *dksc = &rs->sc_dksc;
1842 	RF_Raid_t *raidPtr = &rs->sc_r;
1843 	device_t dev;
1844 
1845 	unit = raidPtr->raidid;
1846 
1847 	/* XXX doesn't check bounds. */
1848 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1849 
1850 	/* attach the pseudo device */
1851 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1852 	cf->cf_name = raid_cd.cd_name;
1853 	cf->cf_atname = raid_cd.cd_name;
1854 	cf->cf_unit = unit;
1855 	cf->cf_fstate = FSTATE_STAR;
1856 
1857 	dev = config_attach_pseudo(cf);
1858 	if (dev == NULL) {
1859 		printf("raid%d: config_attach_pseudo failed\n",
1860 		    raidPtr->raidid);
1861 		free(cf, M_RAIDFRAME);
1862 		return;
1863 	}
1864 
1865 	/* provide a backpointer to the real softc */
1866 	raidsoftc(dev) = rs;
1867 
1868 	/* disk_attach actually creates space for the CPU disklabel, among
1869 	 * other things, so it's critical to call this *BEFORE* we try putzing
1870 	 * with disklabels. */
1871 	dk_init(dksc, dev, DKTYPE_RAID);
1872 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1873 
1874 	/* XXX There may be a weird interaction here between this, and
1875 	 * protectedSectors, as used in RAIDframe.  */
1876 
1877 	rs->sc_size = raidPtr->totalSectors;
1878 
1879 	/* Attach dk and disk subsystems */
1880 	dk_attach(dksc);
1881 	disk_attach(&dksc->sc_dkdev);
1882 	rf_set_geometry(rs, raidPtr);
1883 
1884 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1885 
1886 	/* mark unit as usuable */
1887 	rs->sc_flags |= RAIDF_INITED;
1888 
1889 	dkwedge_discover(&dksc->sc_dkdev);
1890 }
1891 
1892 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1893 /* wake up the daemon & tell it to get us a spare table
1894  * XXX
1895  * the entries in the queues should be tagged with the raidPtr
1896  * so that in the extremely rare case that two recons happen at once,
1897  * we know for which device were requesting a spare table
1898  * XXX
1899  *
1900  * XXX This code is not currently used. GO
1901  */
1902 int
1903 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1904 {
1905 	int     retcode;
1906 
1907 	rf_lock_mutex2(rf_sparet_wait_mutex);
1908 	req->next = rf_sparet_wait_queue;
1909 	rf_sparet_wait_queue = req;
1910 	rf_broadcast_cond2(rf_sparet_wait_cv);
1911 
1912 	/* mpsleep unlocks the mutex */
1913 	while (!rf_sparet_resp_queue) {
1914 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1915 	}
1916 	req = rf_sparet_resp_queue;
1917 	rf_sparet_resp_queue = req->next;
1918 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1919 
1920 	retcode = req->fcol;
1921 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1922 					 * alloc'd */
1923 	return retcode;
1924 }
1925 #endif
1926 
1927 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1928  * bp & passes it down.
1929  * any calls originating in the kernel must use non-blocking I/O
1930  * do some extra sanity checking to return "appropriate" error values for
1931  * certain conditions (to make some standard utilities work)
1932  *
1933  * Formerly known as: rf_DoAccessKernel
1934  */
1935 void
1936 raidstart(RF_Raid_t *raidPtr)
1937 {
1938 	struct raid_softc *rs;
1939 	struct dk_softc *dksc;
1940 
1941 	rs = raidPtr->softc;
1942 	dksc = &rs->sc_dksc;
1943 	/* quick check to see if anything has died recently */
1944 	rf_lock_mutex2(raidPtr->mutex);
1945 	if (raidPtr->numNewFailures > 0) {
1946 		rf_unlock_mutex2(raidPtr->mutex);
1947 		rf_update_component_labels(raidPtr,
1948 					   RF_NORMAL_COMPONENT_UPDATE);
1949 		rf_lock_mutex2(raidPtr->mutex);
1950 		raidPtr->numNewFailures--;
1951 	}
1952 	rf_unlock_mutex2(raidPtr->mutex);
1953 
1954 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1955 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1956 		return;
1957 	}
1958 
1959 	dk_start(dksc, NULL);
1960 }
1961 
1962 static int
1963 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1964 {
1965 	RF_SectorCount_t num_blocks, pb, sum;
1966 	RF_RaidAddr_t raid_addr;
1967 	daddr_t blocknum;
1968 	int rc;
1969 
1970 	rf_lock_mutex2(raidPtr->mutex);
1971 	if (raidPtr->openings == 0) {
1972 		rf_unlock_mutex2(raidPtr->mutex);
1973 		return EAGAIN;
1974 	}
1975 	rf_unlock_mutex2(raidPtr->mutex);
1976 
1977 	blocknum = bp->b_rawblkno;
1978 
1979 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1980 		    (int) blocknum));
1981 
1982 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1983 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1984 
1985 	/* *THIS* is where we adjust what block we're going to...
1986 	 * but DO NOT TOUCH bp->b_blkno!!! */
1987 	raid_addr = blocknum;
1988 
1989 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1990 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1991 	sum = raid_addr + num_blocks + pb;
1992 	if (1 || rf_debugKernelAccess) {
1993 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1994 			    (int) raid_addr, (int) sum, (int) num_blocks,
1995 			    (int) pb, (int) bp->b_resid));
1996 	}
1997 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1998 	    || (sum < num_blocks) || (sum < pb)) {
1999 		rc = ENOSPC;
2000 		goto done;
2001 	}
2002 	/*
2003 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2004 	 */
2005 
2006 	if (bp->b_bcount & raidPtr->sectorMask) {
2007 		rc = ENOSPC;
2008 		goto done;
2009 	}
2010 	db1_printf(("Calling DoAccess..\n"));
2011 
2012 
2013 	rf_lock_mutex2(raidPtr->mutex);
2014 	raidPtr->openings--;
2015 	rf_unlock_mutex2(raidPtr->mutex);
2016 
2017 	/* don't ever condition on bp->b_flags & B_WRITE.
2018 	 * always condition on B_READ instead */
2019 
2020 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2021 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2022 			 raid_addr, num_blocks,
2023 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2024 
2025 done:
2026 	return rc;
2027 }
2028 
2029 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2030 
2031 int
2032 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2033 {
2034 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2035 	struct buf *bp;
2036 
2037 	req->queue = queue;
2038 	bp = req->bp;
2039 
2040 	switch (req->type) {
2041 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2042 		/* XXX need to do something extra here.. */
2043 		/* I'm leaving this in, as I've never actually seen it used,
2044 		 * and I'd like folks to report it... GO */
2045 		printf("%s: WAKEUP CALLED\n", __func__);
2046 		queue->numOutstanding++;
2047 
2048 		bp->b_flags = 0;
2049 		bp->b_private = req;
2050 
2051 		KernelWakeupFunc(bp);
2052 		break;
2053 
2054 	case RF_IO_TYPE_READ:
2055 	case RF_IO_TYPE_WRITE:
2056 #if RF_ACC_TRACE > 0
2057 		if (req->tracerec) {
2058 			RF_ETIMER_START(req->tracerec->timer);
2059 		}
2060 #endif
2061 		InitBP(bp, queue->rf_cinfo->ci_vp,
2062 		    op, queue->rf_cinfo->ci_dev,
2063 		    req->sectorOffset, req->numSector,
2064 		    req->buf, KernelWakeupFunc, (void *) req,
2065 		    queue->raidPtr->logBytesPerSector);
2066 
2067 		if (rf_debugKernelAccess) {
2068 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2069 				(long) bp->b_blkno));
2070 		}
2071 		queue->numOutstanding++;
2072 		queue->last_deq_sector = req->sectorOffset;
2073 		/* acc wouldn't have been let in if there were any pending
2074 		 * reqs at any other priority */
2075 		queue->curPriority = req->priority;
2076 
2077 		db1_printf(("Going for %c to unit %d col %d\n",
2078 			    req->type, queue->raidPtr->raidid,
2079 			    queue->col));
2080 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2081 			(int) req->sectorOffset, (int) req->numSector,
2082 			(int) (req->numSector <<
2083 			    queue->raidPtr->logBytesPerSector),
2084 			(int) queue->raidPtr->logBytesPerSector));
2085 
2086 		/*
2087 		 * XXX: drop lock here since this can block at
2088 		 * least with backing SCSI devices.  Retake it
2089 		 * to minimize fuss with calling interfaces.
2090 		 */
2091 
2092 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2093 		bdev_strategy(bp);
2094 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2095 		break;
2096 
2097 	default:
2098 		panic("bad req->type in rf_DispatchKernelIO");
2099 	}
2100 	db1_printf(("Exiting from DispatchKernelIO\n"));
2101 
2102 	return 0;
2103 }
2104 /* this is the callback function associated with a I/O invoked from
2105    kernel code.
2106  */
2107 static void
2108 KernelWakeupFunc(struct buf *bp)
2109 {
2110 	RF_DiskQueueData_t *req = NULL;
2111 	RF_DiskQueue_t *queue;
2112 
2113 	db1_printf(("recovering the request queue:\n"));
2114 
2115 	req = bp->b_private;
2116 
2117 	queue = (RF_DiskQueue_t *) req->queue;
2118 
2119 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2120 
2121 #if RF_ACC_TRACE > 0
2122 	if (req->tracerec) {
2123 		RF_ETIMER_STOP(req->tracerec->timer);
2124 		RF_ETIMER_EVAL(req->tracerec->timer);
2125 		rf_lock_mutex2(rf_tracing_mutex);
2126 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2127 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2128 		req->tracerec->num_phys_ios++;
2129 		rf_unlock_mutex2(rf_tracing_mutex);
2130 	}
2131 #endif
2132 
2133 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2134 	 * ballistic, and mark the component as hosed... */
2135 
2136 	if (bp->b_error != 0) {
2137 		/* Mark the disk as dead */
2138 		/* but only mark it once... */
2139 		/* and only if it wouldn't leave this RAID set
2140 		   completely broken */
2141 		if (((queue->raidPtr->Disks[queue->col].status ==
2142 		      rf_ds_optimal) ||
2143 		     (queue->raidPtr->Disks[queue->col].status ==
2144 		      rf_ds_used_spare)) &&
2145 		     (queue->raidPtr->numFailures <
2146 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2147 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2148 			       queue->raidPtr->raidid,
2149 			       bp->b_error,
2150 			       queue->raidPtr->Disks[queue->col].devname);
2151 			queue->raidPtr->Disks[queue->col].status =
2152 			    rf_ds_failed;
2153 			queue->raidPtr->status = rf_rs_degraded;
2154 			queue->raidPtr->numFailures++;
2155 			queue->raidPtr->numNewFailures++;
2156 		} else {	/* Disk is already dead... */
2157 			/* printf("Disk already marked as dead!\n"); */
2158 		}
2159 
2160 	}
2161 
2162 	/* Fill in the error value */
2163 	req->error = bp->b_error;
2164 
2165 	/* Drop this one on the "finished" queue... */
2166 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2167 
2168 	/* Let the raidio thread know there is work to be done. */
2169 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2170 
2171 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2172 }
2173 
2174 
2175 /*
2176  * initialize a buf structure for doing an I/O in the kernel.
2177  */
2178 static void
2179 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2180        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2181        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2182 {
2183 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2184 	bp->b_oflags = 0;
2185 	bp->b_cflags = 0;
2186 	bp->b_bcount = numSect << logBytesPerSector;
2187 	bp->b_bufsize = bp->b_bcount;
2188 	bp->b_error = 0;
2189 	bp->b_dev = dev;
2190 	bp->b_data = bf;
2191 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2192 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2193 	if (bp->b_bcount == 0) {
2194 		panic("bp->b_bcount is zero in InitBP!!");
2195 	}
2196 	bp->b_iodone = cbFunc;
2197 	bp->b_private = cbArg;
2198 }
2199 
2200 /*
2201  * Wait interruptibly for an exclusive lock.
2202  *
2203  * XXX
2204  * Several drivers do this; it should be abstracted and made MP-safe.
2205  * (Hmm... where have we seen this warning before :->  GO )
2206  */
2207 static int
2208 raidlock(struct raid_softc *rs)
2209 {
2210 	int     error;
2211 
2212 	error = 0;
2213 	mutex_enter(&rs->sc_mutex);
2214 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2215 		rs->sc_flags |= RAIDF_WANTED;
2216 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2217 		if (error != 0)
2218 			goto done;
2219 	}
2220 	rs->sc_flags |= RAIDF_LOCKED;
2221 done:
2222 	mutex_exit(&rs->sc_mutex);
2223 	return error;
2224 }
2225 /*
2226  * Unlock and wake up any waiters.
2227  */
2228 static void
2229 raidunlock(struct raid_softc *rs)
2230 {
2231 
2232 	mutex_enter(&rs->sc_mutex);
2233 	rs->sc_flags &= ~RAIDF_LOCKED;
2234 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2235 		rs->sc_flags &= ~RAIDF_WANTED;
2236 		cv_broadcast(&rs->sc_cv);
2237 	}
2238 	mutex_exit(&rs->sc_mutex);
2239 }
2240 
2241 
2242 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2243 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2244 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2245 
2246 static daddr_t
2247 rf_component_info_offset(void)
2248 {
2249 
2250 	return RF_COMPONENT_INFO_OFFSET;
2251 }
2252 
2253 static daddr_t
2254 rf_component_info_size(unsigned secsize)
2255 {
2256 	daddr_t info_size;
2257 
2258 	KASSERT(secsize);
2259 	if (secsize > RF_COMPONENT_INFO_SIZE)
2260 		info_size = secsize;
2261 	else
2262 		info_size = RF_COMPONENT_INFO_SIZE;
2263 
2264 	return info_size;
2265 }
2266 
2267 static daddr_t
2268 rf_parity_map_offset(RF_Raid_t *raidPtr)
2269 {
2270 	daddr_t map_offset;
2271 
2272 	KASSERT(raidPtr->bytesPerSector);
2273 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2274 		map_offset = raidPtr->bytesPerSector;
2275 	else
2276 		map_offset = RF_COMPONENT_INFO_SIZE;
2277 	map_offset += rf_component_info_offset();
2278 
2279 	return map_offset;
2280 }
2281 
2282 static daddr_t
2283 rf_parity_map_size(RF_Raid_t *raidPtr)
2284 {
2285 	daddr_t map_size;
2286 
2287 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2288 		map_size = raidPtr->bytesPerSector;
2289 	else
2290 		map_size = RF_PARITY_MAP_SIZE;
2291 
2292 	return map_size;
2293 }
2294 
2295 int
2296 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2297 {
2298 	RF_ComponentLabel_t *clabel;
2299 
2300 	clabel = raidget_component_label(raidPtr, col);
2301 	clabel->clean = RF_RAID_CLEAN;
2302 	raidflush_component_label(raidPtr, col);
2303 	return(0);
2304 }
2305 
2306 
2307 int
2308 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2309 {
2310 	RF_ComponentLabel_t *clabel;
2311 
2312 	clabel = raidget_component_label(raidPtr, col);
2313 	clabel->clean = RF_RAID_DIRTY;
2314 	raidflush_component_label(raidPtr, col);
2315 	return(0);
2316 }
2317 
2318 int
2319 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2320 {
2321 	KASSERT(raidPtr->bytesPerSector);
2322 
2323 	return raidread_component_label(raidPtr->bytesPerSector,
2324 	    raidPtr->Disks[col].dev,
2325 	    raidPtr->raid_cinfo[col].ci_vp,
2326 	    &raidPtr->raid_cinfo[col].ci_label);
2327 }
2328 
2329 RF_ComponentLabel_t *
2330 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2331 {
2332 	return &raidPtr->raid_cinfo[col].ci_label;
2333 }
2334 
2335 int
2336 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2337 {
2338 	RF_ComponentLabel_t *label;
2339 
2340 	label = &raidPtr->raid_cinfo[col].ci_label;
2341 	label->mod_counter = raidPtr->mod_counter;
2342 #ifndef RF_NO_PARITY_MAP
2343 	label->parity_map_modcount = label->mod_counter;
2344 #endif
2345 	return raidwrite_component_label(raidPtr->bytesPerSector,
2346 	    raidPtr->Disks[col].dev,
2347 	    raidPtr->raid_cinfo[col].ci_vp, label);
2348 }
2349 
2350 /*
2351  * Swap the label endianness.
2352  *
2353  * Everything in the component label is 4-byte-swapped except the version,
2354  * which is kept in the byte-swapped version at all times, and indicates
2355  * for the writer that a swap is necessary.
2356  *
2357  * For reads it is expected that out_label == clabel, but writes expect
2358  * separate labels so only the re-swapped label is written out to disk,
2359  * leaving the swapped-except-version internally.
2360  *
2361  * Only support swapping label version 2.
2362  */
2363 static void
2364 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2365 {
2366 	int	*in, *out, *in_last;
2367 
2368 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2369 
2370 	/* Don't swap the label, but do copy it. */
2371 	out_label->version = clabel->version;
2372 
2373 	in = &clabel->serial_number;
2374 	in_last = &clabel->future_use2[42];
2375 	out = &out_label->serial_number;
2376 
2377 	for (; in < in_last; in++, out++)
2378 		*out = bswap32(*in);
2379 }
2380 
2381 static int
2382 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2383     RF_ComponentLabel_t *clabel)
2384 {
2385 	int error;
2386 
2387 	error = raidread_component_area(dev, b_vp, clabel,
2388 	    sizeof(RF_ComponentLabel_t),
2389 	    rf_component_info_offset(),
2390 	    rf_component_info_size(secsize));
2391 
2392 	if (error == 0 &&
2393 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2394 		rf_swap_label(clabel, clabel);
2395 	}
2396 
2397 	return error;
2398 }
2399 
2400 /* ARGSUSED */
2401 static int
2402 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2403     size_t msize, daddr_t offset, daddr_t dsize)
2404 {
2405 	struct buf *bp;
2406 	int error;
2407 
2408 	/* XXX should probably ensure that we don't try to do this if
2409 	   someone has changed rf_protected_sectors. */
2410 
2411 	if (b_vp == NULL) {
2412 		/* For whatever reason, this component is not valid.
2413 		   Don't try to read a component label from it. */
2414 		return(EINVAL);
2415 	}
2416 
2417 	/* get a block of the appropriate size... */
2418 	bp = geteblk((int)dsize);
2419 	bp->b_dev = dev;
2420 
2421 	/* get our ducks in a row for the read */
2422 	bp->b_blkno = offset / DEV_BSIZE;
2423 	bp->b_bcount = dsize;
2424 	bp->b_flags |= B_READ;
2425  	bp->b_resid = dsize;
2426 
2427 	bdev_strategy(bp);
2428 	error = biowait(bp);
2429 
2430 	if (!error) {
2431 		memcpy(data, bp->b_data, msize);
2432 	}
2433 
2434 	brelse(bp, 0);
2435 	return(error);
2436 }
2437 
2438 static int
2439 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2440     RF_ComponentLabel_t *clabel)
2441 {
2442 	RF_ComponentLabel_t *clabel_write = clabel;
2443 	RF_ComponentLabel_t lclabel;
2444 	int error;
2445 
2446 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2447 		clabel_write = &lclabel;
2448 		rf_swap_label(clabel, clabel_write);
2449 	}
2450 	error = raidwrite_component_area(dev, b_vp, clabel_write,
2451 	    sizeof(RF_ComponentLabel_t),
2452 	    rf_component_info_offset(),
2453 	    rf_component_info_size(secsize), 0);
2454 
2455 	return error;
2456 }
2457 
2458 /* ARGSUSED */
2459 static int
2460 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2461     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2462 {
2463 	struct buf *bp;
2464 	int error;
2465 
2466 	/* get a block of the appropriate size... */
2467 	bp = geteblk((int)dsize);
2468 	bp->b_dev = dev;
2469 
2470 	/* get our ducks in a row for the write */
2471 	bp->b_blkno = offset / DEV_BSIZE;
2472 	bp->b_bcount = dsize;
2473 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2474  	bp->b_resid = dsize;
2475 
2476 	memset(bp->b_data, 0, dsize);
2477 	memcpy(bp->b_data, data, msize);
2478 
2479 	bdev_strategy(bp);
2480 	if (asyncp)
2481 		return 0;
2482 	error = biowait(bp);
2483 	brelse(bp, 0);
2484 	if (error) {
2485 #if 1
2486 		printf("Failed to write RAID component info!\n");
2487 #endif
2488 	}
2489 
2490 	return(error);
2491 }
2492 
2493 void
2494 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2495 {
2496 	int c;
2497 
2498 	for (c = 0; c < raidPtr->numCol; c++) {
2499 		/* Skip dead disks. */
2500 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2501 			continue;
2502 		/* XXXjld: what if an error occurs here? */
2503 		raidwrite_component_area(raidPtr->Disks[c].dev,
2504 		    raidPtr->raid_cinfo[c].ci_vp, map,
2505 		    RF_PARITYMAP_NBYTE,
2506 		    rf_parity_map_offset(raidPtr),
2507 		    rf_parity_map_size(raidPtr), 0);
2508 	}
2509 }
2510 
2511 void
2512 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2513 {
2514 	struct rf_paritymap_ondisk tmp;
2515 	int c,first;
2516 
2517 	first=1;
2518 	for (c = 0; c < raidPtr->numCol; c++) {
2519 		/* Skip dead disks. */
2520 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2521 			continue;
2522 		raidread_component_area(raidPtr->Disks[c].dev,
2523 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2524 		    RF_PARITYMAP_NBYTE,
2525 		    rf_parity_map_offset(raidPtr),
2526 		    rf_parity_map_size(raidPtr));
2527 		if (first) {
2528 			memcpy(map, &tmp, sizeof(*map));
2529 			first = 0;
2530 		} else {
2531 			rf_paritymap_merge(map, &tmp);
2532 		}
2533 	}
2534 }
2535 
2536 void
2537 rf_markalldirty(RF_Raid_t *raidPtr)
2538 {
2539 	RF_ComponentLabel_t *clabel;
2540 	int sparecol;
2541 	int c;
2542 	int j;
2543 	int scol = -1;
2544 
2545 	raidPtr->mod_counter++;
2546 	for (c = 0; c < raidPtr->numCol; c++) {
2547 		/* we don't want to touch (at all) a disk that has
2548 		   failed */
2549 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2550 			clabel = raidget_component_label(raidPtr, c);
2551 			if (clabel->status == rf_ds_spared) {
2552 				/* XXX do something special...
2553 				   but whatever you do, don't
2554 				   try to access it!! */
2555 			} else {
2556 				raidmarkdirty(raidPtr, c);
2557 			}
2558 		}
2559 	}
2560 
2561 	for( c = 0; c < raidPtr->numSpare ; c++) {
2562 		sparecol = raidPtr->numCol + c;
2563 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2564 			/*
2565 
2566 			   we claim this disk is "optimal" if it's
2567 			   rf_ds_used_spare, as that means it should be
2568 			   directly substitutable for the disk it replaced.
2569 			   We note that too...
2570 
2571 			 */
2572 
2573 			for(j=0;j<raidPtr->numCol;j++) {
2574 				if (raidPtr->Disks[j].spareCol == sparecol) {
2575 					scol = j;
2576 					break;
2577 				}
2578 			}
2579 
2580 			clabel = raidget_component_label(raidPtr, sparecol);
2581 			/* make sure status is noted */
2582 
2583 			raid_init_component_label(raidPtr, clabel);
2584 
2585 			clabel->row = 0;
2586 			clabel->column = scol;
2587 			/* Note: we *don't* change status from rf_ds_used_spare
2588 			   to rf_ds_optimal */
2589 			/* clabel.status = rf_ds_optimal; */
2590 
2591 			raidmarkdirty(raidPtr, sparecol);
2592 		}
2593 	}
2594 }
2595 
2596 
2597 void
2598 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2599 {
2600 	RF_ComponentLabel_t *clabel;
2601 	int sparecol;
2602 	int c;
2603 	int j;
2604 	int scol;
2605 	struct raid_softc *rs = raidPtr->softc;
2606 
2607 	scol = -1;
2608 
2609 	/* XXX should do extra checks to make sure things really are clean,
2610 	   rather than blindly setting the clean bit... */
2611 
2612 	raidPtr->mod_counter++;
2613 
2614 	for (c = 0; c < raidPtr->numCol; c++) {
2615 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2616 			clabel = raidget_component_label(raidPtr, c);
2617 			/* make sure status is noted */
2618 			clabel->status = rf_ds_optimal;
2619 
2620 			/* note what unit we are configured as */
2621 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2622 				clabel->last_unit = raidPtr->raidid;
2623 
2624 			raidflush_component_label(raidPtr, c);
2625 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2626 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2627 					raidmarkclean(raidPtr, c);
2628 				}
2629 			}
2630 		}
2631 		/* else we don't touch it.. */
2632 	}
2633 
2634 	for( c = 0; c < raidPtr->numSpare ; c++) {
2635 		sparecol = raidPtr->numCol + c;
2636 		/* Need to ensure that the reconstruct actually completed! */
2637 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2638 			/*
2639 
2640 			   we claim this disk is "optimal" if it's
2641 			   rf_ds_used_spare, as that means it should be
2642 			   directly substitutable for the disk it replaced.
2643 			   We note that too...
2644 
2645 			 */
2646 
2647 			for(j=0;j<raidPtr->numCol;j++) {
2648 				if (raidPtr->Disks[j].spareCol == sparecol) {
2649 					scol = j;
2650 					break;
2651 				}
2652 			}
2653 
2654 			/* XXX shouldn't *really* need this... */
2655 			clabel = raidget_component_label(raidPtr, sparecol);
2656 			/* make sure status is noted */
2657 
2658 			raid_init_component_label(raidPtr, clabel);
2659 
2660 			clabel->column = scol;
2661 			clabel->status = rf_ds_optimal;
2662 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2663 				clabel->last_unit = raidPtr->raidid;
2664 
2665 			raidflush_component_label(raidPtr, sparecol);
2666 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2667 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2668 					raidmarkclean(raidPtr, sparecol);
2669 				}
2670 			}
2671 		}
2672 	}
2673 }
2674 
2675 void
2676 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2677 {
2678 
2679 	if (vp != NULL) {
2680 		if (auto_configured == 1) {
2681 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2682 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2683 			vput(vp);
2684 
2685 		} else {
2686 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2687 		}
2688 	}
2689 }
2690 
2691 
2692 void
2693 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2694 {
2695 	int r,c;
2696 	struct vnode *vp;
2697 	int acd;
2698 
2699 
2700 	/* We take this opportunity to close the vnodes like we should.. */
2701 
2702 	for (c = 0; c < raidPtr->numCol; c++) {
2703 		vp = raidPtr->raid_cinfo[c].ci_vp;
2704 		acd = raidPtr->Disks[c].auto_configured;
2705 		rf_close_component(raidPtr, vp, acd);
2706 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2707 		raidPtr->Disks[c].auto_configured = 0;
2708 	}
2709 
2710 	for (r = 0; r < raidPtr->numSpare; r++) {
2711 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2712 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2713 		rf_close_component(raidPtr, vp, acd);
2714 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2715 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2716 	}
2717 }
2718 
2719 
2720 static void
2721 rf_ReconThread(struct rf_recon_req_internal *req)
2722 {
2723 	int     s;
2724 	RF_Raid_t *raidPtr;
2725 
2726 	s = splbio();
2727 	raidPtr = (RF_Raid_t *) req->raidPtr;
2728 	raidPtr->recon_in_progress = 1;
2729 
2730 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2731 		raidPtr->forceRecon = 1;
2732 	}
2733 
2734 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2735 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2736 
2737 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2738 		raidPtr->forceRecon = 0;
2739 	}
2740 
2741 	RF_Free(req, sizeof(*req));
2742 
2743 	raidPtr->recon_in_progress = 0;
2744 	splx(s);
2745 
2746 	/* That's all... */
2747 	kthread_exit(0);	/* does not return */
2748 }
2749 
2750 static void
2751 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2752 {
2753 	int retcode;
2754 	int s;
2755 
2756 	raidPtr->parity_rewrite_stripes_done = 0;
2757 	raidPtr->parity_rewrite_in_progress = 1;
2758 	s = splbio();
2759 	retcode = rf_RewriteParity(raidPtr);
2760 	splx(s);
2761 	if (retcode) {
2762 		printf("raid%d: Error re-writing parity (%d)!\n",
2763 		    raidPtr->raidid, retcode);
2764 	} else {
2765 		/* set the clean bit!  If we shutdown correctly,
2766 		   the clean bit on each component label will get
2767 		   set */
2768 		raidPtr->parity_good = RF_RAID_CLEAN;
2769 	}
2770 	raidPtr->parity_rewrite_in_progress = 0;
2771 
2772 	/* Anyone waiting for us to stop?  If so, inform them... */
2773 	if (raidPtr->waitShutdown) {
2774 		rf_lock_mutex2(raidPtr->rad_lock);
2775 		cv_broadcast(&raidPtr->parity_rewrite_cv);
2776 		rf_unlock_mutex2(raidPtr->rad_lock);
2777 	}
2778 
2779 	/* That's all... */
2780 	kthread_exit(0);	/* does not return */
2781 }
2782 
2783 
2784 static void
2785 rf_CopybackThread(RF_Raid_t *raidPtr)
2786 {
2787 	int s;
2788 
2789 	raidPtr->copyback_in_progress = 1;
2790 	s = splbio();
2791 	rf_CopybackReconstructedData(raidPtr);
2792 	splx(s);
2793 	raidPtr->copyback_in_progress = 0;
2794 
2795 	/* That's all... */
2796 	kthread_exit(0);	/* does not return */
2797 }
2798 
2799 
2800 static void
2801 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2802 {
2803 	int s;
2804 	RF_Raid_t *raidPtr;
2805 
2806 	s = splbio();
2807 	raidPtr = req->raidPtr;
2808 	raidPtr->recon_in_progress = 1;
2809 
2810 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2811 		raidPtr->forceRecon = 1;
2812 	}
2813 
2814 	rf_ReconstructInPlace(raidPtr, req->col);
2815 
2816 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2817 		raidPtr->forceRecon = 0;
2818 	}
2819 
2820 	RF_Free(req, sizeof(*req));
2821 	raidPtr->recon_in_progress = 0;
2822 	splx(s);
2823 
2824 	/* That's all... */
2825 	kthread_exit(0);	/* does not return */
2826 }
2827 
2828 static RF_AutoConfig_t *
2829 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2830     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2831     unsigned secsize)
2832 {
2833 	int good_one = 0;
2834 	RF_ComponentLabel_t *clabel;
2835 	RF_AutoConfig_t *ac;
2836 
2837 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2838 
2839 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2840 		/* Got the label.  Does it look reasonable? */
2841 		if (rf_reasonable_label(clabel, numsecs) &&
2842 		    (rf_component_label_partitionsize(clabel) <= size)) {
2843 #ifdef DEBUG
2844 			printf("Component on: %s: %llu\n",
2845 				cname, (unsigned long long)size);
2846 			rf_print_component_label(clabel);
2847 #endif
2848 			/* if it's reasonable, add it, else ignore it. */
2849 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2850 				M_WAITOK);
2851 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2852 			ac->dev = dev;
2853 			ac->vp = vp;
2854 			ac->clabel = clabel;
2855 			ac->next = ac_list;
2856 			ac_list = ac;
2857 			good_one = 1;
2858 		}
2859 	}
2860 	if (!good_one) {
2861 		/* cleanup */
2862 		free(clabel, M_RAIDFRAME);
2863 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2864 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2865 		vput(vp);
2866 	}
2867 	return ac_list;
2868 }
2869 
2870 static RF_AutoConfig_t *
2871 rf_find_raid_components(void)
2872 {
2873 	struct vnode *vp;
2874 	struct disklabel label;
2875 	device_t dv;
2876 	deviter_t di;
2877 	dev_t dev;
2878 	int bmajor, bminor, wedge, rf_part_found;
2879 	int error;
2880 	int i;
2881 	RF_AutoConfig_t *ac_list;
2882 	uint64_t numsecs;
2883 	unsigned secsize;
2884 	int dowedges;
2885 
2886 	/* initialize the AutoConfig list */
2887 	ac_list = NULL;
2888 
2889 	/*
2890 	 * we begin by trolling through *all* the devices on the system *twice*
2891 	 * first we scan for wedges, second for other devices. This avoids
2892 	 * using a raw partition instead of a wedge that covers the whole disk
2893 	 */
2894 
2895 	for (dowedges=1; dowedges>=0; --dowedges) {
2896 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2897 		     dv = deviter_next(&di)) {
2898 
2899 			/* we are only interested in disks */
2900 			if (device_class(dv) != DV_DISK)
2901 				continue;
2902 
2903 			/* we don't care about floppies */
2904 			if (device_is_a(dv, "fd")) {
2905 				continue;
2906 			}
2907 
2908 			/* we don't care about CDs. */
2909 			if (device_is_a(dv, "cd")) {
2910 				continue;
2911 			}
2912 
2913 			/* we don't care about md. */
2914 			if (device_is_a(dv, "md")) {
2915 				continue;
2916 			}
2917 
2918 			/* hdfd is the Atari/Hades floppy driver */
2919 			if (device_is_a(dv, "hdfd")) {
2920 				continue;
2921 			}
2922 
2923 			/* fdisa is the Atari/Milan floppy driver */
2924 			if (device_is_a(dv, "fdisa")) {
2925 				continue;
2926 			}
2927 
2928 			/* we don't care about spiflash */
2929 			if (device_is_a(dv, "spiflash")) {
2930 				continue;
2931 			}
2932 
2933 			/* are we in the wedges pass ? */
2934 			wedge = device_is_a(dv, "dk");
2935 			if (wedge != dowedges) {
2936 				continue;
2937 			}
2938 
2939 			/* need to find the device_name_to_block_device_major stuff */
2940 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2941 
2942 			rf_part_found = 0; /*No raid partition as yet*/
2943 
2944 			/* get a vnode for the raw partition of this disk */
2945 			bminor = minor(device_unit(dv));
2946 			dev = wedge ? makedev(bmajor, bminor) :
2947 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2948 			if (bdevvp(dev, &vp))
2949 				panic("RAID can't alloc vnode");
2950 
2951 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2952 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2953 
2954 			if (error) {
2955 				/* "Who cares."  Continue looking
2956 				   for something that exists*/
2957 				vput(vp);
2958 				continue;
2959 			}
2960 
2961 			VOP_UNLOCK(vp);
2962 			error = getdisksize(vp, &numsecs, &secsize);
2963 			if (error) {
2964 				/*
2965 				 * Pseudo devices like vnd and cgd can be
2966 				 * opened but may still need some configuration.
2967 				 * Ignore these quietly.
2968 				 */
2969 				if (error != ENXIO)
2970 					printf("RAIDframe: can't get disk size"
2971 					    " for dev %s (%d)\n",
2972 					    device_xname(dv), error);
2973 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2974 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2975 				vput(vp);
2976 				continue;
2977 			}
2978 			if (wedge) {
2979 				struct dkwedge_info dkw;
2980 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2981 				    NOCRED);
2982 				if (error) {
2983 					printf("RAIDframe: can't get wedge info for "
2984 					    "dev %s (%d)\n", device_xname(dv), error);
2985 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2986 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2987 					vput(vp);
2988 					continue;
2989 				}
2990 
2991 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2992 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2993 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2994 					vput(vp);
2995 					continue;
2996 				}
2997 
2998 				ac_list = rf_get_component(ac_list, dev, vp,
2999 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3000 				rf_part_found = 1; /*There is a raid component on this disk*/
3001 				continue;
3002 			}
3003 
3004 			/* Ok, the disk exists.  Go get the disklabel. */
3005 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3006 			if (error) {
3007 				/*
3008 				 * XXX can't happen - open() would
3009 				 * have errored out (or faked up one)
3010 				 */
3011 				if (error != ENOTTY)
3012 					printf("RAIDframe: can't get label for dev "
3013 					    "%s (%d)\n", device_xname(dv), error);
3014 			}
3015 
3016 			/* don't need this any more.  We'll allocate it again
3017 			   a little later if we really do... */
3018 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3019 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3020 			vput(vp);
3021 
3022 			if (error)
3023 				continue;
3024 
3025 			rf_part_found = 0; /*No raid partitions yet*/
3026 			for (i = 0; i < label.d_npartitions; i++) {
3027 				char cname[sizeof(ac_list->devname)];
3028 
3029 				/* We only support partitions marked as RAID */
3030 				if (label.d_partitions[i].p_fstype != FS_RAID)
3031 					continue;
3032 
3033 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3034 				if (bdevvp(dev, &vp))
3035 					panic("RAID can't alloc vnode");
3036 
3037 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3038 				error = VOP_OPEN(vp, FREAD, NOCRED);
3039 				if (error) {
3040 					/* Not quite a 'whatever'.  In
3041 					 * this situation we know
3042 					 * there is a FS_RAID
3043 					 * partition, but we can't
3044 					 * open it.  The most likely
3045 					 * reason is that the
3046 					 * partition is already in
3047 					 * use by another RAID set.
3048 					 * So note that we've already
3049 					 * found a partition on this
3050 					 * disk so we don't attempt
3051 					 * to use the raw disk later. */
3052 					rf_part_found = 1;
3053 					vput(vp);
3054 					continue;
3055 				}
3056 				VOP_UNLOCK(vp);
3057 				snprintf(cname, sizeof(cname), "%s%c",
3058 				    device_xname(dv), 'a' + i);
3059 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3060 					label.d_partitions[i].p_size, numsecs, secsize);
3061 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3062 			}
3063 
3064 			/*
3065 			 *If there is no raid component on this disk, either in a
3066 			 *disklabel or inside a wedge, check the raw partition as well,
3067 			 *as it is possible to configure raid components on raw disk
3068 			 *devices.
3069 			 */
3070 
3071 			if (!rf_part_found) {
3072 				char cname[sizeof(ac_list->devname)];
3073 
3074 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3075 				if (bdevvp(dev, &vp))
3076 					panic("RAID can't alloc vnode");
3077 
3078 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3079 
3080 				error = VOP_OPEN(vp, FREAD, NOCRED);
3081 				if (error) {
3082 					/* Whatever... */
3083 					vput(vp);
3084 					continue;
3085 				}
3086 				VOP_UNLOCK(vp);
3087 				snprintf(cname, sizeof(cname), "%s%c",
3088 				    device_xname(dv), 'a' + RAW_PART);
3089 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3090 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3091 			}
3092 		}
3093 		deviter_release(&di);
3094 	}
3095 	return ac_list;
3096 }
3097 
3098 int
3099 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3100 {
3101 
3102 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3103 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
3104 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3105 	    (clabel->clean == RF_RAID_CLEAN ||
3106 	     clabel->clean == RF_RAID_DIRTY) &&
3107 	    clabel->row >=0 &&
3108 	    clabel->column >= 0 &&
3109 	    clabel->num_rows > 0 &&
3110 	    clabel->num_columns > 0 &&
3111 	    clabel->row < clabel->num_rows &&
3112 	    clabel->column < clabel->num_columns &&
3113 	    clabel->blockSize > 0 &&
3114 	    /*
3115 	     * numBlocksHi may contain garbage, but it is ok since
3116 	     * the type is unsigned.  If it is really garbage,
3117 	     * rf_fix_old_label_size() will fix it.
3118 	     */
3119 	    rf_component_label_numblocks(clabel) > 0) {
3120 		/*
3121 		 * label looks reasonable enough...
3122 		 * let's make sure it has no old garbage.
3123 		 */
3124 		if (numsecs)
3125 			rf_fix_old_label_size(clabel, numsecs);
3126 		return(1);
3127 	}
3128 	return(0);
3129 }
3130 
3131 
3132 /*
3133  * For reasons yet unknown, some old component labels have garbage in
3134  * the newer numBlocksHi region, and this causes lossage.  Since those
3135  * disks will also have numsecs set to less than 32 bits of sectors,
3136  * we can determine when this corruption has occurred, and fix it.
3137  *
3138  * The exact same problem, with the same unknown reason, happens to
3139  * the partitionSizeHi member as well.
3140  */
3141 static void
3142 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3143 {
3144 
3145 	if (numsecs < ((uint64_t)1 << 32)) {
3146 		if (clabel->numBlocksHi) {
3147 			printf("WARNING: total sectors < 32 bits, yet "
3148 			       "numBlocksHi set\n"
3149 			       "WARNING: resetting numBlocksHi to zero.\n");
3150 			clabel->numBlocksHi = 0;
3151 		}
3152 
3153 		if (clabel->partitionSizeHi) {
3154 			printf("WARNING: total sectors < 32 bits, yet "
3155 			       "partitionSizeHi set\n"
3156 			       "WARNING: resetting partitionSizeHi to zero.\n");
3157 			clabel->partitionSizeHi = 0;
3158 		}
3159 	}
3160 }
3161 
3162 
3163 #ifdef DEBUG
3164 void
3165 rf_print_component_label(RF_ComponentLabel_t *clabel)
3166 {
3167 	uint64_t numBlocks;
3168 	static const char *rp[] = {
3169 	    "No", "Force", "Soft", "*invalid*"
3170 	};
3171 
3172 
3173 	numBlocks = rf_component_label_numblocks(clabel);
3174 
3175 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3176 	       clabel->row, clabel->column,
3177 	       clabel->num_rows, clabel->num_columns);
3178 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3179 	       clabel->version, clabel->serial_number,
3180 	       clabel->mod_counter);
3181 	printf("   Clean: %s Status: %d\n",
3182 	       clabel->clean ? "Yes" : "No", clabel->status);
3183 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3184 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3185 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3186 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3187 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3188 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3189 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3190 #if 0
3191 	   printf("   Config order: %d\n", clabel->config_order);
3192 #endif
3193 
3194 }
3195 #endif
3196 
3197 static RF_ConfigSet_t *
3198 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3199 {
3200 	RF_AutoConfig_t *ac;
3201 	RF_ConfigSet_t *config_sets;
3202 	RF_ConfigSet_t *cset;
3203 	RF_AutoConfig_t *ac_next;
3204 
3205 
3206 	config_sets = NULL;
3207 
3208 	/* Go through the AutoConfig list, and figure out which components
3209 	   belong to what sets.  */
3210 	ac = ac_list;
3211 	while(ac!=NULL) {
3212 		/* we're going to putz with ac->next, so save it here
3213 		   for use at the end of the loop */
3214 		ac_next = ac->next;
3215 
3216 		if (config_sets == NULL) {
3217 			/* will need at least this one... */
3218 			config_sets = malloc(sizeof(RF_ConfigSet_t),
3219 				       M_RAIDFRAME, M_WAITOK);
3220 			/* this one is easy :) */
3221 			config_sets->ac = ac;
3222 			config_sets->next = NULL;
3223 			config_sets->rootable = 0;
3224 			ac->next = NULL;
3225 		} else {
3226 			/* which set does this component fit into? */
3227 			cset = config_sets;
3228 			while(cset!=NULL) {
3229 				if (rf_does_it_fit(cset, ac)) {
3230 					/* looks like it matches... */
3231 					ac->next = cset->ac;
3232 					cset->ac = ac;
3233 					break;
3234 				}
3235 				cset = cset->next;
3236 			}
3237 			if (cset==NULL) {
3238 				/* didn't find a match above... new set..*/
3239 				cset = malloc(sizeof(RF_ConfigSet_t),
3240 					       M_RAIDFRAME, M_WAITOK);
3241 				cset->ac = ac;
3242 				ac->next = NULL;
3243 				cset->next = config_sets;
3244 				cset->rootable = 0;
3245 				config_sets = cset;
3246 			}
3247 		}
3248 		ac = ac_next;
3249 	}
3250 
3251 
3252 	return(config_sets);
3253 }
3254 
3255 static int
3256 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3257 {
3258 	RF_ComponentLabel_t *clabel1, *clabel2;
3259 
3260 	/* If this one matches the *first* one in the set, that's good
3261 	   enough, since the other members of the set would have been
3262 	   through here too... */
3263 	/* note that we are not checking partitionSize here..
3264 
3265 	   Note that we are also not checking the mod_counters here.
3266 	   If everything else matches except the mod_counter, that's
3267 	   good enough for this test.  We will deal with the mod_counters
3268 	   a little later in the autoconfiguration process.
3269 
3270 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3271 
3272 	   The reason we don't check for this is that failed disks
3273 	   will have lower modification counts.  If those disks are
3274 	   not added to the set they used to belong to, then they will
3275 	   form their own set, which may result in 2 different sets,
3276 	   for example, competing to be configured at raid0, and
3277 	   perhaps competing to be the root filesystem set.  If the
3278 	   wrong ones get configured, or both attempt to become /,
3279 	   weird behaviour and or serious lossage will occur.  Thus we
3280 	   need to bring them into the fold here, and kick them out at
3281 	   a later point.
3282 
3283 	*/
3284 
3285 	clabel1 = cset->ac->clabel;
3286 	clabel2 = ac->clabel;
3287 	if ((clabel1->version == clabel2->version) &&
3288 	    (clabel1->serial_number == clabel2->serial_number) &&
3289 	    (clabel1->num_rows == clabel2->num_rows) &&
3290 	    (clabel1->num_columns == clabel2->num_columns) &&
3291 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3292 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3293 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3294 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3295 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3296 	    (clabel1->blockSize == clabel2->blockSize) &&
3297 	    rf_component_label_numblocks(clabel1) ==
3298 	    rf_component_label_numblocks(clabel2) &&
3299 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3300 	    (clabel1->root_partition == clabel2->root_partition) &&
3301 	    (clabel1->last_unit == clabel2->last_unit) &&
3302 	    (clabel1->config_order == clabel2->config_order)) {
3303 		/* if it get's here, it almost *has* to be a match */
3304 	} else {
3305 		/* it's not consistent with somebody in the set..
3306 		   punt */
3307 		return(0);
3308 	}
3309 	/* all was fine.. it must fit... */
3310 	return(1);
3311 }
3312 
3313 static int
3314 rf_have_enough_components(RF_ConfigSet_t *cset)
3315 {
3316 	RF_AutoConfig_t *ac;
3317 	RF_AutoConfig_t *auto_config;
3318 	RF_ComponentLabel_t *clabel;
3319 	int c;
3320 	int num_cols;
3321 	int num_missing;
3322 	int mod_counter;
3323 	int mod_counter_found;
3324 	int even_pair_failed;
3325 	char parity_type;
3326 
3327 
3328 	/* check to see that we have enough 'live' components
3329 	   of this set.  If so, we can configure it if necessary */
3330 
3331 	num_cols = cset->ac->clabel->num_columns;
3332 	parity_type = cset->ac->clabel->parityConfig;
3333 
3334 	/* XXX Check for duplicate components!?!?!? */
3335 
3336 	/* Determine what the mod_counter is supposed to be for this set. */
3337 
3338 	mod_counter_found = 0;
3339 	mod_counter = 0;
3340 	ac = cset->ac;
3341 	while(ac!=NULL) {
3342 		if (mod_counter_found==0) {
3343 			mod_counter = ac->clabel->mod_counter;
3344 			mod_counter_found = 1;
3345 		} else {
3346 			if (ac->clabel->mod_counter > mod_counter) {
3347 				mod_counter = ac->clabel->mod_counter;
3348 			}
3349 		}
3350 		ac = ac->next;
3351 	}
3352 
3353 	num_missing = 0;
3354 	auto_config = cset->ac;
3355 
3356 	even_pair_failed = 0;
3357 	for(c=0; c<num_cols; c++) {
3358 		ac = auto_config;
3359 		while(ac!=NULL) {
3360 			if ((ac->clabel->column == c) &&
3361 			    (ac->clabel->mod_counter == mod_counter)) {
3362 				/* it's this one... */
3363 #ifdef DEBUG
3364 				printf("Found: %s at %d\n",
3365 				       ac->devname,c);
3366 #endif
3367 				break;
3368 			}
3369 			ac=ac->next;
3370 		}
3371 		if (ac==NULL) {
3372 				/* Didn't find one here! */
3373 				/* special case for RAID 1, especially
3374 				   where there are more than 2
3375 				   components (where RAIDframe treats
3376 				   things a little differently :( ) */
3377 			if (parity_type == '1') {
3378 				if (c%2 == 0) { /* even component */
3379 					even_pair_failed = 1;
3380 				} else { /* odd component.  If
3381 					    we're failed, and
3382 					    so is the even
3383 					    component, it's
3384 					    "Good Night, Charlie" */
3385 					if (even_pair_failed == 1) {
3386 						return(0);
3387 					}
3388 				}
3389 			} else {
3390 				/* normal accounting */
3391 				num_missing++;
3392 			}
3393 		}
3394 		if ((parity_type == '1') && (c%2 == 1)) {
3395 				/* Just did an even component, and we didn't
3396 				   bail.. reset the even_pair_failed flag,
3397 				   and go on to the next component.... */
3398 			even_pair_failed = 0;
3399 		}
3400 	}
3401 
3402 	clabel = cset->ac->clabel;
3403 
3404 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3405 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3406 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3407 		/* XXX this needs to be made *much* more general */
3408 		/* Too many failures */
3409 		return(0);
3410 	}
3411 	/* otherwise, all is well, and we've got enough to take a kick
3412 	   at autoconfiguring this set */
3413 	return(1);
3414 }
3415 
3416 static void
3417 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3418 			RF_Raid_t *raidPtr)
3419 {
3420 	RF_ComponentLabel_t *clabel;
3421 	int i;
3422 
3423 	clabel = ac->clabel;
3424 
3425 	/* 1. Fill in the common stuff */
3426 	config->numCol = clabel->num_columns;
3427 	config->numSpare = 0; /* XXX should this be set here? */
3428 	config->sectPerSU = clabel->sectPerSU;
3429 	config->SUsPerPU = clabel->SUsPerPU;
3430 	config->SUsPerRU = clabel->SUsPerRU;
3431 	config->parityConfig = clabel->parityConfig;
3432 	/* XXX... */
3433 	strcpy(config->diskQueueType,"fifo");
3434 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3435 	config->layoutSpecificSize = 0; /* XXX ?? */
3436 
3437 	while(ac!=NULL) {
3438 		/* row/col values will be in range due to the checks
3439 		   in reasonable_label() */
3440 		strcpy(config->devnames[0][ac->clabel->column],
3441 		       ac->devname);
3442 		ac = ac->next;
3443 	}
3444 
3445 	for(i=0;i<RF_MAXDBGV;i++) {
3446 		config->debugVars[i][0] = 0;
3447 	}
3448 }
3449 
3450 static int
3451 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3452 {
3453 	RF_ComponentLabel_t *clabel;
3454 	int column;
3455 	int sparecol;
3456 
3457 	raidPtr->autoconfigure = new_value;
3458 
3459 	for(column=0; column<raidPtr->numCol; column++) {
3460 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3461 			clabel = raidget_component_label(raidPtr, column);
3462 			clabel->autoconfigure = new_value;
3463 			raidflush_component_label(raidPtr, column);
3464 		}
3465 	}
3466 	for(column = 0; column < raidPtr->numSpare ; column++) {
3467 		sparecol = raidPtr->numCol + column;
3468 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3469 			clabel = raidget_component_label(raidPtr, sparecol);
3470 			clabel->autoconfigure = new_value;
3471 			raidflush_component_label(raidPtr, sparecol);
3472 		}
3473 	}
3474 	return(new_value);
3475 }
3476 
3477 static int
3478 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3479 {
3480 	RF_ComponentLabel_t *clabel;
3481 	int column;
3482 	int sparecol;
3483 
3484 	raidPtr->root_partition = new_value;
3485 	for(column=0; column<raidPtr->numCol; column++) {
3486 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3487 			clabel = raidget_component_label(raidPtr, column);
3488 			clabel->root_partition = new_value;
3489 			raidflush_component_label(raidPtr, column);
3490 		}
3491 	}
3492 	for(column = 0; column < raidPtr->numSpare ; column++) {
3493 		sparecol = raidPtr->numCol + column;
3494 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3495 			clabel = raidget_component_label(raidPtr, sparecol);
3496 			clabel->root_partition = new_value;
3497 			raidflush_component_label(raidPtr, sparecol);
3498 		}
3499 	}
3500 	return(new_value);
3501 }
3502 
3503 static void
3504 rf_release_all_vps(RF_ConfigSet_t *cset)
3505 {
3506 	RF_AutoConfig_t *ac;
3507 
3508 	ac = cset->ac;
3509 	while(ac!=NULL) {
3510 		/* Close the vp, and give it back */
3511 		if (ac->vp) {
3512 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3513 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3514 			vput(ac->vp);
3515 			ac->vp = NULL;
3516 		}
3517 		ac = ac->next;
3518 	}
3519 }
3520 
3521 
3522 static void
3523 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3524 {
3525 	RF_AutoConfig_t *ac;
3526 	RF_AutoConfig_t *next_ac;
3527 
3528 	ac = cset->ac;
3529 	while(ac!=NULL) {
3530 		next_ac = ac->next;
3531 		/* nuke the label */
3532 		free(ac->clabel, M_RAIDFRAME);
3533 		/* cleanup the config structure */
3534 		free(ac, M_RAIDFRAME);
3535 		/* "next.." */
3536 		ac = next_ac;
3537 	}
3538 	/* and, finally, nuke the config set */
3539 	free(cset, M_RAIDFRAME);
3540 }
3541 
3542 
3543 void
3544 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3545 {
3546 	/* avoid over-writing byteswapped version. */
3547 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3548 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3549 	clabel->serial_number = raidPtr->serial_number;
3550 	clabel->mod_counter = raidPtr->mod_counter;
3551 
3552 	clabel->num_rows = 1;
3553 	clabel->num_columns = raidPtr->numCol;
3554 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3555 	clabel->status = rf_ds_optimal; /* "It's good!" */
3556 
3557 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3558 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3559 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3560 
3561 	clabel->blockSize = raidPtr->bytesPerSector;
3562 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3563 
3564 	/* XXX not portable */
3565 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3566 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3567 	clabel->autoconfigure = raidPtr->autoconfigure;
3568 	clabel->root_partition = raidPtr->root_partition;
3569 	clabel->last_unit = raidPtr->raidid;
3570 	clabel->config_order = raidPtr->config_order;
3571 
3572 #ifndef RF_NO_PARITY_MAP
3573 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3574 #endif
3575 }
3576 
3577 static struct raid_softc *
3578 rf_auto_config_set(RF_ConfigSet_t *cset)
3579 {
3580 	RF_Raid_t *raidPtr;
3581 	RF_Config_t *config;
3582 	int raidID;
3583 	struct raid_softc *sc;
3584 
3585 #ifdef DEBUG
3586 	printf("RAID autoconfigure\n");
3587 #endif
3588 
3589 	/* 1. Create a config structure */
3590 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3591 
3592 	/*
3593 	   2. Figure out what RAID ID this one is supposed to live at
3594 	   See if we can get the same RAID dev that it was configured
3595 	   on last time..
3596 	*/
3597 
3598 	raidID = cset->ac->clabel->last_unit;
3599 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3600 	     sc = raidget(++raidID, false))
3601 		continue;
3602 #ifdef DEBUG
3603 	printf("Configuring raid%d:\n",raidID);
3604 #endif
3605 
3606 	if (sc == NULL)
3607 		sc = raidget(raidID, true);
3608 	raidPtr = &sc->sc_r;
3609 
3610 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3611 	raidPtr->softc = sc;
3612 	raidPtr->raidid = raidID;
3613 	raidPtr->openings = RAIDOUTSTANDING;
3614 
3615 	/* 3. Build the configuration structure */
3616 	rf_create_configuration(cset->ac, config, raidPtr);
3617 
3618 	/* 4. Do the configuration */
3619 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3620 		raidinit(sc);
3621 
3622 		rf_markalldirty(raidPtr);
3623 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3624 		switch (cset->ac->clabel->root_partition) {
3625 		case 1:	/* Force Root */
3626 		case 2:	/* Soft Root: root when boot partition part of raid */
3627 			/*
3628 			 * everything configured just fine.  Make a note
3629 			 * that this set is eligible to be root,
3630 			 * or forced to be root
3631 			 */
3632 			cset->rootable = cset->ac->clabel->root_partition;
3633 			/* XXX do this here? */
3634 			raidPtr->root_partition = cset->rootable;
3635 			break;
3636 		default:
3637 			break;
3638 		}
3639 	} else {
3640 		raidput(sc);
3641 		sc = NULL;
3642 	}
3643 
3644 	/* 5. Cleanup */
3645 	free(config, M_RAIDFRAME);
3646 	return sc;
3647 }
3648 
3649 void
3650 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3651 	     size_t xmin, size_t xmax)
3652 {
3653 
3654 	/* Format: raid%d_foo */
3655 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3656 
3657 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3658 	pool_sethiwat(p, xmax);
3659 	pool_prime(p, xmin);
3660 }
3661 
3662 
3663 /*
3664  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3665  * to see if there is IO pending and if that IO could possibly be done
3666  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3667  * otherwise.
3668  *
3669  */
3670 int
3671 rf_buf_queue_check(RF_Raid_t *raidPtr)
3672 {
3673 	struct raid_softc *rs;
3674 	struct dk_softc *dksc;
3675 
3676 	rs = raidPtr->softc;
3677 	dksc = &rs->sc_dksc;
3678 
3679 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3680 		return 1;
3681 
3682 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3683 		/* there is work to do */
3684 		return 0;
3685 	}
3686 	/* default is nothing to do */
3687 	return 1;
3688 }
3689 
3690 int
3691 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3692 {
3693 	uint64_t numsecs;
3694 	unsigned secsize;
3695 	int error;
3696 
3697 	error = getdisksize(vp, &numsecs, &secsize);
3698 	if (error == 0) {
3699 		diskPtr->blockSize = secsize;
3700 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3701 		diskPtr->partitionSize = numsecs;
3702 		return 0;
3703 	}
3704 	return error;
3705 }
3706 
3707 static int
3708 raid_match(device_t self, cfdata_t cfdata, void *aux)
3709 {
3710 	return 1;
3711 }
3712 
3713 static void
3714 raid_attach(device_t parent, device_t self, void *aux)
3715 {
3716 }
3717 
3718 
3719 static int
3720 raid_detach(device_t self, int flags)
3721 {
3722 	int error;
3723 	struct raid_softc *rs = raidsoftc(self);
3724 
3725 	if (rs == NULL)
3726 		return ENXIO;
3727 
3728 	if ((error = raidlock(rs)) != 0)
3729 		return error;
3730 
3731 	error = raid_detach_unlocked(rs);
3732 
3733 	raidunlock(rs);
3734 
3735 	/* XXX raid can be referenced here */
3736 
3737 	if (error)
3738 		return error;
3739 
3740 	/* Free the softc */
3741 	raidput(rs);
3742 
3743 	return 0;
3744 }
3745 
3746 static void
3747 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3748 {
3749 	struct dk_softc *dksc = &rs->sc_dksc;
3750 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3751 
3752 	memset(dg, 0, sizeof(*dg));
3753 
3754 	dg->dg_secperunit = raidPtr->totalSectors;
3755 	dg->dg_secsize = raidPtr->bytesPerSector;
3756 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3757 	dg->dg_ntracks = 4 * raidPtr->numCol;
3758 
3759 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3760 }
3761 
3762 /*
3763  * Get cache info for all the components (including spares).
3764  * Returns intersection of all the cache flags of all disks, or first
3765  * error if any encountered.
3766  * XXXfua feature flags can change as spares are added - lock down somehow
3767  */
3768 static int
3769 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3770 {
3771 	int c;
3772 	int error;
3773 	int dkwhole = 0, dkpart;
3774 
3775 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3776 		/*
3777 		 * Check any non-dead disk, even when currently being
3778 		 * reconstructed.
3779 		 */
3780 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3781 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3782 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3783 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
3784 			if (error) {
3785 				if (error != ENODEV) {
3786 					printf("raid%d: get cache for component %s failed\n",
3787 					    raidPtr->raidid,
3788 					    raidPtr->Disks[c].devname);
3789 				}
3790 
3791 				return error;
3792 			}
3793 
3794 			if (c == 0)
3795 				dkwhole = dkpart;
3796 			else
3797 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3798 		}
3799 	}
3800 
3801 	*data = dkwhole;
3802 
3803 	return 0;
3804 }
3805 
3806 /*
3807  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3808  * We end up returning whatever error was returned by the first cache flush
3809  * that fails.
3810  */
3811 
3812 static int
3813 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3814 {
3815 	int e = 0;
3816 	for (int i = 0; i < 5; i++) {
3817 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3818 		    &force, FWRITE, NOCRED);
3819 		if (!e || e == ENODEV)
3820 			return e;
3821 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3822 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3823 	}
3824 	return e;
3825 }
3826 
3827 int
3828 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3829 {
3830 	int c, error;
3831 
3832 	error = 0;
3833 	for (c = 0; c < raidPtr->numCol; c++) {
3834 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3835 			int e = rf_sync_component_cache(raidPtr, c, force);
3836 			if (e && !error)
3837 				error = e;
3838 		}
3839 	}
3840 
3841 	for (c = 0; c < raidPtr->numSpare ; c++) {
3842 		int sparecol = raidPtr->numCol + c;
3843 		/* Need to ensure that the reconstruct actually completed! */
3844 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3845 			int e = rf_sync_component_cache(raidPtr, sparecol,
3846 			    force);
3847 			if (e && !error)
3848 				error = e;
3849 		}
3850 	}
3851 	return error;
3852 }
3853 
3854 /* Fill in info with the current status */
3855 void
3856 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3857 {
3858 
3859 	memset(info, 0, sizeof(*info));
3860 
3861 	if (raidPtr->status != rf_rs_reconstructing) {
3862 		info->total = 100;
3863 		info->completed = 100;
3864 	} else {
3865 		info->total = raidPtr->reconControl->numRUsTotal;
3866 		info->completed = raidPtr->reconControl->numRUsComplete;
3867 	}
3868 	info->remaining = info->total - info->completed;
3869 }
3870 
3871 /* Fill in info with the current status */
3872 void
3873 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3874 {
3875 
3876 	memset(info, 0, sizeof(*info));
3877 
3878 	if (raidPtr->parity_rewrite_in_progress == 1) {
3879 		info->total = raidPtr->Layout.numStripe;
3880 		info->completed = raidPtr->parity_rewrite_stripes_done;
3881 	} else {
3882 		info->completed = 100;
3883 		info->total = 100;
3884 	}
3885 	info->remaining = info->total - info->completed;
3886 }
3887 
3888 /* Fill in info with the current status */
3889 void
3890 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3891 {
3892 
3893 	memset(info, 0, sizeof(*info));
3894 
3895 	if (raidPtr->copyback_in_progress == 1) {
3896 		info->total = raidPtr->Layout.numStripe;
3897 		info->completed = raidPtr->copyback_stripes_done;
3898 		info->remaining = info->total - info->completed;
3899 	} else {
3900 		info->remaining = 0;
3901 		info->completed = 100;
3902 		info->total = 100;
3903 	}
3904 }
3905 
3906 /* Fill in config with the current info */
3907 int
3908 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3909 {
3910 	int	d, i, j;
3911 
3912 	if (!raidPtr->valid)
3913 		return ENODEV;
3914 	config->cols = raidPtr->numCol;
3915 	config->ndevs = raidPtr->numCol;
3916 	if (config->ndevs >= RF_MAX_DISKS)
3917 		return ENOMEM;
3918 	config->nspares = raidPtr->numSpare;
3919 	if (config->nspares >= RF_MAX_DISKS)
3920 		return ENOMEM;
3921 	config->maxqdepth = raidPtr->maxQueueDepth;
3922 	d = 0;
3923 	for (j = 0; j < config->cols; j++) {
3924 		config->devs[d] = raidPtr->Disks[j];
3925 		d++;
3926 	}
3927 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3928 		config->spares[i] = raidPtr->Disks[j];
3929 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
3930 			/* XXX: raidctl(8) expects to see this as a used spare */
3931 			config->spares[i].status = rf_ds_used_spare;
3932 		}
3933 	}
3934 	return 0;
3935 }
3936 
3937 int
3938 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3939 {
3940 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3941 	RF_ComponentLabel_t *raid_clabel;
3942 	int column = clabel->column;
3943 
3944 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3945 		return EINVAL;
3946 	raid_clabel = raidget_component_label(raidPtr, column);
3947 	memcpy(clabel, raid_clabel, sizeof *clabel);
3948 	/* Fix-up for userland. */
3949 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
3950 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3951 
3952 	return 0;
3953 }
3954 
3955 /*
3956  * Module interface
3957  */
3958 
3959 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3960 
3961 #ifdef _MODULE
3962 CFDRIVER_DECL(raid, DV_DISK, NULL);
3963 #endif
3964 
3965 static int raid_modcmd(modcmd_t, void *);
3966 static int raid_modcmd_init(void);
3967 static int raid_modcmd_fini(void);
3968 
3969 static int
3970 raid_modcmd(modcmd_t cmd, void *data)
3971 {
3972 	int error;
3973 
3974 	error = 0;
3975 	switch (cmd) {
3976 	case MODULE_CMD_INIT:
3977 		error = raid_modcmd_init();
3978 		break;
3979 	case MODULE_CMD_FINI:
3980 		error = raid_modcmd_fini();
3981 		break;
3982 	default:
3983 		error = ENOTTY;
3984 		break;
3985 	}
3986 	return error;
3987 }
3988 
3989 static int
3990 raid_modcmd_init(void)
3991 {
3992 	int error;
3993 	int bmajor, cmajor;
3994 
3995 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3996 	mutex_enter(&raid_lock);
3997 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3998 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3999 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4000 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4001 
4002 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4003 #endif
4004 
4005 	bmajor = cmajor = -1;
4006 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4007 	    &raid_cdevsw, &cmajor);
4008 	if (error != 0 && error != EEXIST) {
4009 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4010 		mutex_exit(&raid_lock);
4011 		return error;
4012 	}
4013 #ifdef _MODULE
4014 	error = config_cfdriver_attach(&raid_cd);
4015 	if (error != 0) {
4016 		aprint_error("%s: config_cfdriver_attach failed %d\n",
4017 		    __func__, error);
4018 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4019 		mutex_exit(&raid_lock);
4020 		return error;
4021 	}
4022 #endif
4023 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4024 	if (error != 0) {
4025 		aprint_error("%s: config_cfattach_attach failed %d\n",
4026 		    __func__, error);
4027 #ifdef _MODULE
4028 		config_cfdriver_detach(&raid_cd);
4029 #endif
4030 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4031 		mutex_exit(&raid_lock);
4032 		return error;
4033 	}
4034 
4035 	raidautoconfigdone = false;
4036 
4037 	mutex_exit(&raid_lock);
4038 
4039 	if (error == 0) {
4040 		if (rf_BootRaidframe(true) == 0)
4041 			aprint_verbose("Kernelized RAIDframe activated\n");
4042 		else
4043 			panic("Serious error activating RAID!!");
4044 	}
4045 
4046 	/*
4047 	 * Register a finalizer which will be used to auto-config RAID
4048 	 * sets once all real hardware devices have been found.
4049 	 */
4050 	error = config_finalize_register(NULL, rf_autoconfig);
4051 	if (error != 0) {
4052 		aprint_error("WARNING: unable to register RAIDframe "
4053 		    "finalizer\n");
4054 		error = 0;
4055 	}
4056 
4057 	return error;
4058 }
4059 
4060 static int
4061 raid_modcmd_fini(void)
4062 {
4063 	int error;
4064 
4065 	mutex_enter(&raid_lock);
4066 
4067 	/* Don't allow unload if raid device(s) exist.  */
4068 	if (!LIST_EMPTY(&raids)) {
4069 		mutex_exit(&raid_lock);
4070 		return EBUSY;
4071 	}
4072 
4073 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4074 	if (error != 0) {
4075 		aprint_error("%s: cannot detach cfattach\n",__func__);
4076 		mutex_exit(&raid_lock);
4077 		return error;
4078 	}
4079 #ifdef _MODULE
4080 	error = config_cfdriver_detach(&raid_cd);
4081 	if (error != 0) {
4082 		aprint_error("%s: cannot detach cfdriver\n",__func__);
4083 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4084 		mutex_exit(&raid_lock);
4085 		return error;
4086 	}
4087 #endif
4088 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
4089 	rf_BootRaidframe(false);
4090 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4091 	rf_destroy_mutex2(rf_sparet_wait_mutex);
4092 	rf_destroy_cond2(rf_sparet_wait_cv);
4093 	rf_destroy_cond2(rf_sparet_resp_cv);
4094 #endif
4095 	mutex_exit(&raid_lock);
4096 	mutex_destroy(&raid_lock);
4097 
4098 	return error;
4099 }
4100