xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 122b5006ee1bd67145794b4cde92f4fe4781a5ec)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.401 2021/09/09 23:26:37 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.401 2021/09/09 23:26:37 riastradh Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151 
152 #include "ioconf.h"
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #ifdef DEBUG_ROOT
162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
163 #else
164 #define DPRINTF(a, ...)
165 #endif
166 
167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
168 static rf_declare_mutex2(rf_sparet_wait_mutex);
169 static rf_declare_cond2(rf_sparet_wait_cv);
170 static rf_declare_cond2(rf_sparet_resp_cv);
171 
172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
173 						 * spare table */
174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
175 						 * installation process */
176 #endif
177 
178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
179 
180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
181 
182 /* prototypes */
183 static void KernelWakeupFunc(struct buf *);
184 static void InitBP(struct buf *, struct vnode *, unsigned,
185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
186     void *, int);
187 static void raidinit(struct raid_softc *);
188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
190 
191 static int raid_match(device_t, cfdata_t, void *);
192 static void raid_attach(device_t, device_t, void *);
193 static int raid_detach(device_t, int);
194 
195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
196     daddr_t, daddr_t);
197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
198     daddr_t, daddr_t, int);
199 
200 static int raidwrite_component_label(unsigned,
201     dev_t, struct vnode *, RF_ComponentLabel_t *);
202 static int raidread_component_label(unsigned,
203     dev_t, struct vnode *, RF_ComponentLabel_t *);
204 
205 static int raid_diskstart(device_t, struct buf *bp);
206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
207 static int raid_lastclose(device_t);
208 
209 static dev_type_open(raidopen);
210 static dev_type_close(raidclose);
211 static dev_type_read(raidread);
212 static dev_type_write(raidwrite);
213 static dev_type_ioctl(raidioctl);
214 static dev_type_strategy(raidstrategy);
215 static dev_type_dump(raiddump);
216 static dev_type_size(raidsize);
217 
218 const struct bdevsw raid_bdevsw = {
219 	.d_open = raidopen,
220 	.d_close = raidclose,
221 	.d_strategy = raidstrategy,
222 	.d_ioctl = raidioctl,
223 	.d_dump = raiddump,
224 	.d_psize = raidsize,
225 	.d_discard = nodiscard,
226 	.d_flag = D_DISK
227 };
228 
229 const struct cdevsw raid_cdevsw = {
230 	.d_open = raidopen,
231 	.d_close = raidclose,
232 	.d_read = raidread,
233 	.d_write = raidwrite,
234 	.d_ioctl = raidioctl,
235 	.d_stop = nostop,
236 	.d_tty = notty,
237 	.d_poll = nopoll,
238 	.d_mmap = nommap,
239 	.d_kqfilter = nokqfilter,
240 	.d_discard = nodiscard,
241 	.d_flag = D_DISK
242 };
243 
244 static struct dkdriver rf_dkdriver = {
245 	.d_open = raidopen,
246 	.d_close = raidclose,
247 	.d_strategy = raidstrategy,
248 	.d_diskstart = raid_diskstart,
249 	.d_dumpblocks = raid_dumpblocks,
250 	.d_lastclose = raid_lastclose,
251 	.d_minphys = minphys
252 };
253 
254 #define	raidunit(x)	DISKUNIT(x)
255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
256 
257 extern struct cfdriver raid_cd;
258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
260     DVF_DETACH_SHUTDOWN);
261 
262 /* Internal representation of a rf_recon_req */
263 struct rf_recon_req_internal {
264 	RF_RowCol_t col;
265 	RF_ReconReqFlags_t flags;
266 	void   *raidPtr;
267 };
268 
269 /*
270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
271  * Be aware that large numbers can allow the driver to consume a lot of
272  * kernel memory, especially on writes, and in degraded mode reads.
273  *
274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
275  * a single 64K write will typically require 64K for the old data,
276  * 64K for the old parity, and 64K for the new parity, for a total
277  * of 192K (if the parity buffer is not re-used immediately).
278  * Even it if is used immediately, that's still 128K, which when multiplied
279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
280  *
281  * Now in degraded mode, for example, a 64K read on the above setup may
282  * require data reconstruction, which will require *all* of the 4 remaining
283  * disks to participate -- 4 * 32K/disk == 128K again.
284  */
285 
286 #ifndef RAIDOUTSTANDING
287 #define RAIDOUTSTANDING   6
288 #endif
289 
290 #define RAIDLABELDEV(dev)	\
291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
292 
293 /* declared here, and made public, for the benefit of KVM stuff.. */
294 
295 static int raidlock(struct raid_softc *);
296 static void raidunlock(struct raid_softc *);
297 
298 static int raid_detach_unlocked(struct raid_softc *);
299 
300 static void rf_markalldirty(RF_Raid_t *);
301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
302 
303 static void rf_ReconThread(struct rf_recon_req_internal *);
304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
307 static int rf_autoconfig(device_t);
308 static int rf_rescan(void);
309 static void rf_buildroothack(RF_ConfigSet_t *);
310 
311 static RF_AutoConfig_t *rf_find_raid_components(void);
312 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
314 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
315 static int rf_set_autoconfig(RF_Raid_t *, int);
316 static int rf_set_rootpartition(RF_Raid_t *, int);
317 static void rf_release_all_vps(RF_ConfigSet_t *);
318 static void rf_cleanup_config_set(RF_ConfigSet_t *);
319 static int rf_have_enough_components(RF_ConfigSet_t *);
320 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
322 
323 /*
324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
326  * in the kernel config file.
327  */
328 #ifdef RAID_AUTOCONFIG
329 int raidautoconfig = 1;
330 #else
331 int raidautoconfig = 0;
332 #endif
333 static bool raidautoconfigdone = false;
334 
335 struct pool rf_alloclist_pool;   /* AllocList */
336 
337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
338 static kmutex_t raid_lock;
339 
340 static struct raid_softc *
341 raidcreate(int unit) {
342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
343 	sc->sc_unit = unit;
344 	cv_init(&sc->sc_cv, "raidunit");
345 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
346 	return sc;
347 }
348 
349 static void
350 raiddestroy(struct raid_softc *sc) {
351 	cv_destroy(&sc->sc_cv);
352 	mutex_destroy(&sc->sc_mutex);
353 	kmem_free(sc, sizeof(*sc));
354 }
355 
356 static struct raid_softc *
357 raidget(int unit, bool create) {
358 	struct raid_softc *sc;
359 	if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 		panic("%s: unit %d!", __func__, unit);
362 #endif
363 		return NULL;
364 	}
365 	mutex_enter(&raid_lock);
366 	LIST_FOREACH(sc, &raids, sc_link) {
367 		if (sc->sc_unit == unit) {
368 			mutex_exit(&raid_lock);
369 			return sc;
370 		}
371 	}
372 	mutex_exit(&raid_lock);
373 	if (!create)
374 		return NULL;
375 	sc = raidcreate(unit);
376 	mutex_enter(&raid_lock);
377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
378 	mutex_exit(&raid_lock);
379 	return sc;
380 }
381 
382 static void
383 raidput(struct raid_softc *sc) {
384 	mutex_enter(&raid_lock);
385 	LIST_REMOVE(sc, sc_link);
386 	mutex_exit(&raid_lock);
387 	raiddestroy(sc);
388 }
389 
390 void
391 raidattach(int num)
392 {
393 
394 	/*
395 	 * Device attachment and associated initialization now occurs
396 	 * as part of the module initialization.
397 	 */
398 }
399 
400 static int
401 rf_autoconfig(device_t self)
402 {
403 	RF_AutoConfig_t *ac_list;
404 	RF_ConfigSet_t *config_sets;
405 
406 	if (!raidautoconfig || raidautoconfigdone == true)
407 		return 0;
408 
409 	/* XXX This code can only be run once. */
410 	raidautoconfigdone = true;
411 
412 #ifdef __HAVE_CPU_BOOTCONF
413 	/*
414 	 * 0. find the boot device if needed first so we can use it later
415 	 * this needs to be done before we autoconfigure any raid sets,
416 	 * because if we use wedges we are not going to be able to open
417 	 * the boot device later
418 	 */
419 	if (booted_device == NULL)
420 		cpu_bootconf();
421 #endif
422 	/* 1. locate all RAID components on the system */
423 	aprint_debug("Searching for RAID components...\n");
424 	ac_list = rf_find_raid_components();
425 
426 	/* 2. Sort them into their respective sets. */
427 	config_sets = rf_create_auto_sets(ac_list);
428 
429 	/*
430 	 * 3. Evaluate each set and configure the valid ones.
431 	 * This gets done in rf_buildroothack().
432 	 */
433 	rf_buildroothack(config_sets);
434 
435 	return 1;
436 }
437 
438 int
439 rf_inited(const struct raid_softc *rs) {
440 	return (rs->sc_flags & RAIDF_INITED) != 0;
441 }
442 
443 RF_Raid_t *
444 rf_get_raid(struct raid_softc *rs) {
445 	return &rs->sc_r;
446 }
447 
448 int
449 rf_get_unit(const struct raid_softc *rs) {
450 	return rs->sc_unit;
451 }
452 
453 static int
454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
455 	const char *bootname;
456 	size_t len;
457 
458 	/* if bdv is NULL, the set can't contain it. exit early. */
459 	if (bdv == NULL)
460 		return 0;
461 
462 	bootname = device_xname(bdv);
463 	len = strlen(bootname);
464 
465 	for (int col = 0; col < r->numCol; col++) {
466 		const char *devname = r->Disks[col].devname;
467 		devname += sizeof("/dev/") - 1;
468 		if (strncmp(devname, "dk", 2) == 0) {
469 			const char *parent =
470 			    dkwedge_get_parent_name(r->Disks[col].dev);
471 			if (parent != NULL)
472 				devname = parent;
473 		}
474 		if (strncmp(devname, bootname, len) == 0) {
475 			struct raid_softc *sc = r->softc;
476 			aprint_debug("raid%d includes boot device %s\n",
477 			    sc->sc_unit, devname);
478 			return 1;
479 		}
480 	}
481 	return 0;
482 }
483 
484 static int
485 rf_rescan(void)
486 {
487 	RF_AutoConfig_t *ac_list;
488 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
489 	struct raid_softc *sc;
490 	int raid_added;
491 
492 	ac_list = rf_find_raid_components();
493 	config_sets = rf_create_auto_sets(ac_list);
494 
495 	raid_added = 1;
496 	while (raid_added > 0) {
497 		raid_added = 0;
498 		cset = config_sets;
499 		while (cset != NULL) {
500 			next_cset = cset->next;
501 			if (rf_have_enough_components(cset) &&
502 			    cset->ac->clabel->autoconfigure == 1) {
503 				sc = rf_auto_config_set(cset);
504 				if (sc != NULL) {
505 					aprint_debug("raid%d: configured ok, rootable %d\n",
506 						     sc->sc_unit, cset->rootable);
507 					/* We added one RAID set */
508 					raid_added++;
509 				} else {
510 					/* The autoconfig didn't work :( */
511 					aprint_debug("Autoconfig failed\n");
512 					rf_release_all_vps(cset);
513 				}
514 			} else {
515 				/* we're not autoconfiguring this set...
516 				   release the associated resources */
517 				rf_release_all_vps(cset);
518 			}
519 			/* cleanup */
520 			rf_cleanup_config_set(cset);
521 			cset = next_cset;
522 		}
523 		if (raid_added > 0) {
524 			/* We added at least one RAID set, so re-scan for recursive RAID */
525 			ac_list = rf_find_raid_components();
526 			config_sets = rf_create_auto_sets(ac_list);
527 		}
528 	}
529 
530 	return 0;
531 }
532 
533 
534 static void
535 rf_buildroothack(RF_ConfigSet_t *config_sets)
536 {
537 	RF_AutoConfig_t *ac_list;
538 	RF_ConfigSet_t *cset;
539 	RF_ConfigSet_t *next_cset;
540 	int num_root;
541 	int raid_added;
542 	struct raid_softc *sc, *rsc;
543 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
544 
545 	sc = rsc = NULL;
546 	num_root = 0;
547 
548 	raid_added = 1;
549 	while (raid_added > 0) {
550 		raid_added = 0;
551 		cset = config_sets;
552 		while (cset != NULL) {
553 			next_cset = cset->next;
554 			if (rf_have_enough_components(cset) &&
555 			    cset->ac->clabel->autoconfigure == 1) {
556 				sc = rf_auto_config_set(cset);
557 				if (sc != NULL) {
558 					aprint_debug("raid%d: configured ok, rootable %d\n",
559 						     sc->sc_unit, cset->rootable);
560 					/* We added one RAID set */
561 					raid_added++;
562 					if (cset->rootable) {
563 						rsc = sc;
564 						num_root++;
565 					}
566 				} else {
567 					/* The autoconfig didn't work :( */
568 					aprint_debug("Autoconfig failed\n");
569 					rf_release_all_vps(cset);
570 				}
571 			} else {
572 				/* we're not autoconfiguring this set...
573 				   release the associated resources */
574 				rf_release_all_vps(cset);
575 			}
576 			/* cleanup */
577 			rf_cleanup_config_set(cset);
578 			cset = next_cset;
579 		}
580 		if (raid_added > 0) {
581 			/* We added at least one RAID set, so re-scan for recursive RAID */
582 			ac_list = rf_find_raid_components();
583 			config_sets = rf_create_auto_sets(ac_list);
584 		}
585 	}
586 
587 	/* if the user has specified what the root device should be
588 	   then we don't touch booted_device or boothowto... */
589 
590 	if (rootspec != NULL) {
591 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
592 		return;
593 	}
594 
595 	/* we found something bootable... */
596 
597 	/*
598 	 * XXX: The following code assumes that the root raid
599 	 * is the first ('a') partition. This is about the best
600 	 * we can do with a BSD disklabel, but we might be able
601 	 * to do better with a GPT label, by setting a specified
602 	 * attribute to indicate the root partition. We can then
603 	 * stash the partition number in the r->root_partition
604 	 * high bits (the bottom 2 bits are already used). For
605 	 * now we just set booted_partition to 0 when we override
606 	 * root.
607 	 */
608 	if (num_root == 1) {
609 		device_t candidate_root;
610 		dksc = &rsc->sc_dksc;
611 		if (dksc->sc_dkdev.dk_nwedges != 0) {
612 			char cname[sizeof(cset->ac->devname)];
613 			/* XXX: assume partition 'a' first */
614 			snprintf(cname, sizeof(cname), "%s%c",
615 			    device_xname(dksc->sc_dev), 'a');
616 			candidate_root = dkwedge_find_by_wname(cname);
617 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
618 			    cname);
619 			if (candidate_root == NULL) {
620 				/*
621 				 * If that is not found, because we don't use
622 				 * disklabel, return the first dk child
623 				 * XXX: we can skip the 'a' check above
624 				 * and always do this...
625 				 */
626 				size_t i = 0;
627 				candidate_root = dkwedge_find_by_parent(
628 				    device_xname(dksc->sc_dev), &i);
629 			}
630 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
631 			    candidate_root);
632 		} else
633 			candidate_root = dksc->sc_dev;
634 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
635 		DPRINTF("%s: booted_device=%p root_partition=%d "
636 			"contains_boot=%d",
637 		    __func__, booted_device, rsc->sc_r.root_partition,
638 			   rf_containsboot(&rsc->sc_r, booted_device));
639 		/* XXX the check for booted_device == NULL can probably be
640 		 * dropped, now that rf_containsboot handles that case.
641 		 */
642 		if (booted_device == NULL ||
643 		    rsc->sc_r.root_partition == 1 ||
644 		    rf_containsboot(&rsc->sc_r, booted_device)) {
645 			booted_device = candidate_root;
646 			booted_method = "raidframe/single";
647 			booted_partition = 0;	/* XXX assume 'a' */
648 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
649 			    device_xname(booted_device), booted_device);
650 		}
651 	} else if (num_root > 1) {
652 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
653 		    booted_device);
654 
655 		/*
656 		 * Maybe the MD code can help. If it cannot, then
657 		 * setroot() will discover that we have no
658 		 * booted_device and will ask the user if nothing was
659 		 * hardwired in the kernel config file
660 		 */
661 		if (booted_device == NULL)
662 			return;
663 
664 		num_root = 0;
665 		mutex_enter(&raid_lock);
666 		LIST_FOREACH(sc, &raids, sc_link) {
667 			RF_Raid_t *r = &sc->sc_r;
668 			if (r->valid == 0)
669 				continue;
670 
671 			if (r->root_partition == 0)
672 				continue;
673 
674 			if (rf_containsboot(r, booted_device)) {
675 				num_root++;
676 				rsc = sc;
677 				dksc = &rsc->sc_dksc;
678 			}
679 		}
680 		mutex_exit(&raid_lock);
681 
682 		if (num_root == 1) {
683 			booted_device = dksc->sc_dev;
684 			booted_method = "raidframe/multi";
685 			booted_partition = 0;	/* XXX assume 'a' */
686 		} else {
687 			/* we can't guess.. require the user to answer... */
688 			boothowto |= RB_ASKNAME;
689 		}
690 	}
691 }
692 
693 static int
694 raidsize(dev_t dev)
695 {
696 	struct raid_softc *rs;
697 	struct dk_softc *dksc;
698 	unsigned int unit;
699 
700 	unit = raidunit(dev);
701 	if ((rs = raidget(unit, false)) == NULL)
702 		return -1;
703 	dksc = &rs->sc_dksc;
704 
705 	if ((rs->sc_flags & RAIDF_INITED) == 0)
706 		return -1;
707 
708 	return dk_size(dksc, dev);
709 }
710 
711 static int
712 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
713 {
714 	unsigned int unit;
715 	struct raid_softc *rs;
716 	struct dk_softc *dksc;
717 
718 	unit = raidunit(dev);
719 	if ((rs = raidget(unit, false)) == NULL)
720 		return ENXIO;
721 	dksc = &rs->sc_dksc;
722 
723 	if ((rs->sc_flags & RAIDF_INITED) == 0)
724 		return ENODEV;
725 
726         /*
727            Note that blkno is relative to this particular partition.
728            By adding adding RF_PROTECTED_SECTORS, we get a value that
729 	   is relative to the partition used for the underlying component.
730         */
731 	blkno += RF_PROTECTED_SECTORS;
732 
733 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
734 }
735 
736 static int
737 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
738 {
739 	struct raid_softc *rs = raidsoftc(dev);
740 	const struct bdevsw *bdev;
741 	RF_Raid_t *raidPtr;
742 	int     c, sparecol, j, scol, dumpto;
743 	int     error = 0;
744 
745 	raidPtr = &rs->sc_r;
746 
747 	/* we only support dumping to RAID 1 sets */
748 	if (raidPtr->Layout.numDataCol != 1 ||
749 	    raidPtr->Layout.numParityCol != 1)
750 		return EINVAL;
751 
752 	if ((error = raidlock(rs)) != 0)
753 		return error;
754 
755 	/* figure out what device is alive.. */
756 
757 	/*
758 	   Look for a component to dump to.  The preference for the
759 	   component to dump to is as follows:
760 	   1) the first component
761 	   2) a used_spare of the first component
762 	   3) the second component
763 	   4) a used_spare of the second component
764 	*/
765 
766 	dumpto = -1;
767 	for (c = 0; c < raidPtr->numCol; c++) {
768 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
769 			/* this might be the one */
770 			dumpto = c;
771 			break;
772 		}
773 	}
774 
775 	/*
776 	   At this point we have possibly selected a live component.
777 	   If we didn't find a live ocmponent, we now check to see
778 	   if there is a relevant spared component.
779 	*/
780 
781 	for (c = 0; c < raidPtr->numSpare; c++) {
782 		sparecol = raidPtr->numCol + c;
783 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
784 			/* How about this one? */
785 			scol = -1;
786 			for(j=0;j<raidPtr->numCol;j++) {
787 				if (raidPtr->Disks[j].spareCol == sparecol) {
788 					scol = j;
789 					break;
790 				}
791 			}
792 			if (scol == 0) {
793 				/*
794 				   We must have found a spared first
795 				   component!  We'll take that over
796 				   anything else found so far.  (We
797 				   couldn't have found a real first
798 				   component before, since this is a
799 				   used spare, and it's saying that
800 				   it's replacing the first
801 				   component.)  On reboot (with
802 				   autoconfiguration turned on)
803 				   sparecol will become the first
804 				   component (component0) of this set.
805 				*/
806 				dumpto = sparecol;
807 				break;
808 			} else if (scol != -1) {
809 				/*
810 				   Must be a spared second component.
811 				   We'll dump to that if we havn't found
812 				   anything else so far.
813 				*/
814 				if (dumpto == -1)
815 					dumpto = sparecol;
816 			}
817 		}
818 	}
819 
820 	if (dumpto == -1) {
821 		/* we couldn't find any live components to dump to!?!?
822 		 */
823 		error = EINVAL;
824 		goto out;
825 	}
826 
827 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
828 	if (bdev == NULL) {
829 		error = ENXIO;
830 		goto out;
831 	}
832 
833 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
834 				blkno, va, nblk * raidPtr->bytesPerSector);
835 
836 out:
837 	raidunlock(rs);
838 
839 	return error;
840 }
841 
842 /* ARGSUSED */
843 static int
844 raidopen(dev_t dev, int flags, int fmt,
845     struct lwp *l)
846 {
847 	int     unit = raidunit(dev);
848 	struct raid_softc *rs;
849 	struct dk_softc *dksc;
850 	int     error = 0;
851 	int     part, pmask;
852 
853 	if ((rs = raidget(unit, true)) == NULL)
854 		return ENXIO;
855 	if ((error = raidlock(rs)) != 0)
856 		return error;
857 
858 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
859 		error = EBUSY;
860 		goto bad;
861 	}
862 
863 	dksc = &rs->sc_dksc;
864 
865 	part = DISKPART(dev);
866 	pmask = (1 << part);
867 
868 	if (!DK_BUSY(dksc, pmask) &&
869 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
870 		/* First one... mark things as dirty... Note that we *MUST*
871 		 have done a configure before this.  I DO NOT WANT TO BE
872 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
873 		 THAT THEY BELONG TOGETHER!!!!! */
874 		/* XXX should check to see if we're only open for reading
875 		   here... If so, we needn't do this, but then need some
876 		   other way of keeping track of what's happened.. */
877 
878 		rf_markalldirty(&rs->sc_r);
879 	}
880 
881 	if ((rs->sc_flags & RAIDF_INITED) != 0)
882 		error = dk_open(dksc, dev, flags, fmt, l);
883 
884 bad:
885 	raidunlock(rs);
886 
887 	return error;
888 
889 
890 }
891 
892 static int
893 raid_lastclose(device_t self)
894 {
895 	struct raid_softc *rs = raidsoftc(self);
896 
897 	/* Last one... device is not unconfigured yet.
898 	   Device shutdown has taken care of setting the
899 	   clean bits if RAIDF_INITED is not set
900 	   mark things as clean... */
901 
902 	rf_update_component_labels(&rs->sc_r,
903 	    RF_FINAL_COMPONENT_UPDATE);
904 
905 	/* pass to unlocked code */
906 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
907 		rs->sc_flags |= RAIDF_DETACH;
908 
909 	return 0;
910 }
911 
912 /* ARGSUSED */
913 static int
914 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
915 {
916 	int     unit = raidunit(dev);
917 	struct raid_softc *rs;
918 	struct dk_softc *dksc;
919 	cfdata_t cf;
920 	int     error = 0, do_detach = 0, do_put = 0;
921 
922 	if ((rs = raidget(unit, false)) == NULL)
923 		return ENXIO;
924 	dksc = &rs->sc_dksc;
925 
926 	if ((error = raidlock(rs)) != 0)
927 		return error;
928 
929 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
930 		error = dk_close(dksc, dev, flags, fmt, l);
931 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
932 			do_detach = 1;
933 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
934 		do_put = 1;
935 
936 	raidunlock(rs);
937 
938 	if (do_detach) {
939 		/* free the pseudo device attach bits */
940 		cf = device_cfdata(dksc->sc_dev);
941 		error = config_detach(dksc->sc_dev, 0);
942 		if (error == 0)
943 			free(cf, M_RAIDFRAME);
944 	} else if (do_put) {
945 		raidput(rs);
946 	}
947 
948 	return error;
949 
950 }
951 
952 static void
953 raid_wakeup(RF_Raid_t *raidPtr)
954 {
955 	rf_lock_mutex2(raidPtr->iodone_lock);
956 	rf_signal_cond2(raidPtr->iodone_cv);
957 	rf_unlock_mutex2(raidPtr->iodone_lock);
958 }
959 
960 static void
961 raidstrategy(struct buf *bp)
962 {
963 	unsigned int unit;
964 	struct raid_softc *rs;
965 	struct dk_softc *dksc;
966 	RF_Raid_t *raidPtr;
967 
968 	unit = raidunit(bp->b_dev);
969 	if ((rs = raidget(unit, false)) == NULL) {
970 		bp->b_error = ENXIO;
971 		goto fail;
972 	}
973 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
974 		bp->b_error = ENXIO;
975 		goto fail;
976 	}
977 	dksc = &rs->sc_dksc;
978 	raidPtr = &rs->sc_r;
979 
980 	/* Queue IO only */
981 	if (dk_strategy_defer(dksc, bp))
982 		goto done;
983 
984 	/* schedule the IO to happen at the next convenient time */
985 	raid_wakeup(raidPtr);
986 
987 done:
988 	return;
989 
990 fail:
991 	bp->b_resid = bp->b_bcount;
992 	biodone(bp);
993 }
994 
995 static int
996 raid_diskstart(device_t dev, struct buf *bp)
997 {
998 	struct raid_softc *rs = raidsoftc(dev);
999 	RF_Raid_t *raidPtr;
1000 
1001 	raidPtr = &rs->sc_r;
1002 	if (!raidPtr->valid) {
1003 		db1_printf(("raid is not valid..\n"));
1004 		return ENODEV;
1005 	}
1006 
1007 	/* XXX */
1008 	bp->b_resid = 0;
1009 
1010 	return raiddoaccess(raidPtr, bp);
1011 }
1012 
1013 void
1014 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1015 {
1016 	struct raid_softc *rs;
1017 	struct dk_softc *dksc;
1018 
1019 	rs = raidPtr->softc;
1020 	dksc = &rs->sc_dksc;
1021 
1022 	dk_done(dksc, bp);
1023 
1024 	rf_lock_mutex2(raidPtr->mutex);
1025 	raidPtr->openings++;
1026 	rf_unlock_mutex2(raidPtr->mutex);
1027 
1028 	/* schedule more IO */
1029 	raid_wakeup(raidPtr);
1030 }
1031 
1032 /* ARGSUSED */
1033 static int
1034 raidread(dev_t dev, struct uio *uio, int flags)
1035 {
1036 	int     unit = raidunit(dev);
1037 	struct raid_softc *rs;
1038 
1039 	if ((rs = raidget(unit, false)) == NULL)
1040 		return ENXIO;
1041 
1042 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1043 		return ENXIO;
1044 
1045 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1046 
1047 }
1048 
1049 /* ARGSUSED */
1050 static int
1051 raidwrite(dev_t dev, struct uio *uio, int flags)
1052 {
1053 	int     unit = raidunit(dev);
1054 	struct raid_softc *rs;
1055 
1056 	if ((rs = raidget(unit, false)) == NULL)
1057 		return ENXIO;
1058 
1059 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1060 		return ENXIO;
1061 
1062 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1063 
1064 }
1065 
1066 static int
1067 raid_detach_unlocked(struct raid_softc *rs)
1068 {
1069 	struct dk_softc *dksc = &rs->sc_dksc;
1070 	RF_Raid_t *raidPtr;
1071 	int error;
1072 
1073 	raidPtr = &rs->sc_r;
1074 
1075 	if (DK_BUSY(dksc, 0) ||
1076 	    raidPtr->recon_in_progress != 0 ||
1077 	    raidPtr->parity_rewrite_in_progress != 0 ||
1078 	    raidPtr->copyback_in_progress != 0)
1079 		return EBUSY;
1080 
1081 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1082 		return 0;
1083 
1084 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1085 
1086 	if ((error = rf_Shutdown(raidPtr)) != 0)
1087 		return error;
1088 
1089 	rs->sc_flags &= ~RAIDF_INITED;
1090 
1091 	/* Kill off any queued buffers */
1092 	dk_drain(dksc);
1093 	bufq_free(dksc->sc_bufq);
1094 
1095 	/* Detach the disk. */
1096 	dkwedge_delall(&dksc->sc_dkdev);
1097 	disk_detach(&dksc->sc_dkdev);
1098 	disk_destroy(&dksc->sc_dkdev);
1099 	dk_detach(dksc);
1100 
1101 	return 0;
1102 }
1103 
1104 static bool
1105 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1106 {
1107 	switch (cmd) {
1108 	case RAIDFRAME_ADD_HOT_SPARE:
1109 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1110 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1111 	case RAIDFRAME_CHECK_PARITY:
1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1113 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1114 	case RAIDFRAME_CHECK_RECON_STATUS:
1115 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1116 	case RAIDFRAME_COPYBACK:
1117 	case RAIDFRAME_DELETE_COMPONENT:
1118 	case RAIDFRAME_FAIL_DISK:
1119 	case RAIDFRAME_GET_ACCTOTALS:
1120 	case RAIDFRAME_GET_COMPONENT_LABEL:
1121 	case RAIDFRAME_GET_INFO:
1122 	case RAIDFRAME_GET_SIZE:
1123 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1124 	case RAIDFRAME_INIT_LABELS:
1125 	case RAIDFRAME_KEEP_ACCTOTALS:
1126 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1127 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1128 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1129 	case RAIDFRAME_PARITYMAP_STATUS:
1130 	case RAIDFRAME_REBUILD_IN_PLACE:
1131 	case RAIDFRAME_REMOVE_HOT_SPARE:
1132 	case RAIDFRAME_RESET_ACCTOTALS:
1133 	case RAIDFRAME_REWRITEPARITY:
1134 	case RAIDFRAME_SET_AUTOCONFIG:
1135 	case RAIDFRAME_SET_COMPONENT_LABEL:
1136 	case RAIDFRAME_SET_ROOT:
1137 		return (rs->sc_flags & RAIDF_INITED) == 0;
1138 	}
1139 	return false;
1140 }
1141 
1142 int
1143 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1144 {
1145 	struct rf_recon_req_internal *rrint;
1146 
1147 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1148 		/* Can't do this on a RAID 0!! */
1149 		return EINVAL;
1150 	}
1151 
1152 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1153 		/* bad column */
1154 		return EINVAL;
1155 	}
1156 
1157 	rf_lock_mutex2(raidPtr->mutex);
1158 	if (raidPtr->status == rf_rs_reconstructing) {
1159 		/* you can't fail a disk while we're reconstructing! */
1160 		/* XXX wrong for RAID6 */
1161 		goto out;
1162 	}
1163 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1164 	    (raidPtr->numFailures > 0)) {
1165 		/* some other component has failed.  Let's not make
1166 		   things worse. XXX wrong for RAID6 */
1167 		goto out;
1168 	}
1169 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1170 		/* Can't fail a spared disk! */
1171 		goto out;
1172 	}
1173 	rf_unlock_mutex2(raidPtr->mutex);
1174 
1175 	/* make a copy of the recon request so that we don't rely on
1176 	 * the user's buffer */
1177 	rrint = RF_Malloc(sizeof(*rrint));
1178 	if (rrint == NULL)
1179 		return(ENOMEM);
1180 	rrint->col = rr->col;
1181 	rrint->flags = rr->flags;
1182 	rrint->raidPtr = raidPtr;
1183 
1184 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1185 	    rrint, "raid_recon");
1186 out:
1187 	rf_unlock_mutex2(raidPtr->mutex);
1188 	return EINVAL;
1189 }
1190 
1191 static int
1192 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1193 {
1194 	/* allocate a buffer for the layout-specific data, and copy it in */
1195 	if (k_cfg->layoutSpecificSize == 0)
1196 		return 0;
1197 
1198 	if (k_cfg->layoutSpecificSize > 10000) {
1199 	    /* sanity check */
1200 	    return EINVAL;
1201 	}
1202 
1203 	u_char *specific_buf;
1204 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1205 	if (specific_buf == NULL)
1206 		return ENOMEM;
1207 
1208 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1209 	    k_cfg->layoutSpecificSize);
1210 	if (retcode) {
1211 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1212 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1213 		return retcode;
1214 	}
1215 
1216 	k_cfg->layoutSpecific = specific_buf;
1217 	return 0;
1218 }
1219 
1220 static int
1221 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1222 {
1223 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
1224 
1225 	if (rs->sc_r.valid) {
1226 		/* There is a valid RAID set running on this unit! */
1227 		printf("raid%d: Device already configured!\n", rs->sc_unit);
1228 		return EINVAL;
1229 	}
1230 
1231 	/* copy-in the configuration information */
1232 	/* data points to a pointer to the configuration structure */
1233 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
1234 	if (*k_cfg == NULL) {
1235 		return ENOMEM;
1236 	}
1237 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1238 	if (retcode == 0)
1239 		return 0;
1240 	RF_Free(*k_cfg, sizeof(RF_Config_t));
1241 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1242 	rs->sc_flags |= RAIDF_SHUTDOWN;
1243 	return retcode;
1244 }
1245 
1246 int
1247 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1248 {
1249 	int retcode;
1250 	RF_Raid_t *raidPtr = &rs->sc_r;
1251 
1252 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1253 
1254 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1255 		goto out;
1256 
1257 	/* should do some kind of sanity check on the configuration.
1258 	 * Store the sum of all the bytes in the last byte? */
1259 
1260 	/* configure the system */
1261 
1262 	/*
1263 	 * Clear the entire RAID descriptor, just to make sure
1264 	 *  there is no stale data left in the case of a
1265 	 *  reconfiguration
1266 	 */
1267 	memset(raidPtr, 0, sizeof(*raidPtr));
1268 	raidPtr->softc = rs;
1269 	raidPtr->raidid = rs->sc_unit;
1270 
1271 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
1272 
1273 	if (retcode == 0) {
1274 		/* allow this many simultaneous IO's to
1275 		   this RAID device */
1276 		raidPtr->openings = RAIDOUTSTANDING;
1277 
1278 		raidinit(rs);
1279 		raid_wakeup(raidPtr);
1280 		rf_markalldirty(raidPtr);
1281 	}
1282 
1283 	/* free the buffers.  No return code here. */
1284 	if (k_cfg->layoutSpecificSize) {
1285 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1286 	}
1287 out:
1288 	RF_Free(k_cfg, sizeof(RF_Config_t));
1289 	if (retcode) {
1290 		/*
1291 		 * If configuration failed, set sc_flags so that we
1292 		 * will detach the device when we close it.
1293 		 */
1294 		rs->sc_flags |= RAIDF_SHUTDOWN;
1295 	}
1296 	return retcode;
1297 }
1298 
1299 #if RF_DISABLED
1300 static int
1301 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1302 {
1303 
1304 	/* XXX check the label for valid stuff... */
1305 	/* Note that some things *should not* get modified --
1306 	   the user should be re-initing the labels instead of
1307 	   trying to patch things.
1308 	   */
1309 #ifdef DEBUG
1310 	int raidid = raidPtr->raidid;
1311 	printf("raid%d: Got component label:\n", raidid);
1312 	printf("raid%d: Version: %d\n", raidid, clabel->version);
1313 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1314 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1315 	printf("raid%d: Column: %d\n", raidid, clabel->column);
1316 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1317 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1318 	printf("raid%d: Status: %d\n", raidid, clabel->status);
1319 #endif	/* DEBUG */
1320 	clabel->row = 0;
1321 	int column = clabel->column;
1322 
1323 	if ((column < 0) || (column >= raidPtr->numCol)) {
1324 		return(EINVAL);
1325 	}
1326 
1327 	/* XXX this isn't allowed to do anything for now :-) */
1328 
1329 	/* XXX and before it is, we need to fill in the rest
1330 	   of the fields!?!?!?! */
1331 	memcpy(raidget_component_label(raidPtr, column),
1332 	    clabel, sizeof(*clabel));
1333 	raidflush_component_label(raidPtr, column);
1334 	return 0;
1335 }
1336 #endif
1337 
1338 static int
1339 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1340 {
1341 	/*
1342 	   we only want the serial number from
1343 	   the above.  We get all the rest of the information
1344 	   from the config that was used to create this RAID
1345 	   set.
1346 	   */
1347 
1348 	raidPtr->serial_number = clabel->serial_number;
1349 
1350 	for (int column = 0; column < raidPtr->numCol; column++) {
1351 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1352 		if (RF_DEAD_DISK(diskPtr->status))
1353 			continue;
1354 		RF_ComponentLabel_t *ci_label = raidget_component_label(
1355 		    raidPtr, column);
1356 		/* Zeroing this is important. */
1357 		memset(ci_label, 0, sizeof(*ci_label));
1358 		raid_init_component_label(raidPtr, ci_label);
1359 		ci_label->serial_number = raidPtr->serial_number;
1360 		ci_label->row = 0; /* we dont' pretend to support more */
1361 		rf_component_label_set_partitionsize(ci_label,
1362 		    diskPtr->partitionSize);
1363 		ci_label->column = column;
1364 		raidflush_component_label(raidPtr, column);
1365 		/* XXXjld what about the spares? */
1366 	}
1367 
1368 	return 0;
1369 }
1370 
1371 static int
1372 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1373 {
1374 
1375 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1376 		/* Can't do this on a RAID 0!! */
1377 		return EINVAL;
1378 	}
1379 
1380 	if (raidPtr->recon_in_progress == 1) {
1381 		/* a reconstruct is already in progress! */
1382 		return EINVAL;
1383 	}
1384 
1385 	RF_SingleComponent_t component;
1386 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1387 	component.row = 0; /* we don't support any more */
1388 	int column = component.column;
1389 
1390 	if ((column < 0) || (column >= raidPtr->numCol)) {
1391 		return EINVAL;
1392 	}
1393 
1394 	rf_lock_mutex2(raidPtr->mutex);
1395 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1396 	    (raidPtr->numFailures > 0)) {
1397 		/* XXX 0 above shouldn't be constant!!! */
1398 		/* some component other than this has failed.
1399 		   Let's not make things worse than they already
1400 		   are... */
1401 		printf("raid%d: Unable to reconstruct to disk at:\n",
1402 		       raidPtr->raidid);
1403 		printf("raid%d:     Col: %d   Too many failures.\n",
1404 		       raidPtr->raidid, column);
1405 		rf_unlock_mutex2(raidPtr->mutex);
1406 		return EINVAL;
1407 	}
1408 
1409 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1410 		printf("raid%d: Unable to reconstruct to disk at:\n",
1411 		       raidPtr->raidid);
1412 		printf("raid%d:    Col: %d   "
1413 		    "Reconstruction already occurring!\n",
1414 		    raidPtr->raidid, column);
1415 
1416 		rf_unlock_mutex2(raidPtr->mutex);
1417 		return EINVAL;
1418 	}
1419 
1420 	if (raidPtr->Disks[column].status == rf_ds_spared) {
1421 		rf_unlock_mutex2(raidPtr->mutex);
1422 		return EINVAL;
1423 	}
1424 
1425 	rf_unlock_mutex2(raidPtr->mutex);
1426 
1427 	struct rf_recon_req_internal *rrint;
1428 	rrint = RF_Malloc(sizeof(*rrint));
1429 	if (rrint == NULL)
1430 		return ENOMEM;
1431 
1432 	rrint->col = column;
1433 	rrint->raidPtr = raidPtr;
1434 
1435 	return RF_CREATE_THREAD(raidPtr->recon_thread,
1436 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1437 }
1438 
1439 static int
1440 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1441 {
1442 	/*
1443 	 * This makes no sense on a RAID 0, or if we are not reconstructing
1444 	 * so tell the user it's done.
1445 	 */
1446 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
1447 	    raidPtr->status != rf_rs_reconstructing) {
1448 		*data = 100;
1449 		return 0;
1450 	}
1451 	if (raidPtr->reconControl->numRUsTotal == 0) {
1452 		*data = 0;
1453 		return 0;
1454 	}
1455 	*data = (raidPtr->reconControl->numRUsComplete * 100
1456 	    / raidPtr->reconControl->numRUsTotal);
1457 	return 0;
1458 }
1459 
1460 static int
1461 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1462 {
1463 	int     unit = raidunit(dev);
1464 	int     part, pmask;
1465 	struct raid_softc *rs;
1466 	struct dk_softc *dksc;
1467 	RF_Config_t *k_cfg;
1468 	RF_Raid_t *raidPtr;
1469 	RF_AccTotals_t *totals;
1470 	RF_SingleComponent_t component;
1471 	RF_DeviceConfig_t *d_cfg, *ucfgp;
1472 	int retcode = 0;
1473 	int column;
1474 	RF_ComponentLabel_t *clabel;
1475 	RF_SingleComponent_t *sparePtr,*componentPtr;
1476 	int d;
1477 
1478 	if ((rs = raidget(unit, false)) == NULL)
1479 		return ENXIO;
1480 
1481 	dksc = &rs->sc_dksc;
1482 	raidPtr = &rs->sc_r;
1483 
1484 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1485 	    (int) DISKPART(dev), (int) unit, cmd));
1486 
1487 	/* Must be initialized for these... */
1488 	if (rf_must_be_initialized(rs, cmd))
1489 		return ENXIO;
1490 
1491 	switch (cmd) {
1492 		/* configure the system */
1493 	case RAIDFRAME_CONFIGURE:
1494 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1495 			return retcode;
1496 		return rf_construct(rs, k_cfg);
1497 
1498 		/* shutdown the system */
1499 	case RAIDFRAME_SHUTDOWN:
1500 
1501 		part = DISKPART(dev);
1502 		pmask = (1 << part);
1503 
1504 		if ((retcode = raidlock(rs)) != 0)
1505 			return retcode;
1506 
1507 		if (DK_BUSY(dksc, pmask) ||
1508 		    raidPtr->recon_in_progress != 0 ||
1509 		    raidPtr->parity_rewrite_in_progress != 0 ||
1510 		    raidPtr->copyback_in_progress != 0)
1511 			retcode = EBUSY;
1512 		else {
1513 			/* detach and free on close */
1514 			rs->sc_flags |= RAIDF_SHUTDOWN;
1515 			retcode = 0;
1516 		}
1517 
1518 		raidunlock(rs);
1519 
1520 		return retcode;
1521 	case RAIDFRAME_GET_COMPONENT_LABEL:
1522 		return rf_get_component_label(raidPtr, data);
1523 
1524 #if RF_DISABLED
1525 	case RAIDFRAME_SET_COMPONENT_LABEL:
1526 		return rf_set_component_label(raidPtr, data);
1527 #endif
1528 
1529 	case RAIDFRAME_INIT_LABELS:
1530 		return rf_init_component_label(raidPtr, data);
1531 
1532 	case RAIDFRAME_SET_AUTOCONFIG:
1533 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1534 		printf("raid%d: New autoconfig value is: %d\n",
1535 		       raidPtr->raidid, d);
1536 		*(int *) data = d;
1537 		return retcode;
1538 
1539 	case RAIDFRAME_SET_ROOT:
1540 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1541 		printf("raid%d: New rootpartition value is: %d\n",
1542 		       raidPtr->raidid, d);
1543 		*(int *) data = d;
1544 		return retcode;
1545 
1546 		/* initialize all parity */
1547 	case RAIDFRAME_REWRITEPARITY:
1548 
1549 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1550 			/* Parity for RAID 0 is trivially correct */
1551 			raidPtr->parity_good = RF_RAID_CLEAN;
1552 			return 0;
1553 		}
1554 
1555 		if (raidPtr->parity_rewrite_in_progress == 1) {
1556 			/* Re-write is already in progress! */
1557 			return EINVAL;
1558 		}
1559 
1560 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1561 		    rf_RewriteParityThread, raidPtr,"raid_parity");
1562 
1563 	case RAIDFRAME_ADD_HOT_SPARE:
1564 		sparePtr = (RF_SingleComponent_t *) data;
1565 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
1566 		return rf_add_hot_spare(raidPtr, &component);
1567 
1568 	case RAIDFRAME_REMOVE_HOT_SPARE:
1569 		return retcode;
1570 
1571 	case RAIDFRAME_DELETE_COMPONENT:
1572 		componentPtr = (RF_SingleComponent_t *)data;
1573 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1574 		return rf_delete_component(raidPtr, &component);
1575 
1576 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1577 		componentPtr = (RF_SingleComponent_t *)data;
1578 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1579 		return rf_incorporate_hot_spare(raidPtr, &component);
1580 
1581 	case RAIDFRAME_REBUILD_IN_PLACE:
1582 		return rf_rebuild_in_place(raidPtr, data);
1583 
1584 	case RAIDFRAME_GET_INFO:
1585 		ucfgp = *(RF_DeviceConfig_t **)data;
1586 		d_cfg = RF_Malloc(sizeof(*d_cfg));
1587 		if (d_cfg == NULL)
1588 			return ENOMEM;
1589 		retcode = rf_get_info(raidPtr, d_cfg);
1590 		if (retcode == 0) {
1591 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1592 		}
1593 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1594 		return retcode;
1595 
1596 	case RAIDFRAME_CHECK_PARITY:
1597 		*(int *) data = raidPtr->parity_good;
1598 		return 0;
1599 
1600 	case RAIDFRAME_PARITYMAP_STATUS:
1601 		if (rf_paritymap_ineligible(raidPtr))
1602 			return EINVAL;
1603 		rf_paritymap_status(raidPtr->parity_map, data);
1604 		return 0;
1605 
1606 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1607 		if (rf_paritymap_ineligible(raidPtr))
1608 			return EINVAL;
1609 		if (raidPtr->parity_map == NULL)
1610 			return ENOENT; /* ??? */
1611 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1612 			return EINVAL;
1613 		return 0;
1614 
1615 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1616 		if (rf_paritymap_ineligible(raidPtr))
1617 			return EINVAL;
1618 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1619 		return 0;
1620 
1621 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1622 		if (rf_paritymap_ineligible(raidPtr))
1623 			return EINVAL;
1624 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1625 		/* XXX should errors be passed up? */
1626 		return 0;
1627 
1628 	case RAIDFRAME_RESCAN:
1629 		return rf_rescan();
1630 
1631 	case RAIDFRAME_RESET_ACCTOTALS:
1632 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1633 		return 0;
1634 
1635 	case RAIDFRAME_GET_ACCTOTALS:
1636 		totals = (RF_AccTotals_t *) data;
1637 		*totals = raidPtr->acc_totals;
1638 		return 0;
1639 
1640 	case RAIDFRAME_KEEP_ACCTOTALS:
1641 		raidPtr->keep_acc_totals = *(int *)data;
1642 		return 0;
1643 
1644 	case RAIDFRAME_GET_SIZE:
1645 		*(int *) data = raidPtr->totalSectors;
1646 		return 0;
1647 
1648 	case RAIDFRAME_FAIL_DISK:
1649 		return rf_fail_disk(raidPtr, data);
1650 
1651 		/* invoke a copyback operation after recon on whatever disk
1652 		 * needs it, if any */
1653 	case RAIDFRAME_COPYBACK:
1654 
1655 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1656 			/* This makes no sense on a RAID 0!! */
1657 			return EINVAL;
1658 		}
1659 
1660 		if (raidPtr->copyback_in_progress == 1) {
1661 			/* Copyback is already in progress! */
1662 			return EINVAL;
1663 		}
1664 
1665 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
1666 		    rf_CopybackThread, raidPtr, "raid_copyback");
1667 
1668 		/* return the percentage completion of reconstruction */
1669 	case RAIDFRAME_CHECK_RECON_STATUS:
1670 		return rf_check_recon_status(raidPtr, data);
1671 
1672 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1673 		rf_check_recon_status_ext(raidPtr, data);
1674 		return 0;
1675 
1676 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1677 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1678 			/* This makes no sense on a RAID 0, so tell the
1679 			   user it's done. */
1680 			*(int *) data = 100;
1681 			return 0;
1682 		}
1683 		if (raidPtr->parity_rewrite_in_progress == 1) {
1684 			*(int *) data = 100 *
1685 				raidPtr->parity_rewrite_stripes_done /
1686 				raidPtr->Layout.numStripe;
1687 		} else {
1688 			*(int *) data = 100;
1689 		}
1690 		return 0;
1691 
1692 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1693 		rf_check_parityrewrite_status_ext(raidPtr, data);
1694 		return 0;
1695 
1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1697 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1698 			/* This makes no sense on a RAID 0 */
1699 			*(int *) data = 100;
1700 			return 0;
1701 		}
1702 		if (raidPtr->copyback_in_progress == 1) {
1703 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1704 				raidPtr->Layout.numStripe;
1705 		} else {
1706 			*(int *) data = 100;
1707 		}
1708 		return 0;
1709 
1710 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1711 		rf_check_copyback_status_ext(raidPtr, data);
1712 		return 0;
1713 
1714 	case RAIDFRAME_SET_LAST_UNIT:
1715 		for (column = 0; column < raidPtr->numCol; column++)
1716 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1717 				return EBUSY;
1718 
1719 		for (column = 0; column < raidPtr->numCol; column++) {
1720 			clabel = raidget_component_label(raidPtr, column);
1721 			clabel->last_unit = *(int *)data;
1722 			raidflush_component_label(raidPtr, column);
1723 		}
1724 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1725 		return 0;
1726 
1727 		/* the sparetable daemon calls this to wait for the kernel to
1728 		 * need a spare table. this ioctl does not return until a
1729 		 * spare table is needed. XXX -- calling mpsleep here in the
1730 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1731 		 * -- I should either compute the spare table in the kernel,
1732 		 * or have a different -- XXX XXX -- interface (a different
1733 		 * character device) for delivering the table     -- XXX */
1734 #if RF_DISABLED
1735 	case RAIDFRAME_SPARET_WAIT:
1736 		rf_lock_mutex2(rf_sparet_wait_mutex);
1737 		while (!rf_sparet_wait_queue)
1738 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1739 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1740 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1741 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1742 
1743 		/* structure assignment */
1744 		*((RF_SparetWait_t *) data) = *waitreq;
1745 
1746 		RF_Free(waitreq, sizeof(*waitreq));
1747 		return 0;
1748 
1749 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1750 		 * code in it that will cause the dameon to exit */
1751 	case RAIDFRAME_ABORT_SPARET_WAIT:
1752 		waitreq = RF_Malloc(sizeof(*waitreq));
1753 		waitreq->fcol = -1;
1754 		rf_lock_mutex2(rf_sparet_wait_mutex);
1755 		waitreq->next = rf_sparet_wait_queue;
1756 		rf_sparet_wait_queue = waitreq;
1757 		rf_broadcast_cond2(rf_sparet_wait_cv);
1758 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1759 		return 0;
1760 
1761 		/* used by the spare table daemon to deliver a spare table
1762 		 * into the kernel */
1763 	case RAIDFRAME_SEND_SPARET:
1764 
1765 		/* install the spare table */
1766 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1767 
1768 		/* respond to the requestor.  the return status of the spare
1769 		 * table installation is passed in the "fcol" field */
1770 		waitred = RF_Malloc(sizeof(*waitreq));
1771 		waitreq->fcol = retcode;
1772 		rf_lock_mutex2(rf_sparet_wait_mutex);
1773 		waitreq->next = rf_sparet_resp_queue;
1774 		rf_sparet_resp_queue = waitreq;
1775 		rf_broadcast_cond2(rf_sparet_resp_cv);
1776 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1777 
1778 		return retcode;
1779 #endif
1780 	default:
1781 		/*
1782 		 * Don't bother trying to load compat modules
1783 		 * if it is not our ioctl. This is more efficient
1784 		 * and makes rump tests not depend on compat code
1785 		 */
1786 		if (IOCGROUP(cmd) != 'r')
1787 			break;
1788 #ifdef _LP64
1789 		if ((l->l_proc->p_flag & PK_32) != 0) {
1790 			module_autoload("compat_netbsd32_raid",
1791 			    MODULE_CLASS_EXEC);
1792 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1793 			    (rs, cmd, data), enosys(), retcode);
1794 			if (retcode != EPASSTHROUGH)
1795 				return retcode;
1796 		}
1797 #endif
1798 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1799 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1800 		    (rs, cmd, data), enosys(), retcode);
1801 		if (retcode != EPASSTHROUGH)
1802 			return retcode;
1803 
1804 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1805 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1806 		    (rs, cmd, data), enosys(), retcode);
1807 		if (retcode != EPASSTHROUGH)
1808 			return retcode;
1809 		break; /* fall through to the os-specific code below */
1810 
1811 	}
1812 
1813 	if (!raidPtr->valid)
1814 		return EINVAL;
1815 
1816 	/*
1817 	 * Add support for "regular" device ioctls here.
1818 	 */
1819 
1820 	switch (cmd) {
1821 	case DIOCGCACHE:
1822 		retcode = rf_get_component_caches(raidPtr, (int *)data);
1823 		break;
1824 
1825 	case DIOCCACHESYNC:
1826 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1827 		break;
1828 
1829 	default:
1830 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1831 		break;
1832 	}
1833 
1834 	return retcode;
1835 
1836 }
1837 
1838 
1839 /* raidinit -- complete the rest of the initialization for the
1840    RAIDframe device.  */
1841 
1842 
1843 static void
1844 raidinit(struct raid_softc *rs)
1845 {
1846 	cfdata_t cf;
1847 	unsigned int unit;
1848 	struct dk_softc *dksc = &rs->sc_dksc;
1849 	RF_Raid_t *raidPtr = &rs->sc_r;
1850 	device_t dev;
1851 
1852 	unit = raidPtr->raidid;
1853 
1854 	/* XXX doesn't check bounds. */
1855 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1856 
1857 	/* attach the pseudo device */
1858 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1859 	cf->cf_name = raid_cd.cd_name;
1860 	cf->cf_atname = raid_cd.cd_name;
1861 	cf->cf_unit = unit;
1862 	cf->cf_fstate = FSTATE_STAR;
1863 
1864 	dev = config_attach_pseudo(cf);
1865 	if (dev == NULL) {
1866 		printf("raid%d: config_attach_pseudo failed\n",
1867 		    raidPtr->raidid);
1868 		free(cf, M_RAIDFRAME);
1869 		return;
1870 	}
1871 
1872 	/* provide a backpointer to the real softc */
1873 	raidsoftc(dev) = rs;
1874 
1875 	/* disk_attach actually creates space for the CPU disklabel, among
1876 	 * other things, so it's critical to call this *BEFORE* we try putzing
1877 	 * with disklabels. */
1878 	dk_init(dksc, dev, DKTYPE_RAID);
1879 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1880 
1881 	/* XXX There may be a weird interaction here between this, and
1882 	 * protectedSectors, as used in RAIDframe.  */
1883 
1884 	rs->sc_size = raidPtr->totalSectors;
1885 
1886 	/* Attach dk and disk subsystems */
1887 	dk_attach(dksc);
1888 	disk_attach(&dksc->sc_dkdev);
1889 	rf_set_geometry(rs, raidPtr);
1890 
1891 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1892 
1893 	/* mark unit as usuable */
1894 	rs->sc_flags |= RAIDF_INITED;
1895 
1896 	dkwedge_discover(&dksc->sc_dkdev);
1897 }
1898 
1899 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1900 /* wake up the daemon & tell it to get us a spare table
1901  * XXX
1902  * the entries in the queues should be tagged with the raidPtr
1903  * so that in the extremely rare case that two recons happen at once,
1904  * we know for which device were requesting a spare table
1905  * XXX
1906  *
1907  * XXX This code is not currently used. GO
1908  */
1909 int
1910 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1911 {
1912 	int     retcode;
1913 
1914 	rf_lock_mutex2(rf_sparet_wait_mutex);
1915 	req->next = rf_sparet_wait_queue;
1916 	rf_sparet_wait_queue = req;
1917 	rf_broadcast_cond2(rf_sparet_wait_cv);
1918 
1919 	/* mpsleep unlocks the mutex */
1920 	while (!rf_sparet_resp_queue) {
1921 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1922 	}
1923 	req = rf_sparet_resp_queue;
1924 	rf_sparet_resp_queue = req->next;
1925 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1926 
1927 	retcode = req->fcol;
1928 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1929 					 * alloc'd */
1930 	return retcode;
1931 }
1932 #endif
1933 
1934 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1935  * bp & passes it down.
1936  * any calls originating in the kernel must use non-blocking I/O
1937  * do some extra sanity checking to return "appropriate" error values for
1938  * certain conditions (to make some standard utilities work)
1939  *
1940  * Formerly known as: rf_DoAccessKernel
1941  */
1942 void
1943 raidstart(RF_Raid_t *raidPtr)
1944 {
1945 	struct raid_softc *rs;
1946 	struct dk_softc *dksc;
1947 
1948 	rs = raidPtr->softc;
1949 	dksc = &rs->sc_dksc;
1950 	/* quick check to see if anything has died recently */
1951 	rf_lock_mutex2(raidPtr->mutex);
1952 	if (raidPtr->numNewFailures > 0) {
1953 		rf_unlock_mutex2(raidPtr->mutex);
1954 		rf_update_component_labels(raidPtr,
1955 					   RF_NORMAL_COMPONENT_UPDATE);
1956 		rf_lock_mutex2(raidPtr->mutex);
1957 		raidPtr->numNewFailures--;
1958 	}
1959 	rf_unlock_mutex2(raidPtr->mutex);
1960 
1961 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1962 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1963 		return;
1964 	}
1965 
1966 	dk_start(dksc, NULL);
1967 }
1968 
1969 static int
1970 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1971 {
1972 	RF_SectorCount_t num_blocks, pb, sum;
1973 	RF_RaidAddr_t raid_addr;
1974 	daddr_t blocknum;
1975 	int rc;
1976 
1977 	rf_lock_mutex2(raidPtr->mutex);
1978 	if (raidPtr->openings == 0) {
1979 		rf_unlock_mutex2(raidPtr->mutex);
1980 		return EAGAIN;
1981 	}
1982 	rf_unlock_mutex2(raidPtr->mutex);
1983 
1984 	blocknum = bp->b_rawblkno;
1985 
1986 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1987 		    (int) blocknum));
1988 
1989 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1990 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1991 
1992 	/* *THIS* is where we adjust what block we're going to...
1993 	 * but DO NOT TOUCH bp->b_blkno!!! */
1994 	raid_addr = blocknum;
1995 
1996 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1997 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1998 	sum = raid_addr + num_blocks + pb;
1999 	if (1 || rf_debugKernelAccess) {
2000 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2001 			    (int) raid_addr, (int) sum, (int) num_blocks,
2002 			    (int) pb, (int) bp->b_resid));
2003 	}
2004 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2005 	    || (sum < num_blocks) || (sum < pb)) {
2006 		rc = ENOSPC;
2007 		goto done;
2008 	}
2009 	/*
2010 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2011 	 */
2012 
2013 	if (bp->b_bcount & raidPtr->sectorMask) {
2014 		rc = ENOSPC;
2015 		goto done;
2016 	}
2017 	db1_printf(("Calling DoAccess..\n"));
2018 
2019 
2020 	rf_lock_mutex2(raidPtr->mutex);
2021 	raidPtr->openings--;
2022 	rf_unlock_mutex2(raidPtr->mutex);
2023 
2024 	/* don't ever condition on bp->b_flags & B_WRITE.
2025 	 * always condition on B_READ instead */
2026 
2027 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2028 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2029 			 raid_addr, num_blocks,
2030 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2031 
2032 done:
2033 	return rc;
2034 }
2035 
2036 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2037 
2038 int
2039 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2040 {
2041 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2042 	struct buf *bp;
2043 
2044 	req->queue = queue;
2045 	bp = req->bp;
2046 
2047 	switch (req->type) {
2048 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2049 		/* XXX need to do something extra here.. */
2050 		/* I'm leaving this in, as I've never actually seen it used,
2051 		 * and I'd like folks to report it... GO */
2052 		printf("%s: WAKEUP CALLED\n", __func__);
2053 		queue->numOutstanding++;
2054 
2055 		bp->b_flags = 0;
2056 		bp->b_private = req;
2057 
2058 		KernelWakeupFunc(bp);
2059 		break;
2060 
2061 	case RF_IO_TYPE_READ:
2062 	case RF_IO_TYPE_WRITE:
2063 #if RF_ACC_TRACE > 0
2064 		if (req->tracerec) {
2065 			RF_ETIMER_START(req->tracerec->timer);
2066 		}
2067 #endif
2068 		InitBP(bp, queue->rf_cinfo->ci_vp,
2069 		    op, queue->rf_cinfo->ci_dev,
2070 		    req->sectorOffset, req->numSector,
2071 		    req->buf, KernelWakeupFunc, (void *) req,
2072 		    queue->raidPtr->logBytesPerSector);
2073 
2074 		if (rf_debugKernelAccess) {
2075 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2076 				(long) bp->b_blkno));
2077 		}
2078 		queue->numOutstanding++;
2079 		queue->last_deq_sector = req->sectorOffset;
2080 		/* acc wouldn't have been let in if there were any pending
2081 		 * reqs at any other priority */
2082 		queue->curPriority = req->priority;
2083 
2084 		db1_printf(("Going for %c to unit %d col %d\n",
2085 			    req->type, queue->raidPtr->raidid,
2086 			    queue->col));
2087 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2088 			(int) req->sectorOffset, (int) req->numSector,
2089 			(int) (req->numSector <<
2090 			    queue->raidPtr->logBytesPerSector),
2091 			(int) queue->raidPtr->logBytesPerSector));
2092 
2093 		/*
2094 		 * XXX: drop lock here since this can block at
2095 		 * least with backing SCSI devices.  Retake it
2096 		 * to minimize fuss with calling interfaces.
2097 		 */
2098 
2099 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2100 		bdev_strategy(bp);
2101 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2102 		break;
2103 
2104 	default:
2105 		panic("bad req->type in rf_DispatchKernelIO");
2106 	}
2107 	db1_printf(("Exiting from DispatchKernelIO\n"));
2108 
2109 	return 0;
2110 }
2111 /* this is the callback function associated with a I/O invoked from
2112    kernel code.
2113  */
2114 static void
2115 KernelWakeupFunc(struct buf *bp)
2116 {
2117 	RF_DiskQueueData_t *req = NULL;
2118 	RF_DiskQueue_t *queue;
2119 
2120 	db1_printf(("recovering the request queue:\n"));
2121 
2122 	req = bp->b_private;
2123 
2124 	queue = (RF_DiskQueue_t *) req->queue;
2125 
2126 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2127 
2128 #if RF_ACC_TRACE > 0
2129 	if (req->tracerec) {
2130 		RF_ETIMER_STOP(req->tracerec->timer);
2131 		RF_ETIMER_EVAL(req->tracerec->timer);
2132 		rf_lock_mutex2(rf_tracing_mutex);
2133 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2134 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2135 		req->tracerec->num_phys_ios++;
2136 		rf_unlock_mutex2(rf_tracing_mutex);
2137 	}
2138 #endif
2139 
2140 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2141 	 * ballistic, and mark the component as hosed... */
2142 
2143 	if (bp->b_error != 0) {
2144 		/* Mark the disk as dead */
2145 		/* but only mark it once... */
2146 		/* and only if it wouldn't leave this RAID set
2147 		   completely broken */
2148 		if (((queue->raidPtr->Disks[queue->col].status ==
2149 		      rf_ds_optimal) ||
2150 		     (queue->raidPtr->Disks[queue->col].status ==
2151 		      rf_ds_used_spare)) &&
2152 		     (queue->raidPtr->numFailures <
2153 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2154 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2155 			       queue->raidPtr->raidid,
2156 			       bp->b_error,
2157 			       queue->raidPtr->Disks[queue->col].devname);
2158 			queue->raidPtr->Disks[queue->col].status =
2159 			    rf_ds_failed;
2160 			queue->raidPtr->status = rf_rs_degraded;
2161 			queue->raidPtr->numFailures++;
2162 			queue->raidPtr->numNewFailures++;
2163 		} else {	/* Disk is already dead... */
2164 			/* printf("Disk already marked as dead!\n"); */
2165 		}
2166 
2167 	}
2168 
2169 	/* Fill in the error value */
2170 	req->error = bp->b_error;
2171 
2172 	/* Drop this one on the "finished" queue... */
2173 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2174 
2175 	/* Let the raidio thread know there is work to be done. */
2176 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2177 
2178 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2179 }
2180 
2181 
2182 /*
2183  * initialize a buf structure for doing an I/O in the kernel.
2184  */
2185 static void
2186 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2187        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2188        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2189 {
2190 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2191 	bp->b_oflags = 0;
2192 	bp->b_cflags = 0;
2193 	bp->b_bcount = numSect << logBytesPerSector;
2194 	bp->b_bufsize = bp->b_bcount;
2195 	bp->b_error = 0;
2196 	bp->b_dev = dev;
2197 	bp->b_data = bf;
2198 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2199 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2200 	if (bp->b_bcount == 0) {
2201 		panic("bp->b_bcount is zero in InitBP!!");
2202 	}
2203 	bp->b_iodone = cbFunc;
2204 	bp->b_private = cbArg;
2205 }
2206 
2207 /*
2208  * Wait interruptibly for an exclusive lock.
2209  *
2210  * XXX
2211  * Several drivers do this; it should be abstracted and made MP-safe.
2212  * (Hmm... where have we seen this warning before :->  GO )
2213  */
2214 static int
2215 raidlock(struct raid_softc *rs)
2216 {
2217 	int     error;
2218 
2219 	error = 0;
2220 	mutex_enter(&rs->sc_mutex);
2221 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2222 		rs->sc_flags |= RAIDF_WANTED;
2223 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2224 		if (error != 0)
2225 			goto done;
2226 	}
2227 	rs->sc_flags |= RAIDF_LOCKED;
2228 done:
2229 	mutex_exit(&rs->sc_mutex);
2230 	return error;
2231 }
2232 /*
2233  * Unlock and wake up any waiters.
2234  */
2235 static void
2236 raidunlock(struct raid_softc *rs)
2237 {
2238 
2239 	mutex_enter(&rs->sc_mutex);
2240 	rs->sc_flags &= ~RAIDF_LOCKED;
2241 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2242 		rs->sc_flags &= ~RAIDF_WANTED;
2243 		cv_broadcast(&rs->sc_cv);
2244 	}
2245 	mutex_exit(&rs->sc_mutex);
2246 }
2247 
2248 
2249 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2250 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2251 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2252 
2253 static daddr_t
2254 rf_component_info_offset(void)
2255 {
2256 
2257 	return RF_COMPONENT_INFO_OFFSET;
2258 }
2259 
2260 static daddr_t
2261 rf_component_info_size(unsigned secsize)
2262 {
2263 	daddr_t info_size;
2264 
2265 	KASSERT(secsize);
2266 	if (secsize > RF_COMPONENT_INFO_SIZE)
2267 		info_size = secsize;
2268 	else
2269 		info_size = RF_COMPONENT_INFO_SIZE;
2270 
2271 	return info_size;
2272 }
2273 
2274 static daddr_t
2275 rf_parity_map_offset(RF_Raid_t *raidPtr)
2276 {
2277 	daddr_t map_offset;
2278 
2279 	KASSERT(raidPtr->bytesPerSector);
2280 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2281 		map_offset = raidPtr->bytesPerSector;
2282 	else
2283 		map_offset = RF_COMPONENT_INFO_SIZE;
2284 	map_offset += rf_component_info_offset();
2285 
2286 	return map_offset;
2287 }
2288 
2289 static daddr_t
2290 rf_parity_map_size(RF_Raid_t *raidPtr)
2291 {
2292 	daddr_t map_size;
2293 
2294 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2295 		map_size = raidPtr->bytesPerSector;
2296 	else
2297 		map_size = RF_PARITY_MAP_SIZE;
2298 
2299 	return map_size;
2300 }
2301 
2302 int
2303 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2304 {
2305 	RF_ComponentLabel_t *clabel;
2306 
2307 	clabel = raidget_component_label(raidPtr, col);
2308 	clabel->clean = RF_RAID_CLEAN;
2309 	raidflush_component_label(raidPtr, col);
2310 	return(0);
2311 }
2312 
2313 
2314 int
2315 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2316 {
2317 	RF_ComponentLabel_t *clabel;
2318 
2319 	clabel = raidget_component_label(raidPtr, col);
2320 	clabel->clean = RF_RAID_DIRTY;
2321 	raidflush_component_label(raidPtr, col);
2322 	return(0);
2323 }
2324 
2325 int
2326 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2327 {
2328 	KASSERT(raidPtr->bytesPerSector);
2329 
2330 	return raidread_component_label(raidPtr->bytesPerSector,
2331 	    raidPtr->Disks[col].dev,
2332 	    raidPtr->raid_cinfo[col].ci_vp,
2333 	    &raidPtr->raid_cinfo[col].ci_label);
2334 }
2335 
2336 RF_ComponentLabel_t *
2337 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2338 {
2339 	return &raidPtr->raid_cinfo[col].ci_label;
2340 }
2341 
2342 int
2343 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2344 {
2345 	RF_ComponentLabel_t *label;
2346 
2347 	label = &raidPtr->raid_cinfo[col].ci_label;
2348 	label->mod_counter = raidPtr->mod_counter;
2349 #ifndef RF_NO_PARITY_MAP
2350 	label->parity_map_modcount = label->mod_counter;
2351 #endif
2352 	return raidwrite_component_label(raidPtr->bytesPerSector,
2353 	    raidPtr->Disks[col].dev,
2354 	    raidPtr->raid_cinfo[col].ci_vp, label);
2355 }
2356 
2357 /*
2358  * Swap the label endianness.
2359  *
2360  * Everything in the component label is 4-byte-swapped except the version,
2361  * which is kept in the byte-swapped version at all times, and indicates
2362  * for the writer that a swap is necessary.
2363  *
2364  * For reads it is expected that out_label == clabel, but writes expect
2365  * separate labels so only the re-swapped label is written out to disk,
2366  * leaving the swapped-except-version internally.
2367  *
2368  * Only support swapping label version 2.
2369  */
2370 static void
2371 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2372 {
2373 	int	*in, *out, *in_last;
2374 
2375 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2376 
2377 	/* Don't swap the label, but do copy it. */
2378 	out_label->version = clabel->version;
2379 
2380 	in = &clabel->serial_number;
2381 	in_last = &clabel->future_use2[42];
2382 	out = &out_label->serial_number;
2383 
2384 	for (; in < in_last; in++, out++)
2385 		*out = bswap32(*in);
2386 }
2387 
2388 static int
2389 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2390     RF_ComponentLabel_t *clabel)
2391 {
2392 	int error;
2393 
2394 	error = raidread_component_area(dev, b_vp, clabel,
2395 	    sizeof(RF_ComponentLabel_t),
2396 	    rf_component_info_offset(),
2397 	    rf_component_info_size(secsize));
2398 
2399 	if (error == 0 &&
2400 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2401 		rf_swap_label(clabel, clabel);
2402 	}
2403 
2404 	return error;
2405 }
2406 
2407 /* ARGSUSED */
2408 static int
2409 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2410     size_t msize, daddr_t offset, daddr_t dsize)
2411 {
2412 	struct buf *bp;
2413 	int error;
2414 
2415 	/* XXX should probably ensure that we don't try to do this if
2416 	   someone has changed rf_protected_sectors. */
2417 
2418 	if (b_vp == NULL) {
2419 		/* For whatever reason, this component is not valid.
2420 		   Don't try to read a component label from it. */
2421 		return(EINVAL);
2422 	}
2423 
2424 	/* get a block of the appropriate size... */
2425 	bp = geteblk((int)dsize);
2426 	bp->b_dev = dev;
2427 
2428 	/* get our ducks in a row for the read */
2429 	bp->b_blkno = offset / DEV_BSIZE;
2430 	bp->b_bcount = dsize;
2431 	bp->b_flags |= B_READ;
2432  	bp->b_resid = dsize;
2433 
2434 	bdev_strategy(bp);
2435 	error = biowait(bp);
2436 
2437 	if (!error) {
2438 		memcpy(data, bp->b_data, msize);
2439 	}
2440 
2441 	brelse(bp, 0);
2442 	return(error);
2443 }
2444 
2445 static int
2446 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2447     RF_ComponentLabel_t *clabel)
2448 {
2449 	RF_ComponentLabel_t *clabel_write = clabel;
2450 	RF_ComponentLabel_t lclabel;
2451 	int error;
2452 
2453 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2454 		clabel_write = &lclabel;
2455 		rf_swap_label(clabel, clabel_write);
2456 	}
2457 	error = raidwrite_component_area(dev, b_vp, clabel_write,
2458 	    sizeof(RF_ComponentLabel_t),
2459 	    rf_component_info_offset(),
2460 	    rf_component_info_size(secsize), 0);
2461 
2462 	return error;
2463 }
2464 
2465 /* ARGSUSED */
2466 static int
2467 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2468     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2469 {
2470 	struct buf *bp;
2471 	int error;
2472 
2473 	/* get a block of the appropriate size... */
2474 	bp = geteblk((int)dsize);
2475 	bp->b_dev = dev;
2476 
2477 	/* get our ducks in a row for the write */
2478 	bp->b_blkno = offset / DEV_BSIZE;
2479 	bp->b_bcount = dsize;
2480 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2481  	bp->b_resid = dsize;
2482 
2483 	memset(bp->b_data, 0, dsize);
2484 	memcpy(bp->b_data, data, msize);
2485 
2486 	bdev_strategy(bp);
2487 	if (asyncp)
2488 		return 0;
2489 	error = biowait(bp);
2490 	brelse(bp, 0);
2491 	if (error) {
2492 #if 1
2493 		printf("Failed to write RAID component info!\n");
2494 #endif
2495 	}
2496 
2497 	return(error);
2498 }
2499 
2500 void
2501 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2502 {
2503 	int c;
2504 
2505 	for (c = 0; c < raidPtr->numCol; c++) {
2506 		/* Skip dead disks. */
2507 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2508 			continue;
2509 		/* XXXjld: what if an error occurs here? */
2510 		raidwrite_component_area(raidPtr->Disks[c].dev,
2511 		    raidPtr->raid_cinfo[c].ci_vp, map,
2512 		    RF_PARITYMAP_NBYTE,
2513 		    rf_parity_map_offset(raidPtr),
2514 		    rf_parity_map_size(raidPtr), 0);
2515 	}
2516 }
2517 
2518 void
2519 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2520 {
2521 	struct rf_paritymap_ondisk tmp;
2522 	int c,first;
2523 
2524 	first=1;
2525 	for (c = 0; c < raidPtr->numCol; c++) {
2526 		/* Skip dead disks. */
2527 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2528 			continue;
2529 		raidread_component_area(raidPtr->Disks[c].dev,
2530 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2531 		    RF_PARITYMAP_NBYTE,
2532 		    rf_parity_map_offset(raidPtr),
2533 		    rf_parity_map_size(raidPtr));
2534 		if (first) {
2535 			memcpy(map, &tmp, sizeof(*map));
2536 			first = 0;
2537 		} else {
2538 			rf_paritymap_merge(map, &tmp);
2539 		}
2540 	}
2541 }
2542 
2543 void
2544 rf_markalldirty(RF_Raid_t *raidPtr)
2545 {
2546 	RF_ComponentLabel_t *clabel;
2547 	int sparecol;
2548 	int c;
2549 	int j;
2550 	int scol = -1;
2551 
2552 	raidPtr->mod_counter++;
2553 	for (c = 0; c < raidPtr->numCol; c++) {
2554 		/* we don't want to touch (at all) a disk that has
2555 		   failed */
2556 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2557 			clabel = raidget_component_label(raidPtr, c);
2558 			if (clabel->status == rf_ds_spared) {
2559 				/* XXX do something special...
2560 				   but whatever you do, don't
2561 				   try to access it!! */
2562 			} else {
2563 				raidmarkdirty(raidPtr, c);
2564 			}
2565 		}
2566 	}
2567 
2568 	for( c = 0; c < raidPtr->numSpare ; c++) {
2569 		sparecol = raidPtr->numCol + c;
2570 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2571 			/*
2572 
2573 			   we claim this disk is "optimal" if it's
2574 			   rf_ds_used_spare, as that means it should be
2575 			   directly substitutable for the disk it replaced.
2576 			   We note that too...
2577 
2578 			 */
2579 
2580 			for(j=0;j<raidPtr->numCol;j++) {
2581 				if (raidPtr->Disks[j].spareCol == sparecol) {
2582 					scol = j;
2583 					break;
2584 				}
2585 			}
2586 
2587 			clabel = raidget_component_label(raidPtr, sparecol);
2588 			/* make sure status is noted */
2589 
2590 			raid_init_component_label(raidPtr, clabel);
2591 
2592 			clabel->row = 0;
2593 			clabel->column = scol;
2594 			/* Note: we *don't* change status from rf_ds_used_spare
2595 			   to rf_ds_optimal */
2596 			/* clabel.status = rf_ds_optimal; */
2597 
2598 			raidmarkdirty(raidPtr, sparecol);
2599 		}
2600 	}
2601 }
2602 
2603 
2604 void
2605 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2606 {
2607 	RF_ComponentLabel_t *clabel;
2608 	int sparecol;
2609 	int c;
2610 	int j;
2611 	int scol;
2612 	struct raid_softc *rs = raidPtr->softc;
2613 
2614 	scol = -1;
2615 
2616 	/* XXX should do extra checks to make sure things really are clean,
2617 	   rather than blindly setting the clean bit... */
2618 
2619 	raidPtr->mod_counter++;
2620 
2621 	for (c = 0; c < raidPtr->numCol; c++) {
2622 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2623 			clabel = raidget_component_label(raidPtr, c);
2624 			/* make sure status is noted */
2625 			clabel->status = rf_ds_optimal;
2626 
2627 			/* note what unit we are configured as */
2628 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2629 				clabel->last_unit = raidPtr->raidid;
2630 
2631 			raidflush_component_label(raidPtr, c);
2632 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2633 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2634 					raidmarkclean(raidPtr, c);
2635 				}
2636 			}
2637 		}
2638 		/* else we don't touch it.. */
2639 	}
2640 
2641 	for( c = 0; c < raidPtr->numSpare ; c++) {
2642 		sparecol = raidPtr->numCol + c;
2643 		/* Need to ensure that the reconstruct actually completed! */
2644 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2645 			/*
2646 
2647 			   we claim this disk is "optimal" if it's
2648 			   rf_ds_used_spare, as that means it should be
2649 			   directly substitutable for the disk it replaced.
2650 			   We note that too...
2651 
2652 			 */
2653 
2654 			for(j=0;j<raidPtr->numCol;j++) {
2655 				if (raidPtr->Disks[j].spareCol == sparecol) {
2656 					scol = j;
2657 					break;
2658 				}
2659 			}
2660 
2661 			/* XXX shouldn't *really* need this... */
2662 			clabel = raidget_component_label(raidPtr, sparecol);
2663 			/* make sure status is noted */
2664 
2665 			raid_init_component_label(raidPtr, clabel);
2666 
2667 			clabel->column = scol;
2668 			clabel->status = rf_ds_optimal;
2669 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2670 				clabel->last_unit = raidPtr->raidid;
2671 
2672 			raidflush_component_label(raidPtr, sparecol);
2673 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2674 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2675 					raidmarkclean(raidPtr, sparecol);
2676 				}
2677 			}
2678 		}
2679 	}
2680 }
2681 
2682 void
2683 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2684 {
2685 
2686 	if (vp != NULL) {
2687 		if (auto_configured == 1) {
2688 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2689 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2690 			vput(vp);
2691 
2692 		} else {
2693 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2694 		}
2695 	}
2696 }
2697 
2698 
2699 void
2700 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2701 {
2702 	int r,c;
2703 	struct vnode *vp;
2704 	int acd;
2705 
2706 
2707 	/* We take this opportunity to close the vnodes like we should.. */
2708 
2709 	for (c = 0; c < raidPtr->numCol; c++) {
2710 		vp = raidPtr->raid_cinfo[c].ci_vp;
2711 		acd = raidPtr->Disks[c].auto_configured;
2712 		rf_close_component(raidPtr, vp, acd);
2713 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2714 		raidPtr->Disks[c].auto_configured = 0;
2715 	}
2716 
2717 	for (r = 0; r < raidPtr->numSpare; r++) {
2718 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2719 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2720 		rf_close_component(raidPtr, vp, acd);
2721 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2722 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2723 	}
2724 }
2725 
2726 
2727 static void
2728 rf_ReconThread(struct rf_recon_req_internal *req)
2729 {
2730 	int     s;
2731 	RF_Raid_t *raidPtr;
2732 
2733 	s = splbio();
2734 	raidPtr = (RF_Raid_t *) req->raidPtr;
2735 	raidPtr->recon_in_progress = 1;
2736 
2737 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2738 		raidPtr->forceRecon = 1;
2739 	}
2740 
2741 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2742 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2743 
2744 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2745 		raidPtr->forceRecon = 0;
2746 	}
2747 
2748 	RF_Free(req, sizeof(*req));
2749 
2750 	raidPtr->recon_in_progress = 0;
2751 	splx(s);
2752 
2753 	/* That's all... */
2754 	kthread_exit(0);	/* does not return */
2755 }
2756 
2757 static void
2758 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2759 {
2760 	int retcode;
2761 	int s;
2762 
2763 	raidPtr->parity_rewrite_stripes_done = 0;
2764 	raidPtr->parity_rewrite_in_progress = 1;
2765 	s = splbio();
2766 	retcode = rf_RewriteParity(raidPtr);
2767 	splx(s);
2768 	if (retcode) {
2769 		printf("raid%d: Error re-writing parity (%d)!\n",
2770 		    raidPtr->raidid, retcode);
2771 	} else {
2772 		/* set the clean bit!  If we shutdown correctly,
2773 		   the clean bit on each component label will get
2774 		   set */
2775 		raidPtr->parity_good = RF_RAID_CLEAN;
2776 	}
2777 	raidPtr->parity_rewrite_in_progress = 0;
2778 
2779 	/* Anyone waiting for us to stop?  If so, inform them... */
2780 	if (raidPtr->waitShutdown) {
2781 		rf_lock_mutex2(raidPtr->rad_lock);
2782 		cv_broadcast(&raidPtr->parity_rewrite_cv);
2783 		rf_unlock_mutex2(raidPtr->rad_lock);
2784 	}
2785 
2786 	/* That's all... */
2787 	kthread_exit(0);	/* does not return */
2788 }
2789 
2790 
2791 static void
2792 rf_CopybackThread(RF_Raid_t *raidPtr)
2793 {
2794 	int s;
2795 
2796 	raidPtr->copyback_in_progress = 1;
2797 	s = splbio();
2798 	rf_CopybackReconstructedData(raidPtr);
2799 	splx(s);
2800 	raidPtr->copyback_in_progress = 0;
2801 
2802 	/* That's all... */
2803 	kthread_exit(0);	/* does not return */
2804 }
2805 
2806 
2807 static void
2808 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2809 {
2810 	int s;
2811 	RF_Raid_t *raidPtr;
2812 
2813 	s = splbio();
2814 	raidPtr = req->raidPtr;
2815 	raidPtr->recon_in_progress = 1;
2816 
2817 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2818 		raidPtr->forceRecon = 1;
2819 	}
2820 
2821 	rf_ReconstructInPlace(raidPtr, req->col);
2822 
2823 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2824 		raidPtr->forceRecon = 0;
2825 	}
2826 
2827 	RF_Free(req, sizeof(*req));
2828 	raidPtr->recon_in_progress = 0;
2829 	splx(s);
2830 
2831 	/* That's all... */
2832 	kthread_exit(0);	/* does not return */
2833 }
2834 
2835 static RF_AutoConfig_t *
2836 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2837     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2838     unsigned secsize)
2839 {
2840 	int good_one = 0;
2841 	RF_ComponentLabel_t *clabel;
2842 	RF_AutoConfig_t *ac;
2843 
2844 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2845 
2846 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2847 		/* Got the label.  Does it look reasonable? */
2848 		if (rf_reasonable_label(clabel, numsecs) &&
2849 		    (rf_component_label_partitionsize(clabel) <= size)) {
2850 #ifdef DEBUG
2851 			printf("Component on: %s: %llu\n",
2852 				cname, (unsigned long long)size);
2853 			rf_print_component_label(clabel);
2854 #endif
2855 			/* if it's reasonable, add it, else ignore it. */
2856 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2857 				M_WAITOK);
2858 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2859 			ac->dev = dev;
2860 			ac->vp = vp;
2861 			ac->clabel = clabel;
2862 			ac->next = ac_list;
2863 			ac_list = ac;
2864 			good_one = 1;
2865 		}
2866 	}
2867 	if (!good_one) {
2868 		/* cleanup */
2869 		free(clabel, M_RAIDFRAME);
2870 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2871 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2872 		vput(vp);
2873 	}
2874 	return ac_list;
2875 }
2876 
2877 static RF_AutoConfig_t *
2878 rf_find_raid_components(void)
2879 {
2880 	struct vnode *vp;
2881 	struct disklabel label;
2882 	device_t dv;
2883 	deviter_t di;
2884 	dev_t dev;
2885 	int bmajor, bminor, wedge, rf_part_found;
2886 	int error;
2887 	int i;
2888 	RF_AutoConfig_t *ac_list;
2889 	uint64_t numsecs;
2890 	unsigned secsize;
2891 	int dowedges;
2892 
2893 	/* initialize the AutoConfig list */
2894 	ac_list = NULL;
2895 
2896 	/*
2897 	 * we begin by trolling through *all* the devices on the system *twice*
2898 	 * first we scan for wedges, second for other devices. This avoids
2899 	 * using a raw partition instead of a wedge that covers the whole disk
2900 	 */
2901 
2902 	for (dowedges=1; dowedges>=0; --dowedges) {
2903 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2904 		     dv = deviter_next(&di)) {
2905 
2906 			/* we are only interested in disks */
2907 			if (device_class(dv) != DV_DISK)
2908 				continue;
2909 
2910 			/* we don't care about floppies */
2911 			if (device_is_a(dv, "fd")) {
2912 				continue;
2913 			}
2914 
2915 			/* we don't care about CDs. */
2916 			if (device_is_a(dv, "cd")) {
2917 				continue;
2918 			}
2919 
2920 			/* we don't care about md. */
2921 			if (device_is_a(dv, "md")) {
2922 				continue;
2923 			}
2924 
2925 			/* hdfd is the Atari/Hades floppy driver */
2926 			if (device_is_a(dv, "hdfd")) {
2927 				continue;
2928 			}
2929 
2930 			/* fdisa is the Atari/Milan floppy driver */
2931 			if (device_is_a(dv, "fdisa")) {
2932 				continue;
2933 			}
2934 
2935 			/* we don't care about spiflash */
2936 			if (device_is_a(dv, "spiflash")) {
2937 				continue;
2938 			}
2939 
2940 			/* are we in the wedges pass ? */
2941 			wedge = device_is_a(dv, "dk");
2942 			if (wedge != dowedges) {
2943 				continue;
2944 			}
2945 
2946 			/* need to find the device_name_to_block_device_major stuff */
2947 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2948 
2949 			rf_part_found = 0; /*No raid partition as yet*/
2950 
2951 			/* get a vnode for the raw partition of this disk */
2952 			bminor = minor(device_unit(dv));
2953 			dev = wedge ? makedev(bmajor, bminor) :
2954 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2955 			if (bdevvp(dev, &vp))
2956 				panic("RAID can't alloc vnode");
2957 
2958 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2959 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2960 
2961 			if (error) {
2962 				/* "Who cares."  Continue looking
2963 				   for something that exists*/
2964 				vput(vp);
2965 				continue;
2966 			}
2967 
2968 			error = getdisksize(vp, &numsecs, &secsize);
2969 			if (error) {
2970 				/*
2971 				 * Pseudo devices like vnd and cgd can be
2972 				 * opened but may still need some configuration.
2973 				 * Ignore these quietly.
2974 				 */
2975 				if (error != ENXIO)
2976 					printf("RAIDframe: can't get disk size"
2977 					    " for dev %s (%d)\n",
2978 					    device_xname(dv), error);
2979 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2980 				vput(vp);
2981 				continue;
2982 			}
2983 			if (wedge) {
2984 				struct dkwedge_info dkw;
2985 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2986 				    NOCRED);
2987 				if (error) {
2988 					printf("RAIDframe: can't get wedge info for "
2989 					    "dev %s (%d)\n", device_xname(dv), error);
2990 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2991 					vput(vp);
2992 					continue;
2993 				}
2994 
2995 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2996 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2997 					vput(vp);
2998 					continue;
2999 				}
3000 
3001 				VOP_UNLOCK(vp);
3002 				ac_list = rf_get_component(ac_list, dev, vp,
3003 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3004 				rf_part_found = 1; /*There is a raid component on this disk*/
3005 				continue;
3006 			}
3007 
3008 			/* Ok, the disk exists.  Go get the disklabel. */
3009 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3010 			if (error) {
3011 				/*
3012 				 * XXX can't happen - open() would
3013 				 * have errored out (or faked up one)
3014 				 */
3015 				if (error != ENOTTY)
3016 					printf("RAIDframe: can't get label for dev "
3017 					    "%s (%d)\n", device_xname(dv), error);
3018 			}
3019 
3020 			/* don't need this any more.  We'll allocate it again
3021 			   a little later if we really do... */
3022 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3023 			vput(vp);
3024 
3025 			if (error)
3026 				continue;
3027 
3028 			rf_part_found = 0; /*No raid partitions yet*/
3029 			for (i = 0; i < label.d_npartitions; i++) {
3030 				char cname[sizeof(ac_list->devname)];
3031 
3032 				/* We only support partitions marked as RAID */
3033 				if (label.d_partitions[i].p_fstype != FS_RAID)
3034 					continue;
3035 
3036 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3037 				if (bdevvp(dev, &vp))
3038 					panic("RAID can't alloc vnode");
3039 
3040 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3041 				error = VOP_OPEN(vp, FREAD, NOCRED);
3042 				if (error) {
3043 					/* Not quite a 'whatever'.  In
3044 					 * this situation we know
3045 					 * there is a FS_RAID
3046 					 * partition, but we can't
3047 					 * open it.  The most likely
3048 					 * reason is that the
3049 					 * partition is already in
3050 					 * use by another RAID set.
3051 					 * So note that we've already
3052 					 * found a partition on this
3053 					 * disk so we don't attempt
3054 					 * to use the raw disk later. */
3055 					rf_part_found = 1;
3056 					vput(vp);
3057 					continue;
3058 				}
3059 				VOP_UNLOCK(vp);
3060 				snprintf(cname, sizeof(cname), "%s%c",
3061 				    device_xname(dv), 'a' + i);
3062 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3063 					label.d_partitions[i].p_size, numsecs, secsize);
3064 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3065 			}
3066 
3067 			/*
3068 			 *If there is no raid component on this disk, either in a
3069 			 *disklabel or inside a wedge, check the raw partition as well,
3070 			 *as it is possible to configure raid components on raw disk
3071 			 *devices.
3072 			 */
3073 
3074 			if (!rf_part_found) {
3075 				char cname[sizeof(ac_list->devname)];
3076 
3077 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3078 				if (bdevvp(dev, &vp))
3079 					panic("RAID can't alloc vnode");
3080 
3081 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3082 
3083 				error = VOP_OPEN(vp, FREAD, NOCRED);
3084 				if (error) {
3085 					/* Whatever... */
3086 					vput(vp);
3087 					continue;
3088 				}
3089 				VOP_UNLOCK(vp);
3090 				snprintf(cname, sizeof(cname), "%s%c",
3091 				    device_xname(dv), 'a' + RAW_PART);
3092 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3093 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3094 			}
3095 		}
3096 		deviter_release(&di);
3097 	}
3098 	return ac_list;
3099 }
3100 
3101 int
3102 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3103 {
3104 
3105 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3106 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
3107 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3108 	    (clabel->clean == RF_RAID_CLEAN ||
3109 	     clabel->clean == RF_RAID_DIRTY) &&
3110 	    clabel->row >=0 &&
3111 	    clabel->column >= 0 &&
3112 	    clabel->num_rows > 0 &&
3113 	    clabel->num_columns > 0 &&
3114 	    clabel->row < clabel->num_rows &&
3115 	    clabel->column < clabel->num_columns &&
3116 	    clabel->blockSize > 0 &&
3117 	    /*
3118 	     * numBlocksHi may contain garbage, but it is ok since
3119 	     * the type is unsigned.  If it is really garbage,
3120 	     * rf_fix_old_label_size() will fix it.
3121 	     */
3122 	    rf_component_label_numblocks(clabel) > 0) {
3123 		/*
3124 		 * label looks reasonable enough...
3125 		 * let's make sure it has no old garbage.
3126 		 */
3127 		if (numsecs)
3128 			rf_fix_old_label_size(clabel, numsecs);
3129 		return(1);
3130 	}
3131 	return(0);
3132 }
3133 
3134 
3135 /*
3136  * For reasons yet unknown, some old component labels have garbage in
3137  * the newer numBlocksHi region, and this causes lossage.  Since those
3138  * disks will also have numsecs set to less than 32 bits of sectors,
3139  * we can determine when this corruption has occurred, and fix it.
3140  *
3141  * The exact same problem, with the same unknown reason, happens to
3142  * the partitionSizeHi member as well.
3143  */
3144 static void
3145 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3146 {
3147 
3148 	if (numsecs < ((uint64_t)1 << 32)) {
3149 		if (clabel->numBlocksHi) {
3150 			printf("WARNING: total sectors < 32 bits, yet "
3151 			       "numBlocksHi set\n"
3152 			       "WARNING: resetting numBlocksHi to zero.\n");
3153 			clabel->numBlocksHi = 0;
3154 		}
3155 
3156 		if (clabel->partitionSizeHi) {
3157 			printf("WARNING: total sectors < 32 bits, yet "
3158 			       "partitionSizeHi set\n"
3159 			       "WARNING: resetting partitionSizeHi to zero.\n");
3160 			clabel->partitionSizeHi = 0;
3161 		}
3162 	}
3163 }
3164 
3165 
3166 #ifdef DEBUG
3167 void
3168 rf_print_component_label(RF_ComponentLabel_t *clabel)
3169 {
3170 	uint64_t numBlocks;
3171 	static const char *rp[] = {
3172 	    "No", "Force", "Soft", "*invalid*"
3173 	};
3174 
3175 
3176 	numBlocks = rf_component_label_numblocks(clabel);
3177 
3178 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3179 	       clabel->row, clabel->column,
3180 	       clabel->num_rows, clabel->num_columns);
3181 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3182 	       clabel->version, clabel->serial_number,
3183 	       clabel->mod_counter);
3184 	printf("   Clean: %s Status: %d\n",
3185 	       clabel->clean ? "Yes" : "No", clabel->status);
3186 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3187 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3188 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3189 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3190 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3191 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3192 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3193 #if 0
3194 	   printf("   Config order: %d\n", clabel->config_order);
3195 #endif
3196 
3197 }
3198 #endif
3199 
3200 static RF_ConfigSet_t *
3201 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3202 {
3203 	RF_AutoConfig_t *ac;
3204 	RF_ConfigSet_t *config_sets;
3205 	RF_ConfigSet_t *cset;
3206 	RF_AutoConfig_t *ac_next;
3207 
3208 
3209 	config_sets = NULL;
3210 
3211 	/* Go through the AutoConfig list, and figure out which components
3212 	   belong to what sets.  */
3213 	ac = ac_list;
3214 	while(ac!=NULL) {
3215 		/* we're going to putz with ac->next, so save it here
3216 		   for use at the end of the loop */
3217 		ac_next = ac->next;
3218 
3219 		if (config_sets == NULL) {
3220 			/* will need at least this one... */
3221 			config_sets = malloc(sizeof(RF_ConfigSet_t),
3222 				       M_RAIDFRAME, M_WAITOK);
3223 			/* this one is easy :) */
3224 			config_sets->ac = ac;
3225 			config_sets->next = NULL;
3226 			config_sets->rootable = 0;
3227 			ac->next = NULL;
3228 		} else {
3229 			/* which set does this component fit into? */
3230 			cset = config_sets;
3231 			while(cset!=NULL) {
3232 				if (rf_does_it_fit(cset, ac)) {
3233 					/* looks like it matches... */
3234 					ac->next = cset->ac;
3235 					cset->ac = ac;
3236 					break;
3237 				}
3238 				cset = cset->next;
3239 			}
3240 			if (cset==NULL) {
3241 				/* didn't find a match above... new set..*/
3242 				cset = malloc(sizeof(RF_ConfigSet_t),
3243 					       M_RAIDFRAME, M_WAITOK);
3244 				cset->ac = ac;
3245 				ac->next = NULL;
3246 				cset->next = config_sets;
3247 				cset->rootable = 0;
3248 				config_sets = cset;
3249 			}
3250 		}
3251 		ac = ac_next;
3252 	}
3253 
3254 
3255 	return(config_sets);
3256 }
3257 
3258 static int
3259 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3260 {
3261 	RF_ComponentLabel_t *clabel1, *clabel2;
3262 
3263 	/* If this one matches the *first* one in the set, that's good
3264 	   enough, since the other members of the set would have been
3265 	   through here too... */
3266 	/* note that we are not checking partitionSize here..
3267 
3268 	   Note that we are also not checking the mod_counters here.
3269 	   If everything else matches except the mod_counter, that's
3270 	   good enough for this test.  We will deal with the mod_counters
3271 	   a little later in the autoconfiguration process.
3272 
3273 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3274 
3275 	   The reason we don't check for this is that failed disks
3276 	   will have lower modification counts.  If those disks are
3277 	   not added to the set they used to belong to, then they will
3278 	   form their own set, which may result in 2 different sets,
3279 	   for example, competing to be configured at raid0, and
3280 	   perhaps competing to be the root filesystem set.  If the
3281 	   wrong ones get configured, or both attempt to become /,
3282 	   weird behaviour and or serious lossage will occur.  Thus we
3283 	   need to bring them into the fold here, and kick them out at
3284 	   a later point.
3285 
3286 	*/
3287 
3288 	clabel1 = cset->ac->clabel;
3289 	clabel2 = ac->clabel;
3290 	if ((clabel1->version == clabel2->version) &&
3291 	    (clabel1->serial_number == clabel2->serial_number) &&
3292 	    (clabel1->num_rows == clabel2->num_rows) &&
3293 	    (clabel1->num_columns == clabel2->num_columns) &&
3294 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3295 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3296 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3297 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3298 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3299 	    (clabel1->blockSize == clabel2->blockSize) &&
3300 	    rf_component_label_numblocks(clabel1) ==
3301 	    rf_component_label_numblocks(clabel2) &&
3302 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3303 	    (clabel1->root_partition == clabel2->root_partition) &&
3304 	    (clabel1->last_unit == clabel2->last_unit) &&
3305 	    (clabel1->config_order == clabel2->config_order)) {
3306 		/* if it get's here, it almost *has* to be a match */
3307 	} else {
3308 		/* it's not consistent with somebody in the set..
3309 		   punt */
3310 		return(0);
3311 	}
3312 	/* all was fine.. it must fit... */
3313 	return(1);
3314 }
3315 
3316 static int
3317 rf_have_enough_components(RF_ConfigSet_t *cset)
3318 {
3319 	RF_AutoConfig_t *ac;
3320 	RF_AutoConfig_t *auto_config;
3321 	RF_ComponentLabel_t *clabel;
3322 	int c;
3323 	int num_cols;
3324 	int num_missing;
3325 	int mod_counter;
3326 	int mod_counter_found;
3327 	int even_pair_failed;
3328 	char parity_type;
3329 
3330 
3331 	/* check to see that we have enough 'live' components
3332 	   of this set.  If so, we can configure it if necessary */
3333 
3334 	num_cols = cset->ac->clabel->num_columns;
3335 	parity_type = cset->ac->clabel->parityConfig;
3336 
3337 	/* XXX Check for duplicate components!?!?!? */
3338 
3339 	/* Determine what the mod_counter is supposed to be for this set. */
3340 
3341 	mod_counter_found = 0;
3342 	mod_counter = 0;
3343 	ac = cset->ac;
3344 	while(ac!=NULL) {
3345 		if (mod_counter_found==0) {
3346 			mod_counter = ac->clabel->mod_counter;
3347 			mod_counter_found = 1;
3348 		} else {
3349 			if (ac->clabel->mod_counter > mod_counter) {
3350 				mod_counter = ac->clabel->mod_counter;
3351 			}
3352 		}
3353 		ac = ac->next;
3354 	}
3355 
3356 	num_missing = 0;
3357 	auto_config = cset->ac;
3358 
3359 	even_pair_failed = 0;
3360 	for(c=0; c<num_cols; c++) {
3361 		ac = auto_config;
3362 		while(ac!=NULL) {
3363 			if ((ac->clabel->column == c) &&
3364 			    (ac->clabel->mod_counter == mod_counter)) {
3365 				/* it's this one... */
3366 #ifdef DEBUG
3367 				printf("Found: %s at %d\n",
3368 				       ac->devname,c);
3369 #endif
3370 				break;
3371 			}
3372 			ac=ac->next;
3373 		}
3374 		if (ac==NULL) {
3375 				/* Didn't find one here! */
3376 				/* special case for RAID 1, especially
3377 				   where there are more than 2
3378 				   components (where RAIDframe treats
3379 				   things a little differently :( ) */
3380 			if (parity_type == '1') {
3381 				if (c%2 == 0) { /* even component */
3382 					even_pair_failed = 1;
3383 				} else { /* odd component.  If
3384 					    we're failed, and
3385 					    so is the even
3386 					    component, it's
3387 					    "Good Night, Charlie" */
3388 					if (even_pair_failed == 1) {
3389 						return(0);
3390 					}
3391 				}
3392 			} else {
3393 				/* normal accounting */
3394 				num_missing++;
3395 			}
3396 		}
3397 		if ((parity_type == '1') && (c%2 == 1)) {
3398 				/* Just did an even component, and we didn't
3399 				   bail.. reset the even_pair_failed flag,
3400 				   and go on to the next component.... */
3401 			even_pair_failed = 0;
3402 		}
3403 	}
3404 
3405 	clabel = cset->ac->clabel;
3406 
3407 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3408 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3409 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3410 		/* XXX this needs to be made *much* more general */
3411 		/* Too many failures */
3412 		return(0);
3413 	}
3414 	/* otherwise, all is well, and we've got enough to take a kick
3415 	   at autoconfiguring this set */
3416 	return(1);
3417 }
3418 
3419 static void
3420 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3421 			RF_Raid_t *raidPtr)
3422 {
3423 	RF_ComponentLabel_t *clabel;
3424 	int i;
3425 
3426 	clabel = ac->clabel;
3427 
3428 	/* 1. Fill in the common stuff */
3429 	config->numCol = clabel->num_columns;
3430 	config->numSpare = 0; /* XXX should this be set here? */
3431 	config->sectPerSU = clabel->sectPerSU;
3432 	config->SUsPerPU = clabel->SUsPerPU;
3433 	config->SUsPerRU = clabel->SUsPerRU;
3434 	config->parityConfig = clabel->parityConfig;
3435 	/* XXX... */
3436 	strcpy(config->diskQueueType,"fifo");
3437 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3438 	config->layoutSpecificSize = 0; /* XXX ?? */
3439 
3440 	while(ac!=NULL) {
3441 		/* row/col values will be in range due to the checks
3442 		   in reasonable_label() */
3443 		strcpy(config->devnames[0][ac->clabel->column],
3444 		       ac->devname);
3445 		ac = ac->next;
3446 	}
3447 
3448 	for(i=0;i<RF_MAXDBGV;i++) {
3449 		config->debugVars[i][0] = 0;
3450 	}
3451 }
3452 
3453 static int
3454 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3455 {
3456 	RF_ComponentLabel_t *clabel;
3457 	int column;
3458 	int sparecol;
3459 
3460 	raidPtr->autoconfigure = new_value;
3461 
3462 	for(column=0; column<raidPtr->numCol; column++) {
3463 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3464 			clabel = raidget_component_label(raidPtr, column);
3465 			clabel->autoconfigure = new_value;
3466 			raidflush_component_label(raidPtr, column);
3467 		}
3468 	}
3469 	for(column = 0; column < raidPtr->numSpare ; column++) {
3470 		sparecol = raidPtr->numCol + column;
3471 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3472 			clabel = raidget_component_label(raidPtr, sparecol);
3473 			clabel->autoconfigure = new_value;
3474 			raidflush_component_label(raidPtr, sparecol);
3475 		}
3476 	}
3477 	return(new_value);
3478 }
3479 
3480 static int
3481 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3482 {
3483 	RF_ComponentLabel_t *clabel;
3484 	int column;
3485 	int sparecol;
3486 
3487 	raidPtr->root_partition = new_value;
3488 	for(column=0; column<raidPtr->numCol; column++) {
3489 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3490 			clabel = raidget_component_label(raidPtr, column);
3491 			clabel->root_partition = new_value;
3492 			raidflush_component_label(raidPtr, column);
3493 		}
3494 	}
3495 	for(column = 0; column < raidPtr->numSpare ; column++) {
3496 		sparecol = raidPtr->numCol + column;
3497 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3498 			clabel = raidget_component_label(raidPtr, sparecol);
3499 			clabel->root_partition = new_value;
3500 			raidflush_component_label(raidPtr, sparecol);
3501 		}
3502 	}
3503 	return(new_value);
3504 }
3505 
3506 static void
3507 rf_release_all_vps(RF_ConfigSet_t *cset)
3508 {
3509 	RF_AutoConfig_t *ac;
3510 
3511 	ac = cset->ac;
3512 	while(ac!=NULL) {
3513 		/* Close the vp, and give it back */
3514 		if (ac->vp) {
3515 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3516 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3517 			vput(ac->vp);
3518 			ac->vp = NULL;
3519 		}
3520 		ac = ac->next;
3521 	}
3522 }
3523 
3524 
3525 static void
3526 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3527 {
3528 	RF_AutoConfig_t *ac;
3529 	RF_AutoConfig_t *next_ac;
3530 
3531 	ac = cset->ac;
3532 	while(ac!=NULL) {
3533 		next_ac = ac->next;
3534 		/* nuke the label */
3535 		free(ac->clabel, M_RAIDFRAME);
3536 		/* cleanup the config structure */
3537 		free(ac, M_RAIDFRAME);
3538 		/* "next.." */
3539 		ac = next_ac;
3540 	}
3541 	/* and, finally, nuke the config set */
3542 	free(cset, M_RAIDFRAME);
3543 }
3544 
3545 
3546 void
3547 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3548 {
3549 	/* avoid over-writing byteswapped version. */
3550 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3551 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3552 	clabel->serial_number = raidPtr->serial_number;
3553 	clabel->mod_counter = raidPtr->mod_counter;
3554 
3555 	clabel->num_rows = 1;
3556 	clabel->num_columns = raidPtr->numCol;
3557 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3558 	clabel->status = rf_ds_optimal; /* "It's good!" */
3559 
3560 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3561 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3562 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3563 
3564 	clabel->blockSize = raidPtr->bytesPerSector;
3565 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3566 
3567 	/* XXX not portable */
3568 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3569 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3570 	clabel->autoconfigure = raidPtr->autoconfigure;
3571 	clabel->root_partition = raidPtr->root_partition;
3572 	clabel->last_unit = raidPtr->raidid;
3573 	clabel->config_order = raidPtr->config_order;
3574 
3575 #ifndef RF_NO_PARITY_MAP
3576 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3577 #endif
3578 }
3579 
3580 static struct raid_softc *
3581 rf_auto_config_set(RF_ConfigSet_t *cset)
3582 {
3583 	RF_Raid_t *raidPtr;
3584 	RF_Config_t *config;
3585 	int raidID;
3586 	struct raid_softc *sc;
3587 
3588 #ifdef DEBUG
3589 	printf("RAID autoconfigure\n");
3590 #endif
3591 
3592 	/* 1. Create a config structure */
3593 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3594 
3595 	/*
3596 	   2. Figure out what RAID ID this one is supposed to live at
3597 	   See if we can get the same RAID dev that it was configured
3598 	   on last time..
3599 	*/
3600 
3601 	raidID = cset->ac->clabel->last_unit;
3602 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3603 	     sc = raidget(++raidID, false))
3604 		continue;
3605 #ifdef DEBUG
3606 	printf("Configuring raid%d:\n",raidID);
3607 #endif
3608 
3609 	if (sc == NULL)
3610 		sc = raidget(raidID, true);
3611 	raidPtr = &sc->sc_r;
3612 
3613 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3614 	raidPtr->softc = sc;
3615 	raidPtr->raidid = raidID;
3616 	raidPtr->openings = RAIDOUTSTANDING;
3617 
3618 	/* 3. Build the configuration structure */
3619 	rf_create_configuration(cset->ac, config, raidPtr);
3620 
3621 	/* 4. Do the configuration */
3622 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3623 		raidinit(sc);
3624 
3625 		rf_markalldirty(raidPtr);
3626 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3627 		switch (cset->ac->clabel->root_partition) {
3628 		case 1:	/* Force Root */
3629 		case 2:	/* Soft Root: root when boot partition part of raid */
3630 			/*
3631 			 * everything configured just fine.  Make a note
3632 			 * that this set is eligible to be root,
3633 			 * or forced to be root
3634 			 */
3635 			cset->rootable = cset->ac->clabel->root_partition;
3636 			/* XXX do this here? */
3637 			raidPtr->root_partition = cset->rootable;
3638 			break;
3639 		default:
3640 			break;
3641 		}
3642 	} else {
3643 		raidput(sc);
3644 		sc = NULL;
3645 	}
3646 
3647 	/* 5. Cleanup */
3648 	free(config, M_RAIDFRAME);
3649 	return sc;
3650 }
3651 
3652 void
3653 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3654 	     size_t xmin, size_t xmax)
3655 {
3656 
3657 	/* Format: raid%d_foo */
3658 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3659 
3660 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3661 	pool_sethiwat(p, xmax);
3662 	pool_prime(p, xmin);
3663 }
3664 
3665 
3666 /*
3667  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3668  * to see if there is IO pending and if that IO could possibly be done
3669  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3670  * otherwise.
3671  *
3672  */
3673 int
3674 rf_buf_queue_check(RF_Raid_t *raidPtr)
3675 {
3676 	struct raid_softc *rs;
3677 	struct dk_softc *dksc;
3678 
3679 	rs = raidPtr->softc;
3680 	dksc = &rs->sc_dksc;
3681 
3682 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3683 		return 1;
3684 
3685 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3686 		/* there is work to do */
3687 		return 0;
3688 	}
3689 	/* default is nothing to do */
3690 	return 1;
3691 }
3692 
3693 int
3694 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3695 {
3696 	uint64_t numsecs;
3697 	unsigned secsize;
3698 	int error;
3699 
3700 	error = getdisksize(vp, &numsecs, &secsize);
3701 	if (error == 0) {
3702 		diskPtr->blockSize = secsize;
3703 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3704 		diskPtr->partitionSize = numsecs;
3705 		return 0;
3706 	}
3707 	return error;
3708 }
3709 
3710 static int
3711 raid_match(device_t self, cfdata_t cfdata, void *aux)
3712 {
3713 	return 1;
3714 }
3715 
3716 static void
3717 raid_attach(device_t parent, device_t self, void *aux)
3718 {
3719 }
3720 
3721 
3722 static int
3723 raid_detach(device_t self, int flags)
3724 {
3725 	int error;
3726 	struct raid_softc *rs = raidsoftc(self);
3727 
3728 	if (rs == NULL)
3729 		return ENXIO;
3730 
3731 	if ((error = raidlock(rs)) != 0)
3732 		return error;
3733 
3734 	error = raid_detach_unlocked(rs);
3735 
3736 	raidunlock(rs);
3737 
3738 	/* XXX raid can be referenced here */
3739 
3740 	if (error)
3741 		return error;
3742 
3743 	/* Free the softc */
3744 	raidput(rs);
3745 
3746 	return 0;
3747 }
3748 
3749 static void
3750 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3751 {
3752 	struct dk_softc *dksc = &rs->sc_dksc;
3753 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3754 
3755 	memset(dg, 0, sizeof(*dg));
3756 
3757 	dg->dg_secperunit = raidPtr->totalSectors;
3758 	dg->dg_secsize = raidPtr->bytesPerSector;
3759 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3760 	dg->dg_ntracks = 4 * raidPtr->numCol;
3761 
3762 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3763 }
3764 
3765 /*
3766  * Get cache info for all the components (including spares).
3767  * Returns intersection of all the cache flags of all disks, or first
3768  * error if any encountered.
3769  * XXXfua feature flags can change as spares are added - lock down somehow
3770  */
3771 static int
3772 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3773 {
3774 	int c;
3775 	int error;
3776 	int dkwhole = 0, dkpart;
3777 
3778 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3779 		/*
3780 		 * Check any non-dead disk, even when currently being
3781 		 * reconstructed.
3782 		 */
3783 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3784 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3785 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3786 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
3787 			if (error) {
3788 				if (error != ENODEV) {
3789 					printf("raid%d: get cache for component %s failed\n",
3790 					    raidPtr->raidid,
3791 					    raidPtr->Disks[c].devname);
3792 				}
3793 
3794 				return error;
3795 			}
3796 
3797 			if (c == 0)
3798 				dkwhole = dkpart;
3799 			else
3800 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3801 		}
3802 	}
3803 
3804 	*data = dkwhole;
3805 
3806 	return 0;
3807 }
3808 
3809 /*
3810  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3811  * We end up returning whatever error was returned by the first cache flush
3812  * that fails.
3813  */
3814 
3815 static int
3816 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3817 {
3818 	int e = 0;
3819 	for (int i = 0; i < 5; i++) {
3820 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3821 		    &force, FWRITE, NOCRED);
3822 		if (!e || e == ENODEV)
3823 			return e;
3824 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3825 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3826 	}
3827 	return e;
3828 }
3829 
3830 int
3831 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3832 {
3833 	int c, error;
3834 
3835 	error = 0;
3836 	for (c = 0; c < raidPtr->numCol; c++) {
3837 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3838 			int e = rf_sync_component_cache(raidPtr, c, force);
3839 			if (e && !error)
3840 				error = e;
3841 		}
3842 	}
3843 
3844 	for (c = 0; c < raidPtr->numSpare ; c++) {
3845 		int sparecol = raidPtr->numCol + c;
3846 		/* Need to ensure that the reconstruct actually completed! */
3847 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3848 			int e = rf_sync_component_cache(raidPtr, sparecol,
3849 			    force);
3850 			if (e && !error)
3851 				error = e;
3852 		}
3853 	}
3854 	return error;
3855 }
3856 
3857 /* Fill in info with the current status */
3858 void
3859 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3860 {
3861 
3862 	memset(info, 0, sizeof(*info));
3863 
3864 	if (raidPtr->status != rf_rs_reconstructing) {
3865 		info->total = 100;
3866 		info->completed = 100;
3867 	} else {
3868 		info->total = raidPtr->reconControl->numRUsTotal;
3869 		info->completed = raidPtr->reconControl->numRUsComplete;
3870 	}
3871 	info->remaining = info->total - info->completed;
3872 }
3873 
3874 /* Fill in info with the current status */
3875 void
3876 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3877 {
3878 
3879 	memset(info, 0, sizeof(*info));
3880 
3881 	if (raidPtr->parity_rewrite_in_progress == 1) {
3882 		info->total = raidPtr->Layout.numStripe;
3883 		info->completed = raidPtr->parity_rewrite_stripes_done;
3884 	} else {
3885 		info->completed = 100;
3886 		info->total = 100;
3887 	}
3888 	info->remaining = info->total - info->completed;
3889 }
3890 
3891 /* Fill in info with the current status */
3892 void
3893 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3894 {
3895 
3896 	memset(info, 0, sizeof(*info));
3897 
3898 	if (raidPtr->copyback_in_progress == 1) {
3899 		info->total = raidPtr->Layout.numStripe;
3900 		info->completed = raidPtr->copyback_stripes_done;
3901 		info->remaining = info->total - info->completed;
3902 	} else {
3903 		info->remaining = 0;
3904 		info->completed = 100;
3905 		info->total = 100;
3906 	}
3907 }
3908 
3909 /* Fill in config with the current info */
3910 int
3911 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3912 {
3913 	int	d, i, j;
3914 
3915 	if (!raidPtr->valid)
3916 		return ENODEV;
3917 	config->cols = raidPtr->numCol;
3918 	config->ndevs = raidPtr->numCol;
3919 	if (config->ndevs >= RF_MAX_DISKS)
3920 		return ENOMEM;
3921 	config->nspares = raidPtr->numSpare;
3922 	if (config->nspares >= RF_MAX_DISKS)
3923 		return ENOMEM;
3924 	config->maxqdepth = raidPtr->maxQueueDepth;
3925 	d = 0;
3926 	for (j = 0; j < config->cols; j++) {
3927 		config->devs[d] = raidPtr->Disks[j];
3928 		d++;
3929 	}
3930 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3931 		config->spares[i] = raidPtr->Disks[j];
3932 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
3933 			/* XXX: raidctl(8) expects to see this as a used spare */
3934 			config->spares[i].status = rf_ds_used_spare;
3935 		}
3936 	}
3937 	return 0;
3938 }
3939 
3940 int
3941 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3942 {
3943 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3944 	RF_ComponentLabel_t *raid_clabel;
3945 	int column = clabel->column;
3946 
3947 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3948 		return EINVAL;
3949 	raid_clabel = raidget_component_label(raidPtr, column);
3950 	memcpy(clabel, raid_clabel, sizeof *clabel);
3951 	/* Fix-up for userland. */
3952 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
3953 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3954 
3955 	return 0;
3956 }
3957 
3958 /*
3959  * Module interface
3960  */
3961 
3962 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3963 
3964 #ifdef _MODULE
3965 CFDRIVER_DECL(raid, DV_DISK, NULL);
3966 #endif
3967 
3968 static int raid_modcmd(modcmd_t, void *);
3969 static int raid_modcmd_init(void);
3970 static int raid_modcmd_fini(void);
3971 
3972 static int
3973 raid_modcmd(modcmd_t cmd, void *data)
3974 {
3975 	int error;
3976 
3977 	error = 0;
3978 	switch (cmd) {
3979 	case MODULE_CMD_INIT:
3980 		error = raid_modcmd_init();
3981 		break;
3982 	case MODULE_CMD_FINI:
3983 		error = raid_modcmd_fini();
3984 		break;
3985 	default:
3986 		error = ENOTTY;
3987 		break;
3988 	}
3989 	return error;
3990 }
3991 
3992 static int
3993 raid_modcmd_init(void)
3994 {
3995 	int error;
3996 	int bmajor, cmajor;
3997 
3998 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3999 	mutex_enter(&raid_lock);
4000 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4001 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4002 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4003 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4004 
4005 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4006 #endif
4007 
4008 	bmajor = cmajor = -1;
4009 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4010 	    &raid_cdevsw, &cmajor);
4011 	if (error != 0 && error != EEXIST) {
4012 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4013 		mutex_exit(&raid_lock);
4014 		return error;
4015 	}
4016 #ifdef _MODULE
4017 	error = config_cfdriver_attach(&raid_cd);
4018 	if (error != 0) {
4019 		aprint_error("%s: config_cfdriver_attach failed %d\n",
4020 		    __func__, error);
4021 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4022 		mutex_exit(&raid_lock);
4023 		return error;
4024 	}
4025 #endif
4026 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4027 	if (error != 0) {
4028 		aprint_error("%s: config_cfattach_attach failed %d\n",
4029 		    __func__, error);
4030 #ifdef _MODULE
4031 		config_cfdriver_detach(&raid_cd);
4032 #endif
4033 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4034 		mutex_exit(&raid_lock);
4035 		return error;
4036 	}
4037 
4038 	raidautoconfigdone = false;
4039 
4040 	mutex_exit(&raid_lock);
4041 
4042 	if (error == 0) {
4043 		if (rf_BootRaidframe(true) == 0)
4044 			aprint_verbose("Kernelized RAIDframe activated\n");
4045 		else
4046 			panic("Serious error activating RAID!!");
4047 	}
4048 
4049 	/*
4050 	 * Register a finalizer which will be used to auto-config RAID
4051 	 * sets once all real hardware devices have been found.
4052 	 */
4053 	error = config_finalize_register(NULL, rf_autoconfig);
4054 	if (error != 0) {
4055 		aprint_error("WARNING: unable to register RAIDframe "
4056 		    "finalizer\n");
4057 		error = 0;
4058 	}
4059 
4060 	return error;
4061 }
4062 
4063 static int
4064 raid_modcmd_fini(void)
4065 {
4066 	int error;
4067 
4068 	mutex_enter(&raid_lock);
4069 
4070 	/* Don't allow unload if raid device(s) exist.  */
4071 	if (!LIST_EMPTY(&raids)) {
4072 		mutex_exit(&raid_lock);
4073 		return EBUSY;
4074 	}
4075 
4076 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4077 	if (error != 0) {
4078 		aprint_error("%s: cannot detach cfattach\n",__func__);
4079 		mutex_exit(&raid_lock);
4080 		return error;
4081 	}
4082 #ifdef _MODULE
4083 	error = config_cfdriver_detach(&raid_cd);
4084 	if (error != 0) {
4085 		aprint_error("%s: cannot detach cfdriver\n",__func__);
4086 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4087 		mutex_exit(&raid_lock);
4088 		return error;
4089 	}
4090 #endif
4091 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
4092 	if (error != 0) {
4093 		aprint_error("%s: cannot detach devsw\n",__func__);
4094 #ifdef _MODULE
4095 		config_cfdriver_attach(&raid_cd);
4096 #endif
4097 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4098 		mutex_exit(&raid_lock);
4099 		return error;
4100 	}
4101 	rf_BootRaidframe(false);
4102 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4103 	rf_destroy_mutex2(rf_sparet_wait_mutex);
4104 	rf_destroy_cond2(rf_sparet_wait_cv);
4105 	rf_destroy_cond2(rf_sparet_resp_cv);
4106 #endif
4107 	mutex_exit(&raid_lock);
4108 	mutex_destroy(&raid_lock);
4109 
4110 	return error;
4111 }
4112