xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.326 2015/12/08 20:36:15 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.326 2015/12/08 20:36:15 christos Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 
130 #include <prop/proplib.h>
131 
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135 
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149 
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153 
154 #include "ioconf.h"
155 
156 #ifdef DEBUG
157 int     rf_kdebug_level = 0;
158 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
159 #else				/* DEBUG */
160 #define db1_printf(a) { }
161 #endif				/* DEBUG */
162 
163 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
164 static rf_declare_mutex2(rf_sparet_wait_mutex);
165 static rf_declare_cond2(rf_sparet_wait_cv);
166 static rf_declare_cond2(rf_sparet_resp_cv);
167 
168 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
169 						 * spare table */
170 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
171 						 * installation process */
172 #endif
173 
174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
175 
176 /* prototypes */
177 static void KernelWakeupFunc(struct buf *);
178 static void InitBP(struct buf *, struct vnode *, unsigned,
179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
180     void *, int, struct proc *);
181 struct raid_softc;
182 static void raidinit(struct raid_softc *);
183 
184 static int raid_match(device_t, cfdata_t, void *);
185 static void raid_attach(device_t, device_t, void *);
186 static int raid_detach(device_t, int);
187 
188 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
189     daddr_t, daddr_t);
190 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
191     daddr_t, daddr_t, int);
192 
193 static int raidwrite_component_label(unsigned,
194     dev_t, struct vnode *, RF_ComponentLabel_t *);
195 static int raidread_component_label(unsigned,
196     dev_t, struct vnode *, RF_ComponentLabel_t *);
197 
198 
199 static dev_type_open(raidopen);
200 static dev_type_close(raidclose);
201 static dev_type_read(raidread);
202 static dev_type_write(raidwrite);
203 static dev_type_ioctl(raidioctl);
204 static dev_type_strategy(raidstrategy);
205 static dev_type_dump(raiddump);
206 static dev_type_size(raidsize);
207 
208 const struct bdevsw raid_bdevsw = {
209 	.d_open = raidopen,
210 	.d_close = raidclose,
211 	.d_strategy = raidstrategy,
212 	.d_ioctl = raidioctl,
213 	.d_dump = raiddump,
214 	.d_psize = raidsize,
215 	.d_discard = nodiscard,
216 	.d_flag = D_DISK
217 };
218 
219 const struct cdevsw raid_cdevsw = {
220 	.d_open = raidopen,
221 	.d_close = raidclose,
222 	.d_read = raidread,
223 	.d_write = raidwrite,
224 	.d_ioctl = raidioctl,
225 	.d_stop = nostop,
226 	.d_tty = notty,
227 	.d_poll = nopoll,
228 	.d_mmap = nommap,
229 	.d_kqfilter = nokqfilter,
230 	.d_discard = nodiscard,
231 	.d_flag = D_DISK
232 };
233 
234 static struct dkdriver rf_dkdriver = {
235 	.d_strategy = raidstrategy,
236 	.d_minphys = minphys
237 };
238 
239 struct raid_softc {
240 	device_t sc_dev;
241 	int	sc_unit;
242 	int     sc_flags;	/* flags */
243 	int     sc_cflags;	/* configuration flags */
244 	uint64_t sc_size;	/* size of the raid device */
245 	char    sc_xname[20];	/* XXX external name */
246 	struct disk sc_dkdev;	/* generic disk device info */
247 	struct bufq_state *buf_queue;	/* used for the device queue */
248 	RF_Raid_t sc_r;
249 	LIST_ENTRY(raid_softc) sc_link;
250 };
251 /* sc_flags */
252 #define RAIDF_INITED	0x01	/* unit has been initialized */
253 #define RAIDF_WLABEL	0x02	/* label area is writable */
254 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
255 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
256 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
257 #define RAIDF_LOCKED	0x80	/* unit is locked */
258 
259 #define	raidunit(x)	DISKUNIT(x)
260 
261 extern struct cfdriver raid_cd;
262 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
263     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
264     DVF_DETACH_SHUTDOWN);
265 
266 /*
267  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
268  * Be aware that large numbers can allow the driver to consume a lot of
269  * kernel memory, especially on writes, and in degraded mode reads.
270  *
271  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
272  * a single 64K write will typically require 64K for the old data,
273  * 64K for the old parity, and 64K for the new parity, for a total
274  * of 192K (if the parity buffer is not re-used immediately).
275  * Even it if is used immediately, that's still 128K, which when multiplied
276  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
277  *
278  * Now in degraded mode, for example, a 64K read on the above setup may
279  * require data reconstruction, which will require *all* of the 4 remaining
280  * disks to participate -- 4 * 32K/disk == 128K again.
281  */
282 
283 #ifndef RAIDOUTSTANDING
284 #define RAIDOUTSTANDING   6
285 #endif
286 
287 #define RAIDLABELDEV(dev)	\
288 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
289 
290 /* declared here, and made public, for the benefit of KVM stuff.. */
291 
292 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
293 				     struct disklabel *);
294 static void raidgetdisklabel(dev_t);
295 static void raidmakedisklabel(struct raid_softc *);
296 
297 static int raidlock(struct raid_softc *);
298 static void raidunlock(struct raid_softc *);
299 
300 static int raid_detach_unlocked(struct raid_softc *);
301 
302 static void rf_markalldirty(RF_Raid_t *);
303 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
304 
305 void rf_ReconThread(struct rf_recon_req *);
306 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
307 void rf_CopybackThread(RF_Raid_t *raidPtr);
308 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
309 int rf_autoconfig(device_t);
310 void rf_buildroothack(RF_ConfigSet_t *);
311 
312 RF_AutoConfig_t *rf_find_raid_components(void);
313 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
314 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
315 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
316 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
317 int rf_set_autoconfig(RF_Raid_t *, int);
318 int rf_set_rootpartition(RF_Raid_t *, int);
319 void rf_release_all_vps(RF_ConfigSet_t *);
320 void rf_cleanup_config_set(RF_ConfigSet_t *);
321 int rf_have_enough_components(RF_ConfigSet_t *);
322 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
323 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
324 
325 /*
326  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
327  * Note that this is overridden by having RAID_AUTOCONFIG as an option
328  * in the kernel config file.
329  */
330 #ifdef RAID_AUTOCONFIG
331 int raidautoconfig = 1;
332 #else
333 int raidautoconfig = 0;
334 #endif
335 static bool raidautoconfigdone = false;
336 
337 struct RF_Pools_s rf_pools;
338 
339 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
340 static kmutex_t raid_lock;
341 
342 static struct raid_softc *
343 raidcreate(int unit) {
344 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
345 	if (sc == NULL) {
346 #ifdef DIAGNOSTIC
347 		printf("%s: out of memory\n", __func__);
348 #endif
349 		return NULL;
350 	}
351 	sc->sc_unit = unit;
352 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
353 	return sc;
354 }
355 
356 static void
357 raiddestroy(struct raid_softc *sc) {
358 	bufq_free(sc->buf_queue);
359 	kmem_free(sc, sizeof(*sc));
360 }
361 
362 static struct raid_softc *
363 raidget(int unit) {
364 	struct raid_softc *sc;
365 	if (unit < 0) {
366 #ifdef DIAGNOSTIC
367 		panic("%s: unit %d!", __func__, unit);
368 #endif
369 		return NULL;
370 	}
371 	mutex_enter(&raid_lock);
372 	LIST_FOREACH(sc, &raids, sc_link) {
373 		if (sc->sc_unit == unit) {
374 			mutex_exit(&raid_lock);
375 			return sc;
376 		}
377 	}
378 	mutex_exit(&raid_lock);
379 	if ((sc = raidcreate(unit)) == NULL)
380 		return NULL;
381 	mutex_enter(&raid_lock);
382 	LIST_INSERT_HEAD(&raids, sc, sc_link);
383 	mutex_exit(&raid_lock);
384 	return sc;
385 }
386 
387 static void
388 raidput(struct raid_softc *sc) {
389 	mutex_enter(&raid_lock);
390 	LIST_REMOVE(sc, sc_link);
391 	mutex_exit(&raid_lock);
392 	raiddestroy(sc);
393 }
394 
395 void
396 raidattach(int num)
397 {
398 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
399 	/* This is where all the initialization stuff gets done. */
400 
401 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
402 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
403 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
404 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
405 
406 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
407 #endif
408 
409 	if (rf_BootRaidframe() == 0)
410 		aprint_verbose("Kernelized RAIDframe activated\n");
411 	else
412 		panic("Serious error booting RAID!!");
413 
414 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
415 		aprint_error("raidattach: config_cfattach_attach failed?\n");
416 	}
417 
418 	raidautoconfigdone = false;
419 
420 	/*
421 	 * Register a finalizer which will be used to auto-config RAID
422 	 * sets once all real hardware devices have been found.
423 	 */
424 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
425 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
426 }
427 
428 int
429 rf_autoconfig(device_t self)
430 {
431 	RF_AutoConfig_t *ac_list;
432 	RF_ConfigSet_t *config_sets;
433 
434 	if (!raidautoconfig || raidautoconfigdone == true)
435 		return (0);
436 
437 	/* XXX This code can only be run once. */
438 	raidautoconfigdone = true;
439 
440 #ifdef __HAVE_CPU_BOOTCONF
441 	/*
442 	 * 0. find the boot device if needed first so we can use it later
443 	 * this needs to be done before we autoconfigure any raid sets,
444 	 * because if we use wedges we are not going to be able to open
445 	 * the boot device later
446 	 */
447 	if (booted_device == NULL)
448 		cpu_bootconf();
449 #endif
450 	/* 1. locate all RAID components on the system */
451 	aprint_debug("Searching for RAID components...\n");
452 	ac_list = rf_find_raid_components();
453 
454 	/* 2. Sort them into their respective sets. */
455 	config_sets = rf_create_auto_sets(ac_list);
456 
457 	/*
458 	 * 3. Evaluate each set and configure the valid ones.
459 	 * This gets done in rf_buildroothack().
460 	 */
461 	rf_buildroothack(config_sets);
462 
463 	return 1;
464 }
465 
466 static int
467 rf_containsboot(RF_Raid_t *r, device_t bdv) {
468 	const char *bootname = device_xname(bdv);
469 	size_t len = strlen(bootname);
470 
471 	for (int col = 0; col < r->numCol; col++) {
472 		const char *devname = r->Disks[col].devname;
473 		devname += sizeof("/dev/") - 1;
474 		if (strncmp(devname, "dk", 2) == 0) {
475 			const char *parent =
476 			    dkwedge_get_parent_name(r->Disks[col].dev);
477 			if (parent != NULL)
478 				devname = parent;
479 		}
480 		if (strncmp(devname, bootname, len) == 0) {
481 			struct raid_softc *sc = r->softc;
482 			aprint_debug("raid%d includes boot device %s\n",
483 			    sc->sc_unit, devname);
484 			return 1;
485 		}
486 	}
487 	return 0;
488 }
489 
490 void
491 rf_buildroothack(RF_ConfigSet_t *config_sets)
492 {
493 	RF_ConfigSet_t *cset;
494 	RF_ConfigSet_t *next_cset;
495 	int num_root;
496 	struct raid_softc *sc, *rsc;
497 
498 	sc = rsc = NULL;
499 	num_root = 0;
500 	cset = config_sets;
501 	while (cset != NULL) {
502 		next_cset = cset->next;
503 		if (rf_have_enough_components(cset) &&
504 		    cset->ac->clabel->autoconfigure == 1) {
505 			sc = rf_auto_config_set(cset);
506 			if (sc != NULL) {
507 				aprint_debug("raid%d: configured ok\n",
508 				    sc->sc_unit);
509 				if (cset->rootable) {
510 					rsc = sc;
511 					num_root++;
512 				}
513 			} else {
514 				/* The autoconfig didn't work :( */
515 				aprint_debug("Autoconfig failed\n");
516 				rf_release_all_vps(cset);
517 			}
518 		} else {
519 			/* we're not autoconfiguring this set...
520 			   release the associated resources */
521 			rf_release_all_vps(cset);
522 		}
523 		/* cleanup */
524 		rf_cleanup_config_set(cset);
525 		cset = next_cset;
526 	}
527 
528 	/* if the user has specified what the root device should be
529 	   then we don't touch booted_device or boothowto... */
530 
531 	if (rootspec != NULL)
532 		return;
533 
534 	/* we found something bootable... */
535 
536 	/*
537 	 * XXX: The following code assumes that the root raid
538 	 * is the first ('a') partition. This is about the best
539 	 * we can do with a BSD disklabel, but we might be able
540 	 * to do better with a GPT label, by setting a specified
541 	 * attribute to indicate the root partition. We can then
542 	 * stash the partition number in the r->root_partition
543 	 * high bits (the bottom 2 bits are already used). For
544 	 * now we just set booted_partition to 0 when we override
545 	 * root.
546 	 */
547 	if (num_root == 1) {
548 		device_t candidate_root;
549 		if (rsc->sc_dkdev.dk_nwedges != 0) {
550 			char cname[sizeof(cset->ac->devname)];
551 			/* XXX: assume 'a' */
552 			snprintf(cname, sizeof(cname), "%s%c",
553 			    device_xname(rsc->sc_dev), 'a');
554 			candidate_root = dkwedge_find_by_wname(cname);
555 		} else
556 			candidate_root = rsc->sc_dev;
557 		if (booted_device == NULL ||
558 		    rsc->sc_r.root_partition == 1 ||
559 		    rf_containsboot(&rsc->sc_r, booted_device)) {
560 			booted_device = candidate_root;
561 			booted_partition = 0;	/* XXX assume 'a' */
562 		}
563 	} else if (num_root > 1) {
564 
565 		/*
566 		 * Maybe the MD code can help. If it cannot, then
567 		 * setroot() will discover that we have no
568 		 * booted_device and will ask the user if nothing was
569 		 * hardwired in the kernel config file
570 		 */
571 		if (booted_device == NULL)
572 			return;
573 
574 		num_root = 0;
575 		mutex_enter(&raid_lock);
576 		LIST_FOREACH(sc, &raids, sc_link) {
577 			RF_Raid_t *r = &sc->sc_r;
578 			if (r->valid == 0)
579 				continue;
580 
581 			if (r->root_partition == 0)
582 				continue;
583 
584 			if (rf_containsboot(r, booted_device)) {
585 				num_root++;
586 				rsc = sc;
587 			}
588 		}
589 		mutex_exit(&raid_lock);
590 
591 		if (num_root == 1) {
592 			booted_device = rsc->sc_dev;
593 			booted_partition = 0;	/* XXX assume 'a' */
594 		} else {
595 			/* we can't guess.. require the user to answer... */
596 			boothowto |= RB_ASKNAME;
597 		}
598 	}
599 }
600 
601 static int
602 raidsize(dev_t dev)
603 {
604 	struct raid_softc *rs;
605 	struct disklabel *lp;
606 	int     part, unit, omask, size;
607 
608 	unit = raidunit(dev);
609 	if ((rs = raidget(unit)) == NULL)
610 		return -1;
611 	if ((rs->sc_flags & RAIDF_INITED) == 0)
612 		return (-1);
613 
614 	part = DISKPART(dev);
615 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
616 	lp = rs->sc_dkdev.dk_label;
617 
618 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
619 		return (-1);
620 
621 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
622 		size = -1;
623 	else
624 		size = lp->d_partitions[part].p_size *
625 		    (lp->d_secsize / DEV_BSIZE);
626 
627 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
628 		return (-1);
629 
630 	return (size);
631 
632 }
633 
634 static int
635 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
636 {
637 	int     unit = raidunit(dev);
638 	struct raid_softc *rs;
639 	const struct bdevsw *bdev;
640 	struct disklabel *lp;
641 	RF_Raid_t *raidPtr;
642 	daddr_t offset;
643 	int     part, c, sparecol, j, scol, dumpto;
644 	int     error = 0;
645 
646 	if ((rs = raidget(unit)) == NULL)
647 		return ENXIO;
648 
649 	raidPtr = &rs->sc_r;
650 
651 	if ((rs->sc_flags & RAIDF_INITED) == 0)
652 		return ENXIO;
653 
654 	/* we only support dumping to RAID 1 sets */
655 	if (raidPtr->Layout.numDataCol != 1 ||
656 	    raidPtr->Layout.numParityCol != 1)
657 		return EINVAL;
658 
659 
660 	if ((error = raidlock(rs)) != 0)
661 		return error;
662 
663 	if (size % DEV_BSIZE != 0) {
664 		error = EINVAL;
665 		goto out;
666 	}
667 
668 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
669 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
670 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
671 		    size / DEV_BSIZE, rs->sc_size);
672 		error = EINVAL;
673 		goto out;
674 	}
675 
676 	part = DISKPART(dev);
677 	lp = rs->sc_dkdev.dk_label;
678 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
679 
680 	/* figure out what device is alive.. */
681 
682 	/*
683 	   Look for a component to dump to.  The preference for the
684 	   component to dump to is as follows:
685 	   1) the master
686 	   2) a used_spare of the master
687 	   3) the slave
688 	   4) a used_spare of the slave
689 	*/
690 
691 	dumpto = -1;
692 	for (c = 0; c < raidPtr->numCol; c++) {
693 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
694 			/* this might be the one */
695 			dumpto = c;
696 			break;
697 		}
698 	}
699 
700 	/*
701 	   At this point we have possibly selected a live master or a
702 	   live slave.  We now check to see if there is a spared
703 	   master (or a spared slave), if we didn't find a live master
704 	   or a live slave.
705 	*/
706 
707 	for (c = 0; c < raidPtr->numSpare; c++) {
708 		sparecol = raidPtr->numCol + c;
709 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
710 			/* How about this one? */
711 			scol = -1;
712 			for(j=0;j<raidPtr->numCol;j++) {
713 				if (raidPtr->Disks[j].spareCol == sparecol) {
714 					scol = j;
715 					break;
716 				}
717 			}
718 			if (scol == 0) {
719 				/*
720 				   We must have found a spared master!
721 				   We'll take that over anything else
722 				   found so far.  (We couldn't have
723 				   found a real master before, since
724 				   this is a used spare, and it's
725 				   saying that it's replacing the
726 				   master.)  On reboot (with
727 				   autoconfiguration turned on)
728 				   sparecol will become the 1st
729 				   component (component0) of this set.
730 				*/
731 				dumpto = sparecol;
732 				break;
733 			} else if (scol != -1) {
734 				/*
735 				   Must be a spared slave.  We'll dump
736 				   to that if we havn't found anything
737 				   else so far.
738 				*/
739 				if (dumpto == -1)
740 					dumpto = sparecol;
741 			}
742 		}
743 	}
744 
745 	if (dumpto == -1) {
746 		/* we couldn't find any live components to dump to!?!?
747 		 */
748 		error = EINVAL;
749 		goto out;
750 	}
751 
752 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
753 
754 	/*
755 	   Note that blkno is relative to this particular partition.
756 	   By adding the offset of this partition in the RAID
757 	   set, and also adding RF_PROTECTED_SECTORS, we get a
758 	   value that is relative to the partition used for the
759 	   underlying component.
760 	*/
761 
762 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
763 				blkno + offset, va, size);
764 
765 out:
766 	raidunlock(rs);
767 
768 	return error;
769 }
770 
771 /* ARGSUSED */
772 static int
773 raidopen(dev_t dev, int flags, int fmt,
774     struct lwp *l)
775 {
776 	int     unit = raidunit(dev);
777 	struct raid_softc *rs;
778 	struct disklabel *lp;
779 	int     part, pmask;
780 	int     error = 0;
781 
782 	if ((rs = raidget(unit)) == NULL)
783 		return ENXIO;
784 	if ((error = raidlock(rs)) != 0)
785 		return (error);
786 
787 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
788 		error = EBUSY;
789 		goto bad;
790 	}
791 
792 	lp = rs->sc_dkdev.dk_label;
793 
794 	part = DISKPART(dev);
795 
796 	/*
797 	 * If there are wedges, and this is not RAW_PART, then we
798 	 * need to fail.
799 	 */
800 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
801 		error = EBUSY;
802 		goto bad;
803 	}
804 	pmask = (1 << part);
805 
806 	if ((rs->sc_flags & RAIDF_INITED) &&
807 	    (rs->sc_dkdev.dk_nwedges == 0) &&
808 	    (rs->sc_dkdev.dk_openmask == 0))
809 		raidgetdisklabel(dev);
810 
811 	/* make sure that this partition exists */
812 
813 	if (part != RAW_PART) {
814 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
815 		    ((part >= lp->d_npartitions) ||
816 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
817 			error = ENXIO;
818 			goto bad;
819 		}
820 	}
821 	/* Prevent this unit from being unconfigured while open. */
822 	switch (fmt) {
823 	case S_IFCHR:
824 		rs->sc_dkdev.dk_copenmask |= pmask;
825 		break;
826 
827 	case S_IFBLK:
828 		rs->sc_dkdev.dk_bopenmask |= pmask;
829 		break;
830 	}
831 
832 	if ((rs->sc_dkdev.dk_openmask == 0) &&
833 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
834 		/* First one... mark things as dirty... Note that we *MUST*
835 		 have done a configure before this.  I DO NOT WANT TO BE
836 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
837 		 THAT THEY BELONG TOGETHER!!!!! */
838 		/* XXX should check to see if we're only open for reading
839 		   here... If so, we needn't do this, but then need some
840 		   other way of keeping track of what's happened.. */
841 
842 		rf_markalldirty(&rs->sc_r);
843 	}
844 
845 
846 	rs->sc_dkdev.dk_openmask =
847 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
848 
849 bad:
850 	raidunlock(rs);
851 
852 	return (error);
853 
854 
855 }
856 
857 /* ARGSUSED */
858 static int
859 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
860 {
861 	int     unit = raidunit(dev);
862 	struct raid_softc *rs;
863 	int     error = 0;
864 	int     part;
865 
866 	if ((rs = raidget(unit)) == NULL)
867 		return ENXIO;
868 
869 	if ((error = raidlock(rs)) != 0)
870 		return (error);
871 
872 	part = DISKPART(dev);
873 
874 	/* ...that much closer to allowing unconfiguration... */
875 	switch (fmt) {
876 	case S_IFCHR:
877 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
878 		break;
879 
880 	case S_IFBLK:
881 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
882 		break;
883 	}
884 	rs->sc_dkdev.dk_openmask =
885 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
886 
887 	if ((rs->sc_dkdev.dk_openmask == 0) &&
888 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
889 		/* Last one... device is not unconfigured yet.
890 		   Device shutdown has taken care of setting the
891 		   clean bits if RAIDF_INITED is not set
892 		   mark things as clean... */
893 
894 		rf_update_component_labels(&rs->sc_r,
895 						 RF_FINAL_COMPONENT_UPDATE);
896 
897 		/* If the kernel is shutting down, it will detach
898 		 * this RAID set soon enough.
899 		 */
900 	}
901 
902 	raidunlock(rs);
903 	return (0);
904 
905 }
906 
907 static void
908 raidstrategy(struct buf *bp)
909 {
910 	unsigned int unit = raidunit(bp->b_dev);
911 	RF_Raid_t *raidPtr;
912 	int     wlabel;
913 	struct raid_softc *rs;
914 
915 	if ((rs = raidget(unit)) == NULL) {
916 		bp->b_error = ENXIO;
917 		goto done;
918 	}
919 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
920 		bp->b_error = ENXIO;
921 		goto done;
922 	}
923 	raidPtr = &rs->sc_r;
924 	if (!raidPtr->valid) {
925 		bp->b_error = ENODEV;
926 		goto done;
927 	}
928 	if (bp->b_bcount == 0) {
929 		db1_printf(("b_bcount is zero..\n"));
930 		goto done;
931 	}
932 
933 	/*
934 	 * Do bounds checking and adjust transfer.  If there's an
935 	 * error, the bounds check will flag that for us.
936 	 */
937 
938 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
939 	if (DISKPART(bp->b_dev) == RAW_PART) {
940 		uint64_t size; /* device size in DEV_BSIZE unit */
941 
942 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
943 			size = raidPtr->totalSectors <<
944 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
945 		} else {
946 			size = raidPtr->totalSectors >>
947 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
948 		}
949 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
950 			goto done;
951 		}
952 	} else {
953 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
954 			db1_printf(("Bounds check failed!!:%d %d\n",
955 				(int) bp->b_blkno, (int) wlabel));
956 			goto done;
957 		}
958 	}
959 
960 	rf_lock_mutex2(raidPtr->iodone_lock);
961 
962 	bp->b_resid = 0;
963 
964 	/* stuff it onto our queue */
965 	bufq_put(rs->buf_queue, bp);
966 
967 	/* scheduled the IO to happen at the next convenient time */
968 	rf_signal_cond2(raidPtr->iodone_cv);
969 	rf_unlock_mutex2(raidPtr->iodone_lock);
970 
971 	return;
972 
973 done:
974 	bp->b_resid = bp->b_bcount;
975 	biodone(bp);
976 }
977 
978 /* ARGSUSED */
979 static int
980 raidread(dev_t dev, struct uio *uio, int flags)
981 {
982 	int     unit = raidunit(dev);
983 	struct raid_softc *rs;
984 
985 	if ((rs = raidget(unit)) == NULL)
986 		return ENXIO;
987 
988 	if ((rs->sc_flags & RAIDF_INITED) == 0)
989 		return (ENXIO);
990 
991 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
992 
993 }
994 
995 /* ARGSUSED */
996 static int
997 raidwrite(dev_t dev, struct uio *uio, int flags)
998 {
999 	int     unit = raidunit(dev);
1000 	struct raid_softc *rs;
1001 
1002 	if ((rs = raidget(unit)) == NULL)
1003 		return ENXIO;
1004 
1005 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1006 		return (ENXIO);
1007 
1008 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
1009 
1010 }
1011 
1012 static int
1013 raid_detach_unlocked(struct raid_softc *rs)
1014 {
1015 	int error;
1016 	RF_Raid_t *raidPtr;
1017 
1018 	raidPtr = &rs->sc_r;
1019 
1020 	/*
1021 	 * If somebody has a partition mounted, we shouldn't
1022 	 * shutdown.
1023 	 */
1024 	if (rs->sc_dkdev.dk_openmask != 0)
1025 		return EBUSY;
1026 
1027 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1028 		;	/* not initialized: nothing to do */
1029 	else if ((error = rf_Shutdown(raidPtr)) != 0)
1030 		return error;
1031 	else
1032 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
1033 
1034 	/* Detach the disk. */
1035 	dkwedge_delall(&rs->sc_dkdev);
1036 	disk_detach(&rs->sc_dkdev);
1037 	disk_destroy(&rs->sc_dkdev);
1038 
1039 	aprint_normal_dev(rs->sc_dev, "detached\n");
1040 
1041 	return 0;
1042 }
1043 
1044 static int
1045 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1046 {
1047 	int     unit = raidunit(dev);
1048 	int     error = 0;
1049 	int     part, pmask, s;
1050 	cfdata_t cf;
1051 	struct raid_softc *rs;
1052 	RF_Config_t *k_cfg, *u_cfg;
1053 	RF_Raid_t *raidPtr;
1054 	RF_RaidDisk_t *diskPtr;
1055 	RF_AccTotals_t *totals;
1056 	RF_DeviceConfig_t *d_cfg, **ucfgp;
1057 	u_char *specific_buf;
1058 	int retcode = 0;
1059 	int column;
1060 /*	int raidid; */
1061 	struct rf_recon_req *rrcopy, *rr;
1062 	RF_ComponentLabel_t *clabel;
1063 	RF_ComponentLabel_t *ci_label;
1064 	RF_ComponentLabel_t **clabel_ptr;
1065 	RF_SingleComponent_t *sparePtr,*componentPtr;
1066 	RF_SingleComponent_t component;
1067 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1068 	int i, j, d;
1069 #ifdef __HAVE_OLD_DISKLABEL
1070 	struct disklabel newlabel;
1071 #endif
1072 
1073 	if ((rs = raidget(unit)) == NULL)
1074 		return ENXIO;
1075 	raidPtr = &rs->sc_r;
1076 
1077 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1078 		(int) DISKPART(dev), (int) unit, cmd));
1079 
1080 	/* Must be open for writes for these commands... */
1081 	switch (cmd) {
1082 #ifdef DIOCGSECTORSIZE
1083 	case DIOCGSECTORSIZE:
1084 		*(u_int *)data = raidPtr->bytesPerSector;
1085 		return 0;
1086 	case DIOCGMEDIASIZE:
1087 		*(off_t *)data =
1088 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1089 		return 0;
1090 #endif
1091 	case DIOCSDINFO:
1092 	case DIOCWDINFO:
1093 #ifdef __HAVE_OLD_DISKLABEL
1094 	case ODIOCWDINFO:
1095 	case ODIOCSDINFO:
1096 #endif
1097 	case DIOCWLABEL:
1098 	case DIOCAWEDGE:
1099 	case DIOCDWEDGE:
1100 	case DIOCMWEDGES:
1101 	case DIOCSSTRATEGY:
1102 		if ((flag & FWRITE) == 0)
1103 			return (EBADF);
1104 	}
1105 
1106 	/* Must be initialized for these... */
1107 	switch (cmd) {
1108 	case DIOCGDINFO:
1109 	case DIOCSDINFO:
1110 	case DIOCWDINFO:
1111 #ifdef __HAVE_OLD_DISKLABEL
1112 	case ODIOCGDINFO:
1113 	case ODIOCWDINFO:
1114 	case ODIOCSDINFO:
1115 	case ODIOCGDEFLABEL:
1116 #endif
1117 	case DIOCGPARTINFO:
1118 	case DIOCWLABEL:
1119 	case DIOCGDEFLABEL:
1120 	case DIOCAWEDGE:
1121 	case DIOCDWEDGE:
1122 	case DIOCLWEDGES:
1123 	case DIOCMWEDGES:
1124 	case DIOCCACHESYNC:
1125 	case RAIDFRAME_SHUTDOWN:
1126 	case RAIDFRAME_REWRITEPARITY:
1127 	case RAIDFRAME_GET_INFO:
1128 	case RAIDFRAME_RESET_ACCTOTALS:
1129 	case RAIDFRAME_GET_ACCTOTALS:
1130 	case RAIDFRAME_KEEP_ACCTOTALS:
1131 	case RAIDFRAME_GET_SIZE:
1132 	case RAIDFRAME_FAIL_DISK:
1133 	case RAIDFRAME_COPYBACK:
1134 	case RAIDFRAME_CHECK_RECON_STATUS:
1135 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1136 	case RAIDFRAME_GET_COMPONENT_LABEL:
1137 	case RAIDFRAME_SET_COMPONENT_LABEL:
1138 	case RAIDFRAME_ADD_HOT_SPARE:
1139 	case RAIDFRAME_REMOVE_HOT_SPARE:
1140 	case RAIDFRAME_INIT_LABELS:
1141 	case RAIDFRAME_REBUILD_IN_PLACE:
1142 	case RAIDFRAME_CHECK_PARITY:
1143 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1144 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1145 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1146 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1147 	case RAIDFRAME_SET_AUTOCONFIG:
1148 	case RAIDFRAME_SET_ROOT:
1149 	case RAIDFRAME_DELETE_COMPONENT:
1150 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1151 	case RAIDFRAME_PARITYMAP_STATUS:
1152 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1153 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1154 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1155 	case DIOCGSTRATEGY:
1156 	case DIOCSSTRATEGY:
1157 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1158 			return (ENXIO);
1159 	}
1160 
1161 	switch (cmd) {
1162 #ifdef COMPAT_50
1163 	case RAIDFRAME_GET_INFO50:
1164 		return rf_get_info50(raidPtr, data);
1165 
1166 	case RAIDFRAME_CONFIGURE50:
1167 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1168 			return retcode;
1169 		goto config;
1170 #endif
1171 		/* configure the system */
1172 	case RAIDFRAME_CONFIGURE:
1173 
1174 		if (raidPtr->valid) {
1175 			/* There is a valid RAID set running on this unit! */
1176 			printf("raid%d: Device already configured!\n",unit);
1177 			return(EINVAL);
1178 		}
1179 
1180 		/* copy-in the configuration information */
1181 		/* data points to a pointer to the configuration structure */
1182 
1183 		u_cfg = *((RF_Config_t **) data);
1184 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1185 		if (k_cfg == NULL) {
1186 			return (ENOMEM);
1187 		}
1188 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1189 		if (retcode) {
1190 			RF_Free(k_cfg, sizeof(RF_Config_t));
1191 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1192 				retcode));
1193 			return (retcode);
1194 		}
1195 		goto config;
1196 	config:
1197 		/* allocate a buffer for the layout-specific data, and copy it
1198 		 * in */
1199 		if (k_cfg->layoutSpecificSize) {
1200 			if (k_cfg->layoutSpecificSize > 10000) {
1201 				/* sanity check */
1202 				RF_Free(k_cfg, sizeof(RF_Config_t));
1203 				return (EINVAL);
1204 			}
1205 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1206 			    (u_char *));
1207 			if (specific_buf == NULL) {
1208 				RF_Free(k_cfg, sizeof(RF_Config_t));
1209 				return (ENOMEM);
1210 			}
1211 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1212 			    k_cfg->layoutSpecificSize);
1213 			if (retcode) {
1214 				RF_Free(k_cfg, sizeof(RF_Config_t));
1215 				RF_Free(specific_buf,
1216 					k_cfg->layoutSpecificSize);
1217 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1218 					retcode));
1219 				return (retcode);
1220 			}
1221 		} else
1222 			specific_buf = NULL;
1223 		k_cfg->layoutSpecific = specific_buf;
1224 
1225 		/* should do some kind of sanity check on the configuration.
1226 		 * Store the sum of all the bytes in the last byte? */
1227 
1228 		/* configure the system */
1229 
1230 		/*
1231 		 * Clear the entire RAID descriptor, just to make sure
1232 		 *  there is no stale data left in the case of a
1233 		 *  reconfiguration
1234 		 */
1235 		memset(raidPtr, 0, sizeof(*raidPtr));
1236 		raidPtr->softc = rs;
1237 		raidPtr->raidid = unit;
1238 
1239 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1240 
1241 		if (retcode == 0) {
1242 
1243 			/* allow this many simultaneous IO's to
1244 			   this RAID device */
1245 			raidPtr->openings = RAIDOUTSTANDING;
1246 
1247 			raidinit(rs);
1248 			rf_markalldirty(raidPtr);
1249 		}
1250 		/* free the buffers.  No return code here. */
1251 		if (k_cfg->layoutSpecificSize) {
1252 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1253 		}
1254 		RF_Free(k_cfg, sizeof(RF_Config_t));
1255 
1256 		return (retcode);
1257 
1258 		/* shutdown the system */
1259 	case RAIDFRAME_SHUTDOWN:
1260 
1261 		part = DISKPART(dev);
1262 		pmask = (1 << part);
1263 
1264 		if ((error = raidlock(rs)) != 0)
1265 			return (error);
1266 
1267 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1268 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1269 			(rs->sc_dkdev.dk_copenmask & pmask)))
1270 			retcode = EBUSY;
1271 		else {
1272 			rs->sc_flags |= RAIDF_SHUTDOWN;
1273 			rs->sc_dkdev.dk_copenmask &= ~pmask;
1274 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
1275 			rs->sc_dkdev.dk_openmask &= ~pmask;
1276 			retcode = 0;
1277 		}
1278 
1279 		raidunlock(rs);
1280 
1281 		if (retcode != 0)
1282 			return retcode;
1283 
1284 		/* free the pseudo device attach bits */
1285 
1286 		cf = device_cfdata(rs->sc_dev);
1287 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1288 			free(cf, M_RAIDFRAME);
1289 
1290 		return (retcode);
1291 	case RAIDFRAME_GET_COMPONENT_LABEL:
1292 		clabel_ptr = (RF_ComponentLabel_t **) data;
1293 		/* need to read the component label for the disk indicated
1294 		   by row,column in clabel */
1295 
1296 		/*
1297 		 * Perhaps there should be an option to skip the in-core
1298 		 * copy and hit the disk, as with disklabel(8).
1299 		 */
1300 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1301 
1302 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1303 
1304 		if (retcode) {
1305 			RF_Free(clabel, sizeof(*clabel));
1306 			return retcode;
1307 		}
1308 
1309 		clabel->row = 0; /* Don't allow looking at anything else.*/
1310 
1311 		column = clabel->column;
1312 
1313 		if ((column < 0) || (column >= raidPtr->numCol +
1314 		    raidPtr->numSpare)) {
1315 			RF_Free(clabel, sizeof(*clabel));
1316 			return EINVAL;
1317 		}
1318 
1319 		RF_Free(clabel, sizeof(*clabel));
1320 
1321 		clabel = raidget_component_label(raidPtr, column);
1322 
1323 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1324 
1325 #if 0
1326 	case RAIDFRAME_SET_COMPONENT_LABEL:
1327 		clabel = (RF_ComponentLabel_t *) data;
1328 
1329 		/* XXX check the label for valid stuff... */
1330 		/* Note that some things *should not* get modified --
1331 		   the user should be re-initing the labels instead of
1332 		   trying to patch things.
1333 		   */
1334 
1335 		raidid = raidPtr->raidid;
1336 #ifdef DEBUG
1337 		printf("raid%d: Got component label:\n", raidid);
1338 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1339 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1340 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1341 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1342 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1343 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1344 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1345 #endif
1346 		clabel->row = 0;
1347 		column = clabel->column;
1348 
1349 		if ((column < 0) || (column >= raidPtr->numCol)) {
1350 			return(EINVAL);
1351 		}
1352 
1353 		/* XXX this isn't allowed to do anything for now :-) */
1354 
1355 		/* XXX and before it is, we need to fill in the rest
1356 		   of the fields!?!?!?! */
1357 		memcpy(raidget_component_label(raidPtr, column),
1358 		    clabel, sizeof(*clabel));
1359 		raidflush_component_label(raidPtr, column);
1360 		return (0);
1361 #endif
1362 
1363 	case RAIDFRAME_INIT_LABELS:
1364 		clabel = (RF_ComponentLabel_t *) data;
1365 		/*
1366 		   we only want the serial number from
1367 		   the above.  We get all the rest of the information
1368 		   from the config that was used to create this RAID
1369 		   set.
1370 		   */
1371 
1372 		raidPtr->serial_number = clabel->serial_number;
1373 
1374 		for(column=0;column<raidPtr->numCol;column++) {
1375 			diskPtr = &raidPtr->Disks[column];
1376 			if (!RF_DEAD_DISK(diskPtr->status)) {
1377 				ci_label = raidget_component_label(raidPtr,
1378 				    column);
1379 				/* Zeroing this is important. */
1380 				memset(ci_label, 0, sizeof(*ci_label));
1381 				raid_init_component_label(raidPtr, ci_label);
1382 				ci_label->serial_number =
1383 				    raidPtr->serial_number;
1384 				ci_label->row = 0; /* we dont' pretend to support more */
1385 				rf_component_label_set_partitionsize(ci_label,
1386 				    diskPtr->partitionSize);
1387 				ci_label->column = column;
1388 				raidflush_component_label(raidPtr, column);
1389 			}
1390 			/* XXXjld what about the spares? */
1391 		}
1392 
1393 		return (retcode);
1394 	case RAIDFRAME_SET_AUTOCONFIG:
1395 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1396 		printf("raid%d: New autoconfig value is: %d\n",
1397 		       raidPtr->raidid, d);
1398 		*(int *) data = d;
1399 		return (retcode);
1400 
1401 	case RAIDFRAME_SET_ROOT:
1402 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1403 		printf("raid%d: New rootpartition value is: %d\n",
1404 		       raidPtr->raidid, d);
1405 		*(int *) data = d;
1406 		return (retcode);
1407 
1408 		/* initialize all parity */
1409 	case RAIDFRAME_REWRITEPARITY:
1410 
1411 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1412 			/* Parity for RAID 0 is trivially correct */
1413 			raidPtr->parity_good = RF_RAID_CLEAN;
1414 			return(0);
1415 		}
1416 
1417 		if (raidPtr->parity_rewrite_in_progress == 1) {
1418 			/* Re-write is already in progress! */
1419 			return(EINVAL);
1420 		}
1421 
1422 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1423 					   rf_RewriteParityThread,
1424 					   raidPtr,"raid_parity");
1425 		return (retcode);
1426 
1427 
1428 	case RAIDFRAME_ADD_HOT_SPARE:
1429 		sparePtr = (RF_SingleComponent_t *) data;
1430 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1431 		retcode = rf_add_hot_spare(raidPtr, &component);
1432 		return(retcode);
1433 
1434 	case RAIDFRAME_REMOVE_HOT_SPARE:
1435 		return(retcode);
1436 
1437 	case RAIDFRAME_DELETE_COMPONENT:
1438 		componentPtr = (RF_SingleComponent_t *)data;
1439 		memcpy( &component, componentPtr,
1440 			sizeof(RF_SingleComponent_t));
1441 		retcode = rf_delete_component(raidPtr, &component);
1442 		return(retcode);
1443 
1444 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1445 		componentPtr = (RF_SingleComponent_t *)data;
1446 		memcpy( &component, componentPtr,
1447 			sizeof(RF_SingleComponent_t));
1448 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1449 		return(retcode);
1450 
1451 	case RAIDFRAME_REBUILD_IN_PLACE:
1452 
1453 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1454 			/* Can't do this on a RAID 0!! */
1455 			return(EINVAL);
1456 		}
1457 
1458 		if (raidPtr->recon_in_progress == 1) {
1459 			/* a reconstruct is already in progress! */
1460 			return(EINVAL);
1461 		}
1462 
1463 		componentPtr = (RF_SingleComponent_t *) data;
1464 		memcpy( &component, componentPtr,
1465 			sizeof(RF_SingleComponent_t));
1466 		component.row = 0; /* we don't support any more */
1467 		column = component.column;
1468 
1469 		if ((column < 0) || (column >= raidPtr->numCol)) {
1470 			return(EINVAL);
1471 		}
1472 
1473 		rf_lock_mutex2(raidPtr->mutex);
1474 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1475 		    (raidPtr->numFailures > 0)) {
1476 			/* XXX 0 above shouldn't be constant!!! */
1477 			/* some component other than this has failed.
1478 			   Let's not make things worse than they already
1479 			   are... */
1480 			printf("raid%d: Unable to reconstruct to disk at:\n",
1481 			       raidPtr->raidid);
1482 			printf("raid%d:     Col: %d   Too many failures.\n",
1483 			       raidPtr->raidid, column);
1484 			rf_unlock_mutex2(raidPtr->mutex);
1485 			return (EINVAL);
1486 		}
1487 		if (raidPtr->Disks[column].status ==
1488 		    rf_ds_reconstructing) {
1489 			printf("raid%d: Unable to reconstruct to disk at:\n",
1490 			       raidPtr->raidid);
1491 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
1492 
1493 			rf_unlock_mutex2(raidPtr->mutex);
1494 			return (EINVAL);
1495 		}
1496 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1497 			rf_unlock_mutex2(raidPtr->mutex);
1498 			return (EINVAL);
1499 		}
1500 		rf_unlock_mutex2(raidPtr->mutex);
1501 
1502 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1503 		if (rrcopy == NULL)
1504 			return(ENOMEM);
1505 
1506 		rrcopy->raidPtr = (void *) raidPtr;
1507 		rrcopy->col = column;
1508 
1509 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1510 					   rf_ReconstructInPlaceThread,
1511 					   rrcopy,"raid_reconip");
1512 		return(retcode);
1513 
1514 	case RAIDFRAME_GET_INFO:
1515 		if (!raidPtr->valid)
1516 			return (ENODEV);
1517 		ucfgp = (RF_DeviceConfig_t **) data;
1518 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1519 			  (RF_DeviceConfig_t *));
1520 		if (d_cfg == NULL)
1521 			return (ENOMEM);
1522 		d_cfg->rows = 1; /* there is only 1 row now */
1523 		d_cfg->cols = raidPtr->numCol;
1524 		d_cfg->ndevs = raidPtr->numCol;
1525 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1526 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1527 			return (ENOMEM);
1528 		}
1529 		d_cfg->nspares = raidPtr->numSpare;
1530 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1531 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1532 			return (ENOMEM);
1533 		}
1534 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1535 		d = 0;
1536 		for (j = 0; j < d_cfg->cols; j++) {
1537 			d_cfg->devs[d] = raidPtr->Disks[j];
1538 			d++;
1539 		}
1540 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1541 			d_cfg->spares[i] = raidPtr->Disks[j];
1542 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1543 				/* XXX: raidctl(8) expects to see this as a used spare */
1544 				d_cfg->spares[i].status = rf_ds_used_spare;
1545 			}
1546 		}
1547 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1548 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1549 
1550 		return (retcode);
1551 
1552 	case RAIDFRAME_CHECK_PARITY:
1553 		*(int *) data = raidPtr->parity_good;
1554 		return (0);
1555 
1556 	case RAIDFRAME_PARITYMAP_STATUS:
1557 		if (rf_paritymap_ineligible(raidPtr))
1558 			return EINVAL;
1559 		rf_paritymap_status(raidPtr->parity_map,
1560 		    (struct rf_pmstat *)data);
1561 		return 0;
1562 
1563 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1564 		if (rf_paritymap_ineligible(raidPtr))
1565 			return EINVAL;
1566 		if (raidPtr->parity_map == NULL)
1567 			return ENOENT; /* ??? */
1568 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1569 			(struct rf_pmparams *)data, 1))
1570 			return EINVAL;
1571 		return 0;
1572 
1573 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1574 		if (rf_paritymap_ineligible(raidPtr))
1575 			return EINVAL;
1576 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1577 		return 0;
1578 
1579 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1580 		if (rf_paritymap_ineligible(raidPtr))
1581 			return EINVAL;
1582 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1583 		/* XXX should errors be passed up? */
1584 		return 0;
1585 
1586 	case RAIDFRAME_RESET_ACCTOTALS:
1587 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1588 		return (0);
1589 
1590 	case RAIDFRAME_GET_ACCTOTALS:
1591 		totals = (RF_AccTotals_t *) data;
1592 		*totals = raidPtr->acc_totals;
1593 		return (0);
1594 
1595 	case RAIDFRAME_KEEP_ACCTOTALS:
1596 		raidPtr->keep_acc_totals = *(int *)data;
1597 		return (0);
1598 
1599 	case RAIDFRAME_GET_SIZE:
1600 		*(int *) data = raidPtr->totalSectors;
1601 		return (0);
1602 
1603 		/* fail a disk & optionally start reconstruction */
1604 	case RAIDFRAME_FAIL_DISK:
1605 
1606 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1607 			/* Can't do this on a RAID 0!! */
1608 			return(EINVAL);
1609 		}
1610 
1611 		rr = (struct rf_recon_req *) data;
1612 		rr->row = 0;
1613 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1614 			return (EINVAL);
1615 
1616 
1617 		rf_lock_mutex2(raidPtr->mutex);
1618 		if (raidPtr->status == rf_rs_reconstructing) {
1619 			/* you can't fail a disk while we're reconstructing! */
1620 			/* XXX wrong for RAID6 */
1621 			rf_unlock_mutex2(raidPtr->mutex);
1622 			return (EINVAL);
1623 		}
1624 		if ((raidPtr->Disks[rr->col].status ==
1625 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1626 			/* some other component has failed.  Let's not make
1627 			   things worse. XXX wrong for RAID6 */
1628 			rf_unlock_mutex2(raidPtr->mutex);
1629 			return (EINVAL);
1630 		}
1631 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1632 			/* Can't fail a spared disk! */
1633 			rf_unlock_mutex2(raidPtr->mutex);
1634 			return (EINVAL);
1635 		}
1636 		rf_unlock_mutex2(raidPtr->mutex);
1637 
1638 		/* make a copy of the recon request so that we don't rely on
1639 		 * the user's buffer */
1640 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1641 		if (rrcopy == NULL)
1642 			return(ENOMEM);
1643 		memcpy(rrcopy, rr, sizeof(*rr));
1644 		rrcopy->raidPtr = (void *) raidPtr;
1645 
1646 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1647 					   rf_ReconThread,
1648 					   rrcopy,"raid_recon");
1649 		return (0);
1650 
1651 		/* invoke a copyback operation after recon on whatever disk
1652 		 * needs it, if any */
1653 	case RAIDFRAME_COPYBACK:
1654 
1655 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1656 			/* This makes no sense on a RAID 0!! */
1657 			return(EINVAL);
1658 		}
1659 
1660 		if (raidPtr->copyback_in_progress == 1) {
1661 			/* Copyback is already in progress! */
1662 			return(EINVAL);
1663 		}
1664 
1665 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1666 					   rf_CopybackThread,
1667 					   raidPtr,"raid_copyback");
1668 		return (retcode);
1669 
1670 		/* return the percentage completion of reconstruction */
1671 	case RAIDFRAME_CHECK_RECON_STATUS:
1672 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1673 			/* This makes no sense on a RAID 0, so tell the
1674 			   user it's done. */
1675 			*(int *) data = 100;
1676 			return(0);
1677 		}
1678 		if (raidPtr->status != rf_rs_reconstructing)
1679 			*(int *) data = 100;
1680 		else {
1681 			if (raidPtr->reconControl->numRUsTotal > 0) {
1682 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1683 			} else {
1684 				*(int *) data = 0;
1685 			}
1686 		}
1687 		return (0);
1688 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1689 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1690 		if (raidPtr->status != rf_rs_reconstructing) {
1691 			progressInfo.remaining = 0;
1692 			progressInfo.completed = 100;
1693 			progressInfo.total = 100;
1694 		} else {
1695 			progressInfo.total =
1696 				raidPtr->reconControl->numRUsTotal;
1697 			progressInfo.completed =
1698 				raidPtr->reconControl->numRUsComplete;
1699 			progressInfo.remaining = progressInfo.total -
1700 				progressInfo.completed;
1701 		}
1702 		retcode = copyout(&progressInfo, *progressInfoPtr,
1703 				  sizeof(RF_ProgressInfo_t));
1704 		return (retcode);
1705 
1706 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1707 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1708 			/* This makes no sense on a RAID 0, so tell the
1709 			   user it's done. */
1710 			*(int *) data = 100;
1711 			return(0);
1712 		}
1713 		if (raidPtr->parity_rewrite_in_progress == 1) {
1714 			*(int *) data = 100 *
1715 				raidPtr->parity_rewrite_stripes_done /
1716 				raidPtr->Layout.numStripe;
1717 		} else {
1718 			*(int *) data = 100;
1719 		}
1720 		return (0);
1721 
1722 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1723 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1724 		if (raidPtr->parity_rewrite_in_progress == 1) {
1725 			progressInfo.total = raidPtr->Layout.numStripe;
1726 			progressInfo.completed =
1727 				raidPtr->parity_rewrite_stripes_done;
1728 			progressInfo.remaining = progressInfo.total -
1729 				progressInfo.completed;
1730 		} else {
1731 			progressInfo.remaining = 0;
1732 			progressInfo.completed = 100;
1733 			progressInfo.total = 100;
1734 		}
1735 		retcode = copyout(&progressInfo, *progressInfoPtr,
1736 				  sizeof(RF_ProgressInfo_t));
1737 		return (retcode);
1738 
1739 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1740 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1741 			/* This makes no sense on a RAID 0 */
1742 			*(int *) data = 100;
1743 			return(0);
1744 		}
1745 		if (raidPtr->copyback_in_progress == 1) {
1746 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1747 				raidPtr->Layout.numStripe;
1748 		} else {
1749 			*(int *) data = 100;
1750 		}
1751 		return (0);
1752 
1753 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1754 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1755 		if (raidPtr->copyback_in_progress == 1) {
1756 			progressInfo.total = raidPtr->Layout.numStripe;
1757 			progressInfo.completed =
1758 				raidPtr->copyback_stripes_done;
1759 			progressInfo.remaining = progressInfo.total -
1760 				progressInfo.completed;
1761 		} else {
1762 			progressInfo.remaining = 0;
1763 			progressInfo.completed = 100;
1764 			progressInfo.total = 100;
1765 		}
1766 		retcode = copyout(&progressInfo, *progressInfoPtr,
1767 				  sizeof(RF_ProgressInfo_t));
1768 		return (retcode);
1769 
1770 		/* the sparetable daemon calls this to wait for the kernel to
1771 		 * need a spare table. this ioctl does not return until a
1772 		 * spare table is needed. XXX -- calling mpsleep here in the
1773 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1774 		 * -- I should either compute the spare table in the kernel,
1775 		 * or have a different -- XXX XXX -- interface (a different
1776 		 * character device) for delivering the table     -- XXX */
1777 #if 0
1778 	case RAIDFRAME_SPARET_WAIT:
1779 		rf_lock_mutex2(rf_sparet_wait_mutex);
1780 		while (!rf_sparet_wait_queue)
1781 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1782 		waitreq = rf_sparet_wait_queue;
1783 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1784 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1785 
1786 		/* structure assignment */
1787 		*((RF_SparetWait_t *) data) = *waitreq;
1788 
1789 		RF_Free(waitreq, sizeof(*waitreq));
1790 		return (0);
1791 
1792 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1793 		 * code in it that will cause the dameon to exit */
1794 	case RAIDFRAME_ABORT_SPARET_WAIT:
1795 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1796 		waitreq->fcol = -1;
1797 		rf_lock_mutex2(rf_sparet_wait_mutex);
1798 		waitreq->next = rf_sparet_wait_queue;
1799 		rf_sparet_wait_queue = waitreq;
1800 		rf_broadcast_conf2(rf_sparet_wait_cv);
1801 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1802 		return (0);
1803 
1804 		/* used by the spare table daemon to deliver a spare table
1805 		 * into the kernel */
1806 	case RAIDFRAME_SEND_SPARET:
1807 
1808 		/* install the spare table */
1809 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1810 
1811 		/* respond to the requestor.  the return status of the spare
1812 		 * table installation is passed in the "fcol" field */
1813 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1814 		waitreq->fcol = retcode;
1815 		rf_lock_mutex2(rf_sparet_wait_mutex);
1816 		waitreq->next = rf_sparet_resp_queue;
1817 		rf_sparet_resp_queue = waitreq;
1818 		rf_broadcast_cond2(rf_sparet_resp_cv);
1819 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1820 
1821 		return (retcode);
1822 #endif
1823 
1824 	default:
1825 		break; /* fall through to the os-specific code below */
1826 
1827 	}
1828 
1829 	if (!raidPtr->valid)
1830 		return (EINVAL);
1831 
1832 	/*
1833 	 * Add support for "regular" device ioctls here.
1834 	 */
1835 
1836 	error = disk_ioctl(&rs->sc_dkdev, dev, cmd, data, flag, l);
1837 	if (error != EPASSTHROUGH)
1838 		return (error);
1839 
1840 	switch (cmd) {
1841 	case DIOCWDINFO:
1842 	case DIOCSDINFO:
1843 #ifdef __HAVE_OLD_DISKLABEL
1844 	case ODIOCWDINFO:
1845 	case ODIOCSDINFO:
1846 #endif
1847 	{
1848 		struct disklabel *lp;
1849 #ifdef __HAVE_OLD_DISKLABEL
1850 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1851 			memset(&newlabel, 0, sizeof newlabel);
1852 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1853 			lp = &newlabel;
1854 		} else
1855 #endif
1856 		lp = (struct disklabel *)data;
1857 
1858 		if ((error = raidlock(rs)) != 0)
1859 			return (error);
1860 
1861 		rs->sc_flags |= RAIDF_LABELLING;
1862 
1863 		error = setdisklabel(rs->sc_dkdev.dk_label,
1864 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
1865 		if (error == 0) {
1866 			if (cmd == DIOCWDINFO
1867 #ifdef __HAVE_OLD_DISKLABEL
1868 			    || cmd == ODIOCWDINFO
1869 #endif
1870 			   )
1871 				error = writedisklabel(RAIDLABELDEV(dev),
1872 				    raidstrategy, rs->sc_dkdev.dk_label,
1873 				    rs->sc_dkdev.dk_cpulabel);
1874 		}
1875 		rs->sc_flags &= ~RAIDF_LABELLING;
1876 
1877 		raidunlock(rs);
1878 
1879 		if (error)
1880 			return (error);
1881 		break;
1882 	}
1883 
1884 	case DIOCWLABEL:
1885 		if (*(int *) data != 0)
1886 			rs->sc_flags |= RAIDF_WLABEL;
1887 		else
1888 			rs->sc_flags &= ~RAIDF_WLABEL;
1889 		break;
1890 
1891 	case DIOCGDEFLABEL:
1892 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1893 		break;
1894 
1895 #ifdef __HAVE_OLD_DISKLABEL
1896 	case ODIOCGDEFLABEL:
1897 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
1898 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1899 			return ENOTTY;
1900 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1901 		break;
1902 #endif
1903 
1904 	case DIOCCACHESYNC:
1905 		return rf_sync_component_caches(raidPtr);
1906 
1907 	case DIOCGSTRATEGY:
1908 	    {
1909 		struct disk_strategy *dks = (void *)data;
1910 
1911 		s = splbio();
1912 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1913 		    sizeof(dks->dks_name));
1914 		splx(s);
1915 		dks->dks_paramlen = 0;
1916 
1917 		return 0;
1918 	    }
1919 
1920 	case DIOCSSTRATEGY:
1921 	    {
1922 		struct disk_strategy *dks = (void *)data;
1923 		struct bufq_state *new;
1924 		struct bufq_state *old;
1925 
1926 		if (dks->dks_param != NULL) {
1927 			return EINVAL;
1928 		}
1929 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1930 		error = bufq_alloc(&new, dks->dks_name,
1931 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1932 		if (error) {
1933 			return error;
1934 		}
1935 		s = splbio();
1936 		old = rs->buf_queue;
1937 		bufq_move(new, old);
1938 		rs->buf_queue = new;
1939 		splx(s);
1940 		bufq_free(old);
1941 
1942 		return 0;
1943 	    }
1944 
1945 	default:
1946 		retcode = ENOTTY;
1947 	}
1948 	return (retcode);
1949 
1950 }
1951 
1952 
1953 /* raidinit -- complete the rest of the initialization for the
1954    RAIDframe device.  */
1955 
1956 
1957 static void
1958 raidinit(struct raid_softc *rs)
1959 {
1960 	cfdata_t cf;
1961 	int     unit;
1962 	RF_Raid_t *raidPtr = &rs->sc_r;
1963 
1964 	unit = raidPtr->raidid;
1965 
1966 
1967 	/* XXX should check return code first... */
1968 	rs->sc_flags |= RAIDF_INITED;
1969 
1970 	/* XXX doesn't check bounds. */
1971 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1972 
1973 	/* attach the pseudo device */
1974 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1975 	cf->cf_name = raid_cd.cd_name;
1976 	cf->cf_atname = raid_cd.cd_name;
1977 	cf->cf_unit = unit;
1978 	cf->cf_fstate = FSTATE_STAR;
1979 
1980 	rs->sc_dev = config_attach_pseudo(cf);
1981 
1982 	if (rs->sc_dev == NULL) {
1983 		printf("raid%d: config_attach_pseudo failed\n",
1984 		    raidPtr->raidid);
1985 		rs->sc_flags &= ~RAIDF_INITED;
1986 		free(cf, M_RAIDFRAME);
1987 		return;
1988 	}
1989 
1990 	/* disk_attach actually creates space for the CPU disklabel, among
1991 	 * other things, so it's critical to call this *BEFORE* we try putzing
1992 	 * with disklabels. */
1993 
1994 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1995 	disk_attach(&rs->sc_dkdev);
1996 
1997 	/* XXX There may be a weird interaction here between this, and
1998 	 * protectedSectors, as used in RAIDframe.  */
1999 
2000 	rs->sc_size = raidPtr->totalSectors;
2001 
2002 	rf_set_geometry(rs, raidPtr);
2003 
2004 	dkwedge_discover(&rs->sc_dkdev);
2005 
2006 }
2007 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
2008 /* wake up the daemon & tell it to get us a spare table
2009  * XXX
2010  * the entries in the queues should be tagged with the raidPtr
2011  * so that in the extremely rare case that two recons happen at once,
2012  * we know for which device were requesting a spare table
2013  * XXX
2014  *
2015  * XXX This code is not currently used. GO
2016  */
2017 int
2018 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2019 {
2020 	int     retcode;
2021 
2022 	rf_lock_mutex2(rf_sparet_wait_mutex);
2023 	req->next = rf_sparet_wait_queue;
2024 	rf_sparet_wait_queue = req;
2025 	rf_broadcast_cond2(rf_sparet_wait_cv);
2026 
2027 	/* mpsleep unlocks the mutex */
2028 	while (!rf_sparet_resp_queue) {
2029 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2030 	}
2031 	req = rf_sparet_resp_queue;
2032 	rf_sparet_resp_queue = req->next;
2033 	rf_unlock_mutex2(rf_sparet_wait_mutex);
2034 
2035 	retcode = req->fcol;
2036 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
2037 					 * alloc'd */
2038 	return (retcode);
2039 }
2040 #endif
2041 
2042 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2043  * bp & passes it down.
2044  * any calls originating in the kernel must use non-blocking I/O
2045  * do some extra sanity checking to return "appropriate" error values for
2046  * certain conditions (to make some standard utilities work)
2047  *
2048  * Formerly known as: rf_DoAccessKernel
2049  */
2050 void
2051 raidstart(RF_Raid_t *raidPtr)
2052 {
2053 	RF_SectorCount_t num_blocks, pb, sum;
2054 	RF_RaidAddr_t raid_addr;
2055 	struct partition *pp;
2056 	daddr_t blocknum;
2057 	struct raid_softc *rs;
2058 	int     do_async;
2059 	struct buf *bp;
2060 	int rc;
2061 
2062 	rs = raidPtr->softc;
2063 	/* quick check to see if anything has died recently */
2064 	rf_lock_mutex2(raidPtr->mutex);
2065 	if (raidPtr->numNewFailures > 0) {
2066 		rf_unlock_mutex2(raidPtr->mutex);
2067 		rf_update_component_labels(raidPtr,
2068 					   RF_NORMAL_COMPONENT_UPDATE);
2069 		rf_lock_mutex2(raidPtr->mutex);
2070 		raidPtr->numNewFailures--;
2071 	}
2072 
2073 	/* Check to see if we're at the limit... */
2074 	while (raidPtr->openings > 0) {
2075 		rf_unlock_mutex2(raidPtr->mutex);
2076 
2077 		/* get the next item, if any, from the queue */
2078 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2079 			/* nothing more to do */
2080 			return;
2081 		}
2082 
2083 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
2084 		 * partition.. Need to make it absolute to the underlying
2085 		 * device.. */
2086 
2087 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2088 		if (DISKPART(bp->b_dev) != RAW_PART) {
2089 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2090 			blocknum += pp->p_offset;
2091 		}
2092 
2093 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2094 			    (int) blocknum));
2095 
2096 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2097 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2098 
2099 		/* *THIS* is where we adjust what block we're going to...
2100 		 * but DO NOT TOUCH bp->b_blkno!!! */
2101 		raid_addr = blocknum;
2102 
2103 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2104 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2105 		sum = raid_addr + num_blocks + pb;
2106 		if (1 || rf_debugKernelAccess) {
2107 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2108 				    (int) raid_addr, (int) sum, (int) num_blocks,
2109 				    (int) pb, (int) bp->b_resid));
2110 		}
2111 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2112 		    || (sum < num_blocks) || (sum < pb)) {
2113 			bp->b_error = ENOSPC;
2114 			bp->b_resid = bp->b_bcount;
2115 			biodone(bp);
2116 			rf_lock_mutex2(raidPtr->mutex);
2117 			continue;
2118 		}
2119 		/*
2120 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2121 		 */
2122 
2123 		if (bp->b_bcount & raidPtr->sectorMask) {
2124 			bp->b_error = EINVAL;
2125 			bp->b_resid = bp->b_bcount;
2126 			biodone(bp);
2127 			rf_lock_mutex2(raidPtr->mutex);
2128 			continue;
2129 
2130 		}
2131 		db1_printf(("Calling DoAccess..\n"));
2132 
2133 
2134 		rf_lock_mutex2(raidPtr->mutex);
2135 		raidPtr->openings--;
2136 		rf_unlock_mutex2(raidPtr->mutex);
2137 
2138 		/*
2139 		 * Everything is async.
2140 		 */
2141 		do_async = 1;
2142 
2143 		disk_busy(&rs->sc_dkdev);
2144 
2145 		/* XXX we're still at splbio() here... do we *really*
2146 		   need to be? */
2147 
2148 		/* don't ever condition on bp->b_flags & B_WRITE.
2149 		 * always condition on B_READ instead */
2150 
2151 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2152 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2153 				 do_async, raid_addr, num_blocks,
2154 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2155 
2156 		if (rc) {
2157 			bp->b_error = rc;
2158 			bp->b_resid = bp->b_bcount;
2159 			biodone(bp);
2160 			/* continue loop */
2161 		}
2162 
2163 		rf_lock_mutex2(raidPtr->mutex);
2164 	}
2165 	rf_unlock_mutex2(raidPtr->mutex);
2166 }
2167 
2168 
2169 
2170 
2171 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2172 
2173 int
2174 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2175 {
2176 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2177 	struct buf *bp;
2178 
2179 	req->queue = queue;
2180 	bp = req->bp;
2181 
2182 	switch (req->type) {
2183 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2184 		/* XXX need to do something extra here.. */
2185 		/* I'm leaving this in, as I've never actually seen it used,
2186 		 * and I'd like folks to report it... GO */
2187 		printf(("WAKEUP CALLED\n"));
2188 		queue->numOutstanding++;
2189 
2190 		bp->b_flags = 0;
2191 		bp->b_private = req;
2192 
2193 		KernelWakeupFunc(bp);
2194 		break;
2195 
2196 	case RF_IO_TYPE_READ:
2197 	case RF_IO_TYPE_WRITE:
2198 #if RF_ACC_TRACE > 0
2199 		if (req->tracerec) {
2200 			RF_ETIMER_START(req->tracerec->timer);
2201 		}
2202 #endif
2203 		InitBP(bp, queue->rf_cinfo->ci_vp,
2204 		    op, queue->rf_cinfo->ci_dev,
2205 		    req->sectorOffset, req->numSector,
2206 		    req->buf, KernelWakeupFunc, (void *) req,
2207 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2208 
2209 		if (rf_debugKernelAccess) {
2210 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2211 				(long) bp->b_blkno));
2212 		}
2213 		queue->numOutstanding++;
2214 		queue->last_deq_sector = req->sectorOffset;
2215 		/* acc wouldn't have been let in if there were any pending
2216 		 * reqs at any other priority */
2217 		queue->curPriority = req->priority;
2218 
2219 		db1_printf(("Going for %c to unit %d col %d\n",
2220 			    req->type, queue->raidPtr->raidid,
2221 			    queue->col));
2222 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2223 			(int) req->sectorOffset, (int) req->numSector,
2224 			(int) (req->numSector <<
2225 			    queue->raidPtr->logBytesPerSector),
2226 			(int) queue->raidPtr->logBytesPerSector));
2227 
2228 		/*
2229 		 * XXX: drop lock here since this can block at
2230 		 * least with backing SCSI devices.  Retake it
2231 		 * to minimize fuss with calling interfaces.
2232 		 */
2233 
2234 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2235 		bdev_strategy(bp);
2236 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2237 		break;
2238 
2239 	default:
2240 		panic("bad req->type in rf_DispatchKernelIO");
2241 	}
2242 	db1_printf(("Exiting from DispatchKernelIO\n"));
2243 
2244 	return (0);
2245 }
2246 /* this is the callback function associated with a I/O invoked from
2247    kernel code.
2248  */
2249 static void
2250 KernelWakeupFunc(struct buf *bp)
2251 {
2252 	RF_DiskQueueData_t *req = NULL;
2253 	RF_DiskQueue_t *queue;
2254 
2255 	db1_printf(("recovering the request queue:\n"));
2256 
2257 	req = bp->b_private;
2258 
2259 	queue = (RF_DiskQueue_t *) req->queue;
2260 
2261 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2262 
2263 #if RF_ACC_TRACE > 0
2264 	if (req->tracerec) {
2265 		RF_ETIMER_STOP(req->tracerec->timer);
2266 		RF_ETIMER_EVAL(req->tracerec->timer);
2267 		rf_lock_mutex2(rf_tracing_mutex);
2268 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2269 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2270 		req->tracerec->num_phys_ios++;
2271 		rf_unlock_mutex2(rf_tracing_mutex);
2272 	}
2273 #endif
2274 
2275 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2276 	 * ballistic, and mark the component as hosed... */
2277 
2278 	if (bp->b_error != 0) {
2279 		/* Mark the disk as dead */
2280 		/* but only mark it once... */
2281 		/* and only if it wouldn't leave this RAID set
2282 		   completely broken */
2283 		if (((queue->raidPtr->Disks[queue->col].status ==
2284 		      rf_ds_optimal) ||
2285 		     (queue->raidPtr->Disks[queue->col].status ==
2286 		      rf_ds_used_spare)) &&
2287 		     (queue->raidPtr->numFailures <
2288 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2289 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2290 			       queue->raidPtr->raidid,
2291 			       bp->b_error,
2292 			       queue->raidPtr->Disks[queue->col].devname);
2293 			queue->raidPtr->Disks[queue->col].status =
2294 			    rf_ds_failed;
2295 			queue->raidPtr->status = rf_rs_degraded;
2296 			queue->raidPtr->numFailures++;
2297 			queue->raidPtr->numNewFailures++;
2298 		} else {	/* Disk is already dead... */
2299 			/* printf("Disk already marked as dead!\n"); */
2300 		}
2301 
2302 	}
2303 
2304 	/* Fill in the error value */
2305 	req->error = bp->b_error;
2306 
2307 	/* Drop this one on the "finished" queue... */
2308 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2309 
2310 	/* Let the raidio thread know there is work to be done. */
2311 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2312 
2313 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2314 }
2315 
2316 
2317 /*
2318  * initialize a buf structure for doing an I/O in the kernel.
2319  */
2320 static void
2321 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2322        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2323        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2324        struct proc *b_proc)
2325 {
2326 	/* bp->b_flags       = B_PHYS | rw_flag; */
2327 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2328 	bp->b_oflags = 0;
2329 	bp->b_cflags = 0;
2330 	bp->b_bcount = numSect << logBytesPerSector;
2331 	bp->b_bufsize = bp->b_bcount;
2332 	bp->b_error = 0;
2333 	bp->b_dev = dev;
2334 	bp->b_data = bf;
2335 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2336 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2337 	if (bp->b_bcount == 0) {
2338 		panic("bp->b_bcount is zero in InitBP!!");
2339 	}
2340 	bp->b_proc = b_proc;
2341 	bp->b_iodone = cbFunc;
2342 	bp->b_private = cbArg;
2343 }
2344 
2345 static void
2346 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2347 		    struct disklabel *lp)
2348 {
2349 	memset(lp, 0, sizeof(*lp));
2350 
2351 	/* fabricate a label... */
2352 	if (raidPtr->totalSectors > UINT32_MAX)
2353 		lp->d_secperunit = UINT32_MAX;
2354 	else
2355 		lp->d_secperunit = raidPtr->totalSectors;
2356 	lp->d_secsize = raidPtr->bytesPerSector;
2357 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2358 	lp->d_ntracks = 4 * raidPtr->numCol;
2359 	lp->d_ncylinders = raidPtr->totalSectors /
2360 		(lp->d_nsectors * lp->d_ntracks);
2361 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2362 
2363 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2364 	lp->d_type = DKTYPE_RAID;
2365 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2366 	lp->d_rpm = 3600;
2367 	lp->d_interleave = 1;
2368 	lp->d_flags = 0;
2369 
2370 	lp->d_partitions[RAW_PART].p_offset = 0;
2371 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
2372 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2373 	lp->d_npartitions = RAW_PART + 1;
2374 
2375 	lp->d_magic = DISKMAGIC;
2376 	lp->d_magic2 = DISKMAGIC;
2377 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2378 
2379 }
2380 /*
2381  * Read the disklabel from the raid device.  If one is not present, fake one
2382  * up.
2383  */
2384 static void
2385 raidgetdisklabel(dev_t dev)
2386 {
2387 	int     unit = raidunit(dev);
2388 	struct raid_softc *rs;
2389 	const char   *errstring;
2390 	struct disklabel *lp;
2391 	struct cpu_disklabel *clp;
2392 	RF_Raid_t *raidPtr;
2393 
2394 	if ((rs = raidget(unit)) == NULL)
2395 		return;
2396 
2397 	lp = rs->sc_dkdev.dk_label;
2398 	clp = rs->sc_dkdev.dk_cpulabel;
2399 
2400 	db1_printf(("Getting the disklabel...\n"));
2401 
2402 	memset(clp, 0, sizeof(*clp));
2403 
2404 	raidPtr = &rs->sc_r;
2405 
2406 	raidgetdefaultlabel(raidPtr, rs, lp);
2407 
2408 	/*
2409 	 * Call the generic disklabel extraction routine.
2410 	 */
2411 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2412 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2413 	if (errstring)
2414 		raidmakedisklabel(rs);
2415 	else {
2416 		int     i;
2417 		struct partition *pp;
2418 
2419 		/*
2420 		 * Sanity check whether the found disklabel is valid.
2421 		 *
2422 		 * This is necessary since total size of the raid device
2423 		 * may vary when an interleave is changed even though exactly
2424 		 * same components are used, and old disklabel may used
2425 		 * if that is found.
2426 		 */
2427 		if (lp->d_secperunit < UINT32_MAX ?
2428 		    lp->d_secperunit != rs->sc_size :
2429 		    lp->d_secperunit > rs->sc_size)
2430 			printf("raid%d: WARNING: %s: "
2431 			    "total sector size in disklabel (%ju) != "
2432 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
2433 			    (uintmax_t)lp->d_secperunit,
2434 			    (uintmax_t)rs->sc_size);
2435 		for (i = 0; i < lp->d_npartitions; i++) {
2436 			pp = &lp->d_partitions[i];
2437 			if (pp->p_offset + pp->p_size > rs->sc_size)
2438 				printf("raid%d: WARNING: %s: end of partition `%c' "
2439 				       "exceeds the size of raid (%ju)\n",
2440 				       unit, rs->sc_xname, 'a' + i,
2441 				       (uintmax_t)rs->sc_size);
2442 		}
2443 	}
2444 
2445 }
2446 /*
2447  * Take care of things one might want to take care of in the event
2448  * that a disklabel isn't present.
2449  */
2450 static void
2451 raidmakedisklabel(struct raid_softc *rs)
2452 {
2453 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2454 	db1_printf(("Making a label..\n"));
2455 
2456 	/*
2457 	 * For historical reasons, if there's no disklabel present
2458 	 * the raw partition must be marked FS_BSDFFS.
2459 	 */
2460 
2461 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2462 
2463 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2464 
2465 	lp->d_checksum = dkcksum(lp);
2466 }
2467 /*
2468  * Wait interruptibly for an exclusive lock.
2469  *
2470  * XXX
2471  * Several drivers do this; it should be abstracted and made MP-safe.
2472  * (Hmm... where have we seen this warning before :->  GO )
2473  */
2474 static int
2475 raidlock(struct raid_softc *rs)
2476 {
2477 	int     error;
2478 
2479 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2480 		rs->sc_flags |= RAIDF_WANTED;
2481 		if ((error =
2482 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2483 			return (error);
2484 	}
2485 	rs->sc_flags |= RAIDF_LOCKED;
2486 	return (0);
2487 }
2488 /*
2489  * Unlock and wake up any waiters.
2490  */
2491 static void
2492 raidunlock(struct raid_softc *rs)
2493 {
2494 
2495 	rs->sc_flags &= ~RAIDF_LOCKED;
2496 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2497 		rs->sc_flags &= ~RAIDF_WANTED;
2498 		wakeup(rs);
2499 	}
2500 }
2501 
2502 
2503 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2504 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2505 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2506 
2507 static daddr_t
2508 rf_component_info_offset(void)
2509 {
2510 
2511 	return RF_COMPONENT_INFO_OFFSET;
2512 }
2513 
2514 static daddr_t
2515 rf_component_info_size(unsigned secsize)
2516 {
2517 	daddr_t info_size;
2518 
2519 	KASSERT(secsize);
2520 	if (secsize > RF_COMPONENT_INFO_SIZE)
2521 		info_size = secsize;
2522 	else
2523 		info_size = RF_COMPONENT_INFO_SIZE;
2524 
2525 	return info_size;
2526 }
2527 
2528 static daddr_t
2529 rf_parity_map_offset(RF_Raid_t *raidPtr)
2530 {
2531 	daddr_t map_offset;
2532 
2533 	KASSERT(raidPtr->bytesPerSector);
2534 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2535 		map_offset = raidPtr->bytesPerSector;
2536 	else
2537 		map_offset = RF_COMPONENT_INFO_SIZE;
2538 	map_offset += rf_component_info_offset();
2539 
2540 	return map_offset;
2541 }
2542 
2543 static daddr_t
2544 rf_parity_map_size(RF_Raid_t *raidPtr)
2545 {
2546 	daddr_t map_size;
2547 
2548 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2549 		map_size = raidPtr->bytesPerSector;
2550 	else
2551 		map_size = RF_PARITY_MAP_SIZE;
2552 
2553 	return map_size;
2554 }
2555 
2556 int
2557 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2558 {
2559 	RF_ComponentLabel_t *clabel;
2560 
2561 	clabel = raidget_component_label(raidPtr, col);
2562 	clabel->clean = RF_RAID_CLEAN;
2563 	raidflush_component_label(raidPtr, col);
2564 	return(0);
2565 }
2566 
2567 
2568 int
2569 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2570 {
2571 	RF_ComponentLabel_t *clabel;
2572 
2573 	clabel = raidget_component_label(raidPtr, col);
2574 	clabel->clean = RF_RAID_DIRTY;
2575 	raidflush_component_label(raidPtr, col);
2576 	return(0);
2577 }
2578 
2579 int
2580 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2581 {
2582 	KASSERT(raidPtr->bytesPerSector);
2583 	return raidread_component_label(raidPtr->bytesPerSector,
2584 	    raidPtr->Disks[col].dev,
2585 	    raidPtr->raid_cinfo[col].ci_vp,
2586 	    &raidPtr->raid_cinfo[col].ci_label);
2587 }
2588 
2589 RF_ComponentLabel_t *
2590 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2591 {
2592 	return &raidPtr->raid_cinfo[col].ci_label;
2593 }
2594 
2595 int
2596 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2597 {
2598 	RF_ComponentLabel_t *label;
2599 
2600 	label = &raidPtr->raid_cinfo[col].ci_label;
2601 	label->mod_counter = raidPtr->mod_counter;
2602 #ifndef RF_NO_PARITY_MAP
2603 	label->parity_map_modcount = label->mod_counter;
2604 #endif
2605 	return raidwrite_component_label(raidPtr->bytesPerSector,
2606 	    raidPtr->Disks[col].dev,
2607 	    raidPtr->raid_cinfo[col].ci_vp, label);
2608 }
2609 
2610 
2611 static int
2612 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2613     RF_ComponentLabel_t *clabel)
2614 {
2615 	return raidread_component_area(dev, b_vp, clabel,
2616 	    sizeof(RF_ComponentLabel_t),
2617 	    rf_component_info_offset(),
2618 	    rf_component_info_size(secsize));
2619 }
2620 
2621 /* ARGSUSED */
2622 static int
2623 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2624     size_t msize, daddr_t offset, daddr_t dsize)
2625 {
2626 	struct buf *bp;
2627 	const struct bdevsw *bdev;
2628 	int error;
2629 
2630 	/* XXX should probably ensure that we don't try to do this if
2631 	   someone has changed rf_protected_sectors. */
2632 
2633 	if (b_vp == NULL) {
2634 		/* For whatever reason, this component is not valid.
2635 		   Don't try to read a component label from it. */
2636 		return(EINVAL);
2637 	}
2638 
2639 	/* get a block of the appropriate size... */
2640 	bp = geteblk((int)dsize);
2641 	bp->b_dev = dev;
2642 
2643 	/* get our ducks in a row for the read */
2644 	bp->b_blkno = offset / DEV_BSIZE;
2645 	bp->b_bcount = dsize;
2646 	bp->b_flags |= B_READ;
2647  	bp->b_resid = dsize;
2648 
2649 	bdev = bdevsw_lookup(bp->b_dev);
2650 	if (bdev == NULL)
2651 		return (ENXIO);
2652 	(*bdev->d_strategy)(bp);
2653 
2654 	error = biowait(bp);
2655 
2656 	if (!error) {
2657 		memcpy(data, bp->b_data, msize);
2658 	}
2659 
2660 	brelse(bp, 0);
2661 	return(error);
2662 }
2663 
2664 
2665 static int
2666 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2667     RF_ComponentLabel_t *clabel)
2668 {
2669 	return raidwrite_component_area(dev, b_vp, clabel,
2670 	    sizeof(RF_ComponentLabel_t),
2671 	    rf_component_info_offset(),
2672 	    rf_component_info_size(secsize), 0);
2673 }
2674 
2675 /* ARGSUSED */
2676 static int
2677 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2678     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2679 {
2680 	struct buf *bp;
2681 	const struct bdevsw *bdev;
2682 	int error;
2683 
2684 	/* get a block of the appropriate size... */
2685 	bp = geteblk((int)dsize);
2686 	bp->b_dev = dev;
2687 
2688 	/* get our ducks in a row for the write */
2689 	bp->b_blkno = offset / DEV_BSIZE;
2690 	bp->b_bcount = dsize;
2691 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2692  	bp->b_resid = dsize;
2693 
2694 	memset(bp->b_data, 0, dsize);
2695 	memcpy(bp->b_data, data, msize);
2696 
2697 	bdev = bdevsw_lookup(bp->b_dev);
2698 	if (bdev == NULL)
2699 		return (ENXIO);
2700 	(*bdev->d_strategy)(bp);
2701 	if (asyncp)
2702 		return 0;
2703 	error = biowait(bp);
2704 	brelse(bp, 0);
2705 	if (error) {
2706 #if 1
2707 		printf("Failed to write RAID component info!\n");
2708 #endif
2709 	}
2710 
2711 	return(error);
2712 }
2713 
2714 void
2715 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2716 {
2717 	int c;
2718 
2719 	for (c = 0; c < raidPtr->numCol; c++) {
2720 		/* Skip dead disks. */
2721 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2722 			continue;
2723 		/* XXXjld: what if an error occurs here? */
2724 		raidwrite_component_area(raidPtr->Disks[c].dev,
2725 		    raidPtr->raid_cinfo[c].ci_vp, map,
2726 		    RF_PARITYMAP_NBYTE,
2727 		    rf_parity_map_offset(raidPtr),
2728 		    rf_parity_map_size(raidPtr), 0);
2729 	}
2730 }
2731 
2732 void
2733 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2734 {
2735 	struct rf_paritymap_ondisk tmp;
2736 	int c,first;
2737 
2738 	first=1;
2739 	for (c = 0; c < raidPtr->numCol; c++) {
2740 		/* Skip dead disks. */
2741 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2742 			continue;
2743 		raidread_component_area(raidPtr->Disks[c].dev,
2744 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2745 		    RF_PARITYMAP_NBYTE,
2746 		    rf_parity_map_offset(raidPtr),
2747 		    rf_parity_map_size(raidPtr));
2748 		if (first) {
2749 			memcpy(map, &tmp, sizeof(*map));
2750 			first = 0;
2751 		} else {
2752 			rf_paritymap_merge(map, &tmp);
2753 		}
2754 	}
2755 }
2756 
2757 void
2758 rf_markalldirty(RF_Raid_t *raidPtr)
2759 {
2760 	RF_ComponentLabel_t *clabel;
2761 	int sparecol;
2762 	int c;
2763 	int j;
2764 	int scol = -1;
2765 
2766 	raidPtr->mod_counter++;
2767 	for (c = 0; c < raidPtr->numCol; c++) {
2768 		/* we don't want to touch (at all) a disk that has
2769 		   failed */
2770 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2771 			clabel = raidget_component_label(raidPtr, c);
2772 			if (clabel->status == rf_ds_spared) {
2773 				/* XXX do something special...
2774 				   but whatever you do, don't
2775 				   try to access it!! */
2776 			} else {
2777 				raidmarkdirty(raidPtr, c);
2778 			}
2779 		}
2780 	}
2781 
2782 	for( c = 0; c < raidPtr->numSpare ; c++) {
2783 		sparecol = raidPtr->numCol + c;
2784 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2785 			/*
2786 
2787 			   we claim this disk is "optimal" if it's
2788 			   rf_ds_used_spare, as that means it should be
2789 			   directly substitutable for the disk it replaced.
2790 			   We note that too...
2791 
2792 			 */
2793 
2794 			for(j=0;j<raidPtr->numCol;j++) {
2795 				if (raidPtr->Disks[j].spareCol == sparecol) {
2796 					scol = j;
2797 					break;
2798 				}
2799 			}
2800 
2801 			clabel = raidget_component_label(raidPtr, sparecol);
2802 			/* make sure status is noted */
2803 
2804 			raid_init_component_label(raidPtr, clabel);
2805 
2806 			clabel->row = 0;
2807 			clabel->column = scol;
2808 			/* Note: we *don't* change status from rf_ds_used_spare
2809 			   to rf_ds_optimal */
2810 			/* clabel.status = rf_ds_optimal; */
2811 
2812 			raidmarkdirty(raidPtr, sparecol);
2813 		}
2814 	}
2815 }
2816 
2817 
2818 void
2819 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2820 {
2821 	RF_ComponentLabel_t *clabel;
2822 	int sparecol;
2823 	int c;
2824 	int j;
2825 	int scol;
2826 
2827 	scol = -1;
2828 
2829 	/* XXX should do extra checks to make sure things really are clean,
2830 	   rather than blindly setting the clean bit... */
2831 
2832 	raidPtr->mod_counter++;
2833 
2834 	for (c = 0; c < raidPtr->numCol; c++) {
2835 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2836 			clabel = raidget_component_label(raidPtr, c);
2837 			/* make sure status is noted */
2838 			clabel->status = rf_ds_optimal;
2839 
2840 			/* note what unit we are configured as */
2841 			clabel->last_unit = raidPtr->raidid;
2842 
2843 			raidflush_component_label(raidPtr, c);
2844 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2845 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2846 					raidmarkclean(raidPtr, c);
2847 				}
2848 			}
2849 		}
2850 		/* else we don't touch it.. */
2851 	}
2852 
2853 	for( c = 0; c < raidPtr->numSpare ; c++) {
2854 		sparecol = raidPtr->numCol + c;
2855 		/* Need to ensure that the reconstruct actually completed! */
2856 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2857 			/*
2858 
2859 			   we claim this disk is "optimal" if it's
2860 			   rf_ds_used_spare, as that means it should be
2861 			   directly substitutable for the disk it replaced.
2862 			   We note that too...
2863 
2864 			 */
2865 
2866 			for(j=0;j<raidPtr->numCol;j++) {
2867 				if (raidPtr->Disks[j].spareCol == sparecol) {
2868 					scol = j;
2869 					break;
2870 				}
2871 			}
2872 
2873 			/* XXX shouldn't *really* need this... */
2874 			clabel = raidget_component_label(raidPtr, sparecol);
2875 			/* make sure status is noted */
2876 
2877 			raid_init_component_label(raidPtr, clabel);
2878 
2879 			clabel->column = scol;
2880 			clabel->status = rf_ds_optimal;
2881 			clabel->last_unit = raidPtr->raidid;
2882 
2883 			raidflush_component_label(raidPtr, sparecol);
2884 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2885 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2886 					raidmarkclean(raidPtr, sparecol);
2887 				}
2888 			}
2889 		}
2890 	}
2891 }
2892 
2893 void
2894 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2895 {
2896 
2897 	if (vp != NULL) {
2898 		if (auto_configured == 1) {
2899 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2900 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2901 			vput(vp);
2902 
2903 		} else {
2904 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2905 		}
2906 	}
2907 }
2908 
2909 
2910 void
2911 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2912 {
2913 	int r,c;
2914 	struct vnode *vp;
2915 	int acd;
2916 
2917 
2918 	/* We take this opportunity to close the vnodes like we should.. */
2919 
2920 	for (c = 0; c < raidPtr->numCol; c++) {
2921 		vp = raidPtr->raid_cinfo[c].ci_vp;
2922 		acd = raidPtr->Disks[c].auto_configured;
2923 		rf_close_component(raidPtr, vp, acd);
2924 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2925 		raidPtr->Disks[c].auto_configured = 0;
2926 	}
2927 
2928 	for (r = 0; r < raidPtr->numSpare; r++) {
2929 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2930 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2931 		rf_close_component(raidPtr, vp, acd);
2932 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2933 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2934 	}
2935 }
2936 
2937 
2938 void
2939 rf_ReconThread(struct rf_recon_req *req)
2940 {
2941 	int     s;
2942 	RF_Raid_t *raidPtr;
2943 
2944 	s = splbio();
2945 	raidPtr = (RF_Raid_t *) req->raidPtr;
2946 	raidPtr->recon_in_progress = 1;
2947 
2948 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2949 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2950 
2951 	RF_Free(req, sizeof(*req));
2952 
2953 	raidPtr->recon_in_progress = 0;
2954 	splx(s);
2955 
2956 	/* That's all... */
2957 	kthread_exit(0);	/* does not return */
2958 }
2959 
2960 void
2961 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2962 {
2963 	int retcode;
2964 	int s;
2965 
2966 	raidPtr->parity_rewrite_stripes_done = 0;
2967 	raidPtr->parity_rewrite_in_progress = 1;
2968 	s = splbio();
2969 	retcode = rf_RewriteParity(raidPtr);
2970 	splx(s);
2971 	if (retcode) {
2972 		printf("raid%d: Error re-writing parity (%d)!\n",
2973 		    raidPtr->raidid, retcode);
2974 	} else {
2975 		/* set the clean bit!  If we shutdown correctly,
2976 		   the clean bit on each component label will get
2977 		   set */
2978 		raidPtr->parity_good = RF_RAID_CLEAN;
2979 	}
2980 	raidPtr->parity_rewrite_in_progress = 0;
2981 
2982 	/* Anyone waiting for us to stop?  If so, inform them... */
2983 	if (raidPtr->waitShutdown) {
2984 		wakeup(&raidPtr->parity_rewrite_in_progress);
2985 	}
2986 
2987 	/* That's all... */
2988 	kthread_exit(0);	/* does not return */
2989 }
2990 
2991 
2992 void
2993 rf_CopybackThread(RF_Raid_t *raidPtr)
2994 {
2995 	int s;
2996 
2997 	raidPtr->copyback_in_progress = 1;
2998 	s = splbio();
2999 	rf_CopybackReconstructedData(raidPtr);
3000 	splx(s);
3001 	raidPtr->copyback_in_progress = 0;
3002 
3003 	/* That's all... */
3004 	kthread_exit(0);	/* does not return */
3005 }
3006 
3007 
3008 void
3009 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
3010 {
3011 	int s;
3012 	RF_Raid_t *raidPtr;
3013 
3014 	s = splbio();
3015 	raidPtr = req->raidPtr;
3016 	raidPtr->recon_in_progress = 1;
3017 	rf_ReconstructInPlace(raidPtr, req->col);
3018 	RF_Free(req, sizeof(*req));
3019 	raidPtr->recon_in_progress = 0;
3020 	splx(s);
3021 
3022 	/* That's all... */
3023 	kthread_exit(0);	/* does not return */
3024 }
3025 
3026 static RF_AutoConfig_t *
3027 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
3028     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
3029     unsigned secsize)
3030 {
3031 	int good_one = 0;
3032 	RF_ComponentLabel_t *clabel;
3033 	RF_AutoConfig_t *ac;
3034 
3035 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
3036 	if (clabel == NULL) {
3037 oomem:
3038 		    while(ac_list) {
3039 			    ac = ac_list;
3040 			    if (ac->clabel)
3041 				    free(ac->clabel, M_RAIDFRAME);
3042 			    ac_list = ac_list->next;
3043 			    free(ac, M_RAIDFRAME);
3044 		    }
3045 		    printf("RAID auto config: out of memory!\n");
3046 		    return NULL; /* XXX probably should panic? */
3047 	}
3048 
3049 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
3050 		/* Got the label.  Does it look reasonable? */
3051 		if (rf_reasonable_label(clabel, numsecs) &&
3052 		    (rf_component_label_partitionsize(clabel) <= size)) {
3053 #ifdef DEBUG
3054 			printf("Component on: %s: %llu\n",
3055 				cname, (unsigned long long)size);
3056 			rf_print_component_label(clabel);
3057 #endif
3058 			/* if it's reasonable, add it, else ignore it. */
3059 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3060 				M_NOWAIT);
3061 			if (ac == NULL) {
3062 				free(clabel, M_RAIDFRAME);
3063 				goto oomem;
3064 			}
3065 			strlcpy(ac->devname, cname, sizeof(ac->devname));
3066 			ac->dev = dev;
3067 			ac->vp = vp;
3068 			ac->clabel = clabel;
3069 			ac->next = ac_list;
3070 			ac_list = ac;
3071 			good_one = 1;
3072 		}
3073 	}
3074 	if (!good_one) {
3075 		/* cleanup */
3076 		free(clabel, M_RAIDFRAME);
3077 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3078 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3079 		vput(vp);
3080 	}
3081 	return ac_list;
3082 }
3083 
3084 RF_AutoConfig_t *
3085 rf_find_raid_components(void)
3086 {
3087 	struct vnode *vp;
3088 	struct disklabel label;
3089 	device_t dv;
3090 	deviter_t di;
3091 	dev_t dev;
3092 	int bmajor, bminor, wedge, rf_part_found;
3093 	int error;
3094 	int i;
3095 	RF_AutoConfig_t *ac_list;
3096 	uint64_t numsecs;
3097 	unsigned secsize;
3098 
3099 	/* initialize the AutoConfig list */
3100 	ac_list = NULL;
3101 
3102 	/* we begin by trolling through *all* the devices on the system */
3103 
3104 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3105 	     dv = deviter_next(&di)) {
3106 
3107 		/* we are only interested in disks... */
3108 		if (device_class(dv) != DV_DISK)
3109 			continue;
3110 
3111 		/* we don't care about floppies... */
3112 		if (device_is_a(dv, "fd")) {
3113 			continue;
3114 		}
3115 
3116 		/* we don't care about CD's... */
3117 		if (device_is_a(dv, "cd")) {
3118 			continue;
3119 		}
3120 
3121 		/* we don't care about md's... */
3122 		if (device_is_a(dv, "md")) {
3123 			continue;
3124 		}
3125 
3126 		/* hdfd is the Atari/Hades floppy driver */
3127 		if (device_is_a(dv, "hdfd")) {
3128 			continue;
3129 		}
3130 
3131 		/* fdisa is the Atari/Milan floppy driver */
3132 		if (device_is_a(dv, "fdisa")) {
3133 			continue;
3134 		}
3135 
3136 		/* need to find the device_name_to_block_device_major stuff */
3137 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3138 
3139 		rf_part_found = 0; /*No raid partition as yet*/
3140 
3141 		/* get a vnode for the raw partition of this disk */
3142 
3143 		wedge = device_is_a(dv, "dk");
3144 		bminor = minor(device_unit(dv));
3145 		dev = wedge ? makedev(bmajor, bminor) :
3146 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
3147 		if (bdevvp(dev, &vp))
3148 			panic("RAID can't alloc vnode");
3149 
3150 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3151 
3152 		if (error) {
3153 			/* "Who cares."  Continue looking
3154 			   for something that exists*/
3155 			vput(vp);
3156 			continue;
3157 		}
3158 
3159 		error = getdisksize(vp, &numsecs, &secsize);
3160 		if (error) {
3161 			vput(vp);
3162 			continue;
3163 		}
3164 		if (wedge) {
3165 			struct dkwedge_info dkw;
3166 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3167 			    NOCRED);
3168 			if (error) {
3169 				printf("RAIDframe: can't get wedge info for "
3170 				    "dev %s (%d)\n", device_xname(dv), error);
3171 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3172 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3173 				vput(vp);
3174 				continue;
3175 			}
3176 
3177 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3178 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3179 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3180 				vput(vp);
3181 				continue;
3182 			}
3183 
3184 			ac_list = rf_get_component(ac_list, dev, vp,
3185 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3186 			rf_part_found = 1; /*There is a raid component on this disk*/
3187 			continue;
3188 		}
3189 
3190 		/* Ok, the disk exists.  Go get the disklabel. */
3191 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3192 		if (error) {
3193 			/*
3194 			 * XXX can't happen - open() would
3195 			 * have errored out (or faked up one)
3196 			 */
3197 			if (error != ENOTTY)
3198 				printf("RAIDframe: can't get label for dev "
3199 				    "%s (%d)\n", device_xname(dv), error);
3200 		}
3201 
3202 		/* don't need this any more.  We'll allocate it again
3203 		   a little later if we really do... */
3204 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3205 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3206 		vput(vp);
3207 
3208 		if (error)
3209 			continue;
3210 
3211 		rf_part_found = 0; /*No raid partitions yet*/
3212 		for (i = 0; i < label.d_npartitions; i++) {
3213 			char cname[sizeof(ac_list->devname)];
3214 
3215 			/* We only support partitions marked as RAID */
3216 			if (label.d_partitions[i].p_fstype != FS_RAID)
3217 				continue;
3218 
3219 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3220 			if (bdevvp(dev, &vp))
3221 				panic("RAID can't alloc vnode");
3222 
3223 			error = VOP_OPEN(vp, FREAD, NOCRED);
3224 			if (error) {
3225 				/* Whatever... */
3226 				vput(vp);
3227 				continue;
3228 			}
3229 			snprintf(cname, sizeof(cname), "%s%c",
3230 			    device_xname(dv), 'a' + i);
3231 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3232 				label.d_partitions[i].p_size, numsecs, secsize);
3233 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3234 		}
3235 
3236 		/*
3237 		 *If there is no raid component on this disk, either in a
3238 		 *disklabel or inside a wedge, check the raw partition as well,
3239 		 *as it is possible to configure raid components on raw disk
3240 		 *devices.
3241 		 */
3242 
3243 		if (!rf_part_found) {
3244 			char cname[sizeof(ac_list->devname)];
3245 
3246 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3247 			if (bdevvp(dev, &vp))
3248 				panic("RAID can't alloc vnode");
3249 
3250 			error = VOP_OPEN(vp, FREAD, NOCRED);
3251 			if (error) {
3252 				/* Whatever... */
3253 				vput(vp);
3254 				continue;
3255 			}
3256 			snprintf(cname, sizeof(cname), "%s%c",
3257 			    device_xname(dv), 'a' + RAW_PART);
3258 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3259 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3260 		}
3261 	}
3262 	deviter_release(&di);
3263 	return ac_list;
3264 }
3265 
3266 
3267 int
3268 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3269 {
3270 
3271 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3272 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3273 	    ((clabel->clean == RF_RAID_CLEAN) ||
3274 	     (clabel->clean == RF_RAID_DIRTY)) &&
3275 	    clabel->row >=0 &&
3276 	    clabel->column >= 0 &&
3277 	    clabel->num_rows > 0 &&
3278 	    clabel->num_columns > 0 &&
3279 	    clabel->row < clabel->num_rows &&
3280 	    clabel->column < clabel->num_columns &&
3281 	    clabel->blockSize > 0 &&
3282 	    /*
3283 	     * numBlocksHi may contain garbage, but it is ok since
3284 	     * the type is unsigned.  If it is really garbage,
3285 	     * rf_fix_old_label_size() will fix it.
3286 	     */
3287 	    rf_component_label_numblocks(clabel) > 0) {
3288 		/*
3289 		 * label looks reasonable enough...
3290 		 * let's make sure it has no old garbage.
3291 		 */
3292 		if (numsecs)
3293 			rf_fix_old_label_size(clabel, numsecs);
3294 		return(1);
3295 	}
3296 	return(0);
3297 }
3298 
3299 
3300 /*
3301  * For reasons yet unknown, some old component labels have garbage in
3302  * the newer numBlocksHi region, and this causes lossage.  Since those
3303  * disks will also have numsecs set to less than 32 bits of sectors,
3304  * we can determine when this corruption has occurred, and fix it.
3305  *
3306  * The exact same problem, with the same unknown reason, happens to
3307  * the partitionSizeHi member as well.
3308  */
3309 static void
3310 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3311 {
3312 
3313 	if (numsecs < ((uint64_t)1 << 32)) {
3314 		if (clabel->numBlocksHi) {
3315 			printf("WARNING: total sectors < 32 bits, yet "
3316 			       "numBlocksHi set\n"
3317 			       "WARNING: resetting numBlocksHi to zero.\n");
3318 			clabel->numBlocksHi = 0;
3319 		}
3320 
3321 		if (clabel->partitionSizeHi) {
3322 			printf("WARNING: total sectors < 32 bits, yet "
3323 			       "partitionSizeHi set\n"
3324 			       "WARNING: resetting partitionSizeHi to zero.\n");
3325 			clabel->partitionSizeHi = 0;
3326 		}
3327 	}
3328 }
3329 
3330 
3331 #ifdef DEBUG
3332 void
3333 rf_print_component_label(RF_ComponentLabel_t *clabel)
3334 {
3335 	uint64_t numBlocks;
3336 	static const char *rp[] = {
3337 	    "No", "Force", "Soft", "*invalid*"
3338 	};
3339 
3340 
3341 	numBlocks = rf_component_label_numblocks(clabel);
3342 
3343 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3344 	       clabel->row, clabel->column,
3345 	       clabel->num_rows, clabel->num_columns);
3346 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3347 	       clabel->version, clabel->serial_number,
3348 	       clabel->mod_counter);
3349 	printf("   Clean: %s Status: %d\n",
3350 	       clabel->clean ? "Yes" : "No", clabel->status);
3351 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3352 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3353 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3354 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3355 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3356 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3357 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3358 #if 0
3359 	   printf("   Config order: %d\n", clabel->config_order);
3360 #endif
3361 
3362 }
3363 #endif
3364 
3365 RF_ConfigSet_t *
3366 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3367 {
3368 	RF_AutoConfig_t *ac;
3369 	RF_ConfigSet_t *config_sets;
3370 	RF_ConfigSet_t *cset;
3371 	RF_AutoConfig_t *ac_next;
3372 
3373 
3374 	config_sets = NULL;
3375 
3376 	/* Go through the AutoConfig list, and figure out which components
3377 	   belong to what sets.  */
3378 	ac = ac_list;
3379 	while(ac!=NULL) {
3380 		/* we're going to putz with ac->next, so save it here
3381 		   for use at the end of the loop */
3382 		ac_next = ac->next;
3383 
3384 		if (config_sets == NULL) {
3385 			/* will need at least this one... */
3386 			config_sets = (RF_ConfigSet_t *)
3387 				malloc(sizeof(RF_ConfigSet_t),
3388 				       M_RAIDFRAME, M_NOWAIT);
3389 			if (config_sets == NULL) {
3390 				panic("rf_create_auto_sets: No memory!");
3391 			}
3392 			/* this one is easy :) */
3393 			config_sets->ac = ac;
3394 			config_sets->next = NULL;
3395 			config_sets->rootable = 0;
3396 			ac->next = NULL;
3397 		} else {
3398 			/* which set does this component fit into? */
3399 			cset = config_sets;
3400 			while(cset!=NULL) {
3401 				if (rf_does_it_fit(cset, ac)) {
3402 					/* looks like it matches... */
3403 					ac->next = cset->ac;
3404 					cset->ac = ac;
3405 					break;
3406 				}
3407 				cset = cset->next;
3408 			}
3409 			if (cset==NULL) {
3410 				/* didn't find a match above... new set..*/
3411 				cset = (RF_ConfigSet_t *)
3412 					malloc(sizeof(RF_ConfigSet_t),
3413 					       M_RAIDFRAME, M_NOWAIT);
3414 				if (cset == NULL) {
3415 					panic("rf_create_auto_sets: No memory!");
3416 				}
3417 				cset->ac = ac;
3418 				ac->next = NULL;
3419 				cset->next = config_sets;
3420 				cset->rootable = 0;
3421 				config_sets = cset;
3422 			}
3423 		}
3424 		ac = ac_next;
3425 	}
3426 
3427 
3428 	return(config_sets);
3429 }
3430 
3431 static int
3432 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3433 {
3434 	RF_ComponentLabel_t *clabel1, *clabel2;
3435 
3436 	/* If this one matches the *first* one in the set, that's good
3437 	   enough, since the other members of the set would have been
3438 	   through here too... */
3439 	/* note that we are not checking partitionSize here..
3440 
3441 	   Note that we are also not checking the mod_counters here.
3442 	   If everything else matches except the mod_counter, that's
3443 	   good enough for this test.  We will deal with the mod_counters
3444 	   a little later in the autoconfiguration process.
3445 
3446 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3447 
3448 	   The reason we don't check for this is that failed disks
3449 	   will have lower modification counts.  If those disks are
3450 	   not added to the set they used to belong to, then they will
3451 	   form their own set, which may result in 2 different sets,
3452 	   for example, competing to be configured at raid0, and
3453 	   perhaps competing to be the root filesystem set.  If the
3454 	   wrong ones get configured, or both attempt to become /,
3455 	   weird behaviour and or serious lossage will occur.  Thus we
3456 	   need to bring them into the fold here, and kick them out at
3457 	   a later point.
3458 
3459 	*/
3460 
3461 	clabel1 = cset->ac->clabel;
3462 	clabel2 = ac->clabel;
3463 	if ((clabel1->version == clabel2->version) &&
3464 	    (clabel1->serial_number == clabel2->serial_number) &&
3465 	    (clabel1->num_rows == clabel2->num_rows) &&
3466 	    (clabel1->num_columns == clabel2->num_columns) &&
3467 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3468 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3469 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3470 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3471 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3472 	    (clabel1->blockSize == clabel2->blockSize) &&
3473 	    rf_component_label_numblocks(clabel1) ==
3474 	    rf_component_label_numblocks(clabel2) &&
3475 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3476 	    (clabel1->root_partition == clabel2->root_partition) &&
3477 	    (clabel1->last_unit == clabel2->last_unit) &&
3478 	    (clabel1->config_order == clabel2->config_order)) {
3479 		/* if it get's here, it almost *has* to be a match */
3480 	} else {
3481 		/* it's not consistent with somebody in the set..
3482 		   punt */
3483 		return(0);
3484 	}
3485 	/* all was fine.. it must fit... */
3486 	return(1);
3487 }
3488 
3489 int
3490 rf_have_enough_components(RF_ConfigSet_t *cset)
3491 {
3492 	RF_AutoConfig_t *ac;
3493 	RF_AutoConfig_t *auto_config;
3494 	RF_ComponentLabel_t *clabel;
3495 	int c;
3496 	int num_cols;
3497 	int num_missing;
3498 	int mod_counter;
3499 	int mod_counter_found;
3500 	int even_pair_failed;
3501 	char parity_type;
3502 
3503 
3504 	/* check to see that we have enough 'live' components
3505 	   of this set.  If so, we can configure it if necessary */
3506 
3507 	num_cols = cset->ac->clabel->num_columns;
3508 	parity_type = cset->ac->clabel->parityConfig;
3509 
3510 	/* XXX Check for duplicate components!?!?!? */
3511 
3512 	/* Determine what the mod_counter is supposed to be for this set. */
3513 
3514 	mod_counter_found = 0;
3515 	mod_counter = 0;
3516 	ac = cset->ac;
3517 	while(ac!=NULL) {
3518 		if (mod_counter_found==0) {
3519 			mod_counter = ac->clabel->mod_counter;
3520 			mod_counter_found = 1;
3521 		} else {
3522 			if (ac->clabel->mod_counter > mod_counter) {
3523 				mod_counter = ac->clabel->mod_counter;
3524 			}
3525 		}
3526 		ac = ac->next;
3527 	}
3528 
3529 	num_missing = 0;
3530 	auto_config = cset->ac;
3531 
3532 	even_pair_failed = 0;
3533 	for(c=0; c<num_cols; c++) {
3534 		ac = auto_config;
3535 		while(ac!=NULL) {
3536 			if ((ac->clabel->column == c) &&
3537 			    (ac->clabel->mod_counter == mod_counter)) {
3538 				/* it's this one... */
3539 #ifdef DEBUG
3540 				printf("Found: %s at %d\n",
3541 				       ac->devname,c);
3542 #endif
3543 				break;
3544 			}
3545 			ac=ac->next;
3546 		}
3547 		if (ac==NULL) {
3548 				/* Didn't find one here! */
3549 				/* special case for RAID 1, especially
3550 				   where there are more than 2
3551 				   components (where RAIDframe treats
3552 				   things a little differently :( ) */
3553 			if (parity_type == '1') {
3554 				if (c%2 == 0) { /* even component */
3555 					even_pair_failed = 1;
3556 				} else { /* odd component.  If
3557 					    we're failed, and
3558 					    so is the even
3559 					    component, it's
3560 					    "Good Night, Charlie" */
3561 					if (even_pair_failed == 1) {
3562 						return(0);
3563 					}
3564 				}
3565 			} else {
3566 				/* normal accounting */
3567 				num_missing++;
3568 			}
3569 		}
3570 		if ((parity_type == '1') && (c%2 == 1)) {
3571 				/* Just did an even component, and we didn't
3572 				   bail.. reset the even_pair_failed flag,
3573 				   and go on to the next component.... */
3574 			even_pair_failed = 0;
3575 		}
3576 	}
3577 
3578 	clabel = cset->ac->clabel;
3579 
3580 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3581 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3582 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3583 		/* XXX this needs to be made *much* more general */
3584 		/* Too many failures */
3585 		return(0);
3586 	}
3587 	/* otherwise, all is well, and we've got enough to take a kick
3588 	   at autoconfiguring this set */
3589 	return(1);
3590 }
3591 
3592 void
3593 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3594 			RF_Raid_t *raidPtr)
3595 {
3596 	RF_ComponentLabel_t *clabel;
3597 	int i;
3598 
3599 	clabel = ac->clabel;
3600 
3601 	/* 1. Fill in the common stuff */
3602 	config->numRow = clabel->num_rows = 1;
3603 	config->numCol = clabel->num_columns;
3604 	config->numSpare = 0; /* XXX should this be set here? */
3605 	config->sectPerSU = clabel->sectPerSU;
3606 	config->SUsPerPU = clabel->SUsPerPU;
3607 	config->SUsPerRU = clabel->SUsPerRU;
3608 	config->parityConfig = clabel->parityConfig;
3609 	/* XXX... */
3610 	strcpy(config->diskQueueType,"fifo");
3611 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3612 	config->layoutSpecificSize = 0; /* XXX ?? */
3613 
3614 	while(ac!=NULL) {
3615 		/* row/col values will be in range due to the checks
3616 		   in reasonable_label() */
3617 		strcpy(config->devnames[0][ac->clabel->column],
3618 		       ac->devname);
3619 		ac = ac->next;
3620 	}
3621 
3622 	for(i=0;i<RF_MAXDBGV;i++) {
3623 		config->debugVars[i][0] = 0;
3624 	}
3625 }
3626 
3627 int
3628 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3629 {
3630 	RF_ComponentLabel_t *clabel;
3631 	int column;
3632 	int sparecol;
3633 
3634 	raidPtr->autoconfigure = new_value;
3635 
3636 	for(column=0; column<raidPtr->numCol; column++) {
3637 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3638 			clabel = raidget_component_label(raidPtr, column);
3639 			clabel->autoconfigure = new_value;
3640 			raidflush_component_label(raidPtr, column);
3641 		}
3642 	}
3643 	for(column = 0; column < raidPtr->numSpare ; column++) {
3644 		sparecol = raidPtr->numCol + column;
3645 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3646 			clabel = raidget_component_label(raidPtr, sparecol);
3647 			clabel->autoconfigure = new_value;
3648 			raidflush_component_label(raidPtr, sparecol);
3649 		}
3650 	}
3651 	return(new_value);
3652 }
3653 
3654 int
3655 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3656 {
3657 	RF_ComponentLabel_t *clabel;
3658 	int column;
3659 	int sparecol;
3660 
3661 	raidPtr->root_partition = new_value;
3662 	for(column=0; column<raidPtr->numCol; column++) {
3663 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3664 			clabel = raidget_component_label(raidPtr, column);
3665 			clabel->root_partition = new_value;
3666 			raidflush_component_label(raidPtr, column);
3667 		}
3668 	}
3669 	for(column = 0; column < raidPtr->numSpare ; column++) {
3670 		sparecol = raidPtr->numCol + column;
3671 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3672 			clabel = raidget_component_label(raidPtr, sparecol);
3673 			clabel->root_partition = new_value;
3674 			raidflush_component_label(raidPtr, sparecol);
3675 		}
3676 	}
3677 	return(new_value);
3678 }
3679 
3680 void
3681 rf_release_all_vps(RF_ConfigSet_t *cset)
3682 {
3683 	RF_AutoConfig_t *ac;
3684 
3685 	ac = cset->ac;
3686 	while(ac!=NULL) {
3687 		/* Close the vp, and give it back */
3688 		if (ac->vp) {
3689 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3690 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
3691 			vput(ac->vp);
3692 			ac->vp = NULL;
3693 		}
3694 		ac = ac->next;
3695 	}
3696 }
3697 
3698 
3699 void
3700 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3701 {
3702 	RF_AutoConfig_t *ac;
3703 	RF_AutoConfig_t *next_ac;
3704 
3705 	ac = cset->ac;
3706 	while(ac!=NULL) {
3707 		next_ac = ac->next;
3708 		/* nuke the label */
3709 		free(ac->clabel, M_RAIDFRAME);
3710 		/* cleanup the config structure */
3711 		free(ac, M_RAIDFRAME);
3712 		/* "next.." */
3713 		ac = next_ac;
3714 	}
3715 	/* and, finally, nuke the config set */
3716 	free(cset, M_RAIDFRAME);
3717 }
3718 
3719 
3720 void
3721 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3722 {
3723 	/* current version number */
3724 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3725 	clabel->serial_number = raidPtr->serial_number;
3726 	clabel->mod_counter = raidPtr->mod_counter;
3727 
3728 	clabel->num_rows = 1;
3729 	clabel->num_columns = raidPtr->numCol;
3730 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3731 	clabel->status = rf_ds_optimal; /* "It's good!" */
3732 
3733 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3734 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3735 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3736 
3737 	clabel->blockSize = raidPtr->bytesPerSector;
3738 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3739 
3740 	/* XXX not portable */
3741 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3742 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3743 	clabel->autoconfigure = raidPtr->autoconfigure;
3744 	clabel->root_partition = raidPtr->root_partition;
3745 	clabel->last_unit = raidPtr->raidid;
3746 	clabel->config_order = raidPtr->config_order;
3747 
3748 #ifndef RF_NO_PARITY_MAP
3749 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3750 #endif
3751 }
3752 
3753 struct raid_softc *
3754 rf_auto_config_set(RF_ConfigSet_t *cset)
3755 {
3756 	RF_Raid_t *raidPtr;
3757 	RF_Config_t *config;
3758 	int raidID;
3759 	struct raid_softc *sc;
3760 
3761 #ifdef DEBUG
3762 	printf("RAID autoconfigure\n");
3763 #endif
3764 
3765 	/* 1. Create a config structure */
3766 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3767 	if (config == NULL) {
3768 		printf("Out of mem!?!?\n");
3769 				/* XXX do something more intelligent here. */
3770 		return NULL;
3771 	}
3772 
3773 	/*
3774 	   2. Figure out what RAID ID this one is supposed to live at
3775 	   See if we can get the same RAID dev that it was configured
3776 	   on last time..
3777 	*/
3778 
3779 	raidID = cset->ac->clabel->last_unit;
3780 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3781 		continue;
3782 #ifdef DEBUG
3783 	printf("Configuring raid%d:\n",raidID);
3784 #endif
3785 
3786 	raidPtr = &sc->sc_r;
3787 
3788 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3789 	raidPtr->softc = sc;
3790 	raidPtr->raidid = raidID;
3791 	raidPtr->openings = RAIDOUTSTANDING;
3792 
3793 	/* 3. Build the configuration structure */
3794 	rf_create_configuration(cset->ac, config, raidPtr);
3795 
3796 	/* 4. Do the configuration */
3797 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3798 		raidinit(sc);
3799 
3800 		rf_markalldirty(raidPtr);
3801 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3802 		switch (cset->ac->clabel->root_partition) {
3803 		case 1:	/* Force Root */
3804 		case 2:	/* Soft Root: root when boot partition part of raid */
3805 			/*
3806 			 * everything configured just fine.  Make a note
3807 			 * that this set is eligible to be root,
3808 			 * or forced to be root
3809 			 */
3810 			cset->rootable = cset->ac->clabel->root_partition;
3811 			/* XXX do this here? */
3812 			raidPtr->root_partition = cset->rootable;
3813 			break;
3814 		default:
3815 			break;
3816 		}
3817 	} else {
3818 		raidput(sc);
3819 		sc = NULL;
3820 	}
3821 
3822 	/* 5. Cleanup */
3823 	free(config, M_RAIDFRAME);
3824 	return sc;
3825 }
3826 
3827 void
3828 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3829 {
3830 	struct buf *bp;
3831 	struct raid_softc *rs;
3832 
3833 	bp = (struct buf *)desc->bp;
3834 	rs = desc->raidPtr->softc;
3835 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3836 	    (bp->b_flags & B_READ));
3837 }
3838 
3839 void
3840 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3841 	     size_t xmin, size_t xmax)
3842 {
3843 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3844 	pool_sethiwat(p, xmax);
3845 	pool_prime(p, xmin);
3846 	pool_setlowat(p, xmin);
3847 }
3848 
3849 /*
3850  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3851  * if there is IO pending and if that IO could possibly be done for a
3852  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
3853  * otherwise.
3854  *
3855  */
3856 
3857 int
3858 rf_buf_queue_check(RF_Raid_t *raidPtr)
3859 {
3860 	struct raid_softc *rs = raidPtr->softc;
3861 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3862 		/* there is work to do */
3863 		return 0;
3864 	}
3865 	/* default is nothing to do */
3866 	return 1;
3867 }
3868 
3869 int
3870 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3871 {
3872 	uint64_t numsecs;
3873 	unsigned secsize;
3874 	int error;
3875 
3876 	error = getdisksize(vp, &numsecs, &secsize);
3877 	if (error == 0) {
3878 		diskPtr->blockSize = secsize;
3879 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3880 		diskPtr->partitionSize = numsecs;
3881 		return 0;
3882 	}
3883 	return error;
3884 }
3885 
3886 static int
3887 raid_match(device_t self, cfdata_t cfdata, void *aux)
3888 {
3889 	return 1;
3890 }
3891 
3892 static void
3893 raid_attach(device_t parent, device_t self, void *aux)
3894 {
3895 
3896 }
3897 
3898 
3899 static int
3900 raid_detach(device_t self, int flags)
3901 {
3902 	int error;
3903 	struct raid_softc *rs = raidget(device_unit(self));
3904 
3905 	if (rs == NULL)
3906 		return ENXIO;
3907 
3908 	if ((error = raidlock(rs)) != 0)
3909 		return (error);
3910 
3911 	error = raid_detach_unlocked(rs);
3912 
3913 	raidunlock(rs);
3914 
3915 	/* XXXkd: raidput(rs) ??? */
3916 
3917 	return error;
3918 }
3919 
3920 static void
3921 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3922 {
3923 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
3924 
3925 	memset(dg, 0, sizeof(*dg));
3926 
3927 	dg->dg_secperunit = raidPtr->totalSectors;
3928 	dg->dg_secsize = raidPtr->bytesPerSector;
3929 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3930 	dg->dg_ntracks = 4 * raidPtr->numCol;
3931 
3932 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
3933 }
3934 
3935 /*
3936  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3937  * We end up returning whatever error was returned by the first cache flush
3938  * that fails.
3939  */
3940 
3941 int
3942 rf_sync_component_caches(RF_Raid_t *raidPtr)
3943 {
3944 	int c, sparecol;
3945 	int e,error;
3946 	int force = 1;
3947 
3948 	error = 0;
3949 	for (c = 0; c < raidPtr->numCol; c++) {
3950 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3951 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3952 					  &force, FWRITE, NOCRED);
3953 			if (e) {
3954 				if (e != ENODEV)
3955 					printf("raid%d: cache flush to component %s failed.\n",
3956 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3957 				if (error == 0) {
3958 					error = e;
3959 				}
3960 			}
3961 		}
3962 	}
3963 
3964 	for( c = 0; c < raidPtr->numSpare ; c++) {
3965 		sparecol = raidPtr->numCol + c;
3966 		/* Need to ensure that the reconstruct actually completed! */
3967 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3968 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3969 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3970 			if (e) {
3971 				if (e != ENODEV)
3972 					printf("raid%d: cache flush to component %s failed.\n",
3973 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3974 				if (error == 0) {
3975 					error = e;
3976 				}
3977 			}
3978 		}
3979 	}
3980 	return error;
3981 }
3982