xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 80d9064ac03cbb6a4174695f0d5b237c8766d3d0)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.312 2014/07/25 08:10:38 dholland Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.312 2014/07/25 08:10:38 dholland Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 
130 #include <prop/proplib.h>
131 
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135 
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149 
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165 
166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
167 						 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
169 						 * installation process */
170 #endif
171 
172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
173 
174 /* prototypes */
175 static void KernelWakeupFunc(struct buf *);
176 static void InitBP(struct buf *, struct vnode *, unsigned,
177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
178     void *, int, struct proc *);
179 struct raid_softc;
180 static void raidinit(struct raid_softc *);
181 
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186 
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188     daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190     daddr_t, daddr_t, int);
191 
192 static int raidwrite_component_label(unsigned,
193     dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195     dev_t, struct vnode *, RF_ComponentLabel_t *);
196 
197 
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206 
207 const struct bdevsw raid_bdevsw = {
208 	.d_open = raidopen,
209 	.d_close = raidclose,
210 	.d_strategy = raidstrategy,
211 	.d_ioctl = raidioctl,
212 	.d_dump = raiddump,
213 	.d_psize = raidsize,
214 	.d_discard = nodiscard,
215 	.d_flag = D_DISK
216 };
217 
218 const struct cdevsw raid_cdevsw = {
219 	.d_open = raidopen,
220 	.d_close = raidclose,
221 	.d_read = raidread,
222 	.d_write = raidwrite,
223 	.d_ioctl = raidioctl,
224 	.d_stop = nostop,
225 	.d_tty = notty,
226 	.d_poll = nopoll,
227 	.d_mmap = nommap,
228 	.d_kqfilter = nokqfilter,
229 	.d_discard = nodiscard,
230 	.d_flag = D_DISK
231 };
232 
233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
234 
235 struct raid_softc {
236 	device_t sc_dev;
237 	int	sc_unit;
238 	int     sc_flags;	/* flags */
239 	int     sc_cflags;	/* configuration flags */
240 	uint64_t sc_size;	/* size of the raid device */
241 	char    sc_xname[20];	/* XXX external name */
242 	struct disk sc_dkdev;	/* generic disk device info */
243 	struct bufq_state *buf_queue;	/* used for the device queue */
244 	RF_Raid_t sc_r;
245 	LIST_ENTRY(raid_softc) sc_link;
246 };
247 /* sc_flags */
248 #define RAIDF_INITED	0x01	/* unit has been initialized */
249 #define RAIDF_WLABEL	0x02	/* label area is writable */
250 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
251 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
252 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
253 #define RAIDF_LOCKED	0x80	/* unit is locked */
254 
255 #define	raidunit(x)	DISKUNIT(x)
256 
257 extern struct cfdriver raid_cd;
258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
260     DVF_DETACH_SHUTDOWN);
261 
262 /*
263  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
264  * Be aware that large numbers can allow the driver to consume a lot of
265  * kernel memory, especially on writes, and in degraded mode reads.
266  *
267  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
268  * a single 64K write will typically require 64K for the old data,
269  * 64K for the old parity, and 64K for the new parity, for a total
270  * of 192K (if the parity buffer is not re-used immediately).
271  * Even it if is used immediately, that's still 128K, which when multiplied
272  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
273  *
274  * Now in degraded mode, for example, a 64K read on the above setup may
275  * require data reconstruction, which will require *all* of the 4 remaining
276  * disks to participate -- 4 * 32K/disk == 128K again.
277  */
278 
279 #ifndef RAIDOUTSTANDING
280 #define RAIDOUTSTANDING   6
281 #endif
282 
283 #define RAIDLABELDEV(dev)	\
284 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
285 
286 /* declared here, and made public, for the benefit of KVM stuff.. */
287 
288 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
289 				     struct disklabel *);
290 static void raidgetdisklabel(dev_t);
291 static void raidmakedisklabel(struct raid_softc *);
292 
293 static int raidlock(struct raid_softc *);
294 static void raidunlock(struct raid_softc *);
295 
296 static int raid_detach_unlocked(struct raid_softc *);
297 
298 static void rf_markalldirty(RF_Raid_t *);
299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
300 
301 void rf_ReconThread(struct rf_recon_req *);
302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
303 void rf_CopybackThread(RF_Raid_t *raidPtr);
304 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
305 int rf_autoconfig(device_t);
306 void rf_buildroothack(RF_ConfigSet_t *);
307 
308 RF_AutoConfig_t *rf_find_raid_components(void);
309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
313 int rf_set_autoconfig(RF_Raid_t *, int);
314 int rf_set_rootpartition(RF_Raid_t *, int);
315 void rf_release_all_vps(RF_ConfigSet_t *);
316 void rf_cleanup_config_set(RF_ConfigSet_t *);
317 int rf_have_enough_components(RF_ConfigSet_t *);
318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
320 
321 /*
322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
324  * in the kernel config file.
325  */
326 #ifdef RAID_AUTOCONFIG
327 int raidautoconfig = 1;
328 #else
329 int raidautoconfig = 0;
330 #endif
331 static bool raidautoconfigdone = false;
332 
333 struct RF_Pools_s rf_pools;
334 
335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
336 static kmutex_t raid_lock;
337 
338 static struct raid_softc *
339 raidcreate(int unit) {
340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
341 	if (sc == NULL) {
342 #ifdef DIAGNOSTIC
343 		printf("%s: out of memory\n", __func__);
344 #endif
345 		return NULL;
346 	}
347 	sc->sc_unit = unit;
348 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
349 	return sc;
350 }
351 
352 static void
353 raiddestroy(struct raid_softc *sc) {
354 	bufq_free(sc->buf_queue);
355 	kmem_free(sc, sizeof(*sc));
356 }
357 
358 static struct raid_softc *
359 raidget(int unit) {
360 	struct raid_softc *sc;
361 	if (unit < 0) {
362 #ifdef DIAGNOSTIC
363 		panic("%s: unit %d!", __func__, unit);
364 #endif
365 		return NULL;
366 	}
367 	mutex_enter(&raid_lock);
368 	LIST_FOREACH(sc, &raids, sc_link) {
369 		if (sc->sc_unit == unit) {
370 			mutex_exit(&raid_lock);
371 			return sc;
372 		}
373 	}
374 	mutex_exit(&raid_lock);
375 	if ((sc = raidcreate(unit)) == NULL)
376 		return NULL;
377 	mutex_enter(&raid_lock);
378 	LIST_INSERT_HEAD(&raids, sc, sc_link);
379 	mutex_exit(&raid_lock);
380 	return sc;
381 }
382 
383 static void
384 raidput(struct raid_softc *sc) {
385 	mutex_enter(&raid_lock);
386 	LIST_REMOVE(sc, sc_link);
387 	mutex_exit(&raid_lock);
388 	raiddestroy(sc);
389 }
390 
391 void
392 raidattach(int num)
393 {
394 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
395 	/* This is where all the initialization stuff gets done. */
396 
397 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
398 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
399 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
400 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
401 
402 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
403 #endif
404 
405 	if (rf_BootRaidframe() == 0)
406 		aprint_verbose("Kernelized RAIDframe activated\n");
407 	else
408 		panic("Serious error booting RAID!!");
409 
410 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
411 		aprint_error("raidattach: config_cfattach_attach failed?\n");
412 	}
413 
414 	raidautoconfigdone = false;
415 
416 	/*
417 	 * Register a finalizer which will be used to auto-config RAID
418 	 * sets once all real hardware devices have been found.
419 	 */
420 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
421 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
422 }
423 
424 int
425 rf_autoconfig(device_t self)
426 {
427 	RF_AutoConfig_t *ac_list;
428 	RF_ConfigSet_t *config_sets;
429 
430 	if (!raidautoconfig || raidautoconfigdone == true)
431 		return (0);
432 
433 	/* XXX This code can only be run once. */
434 	raidautoconfigdone = true;
435 
436 #ifdef __HAVE_CPU_BOOTCONF
437 	/*
438 	 * 0. find the boot device if needed first so we can use it later
439 	 * this needs to be done before we autoconfigure any raid sets,
440 	 * because if we use wedges we are not going to be able to open
441 	 * the boot device later
442 	 */
443 	if (booted_device == NULL)
444 		cpu_bootconf();
445 #endif
446 	/* 1. locate all RAID components on the system */
447 	aprint_debug("Searching for RAID components...\n");
448 	ac_list = rf_find_raid_components();
449 
450 	/* 2. Sort them into their respective sets. */
451 	config_sets = rf_create_auto_sets(ac_list);
452 
453 	/*
454 	 * 3. Evaluate each set and configure the valid ones.
455 	 * This gets done in rf_buildroothack().
456 	 */
457 	rf_buildroothack(config_sets);
458 
459 	return 1;
460 }
461 
462 static int
463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
464 	const char *bootname = device_xname(bdv);
465 	size_t len = strlen(bootname);
466 
467 	for (int col = 0; col < r->numCol; col++) {
468 		const char *devname = r->Disks[col].devname;
469 		devname += sizeof("/dev/") - 1;
470 		if (strncmp(devname, "dk", 2) == 0) {
471 			const char *parent =
472 			    dkwedge_get_parent_name(r->Disks[col].dev);
473 			if (parent != NULL)
474 				devname = parent;
475 		}
476 		if (strncmp(devname, bootname, len) == 0) {
477 			struct raid_softc *sc = r->softc;
478 			aprint_debug("raid%d includes boot device %s\n",
479 			    sc->sc_unit, devname);
480 			return 1;
481 		}
482 	}
483 	return 0;
484 }
485 
486 void
487 rf_buildroothack(RF_ConfigSet_t *config_sets)
488 {
489 	RF_ConfigSet_t *cset;
490 	RF_ConfigSet_t *next_cset;
491 	int num_root;
492 	struct raid_softc *sc, *rsc;
493 
494 	sc = rsc = NULL;
495 	num_root = 0;
496 	cset = config_sets;
497 	while (cset != NULL) {
498 		next_cset = cset->next;
499 		if (rf_have_enough_components(cset) &&
500 		    cset->ac->clabel->autoconfigure == 1) {
501 			sc = rf_auto_config_set(cset);
502 			if (sc != NULL) {
503 				aprint_debug("raid%d: configured ok\n",
504 				    sc->sc_unit);
505 				if (cset->rootable) {
506 					rsc = sc;
507 					num_root++;
508 				}
509 			} else {
510 				/* The autoconfig didn't work :( */
511 				aprint_debug("Autoconfig failed\n");
512 				rf_release_all_vps(cset);
513 			}
514 		} else {
515 			/* we're not autoconfiguring this set...
516 			   release the associated resources */
517 			rf_release_all_vps(cset);
518 		}
519 		/* cleanup */
520 		rf_cleanup_config_set(cset);
521 		cset = next_cset;
522 	}
523 
524 	/* if the user has specified what the root device should be
525 	   then we don't touch booted_device or boothowto... */
526 
527 	if (rootspec != NULL)
528 		return;
529 
530 	/* we found something bootable... */
531 
532 	/*
533 	 * XXX: The following code assumes that the root raid
534 	 * is the first ('a') partition. This is about the best
535 	 * we can do with a BSD disklabel, but we might be able
536 	 * to do better with a GPT label, by setting a specified
537 	 * attribute to indicate the root partition. We can then
538 	 * stash the partition number in the r->root_partition
539 	 * high bits (the bottom 2 bits are already used). For
540 	 * now we just set booted_partition to 0 when we override
541 	 * root.
542 	 */
543 	if (num_root == 1) {
544 		device_t candidate_root;
545 		if (rsc->sc_dkdev.dk_nwedges != 0) {
546 			char cname[sizeof(cset->ac->devname)];
547 			/* XXX: assume 'a' */
548 			snprintf(cname, sizeof(cname), "%s%c",
549 			    device_xname(rsc->sc_dev), 'a');
550 			candidate_root = dkwedge_find_by_wname(cname);
551 		} else
552 			candidate_root = rsc->sc_dev;
553 		if (booted_device == NULL ||
554 		    rsc->sc_r.root_partition == 1 ||
555 		    rf_containsboot(&rsc->sc_r, booted_device)) {
556 			booted_device = candidate_root;
557 			booted_partition = 0;	/* XXX assume 'a' */
558 		}
559 	} else if (num_root > 1) {
560 
561 		/*
562 		 * Maybe the MD code can help. If it cannot, then
563 		 * setroot() will discover that we have no
564 		 * booted_device and will ask the user if nothing was
565 		 * hardwired in the kernel config file
566 		 */
567 		if (booted_device == NULL)
568 			return;
569 
570 		num_root = 0;
571 		mutex_enter(&raid_lock);
572 		LIST_FOREACH(sc, &raids, sc_link) {
573 			RF_Raid_t *r = &sc->sc_r;
574 			if (r->valid == 0)
575 				continue;
576 
577 			if (r->root_partition == 0)
578 				continue;
579 
580 			if (rf_containsboot(r, booted_device)) {
581 				num_root++;
582 				rsc = sc;
583 			}
584 		}
585 		mutex_exit(&raid_lock);
586 
587 		if (num_root == 1) {
588 			booted_device = rsc->sc_dev;
589 			booted_partition = 0;	/* XXX assume 'a' */
590 		} else {
591 			/* we can't guess.. require the user to answer... */
592 			boothowto |= RB_ASKNAME;
593 		}
594 	}
595 }
596 
597 
598 int
599 raidsize(dev_t dev)
600 {
601 	struct raid_softc *rs;
602 	struct disklabel *lp;
603 	int     part, unit, omask, size;
604 
605 	unit = raidunit(dev);
606 	if ((rs = raidget(unit)) == NULL)
607 		return -1;
608 	if ((rs->sc_flags & RAIDF_INITED) == 0)
609 		return (-1);
610 
611 	part = DISKPART(dev);
612 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
613 	lp = rs->sc_dkdev.dk_label;
614 
615 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
616 		return (-1);
617 
618 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
619 		size = -1;
620 	else
621 		size = lp->d_partitions[part].p_size *
622 		    (lp->d_secsize / DEV_BSIZE);
623 
624 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
625 		return (-1);
626 
627 	return (size);
628 
629 }
630 
631 int
632 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
633 {
634 	int     unit = raidunit(dev);
635 	struct raid_softc *rs;
636 	const struct bdevsw *bdev;
637 	struct disklabel *lp;
638 	RF_Raid_t *raidPtr;
639 	daddr_t offset;
640 	int     part, c, sparecol, j, scol, dumpto;
641 	int     error = 0;
642 
643 	if ((rs = raidget(unit)) == NULL)
644 		return ENXIO;
645 
646 	raidPtr = &rs->sc_r;
647 
648 	if ((rs->sc_flags & RAIDF_INITED) == 0)
649 		return ENXIO;
650 
651 	/* we only support dumping to RAID 1 sets */
652 	if (raidPtr->Layout.numDataCol != 1 ||
653 	    raidPtr->Layout.numParityCol != 1)
654 		return EINVAL;
655 
656 
657 	if ((error = raidlock(rs)) != 0)
658 		return error;
659 
660 	if (size % DEV_BSIZE != 0) {
661 		error = EINVAL;
662 		goto out;
663 	}
664 
665 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
666 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
667 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
668 		    size / DEV_BSIZE, rs->sc_size);
669 		error = EINVAL;
670 		goto out;
671 	}
672 
673 	part = DISKPART(dev);
674 	lp = rs->sc_dkdev.dk_label;
675 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
676 
677 	/* figure out what device is alive.. */
678 
679 	/*
680 	   Look for a component to dump to.  The preference for the
681 	   component to dump to is as follows:
682 	   1) the master
683 	   2) a used_spare of the master
684 	   3) the slave
685 	   4) a used_spare of the slave
686 	*/
687 
688 	dumpto = -1;
689 	for (c = 0; c < raidPtr->numCol; c++) {
690 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
691 			/* this might be the one */
692 			dumpto = c;
693 			break;
694 		}
695 	}
696 
697 	/*
698 	   At this point we have possibly selected a live master or a
699 	   live slave.  We now check to see if there is a spared
700 	   master (or a spared slave), if we didn't find a live master
701 	   or a live slave.
702 	*/
703 
704 	for (c = 0; c < raidPtr->numSpare; c++) {
705 		sparecol = raidPtr->numCol + c;
706 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
707 			/* How about this one? */
708 			scol = -1;
709 			for(j=0;j<raidPtr->numCol;j++) {
710 				if (raidPtr->Disks[j].spareCol == sparecol) {
711 					scol = j;
712 					break;
713 				}
714 			}
715 			if (scol == 0) {
716 				/*
717 				   We must have found a spared master!
718 				   We'll take that over anything else
719 				   found so far.  (We couldn't have
720 				   found a real master before, since
721 				   this is a used spare, and it's
722 				   saying that it's replacing the
723 				   master.)  On reboot (with
724 				   autoconfiguration turned on)
725 				   sparecol will become the 1st
726 				   component (component0) of this set.
727 				*/
728 				dumpto = sparecol;
729 				break;
730 			} else if (scol != -1) {
731 				/*
732 				   Must be a spared slave.  We'll dump
733 				   to that if we havn't found anything
734 				   else so far.
735 				*/
736 				if (dumpto == -1)
737 					dumpto = sparecol;
738 			}
739 		}
740 	}
741 
742 	if (dumpto == -1) {
743 		/* we couldn't find any live components to dump to!?!?
744 		 */
745 		error = EINVAL;
746 		goto out;
747 	}
748 
749 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
750 
751 	/*
752 	   Note that blkno is relative to this particular partition.
753 	   By adding the offset of this partition in the RAID
754 	   set, and also adding RF_PROTECTED_SECTORS, we get a
755 	   value that is relative to the partition used for the
756 	   underlying component.
757 	*/
758 
759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
760 				blkno + offset, va, size);
761 
762 out:
763 	raidunlock(rs);
764 
765 	return error;
766 }
767 /* ARGSUSED */
768 int
769 raidopen(dev_t dev, int flags, int fmt,
770     struct lwp *l)
771 {
772 	int     unit = raidunit(dev);
773 	struct raid_softc *rs;
774 	struct disklabel *lp;
775 	int     part, pmask;
776 	int     error = 0;
777 
778 	if ((rs = raidget(unit)) == NULL)
779 		return ENXIO;
780 	if ((error = raidlock(rs)) != 0)
781 		return (error);
782 
783 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
784 		error = EBUSY;
785 		goto bad;
786 	}
787 
788 	lp = rs->sc_dkdev.dk_label;
789 
790 	part = DISKPART(dev);
791 
792 	/*
793 	 * If there are wedges, and this is not RAW_PART, then we
794 	 * need to fail.
795 	 */
796 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
797 		error = EBUSY;
798 		goto bad;
799 	}
800 	pmask = (1 << part);
801 
802 	if ((rs->sc_flags & RAIDF_INITED) &&
803 	    (rs->sc_dkdev.dk_openmask == 0))
804 		raidgetdisklabel(dev);
805 
806 	/* make sure that this partition exists */
807 
808 	if (part != RAW_PART) {
809 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
810 		    ((part >= lp->d_npartitions) ||
811 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
812 			error = ENXIO;
813 			goto bad;
814 		}
815 	}
816 	/* Prevent this unit from being unconfigured while open. */
817 	switch (fmt) {
818 	case S_IFCHR:
819 		rs->sc_dkdev.dk_copenmask |= pmask;
820 		break;
821 
822 	case S_IFBLK:
823 		rs->sc_dkdev.dk_bopenmask |= pmask;
824 		break;
825 	}
826 
827 	if ((rs->sc_dkdev.dk_openmask == 0) &&
828 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
829 		/* First one... mark things as dirty... Note that we *MUST*
830 		 have done a configure before this.  I DO NOT WANT TO BE
831 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
832 		 THAT THEY BELONG TOGETHER!!!!! */
833 		/* XXX should check to see if we're only open for reading
834 		   here... If so, we needn't do this, but then need some
835 		   other way of keeping track of what's happened.. */
836 
837 		rf_markalldirty(&rs->sc_r);
838 	}
839 
840 
841 	rs->sc_dkdev.dk_openmask =
842 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
843 
844 bad:
845 	raidunlock(rs);
846 
847 	return (error);
848 
849 
850 }
851 /* ARGSUSED */
852 int
853 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
854 {
855 	int     unit = raidunit(dev);
856 	struct raid_softc *rs;
857 	int     error = 0;
858 	int     part;
859 
860 	if ((rs = raidget(unit)) == NULL)
861 		return ENXIO;
862 
863 	if ((error = raidlock(rs)) != 0)
864 		return (error);
865 
866 	part = DISKPART(dev);
867 
868 	/* ...that much closer to allowing unconfiguration... */
869 	switch (fmt) {
870 	case S_IFCHR:
871 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
872 		break;
873 
874 	case S_IFBLK:
875 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
876 		break;
877 	}
878 	rs->sc_dkdev.dk_openmask =
879 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
880 
881 	if ((rs->sc_dkdev.dk_openmask == 0) &&
882 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
883 		/* Last one... device is not unconfigured yet.
884 		   Device shutdown has taken care of setting the
885 		   clean bits if RAIDF_INITED is not set
886 		   mark things as clean... */
887 
888 		rf_update_component_labels(&rs->sc_r,
889 						 RF_FINAL_COMPONENT_UPDATE);
890 
891 		/* If the kernel is shutting down, it will detach
892 		 * this RAID set soon enough.
893 		 */
894 	}
895 
896 	raidunlock(rs);
897 	return (0);
898 
899 }
900 
901 void
902 raidstrategy(struct buf *bp)
903 {
904 	unsigned int unit = raidunit(bp->b_dev);
905 	RF_Raid_t *raidPtr;
906 	int     wlabel;
907 	struct raid_softc *rs;
908 
909 	if ((rs = raidget(unit)) == NULL) {
910 		bp->b_error = ENXIO;
911 		goto done;
912 	}
913 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
914 		bp->b_error = ENXIO;
915 		goto done;
916 	}
917 	raidPtr = &rs->sc_r;
918 	if (!raidPtr->valid) {
919 		bp->b_error = ENODEV;
920 		goto done;
921 	}
922 	if (bp->b_bcount == 0) {
923 		db1_printf(("b_bcount is zero..\n"));
924 		goto done;
925 	}
926 
927 	/*
928 	 * Do bounds checking and adjust transfer.  If there's an
929 	 * error, the bounds check will flag that for us.
930 	 */
931 
932 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
933 	if (DISKPART(bp->b_dev) == RAW_PART) {
934 		uint64_t size; /* device size in DEV_BSIZE unit */
935 
936 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
937 			size = raidPtr->totalSectors <<
938 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
939 		} else {
940 			size = raidPtr->totalSectors >>
941 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
942 		}
943 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
944 			goto done;
945 		}
946 	} else {
947 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
948 			db1_printf(("Bounds check failed!!:%d %d\n",
949 				(int) bp->b_blkno, (int) wlabel));
950 			goto done;
951 		}
952 	}
953 
954 	rf_lock_mutex2(raidPtr->iodone_lock);
955 
956 	bp->b_resid = 0;
957 
958 	/* stuff it onto our queue */
959 	bufq_put(rs->buf_queue, bp);
960 
961 	/* scheduled the IO to happen at the next convenient time */
962 	rf_signal_cond2(raidPtr->iodone_cv);
963 	rf_unlock_mutex2(raidPtr->iodone_lock);
964 
965 	return;
966 
967 done:
968 	bp->b_resid = bp->b_bcount;
969 	biodone(bp);
970 }
971 /* ARGSUSED */
972 int
973 raidread(dev_t dev, struct uio *uio, int flags)
974 {
975 	int     unit = raidunit(dev);
976 	struct raid_softc *rs;
977 
978 	if ((rs = raidget(unit)) == NULL)
979 		return ENXIO;
980 
981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
982 		return (ENXIO);
983 
984 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
985 
986 }
987 /* ARGSUSED */
988 int
989 raidwrite(dev_t dev, struct uio *uio, int flags)
990 {
991 	int     unit = raidunit(dev);
992 	struct raid_softc *rs;
993 
994 	if ((rs = raidget(unit)) == NULL)
995 		return ENXIO;
996 
997 	if ((rs->sc_flags & RAIDF_INITED) == 0)
998 		return (ENXIO);
999 
1000 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
1001 
1002 }
1003 
1004 static int
1005 raid_detach_unlocked(struct raid_softc *rs)
1006 {
1007 	int error;
1008 	RF_Raid_t *raidPtr;
1009 
1010 	raidPtr = &rs->sc_r;
1011 
1012 	/*
1013 	 * If somebody has a partition mounted, we shouldn't
1014 	 * shutdown.
1015 	 */
1016 	if (rs->sc_dkdev.dk_openmask != 0)
1017 		return EBUSY;
1018 
1019 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1020 		;	/* not initialized: nothing to do */
1021 	else if ((error = rf_Shutdown(raidPtr)) != 0)
1022 		return error;
1023 	else
1024 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
1025 
1026 	/* Detach the disk. */
1027 	dkwedge_delall(&rs->sc_dkdev);
1028 	disk_detach(&rs->sc_dkdev);
1029 	disk_destroy(&rs->sc_dkdev);
1030 
1031 	aprint_normal_dev(rs->sc_dev, "detached\n");
1032 
1033 	return 0;
1034 }
1035 
1036 int
1037 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1038 {
1039 	int     unit = raidunit(dev);
1040 	int     error = 0;
1041 	int     part, pmask, s;
1042 	cfdata_t cf;
1043 	struct raid_softc *rs;
1044 	RF_Config_t *k_cfg, *u_cfg;
1045 	RF_Raid_t *raidPtr;
1046 	RF_RaidDisk_t *diskPtr;
1047 	RF_AccTotals_t *totals;
1048 	RF_DeviceConfig_t *d_cfg, **ucfgp;
1049 	u_char *specific_buf;
1050 	int retcode = 0;
1051 	int column;
1052 /*	int raidid; */
1053 	struct rf_recon_req *rrcopy, *rr;
1054 	RF_ComponentLabel_t *clabel;
1055 	RF_ComponentLabel_t *ci_label;
1056 	RF_ComponentLabel_t **clabel_ptr;
1057 	RF_SingleComponent_t *sparePtr,*componentPtr;
1058 	RF_SingleComponent_t component;
1059 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1060 	int i, j, d;
1061 #ifdef __HAVE_OLD_DISKLABEL
1062 	struct disklabel newlabel;
1063 #endif
1064 	struct dkwedge_info *dkw;
1065 
1066 	if ((rs = raidget(unit)) == NULL)
1067 		return ENXIO;
1068 	raidPtr = &rs->sc_r;
1069 
1070 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1071 		(int) DISKPART(dev), (int) unit, cmd));
1072 
1073 	/* Must be open for writes for these commands... */
1074 	switch (cmd) {
1075 #ifdef DIOCGSECTORSIZE
1076 	case DIOCGSECTORSIZE:
1077 		*(u_int *)data = raidPtr->bytesPerSector;
1078 		return 0;
1079 	case DIOCGMEDIASIZE:
1080 		*(off_t *)data =
1081 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1082 		return 0;
1083 #endif
1084 	case DIOCSDINFO:
1085 	case DIOCWDINFO:
1086 #ifdef __HAVE_OLD_DISKLABEL
1087 	case ODIOCWDINFO:
1088 	case ODIOCSDINFO:
1089 #endif
1090 	case DIOCWLABEL:
1091 	case DIOCAWEDGE:
1092 	case DIOCDWEDGE:
1093 	case DIOCSSTRATEGY:
1094 		if ((flag & FWRITE) == 0)
1095 			return (EBADF);
1096 	}
1097 
1098 	/* Must be initialized for these... */
1099 	switch (cmd) {
1100 	case DIOCGDINFO:
1101 	case DIOCSDINFO:
1102 	case DIOCWDINFO:
1103 #ifdef __HAVE_OLD_DISKLABEL
1104 	case ODIOCGDINFO:
1105 	case ODIOCWDINFO:
1106 	case ODIOCSDINFO:
1107 	case ODIOCGDEFLABEL:
1108 #endif
1109 	case DIOCGPART:
1110 	case DIOCWLABEL:
1111 	case DIOCGDEFLABEL:
1112 	case DIOCAWEDGE:
1113 	case DIOCDWEDGE:
1114 	case DIOCLWEDGES:
1115 	case DIOCCACHESYNC:
1116 	case RAIDFRAME_SHUTDOWN:
1117 	case RAIDFRAME_REWRITEPARITY:
1118 	case RAIDFRAME_GET_INFO:
1119 	case RAIDFRAME_RESET_ACCTOTALS:
1120 	case RAIDFRAME_GET_ACCTOTALS:
1121 	case RAIDFRAME_KEEP_ACCTOTALS:
1122 	case RAIDFRAME_GET_SIZE:
1123 	case RAIDFRAME_FAIL_DISK:
1124 	case RAIDFRAME_COPYBACK:
1125 	case RAIDFRAME_CHECK_RECON_STATUS:
1126 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1127 	case RAIDFRAME_GET_COMPONENT_LABEL:
1128 	case RAIDFRAME_SET_COMPONENT_LABEL:
1129 	case RAIDFRAME_ADD_HOT_SPARE:
1130 	case RAIDFRAME_REMOVE_HOT_SPARE:
1131 	case RAIDFRAME_INIT_LABELS:
1132 	case RAIDFRAME_REBUILD_IN_PLACE:
1133 	case RAIDFRAME_CHECK_PARITY:
1134 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1135 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1136 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1137 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1138 	case RAIDFRAME_SET_AUTOCONFIG:
1139 	case RAIDFRAME_SET_ROOT:
1140 	case RAIDFRAME_DELETE_COMPONENT:
1141 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1142 	case RAIDFRAME_PARITYMAP_STATUS:
1143 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1144 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1145 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1146 	case DIOCGSTRATEGY:
1147 	case DIOCSSTRATEGY:
1148 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1149 			return (ENXIO);
1150 	}
1151 
1152 	switch (cmd) {
1153 #ifdef COMPAT_50
1154 	case RAIDFRAME_GET_INFO50:
1155 		return rf_get_info50(raidPtr, data);
1156 
1157 	case RAIDFRAME_CONFIGURE50:
1158 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1159 			return retcode;
1160 		goto config;
1161 #endif
1162 		/* configure the system */
1163 	case RAIDFRAME_CONFIGURE:
1164 
1165 		if (raidPtr->valid) {
1166 			/* There is a valid RAID set running on this unit! */
1167 			printf("raid%d: Device already configured!\n",unit);
1168 			return(EINVAL);
1169 		}
1170 
1171 		/* copy-in the configuration information */
1172 		/* data points to a pointer to the configuration structure */
1173 
1174 		u_cfg = *((RF_Config_t **) data);
1175 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1176 		if (k_cfg == NULL) {
1177 			return (ENOMEM);
1178 		}
1179 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1180 		if (retcode) {
1181 			RF_Free(k_cfg, sizeof(RF_Config_t));
1182 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1183 				retcode));
1184 			return (retcode);
1185 		}
1186 		goto config;
1187 	config:
1188 		/* allocate a buffer for the layout-specific data, and copy it
1189 		 * in */
1190 		if (k_cfg->layoutSpecificSize) {
1191 			if (k_cfg->layoutSpecificSize > 10000) {
1192 				/* sanity check */
1193 				RF_Free(k_cfg, sizeof(RF_Config_t));
1194 				return (EINVAL);
1195 			}
1196 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1197 			    (u_char *));
1198 			if (specific_buf == NULL) {
1199 				RF_Free(k_cfg, sizeof(RF_Config_t));
1200 				return (ENOMEM);
1201 			}
1202 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1203 			    k_cfg->layoutSpecificSize);
1204 			if (retcode) {
1205 				RF_Free(k_cfg, sizeof(RF_Config_t));
1206 				RF_Free(specific_buf,
1207 					k_cfg->layoutSpecificSize);
1208 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1209 					retcode));
1210 				return (retcode);
1211 			}
1212 		} else
1213 			specific_buf = NULL;
1214 		k_cfg->layoutSpecific = specific_buf;
1215 
1216 		/* should do some kind of sanity check on the configuration.
1217 		 * Store the sum of all the bytes in the last byte? */
1218 
1219 		/* configure the system */
1220 
1221 		/*
1222 		 * Clear the entire RAID descriptor, just to make sure
1223 		 *  there is no stale data left in the case of a
1224 		 *  reconfiguration
1225 		 */
1226 		memset(raidPtr, 0, sizeof(*raidPtr));
1227 		raidPtr->softc = rs;
1228 		raidPtr->raidid = unit;
1229 
1230 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1231 
1232 		if (retcode == 0) {
1233 
1234 			/* allow this many simultaneous IO's to
1235 			   this RAID device */
1236 			raidPtr->openings = RAIDOUTSTANDING;
1237 
1238 			raidinit(rs);
1239 			rf_markalldirty(raidPtr);
1240 		}
1241 		/* free the buffers.  No return code here. */
1242 		if (k_cfg->layoutSpecificSize) {
1243 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1244 		}
1245 		RF_Free(k_cfg, sizeof(RF_Config_t));
1246 
1247 		return (retcode);
1248 
1249 		/* shutdown the system */
1250 	case RAIDFRAME_SHUTDOWN:
1251 
1252 		part = DISKPART(dev);
1253 		pmask = (1 << part);
1254 
1255 		if ((error = raidlock(rs)) != 0)
1256 			return (error);
1257 
1258 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1259 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1260 			(rs->sc_dkdev.dk_copenmask & pmask)))
1261 			retcode = EBUSY;
1262 		else {
1263 			rs->sc_flags |= RAIDF_SHUTDOWN;
1264 			rs->sc_dkdev.dk_copenmask &= ~pmask;
1265 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
1266 			rs->sc_dkdev.dk_openmask &= ~pmask;
1267 			retcode = 0;
1268 		}
1269 
1270 		raidunlock(rs);
1271 
1272 		if (retcode != 0)
1273 			return retcode;
1274 
1275 		/* free the pseudo device attach bits */
1276 
1277 		cf = device_cfdata(rs->sc_dev);
1278 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1279 			free(cf, M_RAIDFRAME);
1280 
1281 		return (retcode);
1282 	case RAIDFRAME_GET_COMPONENT_LABEL:
1283 		clabel_ptr = (RF_ComponentLabel_t **) data;
1284 		/* need to read the component label for the disk indicated
1285 		   by row,column in clabel */
1286 
1287 		/*
1288 		 * Perhaps there should be an option to skip the in-core
1289 		 * copy and hit the disk, as with disklabel(8).
1290 		 */
1291 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1292 
1293 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1294 
1295 		if (retcode) {
1296 			RF_Free(clabel, sizeof(*clabel));
1297 			return retcode;
1298 		}
1299 
1300 		clabel->row = 0; /* Don't allow looking at anything else.*/
1301 
1302 		column = clabel->column;
1303 
1304 		if ((column < 0) || (column >= raidPtr->numCol +
1305 		    raidPtr->numSpare)) {
1306 			RF_Free(clabel, sizeof(*clabel));
1307 			return EINVAL;
1308 		}
1309 
1310 		RF_Free(clabel, sizeof(*clabel));
1311 
1312 		clabel = raidget_component_label(raidPtr, column);
1313 
1314 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1315 
1316 #if 0
1317 	case RAIDFRAME_SET_COMPONENT_LABEL:
1318 		clabel = (RF_ComponentLabel_t *) data;
1319 
1320 		/* XXX check the label for valid stuff... */
1321 		/* Note that some things *should not* get modified --
1322 		   the user should be re-initing the labels instead of
1323 		   trying to patch things.
1324 		   */
1325 
1326 		raidid = raidPtr->raidid;
1327 #ifdef DEBUG
1328 		printf("raid%d: Got component label:\n", raidid);
1329 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1330 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1331 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1332 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1333 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1334 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1335 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1336 #endif
1337 		clabel->row = 0;
1338 		column = clabel->column;
1339 
1340 		if ((column < 0) || (column >= raidPtr->numCol)) {
1341 			return(EINVAL);
1342 		}
1343 
1344 		/* XXX this isn't allowed to do anything for now :-) */
1345 
1346 		/* XXX and before it is, we need to fill in the rest
1347 		   of the fields!?!?!?! */
1348 		memcpy(raidget_component_label(raidPtr, column),
1349 		    clabel, sizeof(*clabel));
1350 		raidflush_component_label(raidPtr, column);
1351 		return (0);
1352 #endif
1353 
1354 	case RAIDFRAME_INIT_LABELS:
1355 		clabel = (RF_ComponentLabel_t *) data;
1356 		/*
1357 		   we only want the serial number from
1358 		   the above.  We get all the rest of the information
1359 		   from the config that was used to create this RAID
1360 		   set.
1361 		   */
1362 
1363 		raidPtr->serial_number = clabel->serial_number;
1364 
1365 		for(column=0;column<raidPtr->numCol;column++) {
1366 			diskPtr = &raidPtr->Disks[column];
1367 			if (!RF_DEAD_DISK(diskPtr->status)) {
1368 				ci_label = raidget_component_label(raidPtr,
1369 				    column);
1370 				/* Zeroing this is important. */
1371 				memset(ci_label, 0, sizeof(*ci_label));
1372 				raid_init_component_label(raidPtr, ci_label);
1373 				ci_label->serial_number =
1374 				    raidPtr->serial_number;
1375 				ci_label->row = 0; /* we dont' pretend to support more */
1376 				rf_component_label_set_partitionsize(ci_label,
1377 				    diskPtr->partitionSize);
1378 				ci_label->column = column;
1379 				raidflush_component_label(raidPtr, column);
1380 			}
1381 			/* XXXjld what about the spares? */
1382 		}
1383 
1384 		return (retcode);
1385 	case RAIDFRAME_SET_AUTOCONFIG:
1386 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1387 		printf("raid%d: New autoconfig value is: %d\n",
1388 		       raidPtr->raidid, d);
1389 		*(int *) data = d;
1390 		return (retcode);
1391 
1392 	case RAIDFRAME_SET_ROOT:
1393 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1394 		printf("raid%d: New rootpartition value is: %d\n",
1395 		       raidPtr->raidid, d);
1396 		*(int *) data = d;
1397 		return (retcode);
1398 
1399 		/* initialize all parity */
1400 	case RAIDFRAME_REWRITEPARITY:
1401 
1402 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1403 			/* Parity for RAID 0 is trivially correct */
1404 			raidPtr->parity_good = RF_RAID_CLEAN;
1405 			return(0);
1406 		}
1407 
1408 		if (raidPtr->parity_rewrite_in_progress == 1) {
1409 			/* Re-write is already in progress! */
1410 			return(EINVAL);
1411 		}
1412 
1413 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1414 					   rf_RewriteParityThread,
1415 					   raidPtr,"raid_parity");
1416 		return (retcode);
1417 
1418 
1419 	case RAIDFRAME_ADD_HOT_SPARE:
1420 		sparePtr = (RF_SingleComponent_t *) data;
1421 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1422 		retcode = rf_add_hot_spare(raidPtr, &component);
1423 		return(retcode);
1424 
1425 	case RAIDFRAME_REMOVE_HOT_SPARE:
1426 		return(retcode);
1427 
1428 	case RAIDFRAME_DELETE_COMPONENT:
1429 		componentPtr = (RF_SingleComponent_t *)data;
1430 		memcpy( &component, componentPtr,
1431 			sizeof(RF_SingleComponent_t));
1432 		retcode = rf_delete_component(raidPtr, &component);
1433 		return(retcode);
1434 
1435 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1436 		componentPtr = (RF_SingleComponent_t *)data;
1437 		memcpy( &component, componentPtr,
1438 			sizeof(RF_SingleComponent_t));
1439 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1440 		return(retcode);
1441 
1442 	case RAIDFRAME_REBUILD_IN_PLACE:
1443 
1444 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1445 			/* Can't do this on a RAID 0!! */
1446 			return(EINVAL);
1447 		}
1448 
1449 		if (raidPtr->recon_in_progress == 1) {
1450 			/* a reconstruct is already in progress! */
1451 			return(EINVAL);
1452 		}
1453 
1454 		componentPtr = (RF_SingleComponent_t *) data;
1455 		memcpy( &component, componentPtr,
1456 			sizeof(RF_SingleComponent_t));
1457 		component.row = 0; /* we don't support any more */
1458 		column = component.column;
1459 
1460 		if ((column < 0) || (column >= raidPtr->numCol)) {
1461 			return(EINVAL);
1462 		}
1463 
1464 		rf_lock_mutex2(raidPtr->mutex);
1465 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1466 		    (raidPtr->numFailures > 0)) {
1467 			/* XXX 0 above shouldn't be constant!!! */
1468 			/* some component other than this has failed.
1469 			   Let's not make things worse than they already
1470 			   are... */
1471 			printf("raid%d: Unable to reconstruct to disk at:\n",
1472 			       raidPtr->raidid);
1473 			printf("raid%d:     Col: %d   Too many failures.\n",
1474 			       raidPtr->raidid, column);
1475 			rf_unlock_mutex2(raidPtr->mutex);
1476 			return (EINVAL);
1477 		}
1478 		if (raidPtr->Disks[column].status ==
1479 		    rf_ds_reconstructing) {
1480 			printf("raid%d: Unable to reconstruct to disk at:\n",
1481 			       raidPtr->raidid);
1482 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
1483 
1484 			rf_unlock_mutex2(raidPtr->mutex);
1485 			return (EINVAL);
1486 		}
1487 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1488 			rf_unlock_mutex2(raidPtr->mutex);
1489 			return (EINVAL);
1490 		}
1491 		rf_unlock_mutex2(raidPtr->mutex);
1492 
1493 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1494 		if (rrcopy == NULL)
1495 			return(ENOMEM);
1496 
1497 		rrcopy->raidPtr = (void *) raidPtr;
1498 		rrcopy->col = column;
1499 
1500 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1501 					   rf_ReconstructInPlaceThread,
1502 					   rrcopy,"raid_reconip");
1503 		return(retcode);
1504 
1505 	case RAIDFRAME_GET_INFO:
1506 		if (!raidPtr->valid)
1507 			return (ENODEV);
1508 		ucfgp = (RF_DeviceConfig_t **) data;
1509 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1510 			  (RF_DeviceConfig_t *));
1511 		if (d_cfg == NULL)
1512 			return (ENOMEM);
1513 		d_cfg->rows = 1; /* there is only 1 row now */
1514 		d_cfg->cols = raidPtr->numCol;
1515 		d_cfg->ndevs = raidPtr->numCol;
1516 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1517 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1518 			return (ENOMEM);
1519 		}
1520 		d_cfg->nspares = raidPtr->numSpare;
1521 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1522 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1523 			return (ENOMEM);
1524 		}
1525 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1526 		d = 0;
1527 		for (j = 0; j < d_cfg->cols; j++) {
1528 			d_cfg->devs[d] = raidPtr->Disks[j];
1529 			d++;
1530 		}
1531 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1532 			d_cfg->spares[i] = raidPtr->Disks[j];
1533 		}
1534 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1535 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1536 
1537 		return (retcode);
1538 
1539 	case RAIDFRAME_CHECK_PARITY:
1540 		*(int *) data = raidPtr->parity_good;
1541 		return (0);
1542 
1543 	case RAIDFRAME_PARITYMAP_STATUS:
1544 		if (rf_paritymap_ineligible(raidPtr))
1545 			return EINVAL;
1546 		rf_paritymap_status(raidPtr->parity_map,
1547 		    (struct rf_pmstat *)data);
1548 		return 0;
1549 
1550 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1551 		if (rf_paritymap_ineligible(raidPtr))
1552 			return EINVAL;
1553 		if (raidPtr->parity_map == NULL)
1554 			return ENOENT; /* ??? */
1555 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1556 			(struct rf_pmparams *)data, 1))
1557 			return EINVAL;
1558 		return 0;
1559 
1560 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1561 		if (rf_paritymap_ineligible(raidPtr))
1562 			return EINVAL;
1563 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1564 		return 0;
1565 
1566 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1567 		if (rf_paritymap_ineligible(raidPtr))
1568 			return EINVAL;
1569 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1570 		/* XXX should errors be passed up? */
1571 		return 0;
1572 
1573 	case RAIDFRAME_RESET_ACCTOTALS:
1574 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1575 		return (0);
1576 
1577 	case RAIDFRAME_GET_ACCTOTALS:
1578 		totals = (RF_AccTotals_t *) data;
1579 		*totals = raidPtr->acc_totals;
1580 		return (0);
1581 
1582 	case RAIDFRAME_KEEP_ACCTOTALS:
1583 		raidPtr->keep_acc_totals = *(int *)data;
1584 		return (0);
1585 
1586 	case RAIDFRAME_GET_SIZE:
1587 		*(int *) data = raidPtr->totalSectors;
1588 		return (0);
1589 
1590 		/* fail a disk & optionally start reconstruction */
1591 	case RAIDFRAME_FAIL_DISK:
1592 
1593 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1594 			/* Can't do this on a RAID 0!! */
1595 			return(EINVAL);
1596 		}
1597 
1598 		rr = (struct rf_recon_req *) data;
1599 		rr->row = 0;
1600 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1601 			return (EINVAL);
1602 
1603 
1604 		rf_lock_mutex2(raidPtr->mutex);
1605 		if (raidPtr->status == rf_rs_reconstructing) {
1606 			/* you can't fail a disk while we're reconstructing! */
1607 			/* XXX wrong for RAID6 */
1608 			rf_unlock_mutex2(raidPtr->mutex);
1609 			return (EINVAL);
1610 		}
1611 		if ((raidPtr->Disks[rr->col].status ==
1612 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1613 			/* some other component has failed.  Let's not make
1614 			   things worse. XXX wrong for RAID6 */
1615 			rf_unlock_mutex2(raidPtr->mutex);
1616 			return (EINVAL);
1617 		}
1618 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1619 			/* Can't fail a spared disk! */
1620 			rf_unlock_mutex2(raidPtr->mutex);
1621 			return (EINVAL);
1622 		}
1623 		rf_unlock_mutex2(raidPtr->mutex);
1624 
1625 		/* make a copy of the recon request so that we don't rely on
1626 		 * the user's buffer */
1627 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1628 		if (rrcopy == NULL)
1629 			return(ENOMEM);
1630 		memcpy(rrcopy, rr, sizeof(*rr));
1631 		rrcopy->raidPtr = (void *) raidPtr;
1632 
1633 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1634 					   rf_ReconThread,
1635 					   rrcopy,"raid_recon");
1636 		return (0);
1637 
1638 		/* invoke a copyback operation after recon on whatever disk
1639 		 * needs it, if any */
1640 	case RAIDFRAME_COPYBACK:
1641 
1642 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1643 			/* This makes no sense on a RAID 0!! */
1644 			return(EINVAL);
1645 		}
1646 
1647 		if (raidPtr->copyback_in_progress == 1) {
1648 			/* Copyback is already in progress! */
1649 			return(EINVAL);
1650 		}
1651 
1652 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1653 					   rf_CopybackThread,
1654 					   raidPtr,"raid_copyback");
1655 		return (retcode);
1656 
1657 		/* return the percentage completion of reconstruction */
1658 	case RAIDFRAME_CHECK_RECON_STATUS:
1659 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1660 			/* This makes no sense on a RAID 0, so tell the
1661 			   user it's done. */
1662 			*(int *) data = 100;
1663 			return(0);
1664 		}
1665 		if (raidPtr->status != rf_rs_reconstructing)
1666 			*(int *) data = 100;
1667 		else {
1668 			if (raidPtr->reconControl->numRUsTotal > 0) {
1669 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1670 			} else {
1671 				*(int *) data = 0;
1672 			}
1673 		}
1674 		return (0);
1675 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1676 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1677 		if (raidPtr->status != rf_rs_reconstructing) {
1678 			progressInfo.remaining = 0;
1679 			progressInfo.completed = 100;
1680 			progressInfo.total = 100;
1681 		} else {
1682 			progressInfo.total =
1683 				raidPtr->reconControl->numRUsTotal;
1684 			progressInfo.completed =
1685 				raidPtr->reconControl->numRUsComplete;
1686 			progressInfo.remaining = progressInfo.total -
1687 				progressInfo.completed;
1688 		}
1689 		retcode = copyout(&progressInfo, *progressInfoPtr,
1690 				  sizeof(RF_ProgressInfo_t));
1691 		return (retcode);
1692 
1693 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1694 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1695 			/* This makes no sense on a RAID 0, so tell the
1696 			   user it's done. */
1697 			*(int *) data = 100;
1698 			return(0);
1699 		}
1700 		if (raidPtr->parity_rewrite_in_progress == 1) {
1701 			*(int *) data = 100 *
1702 				raidPtr->parity_rewrite_stripes_done /
1703 				raidPtr->Layout.numStripe;
1704 		} else {
1705 			*(int *) data = 100;
1706 		}
1707 		return (0);
1708 
1709 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1710 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1711 		if (raidPtr->parity_rewrite_in_progress == 1) {
1712 			progressInfo.total = raidPtr->Layout.numStripe;
1713 			progressInfo.completed =
1714 				raidPtr->parity_rewrite_stripes_done;
1715 			progressInfo.remaining = progressInfo.total -
1716 				progressInfo.completed;
1717 		} else {
1718 			progressInfo.remaining = 0;
1719 			progressInfo.completed = 100;
1720 			progressInfo.total = 100;
1721 		}
1722 		retcode = copyout(&progressInfo, *progressInfoPtr,
1723 				  sizeof(RF_ProgressInfo_t));
1724 		return (retcode);
1725 
1726 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1727 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1728 			/* This makes no sense on a RAID 0 */
1729 			*(int *) data = 100;
1730 			return(0);
1731 		}
1732 		if (raidPtr->copyback_in_progress == 1) {
1733 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1734 				raidPtr->Layout.numStripe;
1735 		} else {
1736 			*(int *) data = 100;
1737 		}
1738 		return (0);
1739 
1740 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1741 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1742 		if (raidPtr->copyback_in_progress == 1) {
1743 			progressInfo.total = raidPtr->Layout.numStripe;
1744 			progressInfo.completed =
1745 				raidPtr->copyback_stripes_done;
1746 			progressInfo.remaining = progressInfo.total -
1747 				progressInfo.completed;
1748 		} else {
1749 			progressInfo.remaining = 0;
1750 			progressInfo.completed = 100;
1751 			progressInfo.total = 100;
1752 		}
1753 		retcode = copyout(&progressInfo, *progressInfoPtr,
1754 				  sizeof(RF_ProgressInfo_t));
1755 		return (retcode);
1756 
1757 		/* the sparetable daemon calls this to wait for the kernel to
1758 		 * need a spare table. this ioctl does not return until a
1759 		 * spare table is needed. XXX -- calling mpsleep here in the
1760 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1761 		 * -- I should either compute the spare table in the kernel,
1762 		 * or have a different -- XXX XXX -- interface (a different
1763 		 * character device) for delivering the table     -- XXX */
1764 #if 0
1765 	case RAIDFRAME_SPARET_WAIT:
1766 		rf_lock_mutex2(rf_sparet_wait_mutex);
1767 		while (!rf_sparet_wait_queue)
1768 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1769 		waitreq = rf_sparet_wait_queue;
1770 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1771 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1772 
1773 		/* structure assignment */
1774 		*((RF_SparetWait_t *) data) = *waitreq;
1775 
1776 		RF_Free(waitreq, sizeof(*waitreq));
1777 		return (0);
1778 
1779 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1780 		 * code in it that will cause the dameon to exit */
1781 	case RAIDFRAME_ABORT_SPARET_WAIT:
1782 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1783 		waitreq->fcol = -1;
1784 		rf_lock_mutex2(rf_sparet_wait_mutex);
1785 		waitreq->next = rf_sparet_wait_queue;
1786 		rf_sparet_wait_queue = waitreq;
1787 		rf_broadcast_conf2(rf_sparet_wait_cv);
1788 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1789 		return (0);
1790 
1791 		/* used by the spare table daemon to deliver a spare table
1792 		 * into the kernel */
1793 	case RAIDFRAME_SEND_SPARET:
1794 
1795 		/* install the spare table */
1796 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1797 
1798 		/* respond to the requestor.  the return status of the spare
1799 		 * table installation is passed in the "fcol" field */
1800 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1801 		waitreq->fcol = retcode;
1802 		rf_lock_mutex2(rf_sparet_wait_mutex);
1803 		waitreq->next = rf_sparet_resp_queue;
1804 		rf_sparet_resp_queue = waitreq;
1805 		rf_broadcast_cond2(rf_sparet_resp_cv);
1806 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1807 
1808 		return (retcode);
1809 #endif
1810 
1811 	default:
1812 		break; /* fall through to the os-specific code below */
1813 
1814 	}
1815 
1816 	if (!raidPtr->valid)
1817 		return (EINVAL);
1818 
1819 	/*
1820 	 * Add support for "regular" device ioctls here.
1821 	 */
1822 
1823 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1824 	if (error != EPASSTHROUGH)
1825 		return (error);
1826 
1827 	switch (cmd) {
1828 	case DIOCGDINFO:
1829 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1830 		break;
1831 #ifdef __HAVE_OLD_DISKLABEL
1832 	case ODIOCGDINFO:
1833 		newlabel = *(rs->sc_dkdev.dk_label);
1834 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1835 			return ENOTTY;
1836 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1837 		break;
1838 #endif
1839 
1840 	case DIOCGPART:
1841 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1842 		((struct partinfo *) data)->part =
1843 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1844 		break;
1845 
1846 	case DIOCWDINFO:
1847 	case DIOCSDINFO:
1848 #ifdef __HAVE_OLD_DISKLABEL
1849 	case ODIOCWDINFO:
1850 	case ODIOCSDINFO:
1851 #endif
1852 	{
1853 		struct disklabel *lp;
1854 #ifdef __HAVE_OLD_DISKLABEL
1855 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1856 			memset(&newlabel, 0, sizeof newlabel);
1857 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1858 			lp = &newlabel;
1859 		} else
1860 #endif
1861 		lp = (struct disklabel *)data;
1862 
1863 		if ((error = raidlock(rs)) != 0)
1864 			return (error);
1865 
1866 		rs->sc_flags |= RAIDF_LABELLING;
1867 
1868 		error = setdisklabel(rs->sc_dkdev.dk_label,
1869 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
1870 		if (error == 0) {
1871 			if (cmd == DIOCWDINFO
1872 #ifdef __HAVE_OLD_DISKLABEL
1873 			    || cmd == ODIOCWDINFO
1874 #endif
1875 			   )
1876 				error = writedisklabel(RAIDLABELDEV(dev),
1877 				    raidstrategy, rs->sc_dkdev.dk_label,
1878 				    rs->sc_dkdev.dk_cpulabel);
1879 		}
1880 		rs->sc_flags &= ~RAIDF_LABELLING;
1881 
1882 		raidunlock(rs);
1883 
1884 		if (error)
1885 			return (error);
1886 		break;
1887 	}
1888 
1889 	case DIOCWLABEL:
1890 		if (*(int *) data != 0)
1891 			rs->sc_flags |= RAIDF_WLABEL;
1892 		else
1893 			rs->sc_flags &= ~RAIDF_WLABEL;
1894 		break;
1895 
1896 	case DIOCGDEFLABEL:
1897 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1898 		break;
1899 
1900 #ifdef __HAVE_OLD_DISKLABEL
1901 	case ODIOCGDEFLABEL:
1902 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
1903 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1904 			return ENOTTY;
1905 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1906 		break;
1907 #endif
1908 
1909 	case DIOCAWEDGE:
1910 	case DIOCDWEDGE:
1911 	    	dkw = (void *)data;
1912 
1913 		/* If the ioctl happens here, the parent is us. */
1914 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
1915 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1916 
1917 	case DIOCLWEDGES:
1918 		return dkwedge_list(&rs->sc_dkdev,
1919 		    (struct dkwedge_list *)data, l);
1920 	case DIOCCACHESYNC:
1921 		return rf_sync_component_caches(raidPtr);
1922 
1923 	case DIOCGSTRATEGY:
1924 	    {
1925 		struct disk_strategy *dks = (void *)data;
1926 
1927 		s = splbio();
1928 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1929 		    sizeof(dks->dks_name));
1930 		splx(s);
1931 		dks->dks_paramlen = 0;
1932 
1933 		return 0;
1934 	    }
1935 
1936 	case DIOCSSTRATEGY:
1937 	    {
1938 		struct disk_strategy *dks = (void *)data;
1939 		struct bufq_state *new;
1940 		struct bufq_state *old;
1941 
1942 		if (dks->dks_param != NULL) {
1943 			return EINVAL;
1944 		}
1945 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1946 		error = bufq_alloc(&new, dks->dks_name,
1947 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1948 		if (error) {
1949 			return error;
1950 		}
1951 		s = splbio();
1952 		old = rs->buf_queue;
1953 		bufq_move(new, old);
1954 		rs->buf_queue = new;
1955 		splx(s);
1956 		bufq_free(old);
1957 
1958 		return 0;
1959 	    }
1960 
1961 	default:
1962 		retcode = ENOTTY;
1963 	}
1964 	return (retcode);
1965 
1966 }
1967 
1968 
1969 /* raidinit -- complete the rest of the initialization for the
1970    RAIDframe device.  */
1971 
1972 
1973 static void
1974 raidinit(struct raid_softc *rs)
1975 {
1976 	cfdata_t cf;
1977 	int     unit;
1978 	RF_Raid_t *raidPtr = &rs->sc_r;
1979 
1980 	unit = raidPtr->raidid;
1981 
1982 
1983 	/* XXX should check return code first... */
1984 	rs->sc_flags |= RAIDF_INITED;
1985 
1986 	/* XXX doesn't check bounds. */
1987 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1988 
1989 	/* attach the pseudo device */
1990 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1991 	cf->cf_name = raid_cd.cd_name;
1992 	cf->cf_atname = raid_cd.cd_name;
1993 	cf->cf_unit = unit;
1994 	cf->cf_fstate = FSTATE_STAR;
1995 
1996 	rs->sc_dev = config_attach_pseudo(cf);
1997 
1998 	if (rs->sc_dev == NULL) {
1999 		printf("raid%d: config_attach_pseudo failed\n",
2000 		    raidPtr->raidid);
2001 		rs->sc_flags &= ~RAIDF_INITED;
2002 		free(cf, M_RAIDFRAME);
2003 		return;
2004 	}
2005 
2006 	/* disk_attach actually creates space for the CPU disklabel, among
2007 	 * other things, so it's critical to call this *BEFORE* we try putzing
2008 	 * with disklabels. */
2009 
2010 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
2011 	disk_attach(&rs->sc_dkdev);
2012 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
2013 
2014 	/* XXX There may be a weird interaction here between this, and
2015 	 * protectedSectors, as used in RAIDframe.  */
2016 
2017 	rs->sc_size = raidPtr->totalSectors;
2018 
2019 	dkwedge_discover(&rs->sc_dkdev);
2020 
2021 	rf_set_geometry(rs, raidPtr);
2022 
2023 }
2024 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
2025 /* wake up the daemon & tell it to get us a spare table
2026  * XXX
2027  * the entries in the queues should be tagged with the raidPtr
2028  * so that in the extremely rare case that two recons happen at once,
2029  * we know for which device were requesting a spare table
2030  * XXX
2031  *
2032  * XXX This code is not currently used. GO
2033  */
2034 int
2035 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2036 {
2037 	int     retcode;
2038 
2039 	rf_lock_mutex2(rf_sparet_wait_mutex);
2040 	req->next = rf_sparet_wait_queue;
2041 	rf_sparet_wait_queue = req;
2042 	rf_broadcast_cond2(rf_sparet_wait_cv);
2043 
2044 	/* mpsleep unlocks the mutex */
2045 	while (!rf_sparet_resp_queue) {
2046 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2047 	}
2048 	req = rf_sparet_resp_queue;
2049 	rf_sparet_resp_queue = req->next;
2050 	rf_unlock_mutex2(rf_sparet_wait_mutex);
2051 
2052 	retcode = req->fcol;
2053 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
2054 					 * alloc'd */
2055 	return (retcode);
2056 }
2057 #endif
2058 
2059 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2060  * bp & passes it down.
2061  * any calls originating in the kernel must use non-blocking I/O
2062  * do some extra sanity checking to return "appropriate" error values for
2063  * certain conditions (to make some standard utilities work)
2064  *
2065  * Formerly known as: rf_DoAccessKernel
2066  */
2067 void
2068 raidstart(RF_Raid_t *raidPtr)
2069 {
2070 	RF_SectorCount_t num_blocks, pb, sum;
2071 	RF_RaidAddr_t raid_addr;
2072 	struct partition *pp;
2073 	daddr_t blocknum;
2074 	struct raid_softc *rs;
2075 	int     do_async;
2076 	struct buf *bp;
2077 	int rc;
2078 
2079 	rs = raidPtr->softc;
2080 	/* quick check to see if anything has died recently */
2081 	rf_lock_mutex2(raidPtr->mutex);
2082 	if (raidPtr->numNewFailures > 0) {
2083 		rf_unlock_mutex2(raidPtr->mutex);
2084 		rf_update_component_labels(raidPtr,
2085 					   RF_NORMAL_COMPONENT_UPDATE);
2086 		rf_lock_mutex2(raidPtr->mutex);
2087 		raidPtr->numNewFailures--;
2088 	}
2089 
2090 	/* Check to see if we're at the limit... */
2091 	while (raidPtr->openings > 0) {
2092 		rf_unlock_mutex2(raidPtr->mutex);
2093 
2094 		/* get the next item, if any, from the queue */
2095 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2096 			/* nothing more to do */
2097 			return;
2098 		}
2099 
2100 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
2101 		 * partition.. Need to make it absolute to the underlying
2102 		 * device.. */
2103 
2104 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2105 		if (DISKPART(bp->b_dev) != RAW_PART) {
2106 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2107 			blocknum += pp->p_offset;
2108 		}
2109 
2110 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2111 			    (int) blocknum));
2112 
2113 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2114 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2115 
2116 		/* *THIS* is where we adjust what block we're going to...
2117 		 * but DO NOT TOUCH bp->b_blkno!!! */
2118 		raid_addr = blocknum;
2119 
2120 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2121 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2122 		sum = raid_addr + num_blocks + pb;
2123 		if (1 || rf_debugKernelAccess) {
2124 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2125 				    (int) raid_addr, (int) sum, (int) num_blocks,
2126 				    (int) pb, (int) bp->b_resid));
2127 		}
2128 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2129 		    || (sum < num_blocks) || (sum < pb)) {
2130 			bp->b_error = ENOSPC;
2131 			bp->b_resid = bp->b_bcount;
2132 			biodone(bp);
2133 			rf_lock_mutex2(raidPtr->mutex);
2134 			continue;
2135 		}
2136 		/*
2137 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2138 		 */
2139 
2140 		if (bp->b_bcount & raidPtr->sectorMask) {
2141 			bp->b_error = EINVAL;
2142 			bp->b_resid = bp->b_bcount;
2143 			biodone(bp);
2144 			rf_lock_mutex2(raidPtr->mutex);
2145 			continue;
2146 
2147 		}
2148 		db1_printf(("Calling DoAccess..\n"));
2149 
2150 
2151 		rf_lock_mutex2(raidPtr->mutex);
2152 		raidPtr->openings--;
2153 		rf_unlock_mutex2(raidPtr->mutex);
2154 
2155 		/*
2156 		 * Everything is async.
2157 		 */
2158 		do_async = 1;
2159 
2160 		disk_busy(&rs->sc_dkdev);
2161 
2162 		/* XXX we're still at splbio() here... do we *really*
2163 		   need to be? */
2164 
2165 		/* don't ever condition on bp->b_flags & B_WRITE.
2166 		 * always condition on B_READ instead */
2167 
2168 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2169 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2170 				 do_async, raid_addr, num_blocks,
2171 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2172 
2173 		if (rc) {
2174 			bp->b_error = rc;
2175 			bp->b_resid = bp->b_bcount;
2176 			biodone(bp);
2177 			/* continue loop */
2178 		}
2179 
2180 		rf_lock_mutex2(raidPtr->mutex);
2181 	}
2182 	rf_unlock_mutex2(raidPtr->mutex);
2183 }
2184 
2185 
2186 
2187 
2188 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2189 
2190 int
2191 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2192 {
2193 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2194 	struct buf *bp;
2195 
2196 	req->queue = queue;
2197 	bp = req->bp;
2198 
2199 	switch (req->type) {
2200 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2201 		/* XXX need to do something extra here.. */
2202 		/* I'm leaving this in, as I've never actually seen it used,
2203 		 * and I'd like folks to report it... GO */
2204 		printf(("WAKEUP CALLED\n"));
2205 		queue->numOutstanding++;
2206 
2207 		bp->b_flags = 0;
2208 		bp->b_private = req;
2209 
2210 		KernelWakeupFunc(bp);
2211 		break;
2212 
2213 	case RF_IO_TYPE_READ:
2214 	case RF_IO_TYPE_WRITE:
2215 #if RF_ACC_TRACE > 0
2216 		if (req->tracerec) {
2217 			RF_ETIMER_START(req->tracerec->timer);
2218 		}
2219 #endif
2220 		InitBP(bp, queue->rf_cinfo->ci_vp,
2221 		    op, queue->rf_cinfo->ci_dev,
2222 		    req->sectorOffset, req->numSector,
2223 		    req->buf, KernelWakeupFunc, (void *) req,
2224 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2225 
2226 		if (rf_debugKernelAccess) {
2227 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2228 				(long) bp->b_blkno));
2229 		}
2230 		queue->numOutstanding++;
2231 		queue->last_deq_sector = req->sectorOffset;
2232 		/* acc wouldn't have been let in if there were any pending
2233 		 * reqs at any other priority */
2234 		queue->curPriority = req->priority;
2235 
2236 		db1_printf(("Going for %c to unit %d col %d\n",
2237 			    req->type, queue->raidPtr->raidid,
2238 			    queue->col));
2239 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2240 			(int) req->sectorOffset, (int) req->numSector,
2241 			(int) (req->numSector <<
2242 			    queue->raidPtr->logBytesPerSector),
2243 			(int) queue->raidPtr->logBytesPerSector));
2244 
2245 		/*
2246 		 * XXX: drop lock here since this can block at
2247 		 * least with backing SCSI devices.  Retake it
2248 		 * to minimize fuss with calling interfaces.
2249 		 */
2250 
2251 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2252 		bdev_strategy(bp);
2253 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2254 		break;
2255 
2256 	default:
2257 		panic("bad req->type in rf_DispatchKernelIO");
2258 	}
2259 	db1_printf(("Exiting from DispatchKernelIO\n"));
2260 
2261 	return (0);
2262 }
2263 /* this is the callback function associated with a I/O invoked from
2264    kernel code.
2265  */
2266 static void
2267 KernelWakeupFunc(struct buf *bp)
2268 {
2269 	RF_DiskQueueData_t *req = NULL;
2270 	RF_DiskQueue_t *queue;
2271 
2272 	db1_printf(("recovering the request queue:\n"));
2273 
2274 	req = bp->b_private;
2275 
2276 	queue = (RF_DiskQueue_t *) req->queue;
2277 
2278 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2279 
2280 #if RF_ACC_TRACE > 0
2281 	if (req->tracerec) {
2282 		RF_ETIMER_STOP(req->tracerec->timer);
2283 		RF_ETIMER_EVAL(req->tracerec->timer);
2284 		rf_lock_mutex2(rf_tracing_mutex);
2285 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2286 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2287 		req->tracerec->num_phys_ios++;
2288 		rf_unlock_mutex2(rf_tracing_mutex);
2289 	}
2290 #endif
2291 
2292 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2293 	 * ballistic, and mark the component as hosed... */
2294 
2295 	if (bp->b_error != 0) {
2296 		/* Mark the disk as dead */
2297 		/* but only mark it once... */
2298 		/* and only if it wouldn't leave this RAID set
2299 		   completely broken */
2300 		if (((queue->raidPtr->Disks[queue->col].status ==
2301 		      rf_ds_optimal) ||
2302 		     (queue->raidPtr->Disks[queue->col].status ==
2303 		      rf_ds_used_spare)) &&
2304 		     (queue->raidPtr->numFailures <
2305 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2306 			printf("raid%d: IO Error.  Marking %s as failed.\n",
2307 			       queue->raidPtr->raidid,
2308 			       queue->raidPtr->Disks[queue->col].devname);
2309 			queue->raidPtr->Disks[queue->col].status =
2310 			    rf_ds_failed;
2311 			queue->raidPtr->status = rf_rs_degraded;
2312 			queue->raidPtr->numFailures++;
2313 			queue->raidPtr->numNewFailures++;
2314 		} else {	/* Disk is already dead... */
2315 			/* printf("Disk already marked as dead!\n"); */
2316 		}
2317 
2318 	}
2319 
2320 	/* Fill in the error value */
2321 	req->error = bp->b_error;
2322 
2323 	/* Drop this one on the "finished" queue... */
2324 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2325 
2326 	/* Let the raidio thread know there is work to be done. */
2327 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2328 
2329 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2330 }
2331 
2332 
2333 /*
2334  * initialize a buf structure for doing an I/O in the kernel.
2335  */
2336 static void
2337 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2338        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2339        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2340        struct proc *b_proc)
2341 {
2342 	/* bp->b_flags       = B_PHYS | rw_flag; */
2343 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2344 	bp->b_oflags = 0;
2345 	bp->b_cflags = 0;
2346 	bp->b_bcount = numSect << logBytesPerSector;
2347 	bp->b_bufsize = bp->b_bcount;
2348 	bp->b_error = 0;
2349 	bp->b_dev = dev;
2350 	bp->b_data = bf;
2351 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2352 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2353 	if (bp->b_bcount == 0) {
2354 		panic("bp->b_bcount is zero in InitBP!!");
2355 	}
2356 	bp->b_proc = b_proc;
2357 	bp->b_iodone = cbFunc;
2358 	bp->b_private = cbArg;
2359 }
2360 
2361 static void
2362 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2363 		    struct disklabel *lp)
2364 {
2365 	memset(lp, 0, sizeof(*lp));
2366 
2367 	/* fabricate a label... */
2368 	lp->d_secperunit = raidPtr->totalSectors;
2369 	lp->d_secsize = raidPtr->bytesPerSector;
2370 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2371 	lp->d_ntracks = 4 * raidPtr->numCol;
2372 	lp->d_ncylinders = raidPtr->totalSectors /
2373 		(lp->d_nsectors * lp->d_ntracks);
2374 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2375 
2376 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2377 	lp->d_type = DTYPE_RAID;
2378 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2379 	lp->d_rpm = 3600;
2380 	lp->d_interleave = 1;
2381 	lp->d_flags = 0;
2382 
2383 	lp->d_partitions[RAW_PART].p_offset = 0;
2384 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2385 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2386 	lp->d_npartitions = RAW_PART + 1;
2387 
2388 	lp->d_magic = DISKMAGIC;
2389 	lp->d_magic2 = DISKMAGIC;
2390 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2391 
2392 }
2393 /*
2394  * Read the disklabel from the raid device.  If one is not present, fake one
2395  * up.
2396  */
2397 static void
2398 raidgetdisklabel(dev_t dev)
2399 {
2400 	int     unit = raidunit(dev);
2401 	struct raid_softc *rs;
2402 	const char   *errstring;
2403 	struct disklabel *lp;
2404 	struct cpu_disklabel *clp;
2405 	RF_Raid_t *raidPtr;
2406 
2407 	if ((rs = raidget(unit)) == NULL)
2408 		return;
2409 
2410 	lp = rs->sc_dkdev.dk_label;
2411 	clp = rs->sc_dkdev.dk_cpulabel;
2412 
2413 	db1_printf(("Getting the disklabel...\n"));
2414 
2415 	memset(clp, 0, sizeof(*clp));
2416 
2417 	raidPtr = &rs->sc_r;
2418 
2419 	raidgetdefaultlabel(raidPtr, rs, lp);
2420 
2421 	/*
2422 	 * Call the generic disklabel extraction routine.
2423 	 */
2424 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2425 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2426 	if (errstring)
2427 		raidmakedisklabel(rs);
2428 	else {
2429 		int     i;
2430 		struct partition *pp;
2431 
2432 		/*
2433 		 * Sanity check whether the found disklabel is valid.
2434 		 *
2435 		 * This is necessary since total size of the raid device
2436 		 * may vary when an interleave is changed even though exactly
2437 		 * same components are used, and old disklabel may used
2438 		 * if that is found.
2439 		 */
2440 		if (lp->d_secperunit != rs->sc_size)
2441 			printf("raid%d: WARNING: %s: "
2442 			    "total sector size in disklabel (%" PRIu32 ") != "
2443 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2444 			    lp->d_secperunit, rs->sc_size);
2445 		for (i = 0; i < lp->d_npartitions; i++) {
2446 			pp = &lp->d_partitions[i];
2447 			if (pp->p_offset + pp->p_size > rs->sc_size)
2448 				printf("raid%d: WARNING: %s: end of partition `%c' "
2449 				       "exceeds the size of raid (%" PRIu64 ")\n",
2450 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
2451 		}
2452 	}
2453 
2454 }
2455 /*
2456  * Take care of things one might want to take care of in the event
2457  * that a disklabel isn't present.
2458  */
2459 static void
2460 raidmakedisklabel(struct raid_softc *rs)
2461 {
2462 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2463 	db1_printf(("Making a label..\n"));
2464 
2465 	/*
2466 	 * For historical reasons, if there's no disklabel present
2467 	 * the raw partition must be marked FS_BSDFFS.
2468 	 */
2469 
2470 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2471 
2472 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2473 
2474 	lp->d_checksum = dkcksum(lp);
2475 }
2476 /*
2477  * Wait interruptibly for an exclusive lock.
2478  *
2479  * XXX
2480  * Several drivers do this; it should be abstracted and made MP-safe.
2481  * (Hmm... where have we seen this warning before :->  GO )
2482  */
2483 static int
2484 raidlock(struct raid_softc *rs)
2485 {
2486 	int     error;
2487 
2488 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2489 		rs->sc_flags |= RAIDF_WANTED;
2490 		if ((error =
2491 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2492 			return (error);
2493 	}
2494 	rs->sc_flags |= RAIDF_LOCKED;
2495 	return (0);
2496 }
2497 /*
2498  * Unlock and wake up any waiters.
2499  */
2500 static void
2501 raidunlock(struct raid_softc *rs)
2502 {
2503 
2504 	rs->sc_flags &= ~RAIDF_LOCKED;
2505 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2506 		rs->sc_flags &= ~RAIDF_WANTED;
2507 		wakeup(rs);
2508 	}
2509 }
2510 
2511 
2512 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2513 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2514 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2515 
2516 static daddr_t
2517 rf_component_info_offset(void)
2518 {
2519 
2520 	return RF_COMPONENT_INFO_OFFSET;
2521 }
2522 
2523 static daddr_t
2524 rf_component_info_size(unsigned secsize)
2525 {
2526 	daddr_t info_size;
2527 
2528 	KASSERT(secsize);
2529 	if (secsize > RF_COMPONENT_INFO_SIZE)
2530 		info_size = secsize;
2531 	else
2532 		info_size = RF_COMPONENT_INFO_SIZE;
2533 
2534 	return info_size;
2535 }
2536 
2537 static daddr_t
2538 rf_parity_map_offset(RF_Raid_t *raidPtr)
2539 {
2540 	daddr_t map_offset;
2541 
2542 	KASSERT(raidPtr->bytesPerSector);
2543 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2544 		map_offset = raidPtr->bytesPerSector;
2545 	else
2546 		map_offset = RF_COMPONENT_INFO_SIZE;
2547 	map_offset += rf_component_info_offset();
2548 
2549 	return map_offset;
2550 }
2551 
2552 static daddr_t
2553 rf_parity_map_size(RF_Raid_t *raidPtr)
2554 {
2555 	daddr_t map_size;
2556 
2557 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2558 		map_size = raidPtr->bytesPerSector;
2559 	else
2560 		map_size = RF_PARITY_MAP_SIZE;
2561 
2562 	return map_size;
2563 }
2564 
2565 int
2566 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2567 {
2568 	RF_ComponentLabel_t *clabel;
2569 
2570 	clabel = raidget_component_label(raidPtr, col);
2571 	clabel->clean = RF_RAID_CLEAN;
2572 	raidflush_component_label(raidPtr, col);
2573 	return(0);
2574 }
2575 
2576 
2577 int
2578 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2579 {
2580 	RF_ComponentLabel_t *clabel;
2581 
2582 	clabel = raidget_component_label(raidPtr, col);
2583 	clabel->clean = RF_RAID_DIRTY;
2584 	raidflush_component_label(raidPtr, col);
2585 	return(0);
2586 }
2587 
2588 int
2589 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2590 {
2591 	KASSERT(raidPtr->bytesPerSector);
2592 	return raidread_component_label(raidPtr->bytesPerSector,
2593 	    raidPtr->Disks[col].dev,
2594 	    raidPtr->raid_cinfo[col].ci_vp,
2595 	    &raidPtr->raid_cinfo[col].ci_label);
2596 }
2597 
2598 RF_ComponentLabel_t *
2599 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2600 {
2601 	return &raidPtr->raid_cinfo[col].ci_label;
2602 }
2603 
2604 int
2605 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2606 {
2607 	RF_ComponentLabel_t *label;
2608 
2609 	label = &raidPtr->raid_cinfo[col].ci_label;
2610 	label->mod_counter = raidPtr->mod_counter;
2611 #ifndef RF_NO_PARITY_MAP
2612 	label->parity_map_modcount = label->mod_counter;
2613 #endif
2614 	return raidwrite_component_label(raidPtr->bytesPerSector,
2615 	    raidPtr->Disks[col].dev,
2616 	    raidPtr->raid_cinfo[col].ci_vp, label);
2617 }
2618 
2619 
2620 static int
2621 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2622     RF_ComponentLabel_t *clabel)
2623 {
2624 	return raidread_component_area(dev, b_vp, clabel,
2625 	    sizeof(RF_ComponentLabel_t),
2626 	    rf_component_info_offset(),
2627 	    rf_component_info_size(secsize));
2628 }
2629 
2630 /* ARGSUSED */
2631 static int
2632 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2633     size_t msize, daddr_t offset, daddr_t dsize)
2634 {
2635 	struct buf *bp;
2636 	const struct bdevsw *bdev;
2637 	int error;
2638 
2639 	/* XXX should probably ensure that we don't try to do this if
2640 	   someone has changed rf_protected_sectors. */
2641 
2642 	if (b_vp == NULL) {
2643 		/* For whatever reason, this component is not valid.
2644 		   Don't try to read a component label from it. */
2645 		return(EINVAL);
2646 	}
2647 
2648 	/* get a block of the appropriate size... */
2649 	bp = geteblk((int)dsize);
2650 	bp->b_dev = dev;
2651 
2652 	/* get our ducks in a row for the read */
2653 	bp->b_blkno = offset / DEV_BSIZE;
2654 	bp->b_bcount = dsize;
2655 	bp->b_flags |= B_READ;
2656  	bp->b_resid = dsize;
2657 
2658 	bdev = bdevsw_lookup(bp->b_dev);
2659 	if (bdev == NULL)
2660 		return (ENXIO);
2661 	(*bdev->d_strategy)(bp);
2662 
2663 	error = biowait(bp);
2664 
2665 	if (!error) {
2666 		memcpy(data, bp->b_data, msize);
2667 	}
2668 
2669 	brelse(bp, 0);
2670 	return(error);
2671 }
2672 
2673 
2674 static int
2675 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2676     RF_ComponentLabel_t *clabel)
2677 {
2678 	return raidwrite_component_area(dev, b_vp, clabel,
2679 	    sizeof(RF_ComponentLabel_t),
2680 	    rf_component_info_offset(),
2681 	    rf_component_info_size(secsize), 0);
2682 }
2683 
2684 /* ARGSUSED */
2685 static int
2686 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2687     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2688 {
2689 	struct buf *bp;
2690 	const struct bdevsw *bdev;
2691 	int error;
2692 
2693 	/* get a block of the appropriate size... */
2694 	bp = geteblk((int)dsize);
2695 	bp->b_dev = dev;
2696 
2697 	/* get our ducks in a row for the write */
2698 	bp->b_blkno = offset / DEV_BSIZE;
2699 	bp->b_bcount = dsize;
2700 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2701  	bp->b_resid = dsize;
2702 
2703 	memset(bp->b_data, 0, dsize);
2704 	memcpy(bp->b_data, data, msize);
2705 
2706 	bdev = bdevsw_lookup(bp->b_dev);
2707 	if (bdev == NULL)
2708 		return (ENXIO);
2709 	(*bdev->d_strategy)(bp);
2710 	if (asyncp)
2711 		return 0;
2712 	error = biowait(bp);
2713 	brelse(bp, 0);
2714 	if (error) {
2715 #if 1
2716 		printf("Failed to write RAID component info!\n");
2717 #endif
2718 	}
2719 
2720 	return(error);
2721 }
2722 
2723 void
2724 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2725 {
2726 	int c;
2727 
2728 	for (c = 0; c < raidPtr->numCol; c++) {
2729 		/* Skip dead disks. */
2730 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2731 			continue;
2732 		/* XXXjld: what if an error occurs here? */
2733 		raidwrite_component_area(raidPtr->Disks[c].dev,
2734 		    raidPtr->raid_cinfo[c].ci_vp, map,
2735 		    RF_PARITYMAP_NBYTE,
2736 		    rf_parity_map_offset(raidPtr),
2737 		    rf_parity_map_size(raidPtr), 0);
2738 	}
2739 }
2740 
2741 void
2742 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2743 {
2744 	struct rf_paritymap_ondisk tmp;
2745 	int c,first;
2746 
2747 	first=1;
2748 	for (c = 0; c < raidPtr->numCol; c++) {
2749 		/* Skip dead disks. */
2750 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2751 			continue;
2752 		raidread_component_area(raidPtr->Disks[c].dev,
2753 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2754 		    RF_PARITYMAP_NBYTE,
2755 		    rf_parity_map_offset(raidPtr),
2756 		    rf_parity_map_size(raidPtr));
2757 		if (first) {
2758 			memcpy(map, &tmp, sizeof(*map));
2759 			first = 0;
2760 		} else {
2761 			rf_paritymap_merge(map, &tmp);
2762 		}
2763 	}
2764 }
2765 
2766 void
2767 rf_markalldirty(RF_Raid_t *raidPtr)
2768 {
2769 	RF_ComponentLabel_t *clabel;
2770 	int sparecol;
2771 	int c;
2772 	int j;
2773 	int scol = -1;
2774 
2775 	raidPtr->mod_counter++;
2776 	for (c = 0; c < raidPtr->numCol; c++) {
2777 		/* we don't want to touch (at all) a disk that has
2778 		   failed */
2779 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2780 			clabel = raidget_component_label(raidPtr, c);
2781 			if (clabel->status == rf_ds_spared) {
2782 				/* XXX do something special...
2783 				   but whatever you do, don't
2784 				   try to access it!! */
2785 			} else {
2786 				raidmarkdirty(raidPtr, c);
2787 			}
2788 		}
2789 	}
2790 
2791 	for( c = 0; c < raidPtr->numSpare ; c++) {
2792 		sparecol = raidPtr->numCol + c;
2793 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2794 			/*
2795 
2796 			   we claim this disk is "optimal" if it's
2797 			   rf_ds_used_spare, as that means it should be
2798 			   directly substitutable for the disk it replaced.
2799 			   We note that too...
2800 
2801 			 */
2802 
2803 			for(j=0;j<raidPtr->numCol;j++) {
2804 				if (raidPtr->Disks[j].spareCol == sparecol) {
2805 					scol = j;
2806 					break;
2807 				}
2808 			}
2809 
2810 			clabel = raidget_component_label(raidPtr, sparecol);
2811 			/* make sure status is noted */
2812 
2813 			raid_init_component_label(raidPtr, clabel);
2814 
2815 			clabel->row = 0;
2816 			clabel->column = scol;
2817 			/* Note: we *don't* change status from rf_ds_used_spare
2818 			   to rf_ds_optimal */
2819 			/* clabel.status = rf_ds_optimal; */
2820 
2821 			raidmarkdirty(raidPtr, sparecol);
2822 		}
2823 	}
2824 }
2825 
2826 
2827 void
2828 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2829 {
2830 	RF_ComponentLabel_t *clabel;
2831 	int sparecol;
2832 	int c;
2833 	int j;
2834 	int scol;
2835 
2836 	scol = -1;
2837 
2838 	/* XXX should do extra checks to make sure things really are clean,
2839 	   rather than blindly setting the clean bit... */
2840 
2841 	raidPtr->mod_counter++;
2842 
2843 	for (c = 0; c < raidPtr->numCol; c++) {
2844 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2845 			clabel = raidget_component_label(raidPtr, c);
2846 			/* make sure status is noted */
2847 			clabel->status = rf_ds_optimal;
2848 
2849 			/* note what unit we are configured as */
2850 			clabel->last_unit = raidPtr->raidid;
2851 
2852 			raidflush_component_label(raidPtr, c);
2853 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2854 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2855 					raidmarkclean(raidPtr, c);
2856 				}
2857 			}
2858 		}
2859 		/* else we don't touch it.. */
2860 	}
2861 
2862 	for( c = 0; c < raidPtr->numSpare ; c++) {
2863 		sparecol = raidPtr->numCol + c;
2864 		/* Need to ensure that the reconstruct actually completed! */
2865 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2866 			/*
2867 
2868 			   we claim this disk is "optimal" if it's
2869 			   rf_ds_used_spare, as that means it should be
2870 			   directly substitutable for the disk it replaced.
2871 			   We note that too...
2872 
2873 			 */
2874 
2875 			for(j=0;j<raidPtr->numCol;j++) {
2876 				if (raidPtr->Disks[j].spareCol == sparecol) {
2877 					scol = j;
2878 					break;
2879 				}
2880 			}
2881 
2882 			/* XXX shouldn't *really* need this... */
2883 			clabel = raidget_component_label(raidPtr, sparecol);
2884 			/* make sure status is noted */
2885 
2886 			raid_init_component_label(raidPtr, clabel);
2887 
2888 			clabel->column = scol;
2889 			clabel->status = rf_ds_optimal;
2890 			clabel->last_unit = raidPtr->raidid;
2891 
2892 			raidflush_component_label(raidPtr, sparecol);
2893 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2894 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2895 					raidmarkclean(raidPtr, sparecol);
2896 				}
2897 			}
2898 		}
2899 	}
2900 }
2901 
2902 void
2903 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2904 {
2905 
2906 	if (vp != NULL) {
2907 		if (auto_configured == 1) {
2908 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2909 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2910 			vput(vp);
2911 
2912 		} else {
2913 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2914 		}
2915 	}
2916 }
2917 
2918 
2919 void
2920 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2921 {
2922 	int r,c;
2923 	struct vnode *vp;
2924 	int acd;
2925 
2926 
2927 	/* We take this opportunity to close the vnodes like we should.. */
2928 
2929 	for (c = 0; c < raidPtr->numCol; c++) {
2930 		vp = raidPtr->raid_cinfo[c].ci_vp;
2931 		acd = raidPtr->Disks[c].auto_configured;
2932 		rf_close_component(raidPtr, vp, acd);
2933 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2934 		raidPtr->Disks[c].auto_configured = 0;
2935 	}
2936 
2937 	for (r = 0; r < raidPtr->numSpare; r++) {
2938 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2939 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2940 		rf_close_component(raidPtr, vp, acd);
2941 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2942 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2943 	}
2944 }
2945 
2946 
2947 void
2948 rf_ReconThread(struct rf_recon_req *req)
2949 {
2950 	int     s;
2951 	RF_Raid_t *raidPtr;
2952 
2953 	s = splbio();
2954 	raidPtr = (RF_Raid_t *) req->raidPtr;
2955 	raidPtr->recon_in_progress = 1;
2956 
2957 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2958 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2959 
2960 	RF_Free(req, sizeof(*req));
2961 
2962 	raidPtr->recon_in_progress = 0;
2963 	splx(s);
2964 
2965 	/* That's all... */
2966 	kthread_exit(0);	/* does not return */
2967 }
2968 
2969 void
2970 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2971 {
2972 	int retcode;
2973 	int s;
2974 
2975 	raidPtr->parity_rewrite_stripes_done = 0;
2976 	raidPtr->parity_rewrite_in_progress = 1;
2977 	s = splbio();
2978 	retcode = rf_RewriteParity(raidPtr);
2979 	splx(s);
2980 	if (retcode) {
2981 		printf("raid%d: Error re-writing parity (%d)!\n",
2982 		    raidPtr->raidid, retcode);
2983 	} else {
2984 		/* set the clean bit!  If we shutdown correctly,
2985 		   the clean bit on each component label will get
2986 		   set */
2987 		raidPtr->parity_good = RF_RAID_CLEAN;
2988 	}
2989 	raidPtr->parity_rewrite_in_progress = 0;
2990 
2991 	/* Anyone waiting for us to stop?  If so, inform them... */
2992 	if (raidPtr->waitShutdown) {
2993 		wakeup(&raidPtr->parity_rewrite_in_progress);
2994 	}
2995 
2996 	/* That's all... */
2997 	kthread_exit(0);	/* does not return */
2998 }
2999 
3000 
3001 void
3002 rf_CopybackThread(RF_Raid_t *raidPtr)
3003 {
3004 	int s;
3005 
3006 	raidPtr->copyback_in_progress = 1;
3007 	s = splbio();
3008 	rf_CopybackReconstructedData(raidPtr);
3009 	splx(s);
3010 	raidPtr->copyback_in_progress = 0;
3011 
3012 	/* That's all... */
3013 	kthread_exit(0);	/* does not return */
3014 }
3015 
3016 
3017 void
3018 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
3019 {
3020 	int s;
3021 	RF_Raid_t *raidPtr;
3022 
3023 	s = splbio();
3024 	raidPtr = req->raidPtr;
3025 	raidPtr->recon_in_progress = 1;
3026 	rf_ReconstructInPlace(raidPtr, req->col);
3027 	RF_Free(req, sizeof(*req));
3028 	raidPtr->recon_in_progress = 0;
3029 	splx(s);
3030 
3031 	/* That's all... */
3032 	kthread_exit(0);	/* does not return */
3033 }
3034 
3035 static RF_AutoConfig_t *
3036 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
3037     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
3038     unsigned secsize)
3039 {
3040 	int good_one = 0;
3041 	RF_ComponentLabel_t *clabel;
3042 	RF_AutoConfig_t *ac;
3043 
3044 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
3045 	if (clabel == NULL) {
3046 oomem:
3047 		    while(ac_list) {
3048 			    ac = ac_list;
3049 			    if (ac->clabel)
3050 				    free(ac->clabel, M_RAIDFRAME);
3051 			    ac_list = ac_list->next;
3052 			    free(ac, M_RAIDFRAME);
3053 		    }
3054 		    printf("RAID auto config: out of memory!\n");
3055 		    return NULL; /* XXX probably should panic? */
3056 	}
3057 
3058 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
3059 		/* Got the label.  Does it look reasonable? */
3060 		if (rf_reasonable_label(clabel, numsecs) &&
3061 		    (rf_component_label_partitionsize(clabel) <= size)) {
3062 #ifdef DEBUG
3063 			printf("Component on: %s: %llu\n",
3064 				cname, (unsigned long long)size);
3065 			rf_print_component_label(clabel);
3066 #endif
3067 			/* if it's reasonable, add it, else ignore it. */
3068 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3069 				M_NOWAIT);
3070 			if (ac == NULL) {
3071 				free(clabel, M_RAIDFRAME);
3072 				goto oomem;
3073 			}
3074 			strlcpy(ac->devname, cname, sizeof(ac->devname));
3075 			ac->dev = dev;
3076 			ac->vp = vp;
3077 			ac->clabel = clabel;
3078 			ac->next = ac_list;
3079 			ac_list = ac;
3080 			good_one = 1;
3081 		}
3082 	}
3083 	if (!good_one) {
3084 		/* cleanup */
3085 		free(clabel, M_RAIDFRAME);
3086 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3087 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3088 		vput(vp);
3089 	}
3090 	return ac_list;
3091 }
3092 
3093 RF_AutoConfig_t *
3094 rf_find_raid_components(void)
3095 {
3096 	struct vnode *vp;
3097 	struct disklabel label;
3098 	device_t dv;
3099 	deviter_t di;
3100 	dev_t dev;
3101 	int bmajor, bminor, wedge, rf_part_found;
3102 	int error;
3103 	int i;
3104 	RF_AutoConfig_t *ac_list;
3105 	uint64_t numsecs;
3106 	unsigned secsize;
3107 
3108 	/* initialize the AutoConfig list */
3109 	ac_list = NULL;
3110 
3111 	/* we begin by trolling through *all* the devices on the system */
3112 
3113 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3114 	     dv = deviter_next(&di)) {
3115 
3116 		/* we are only interested in disks... */
3117 		if (device_class(dv) != DV_DISK)
3118 			continue;
3119 
3120 		/* we don't care about floppies... */
3121 		if (device_is_a(dv, "fd")) {
3122 			continue;
3123 		}
3124 
3125 		/* we don't care about CD's... */
3126 		if (device_is_a(dv, "cd")) {
3127 			continue;
3128 		}
3129 
3130 		/* we don't care about md's... */
3131 		if (device_is_a(dv, "md")) {
3132 			continue;
3133 		}
3134 
3135 		/* hdfd is the Atari/Hades floppy driver */
3136 		if (device_is_a(dv, "hdfd")) {
3137 			continue;
3138 		}
3139 
3140 		/* fdisa is the Atari/Milan floppy driver */
3141 		if (device_is_a(dv, "fdisa")) {
3142 			continue;
3143 		}
3144 
3145 		/* need to find the device_name_to_block_device_major stuff */
3146 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3147 
3148 		rf_part_found = 0; /*No raid partition as yet*/
3149 
3150 		/* get a vnode for the raw partition of this disk */
3151 
3152 		wedge = device_is_a(dv, "dk");
3153 		bminor = minor(device_unit(dv));
3154 		dev = wedge ? makedev(bmajor, bminor) :
3155 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
3156 		if (bdevvp(dev, &vp))
3157 			panic("RAID can't alloc vnode");
3158 
3159 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3160 
3161 		if (error) {
3162 			/* "Who cares."  Continue looking
3163 			   for something that exists*/
3164 			vput(vp);
3165 			continue;
3166 		}
3167 
3168 		error = getdisksize(vp, &numsecs, &secsize);
3169 		if (error) {
3170 			vput(vp);
3171 			continue;
3172 		}
3173 		if (wedge) {
3174 			struct dkwedge_info dkw;
3175 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3176 			    NOCRED);
3177 			if (error) {
3178 				printf("RAIDframe: can't get wedge info for "
3179 				    "dev %s (%d)\n", device_xname(dv), error);
3180 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3181 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3182 				vput(vp);
3183 				continue;
3184 			}
3185 
3186 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3187 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3188 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3189 				vput(vp);
3190 				continue;
3191 			}
3192 
3193 			ac_list = rf_get_component(ac_list, dev, vp,
3194 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3195 			rf_part_found = 1; /*There is a raid component on this disk*/
3196 			continue;
3197 		}
3198 
3199 		/* Ok, the disk exists.  Go get the disklabel. */
3200 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3201 		if (error) {
3202 			/*
3203 			 * XXX can't happen - open() would
3204 			 * have errored out (or faked up one)
3205 			 */
3206 			if (error != ENOTTY)
3207 				printf("RAIDframe: can't get label for dev "
3208 				    "%s (%d)\n", device_xname(dv), error);
3209 		}
3210 
3211 		/* don't need this any more.  We'll allocate it again
3212 		   a little later if we really do... */
3213 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3214 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3215 		vput(vp);
3216 
3217 		if (error)
3218 			continue;
3219 
3220 		rf_part_found = 0; /*No raid partitions yet*/
3221 		for (i = 0; i < label.d_npartitions; i++) {
3222 			char cname[sizeof(ac_list->devname)];
3223 
3224 			/* We only support partitions marked as RAID */
3225 			if (label.d_partitions[i].p_fstype != FS_RAID)
3226 				continue;
3227 
3228 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3229 			if (bdevvp(dev, &vp))
3230 				panic("RAID can't alloc vnode");
3231 
3232 			error = VOP_OPEN(vp, FREAD, NOCRED);
3233 			if (error) {
3234 				/* Whatever... */
3235 				vput(vp);
3236 				continue;
3237 			}
3238 			snprintf(cname, sizeof(cname), "%s%c",
3239 			    device_xname(dv), 'a' + i);
3240 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3241 				label.d_partitions[i].p_size, numsecs, secsize);
3242 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3243 		}
3244 
3245 		/*
3246 		 *If there is no raid component on this disk, either in a
3247 		 *disklabel or inside a wedge, check the raw partition as well,
3248 		 *as it is possible to configure raid components on raw disk
3249 		 *devices.
3250 		 */
3251 
3252 		if (!rf_part_found) {
3253 			char cname[sizeof(ac_list->devname)];
3254 
3255 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3256 			if (bdevvp(dev, &vp))
3257 				panic("RAID can't alloc vnode");
3258 
3259 			error = VOP_OPEN(vp, FREAD, NOCRED);
3260 			if (error) {
3261 				/* Whatever... */
3262 				vput(vp);
3263 				continue;
3264 			}
3265 			snprintf(cname, sizeof(cname), "%s%c",
3266 			    device_xname(dv), 'a' + RAW_PART);
3267 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3268 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3269 		}
3270 	}
3271 	deviter_release(&di);
3272 	return ac_list;
3273 }
3274 
3275 
3276 int
3277 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3278 {
3279 
3280 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3281 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3282 	    ((clabel->clean == RF_RAID_CLEAN) ||
3283 	     (clabel->clean == RF_RAID_DIRTY)) &&
3284 	    clabel->row >=0 &&
3285 	    clabel->column >= 0 &&
3286 	    clabel->num_rows > 0 &&
3287 	    clabel->num_columns > 0 &&
3288 	    clabel->row < clabel->num_rows &&
3289 	    clabel->column < clabel->num_columns &&
3290 	    clabel->blockSize > 0 &&
3291 	    /*
3292 	     * numBlocksHi may contain garbage, but it is ok since
3293 	     * the type is unsigned.  If it is really garbage,
3294 	     * rf_fix_old_label_size() will fix it.
3295 	     */
3296 	    rf_component_label_numblocks(clabel) > 0) {
3297 		/*
3298 		 * label looks reasonable enough...
3299 		 * let's make sure it has no old garbage.
3300 		 */
3301 		if (numsecs)
3302 			rf_fix_old_label_size(clabel, numsecs);
3303 		return(1);
3304 	}
3305 	return(0);
3306 }
3307 
3308 
3309 /*
3310  * For reasons yet unknown, some old component labels have garbage in
3311  * the newer numBlocksHi region, and this causes lossage.  Since those
3312  * disks will also have numsecs set to less than 32 bits of sectors,
3313  * we can determine when this corruption has occurred, and fix it.
3314  *
3315  * The exact same problem, with the same unknown reason, happens to
3316  * the partitionSizeHi member as well.
3317  */
3318 static void
3319 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3320 {
3321 
3322 	if (numsecs < ((uint64_t)1 << 32)) {
3323 		if (clabel->numBlocksHi) {
3324 			printf("WARNING: total sectors < 32 bits, yet "
3325 			       "numBlocksHi set\n"
3326 			       "WARNING: resetting numBlocksHi to zero.\n");
3327 			clabel->numBlocksHi = 0;
3328 		}
3329 
3330 		if (clabel->partitionSizeHi) {
3331 			printf("WARNING: total sectors < 32 bits, yet "
3332 			       "partitionSizeHi set\n"
3333 			       "WARNING: resetting partitionSizeHi to zero.\n");
3334 			clabel->partitionSizeHi = 0;
3335 		}
3336 	}
3337 }
3338 
3339 
3340 #ifdef DEBUG
3341 void
3342 rf_print_component_label(RF_ComponentLabel_t *clabel)
3343 {
3344 	uint64_t numBlocks;
3345 	static const char *rp[] = {
3346 	    "No", "Force", "Soft", "*invalid*"
3347 	};
3348 
3349 
3350 	numBlocks = rf_component_label_numblocks(clabel);
3351 
3352 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3353 	       clabel->row, clabel->column,
3354 	       clabel->num_rows, clabel->num_columns);
3355 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3356 	       clabel->version, clabel->serial_number,
3357 	       clabel->mod_counter);
3358 	printf("   Clean: %s Status: %d\n",
3359 	       clabel->clean ? "Yes" : "No", clabel->status);
3360 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3361 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3362 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3363 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3364 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3365 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3366 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3367 #if 0
3368 	   printf("   Config order: %d\n", clabel->config_order);
3369 #endif
3370 
3371 }
3372 #endif
3373 
3374 RF_ConfigSet_t *
3375 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3376 {
3377 	RF_AutoConfig_t *ac;
3378 	RF_ConfigSet_t *config_sets;
3379 	RF_ConfigSet_t *cset;
3380 	RF_AutoConfig_t *ac_next;
3381 
3382 
3383 	config_sets = NULL;
3384 
3385 	/* Go through the AutoConfig list, and figure out which components
3386 	   belong to what sets.  */
3387 	ac = ac_list;
3388 	while(ac!=NULL) {
3389 		/* we're going to putz with ac->next, so save it here
3390 		   for use at the end of the loop */
3391 		ac_next = ac->next;
3392 
3393 		if (config_sets == NULL) {
3394 			/* will need at least this one... */
3395 			config_sets = (RF_ConfigSet_t *)
3396 				malloc(sizeof(RF_ConfigSet_t),
3397 				       M_RAIDFRAME, M_NOWAIT);
3398 			if (config_sets == NULL) {
3399 				panic("rf_create_auto_sets: No memory!");
3400 			}
3401 			/* this one is easy :) */
3402 			config_sets->ac = ac;
3403 			config_sets->next = NULL;
3404 			config_sets->rootable = 0;
3405 			ac->next = NULL;
3406 		} else {
3407 			/* which set does this component fit into? */
3408 			cset = config_sets;
3409 			while(cset!=NULL) {
3410 				if (rf_does_it_fit(cset, ac)) {
3411 					/* looks like it matches... */
3412 					ac->next = cset->ac;
3413 					cset->ac = ac;
3414 					break;
3415 				}
3416 				cset = cset->next;
3417 			}
3418 			if (cset==NULL) {
3419 				/* didn't find a match above... new set..*/
3420 				cset = (RF_ConfigSet_t *)
3421 					malloc(sizeof(RF_ConfigSet_t),
3422 					       M_RAIDFRAME, M_NOWAIT);
3423 				if (cset == NULL) {
3424 					panic("rf_create_auto_sets: No memory!");
3425 				}
3426 				cset->ac = ac;
3427 				ac->next = NULL;
3428 				cset->next = config_sets;
3429 				cset->rootable = 0;
3430 				config_sets = cset;
3431 			}
3432 		}
3433 		ac = ac_next;
3434 	}
3435 
3436 
3437 	return(config_sets);
3438 }
3439 
3440 static int
3441 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3442 {
3443 	RF_ComponentLabel_t *clabel1, *clabel2;
3444 
3445 	/* If this one matches the *first* one in the set, that's good
3446 	   enough, since the other members of the set would have been
3447 	   through here too... */
3448 	/* note that we are not checking partitionSize here..
3449 
3450 	   Note that we are also not checking the mod_counters here.
3451 	   If everything else matches except the mod_counter, that's
3452 	   good enough for this test.  We will deal with the mod_counters
3453 	   a little later in the autoconfiguration process.
3454 
3455 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3456 
3457 	   The reason we don't check for this is that failed disks
3458 	   will have lower modification counts.  If those disks are
3459 	   not added to the set they used to belong to, then they will
3460 	   form their own set, which may result in 2 different sets,
3461 	   for example, competing to be configured at raid0, and
3462 	   perhaps competing to be the root filesystem set.  If the
3463 	   wrong ones get configured, or both attempt to become /,
3464 	   weird behaviour and or serious lossage will occur.  Thus we
3465 	   need to bring them into the fold here, and kick them out at
3466 	   a later point.
3467 
3468 	*/
3469 
3470 	clabel1 = cset->ac->clabel;
3471 	clabel2 = ac->clabel;
3472 	if ((clabel1->version == clabel2->version) &&
3473 	    (clabel1->serial_number == clabel2->serial_number) &&
3474 	    (clabel1->num_rows == clabel2->num_rows) &&
3475 	    (clabel1->num_columns == clabel2->num_columns) &&
3476 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3477 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3478 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3479 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3480 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3481 	    (clabel1->blockSize == clabel2->blockSize) &&
3482 	    rf_component_label_numblocks(clabel1) ==
3483 	    rf_component_label_numblocks(clabel2) &&
3484 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3485 	    (clabel1->root_partition == clabel2->root_partition) &&
3486 	    (clabel1->last_unit == clabel2->last_unit) &&
3487 	    (clabel1->config_order == clabel2->config_order)) {
3488 		/* if it get's here, it almost *has* to be a match */
3489 	} else {
3490 		/* it's not consistent with somebody in the set..
3491 		   punt */
3492 		return(0);
3493 	}
3494 	/* all was fine.. it must fit... */
3495 	return(1);
3496 }
3497 
3498 int
3499 rf_have_enough_components(RF_ConfigSet_t *cset)
3500 {
3501 	RF_AutoConfig_t *ac;
3502 	RF_AutoConfig_t *auto_config;
3503 	RF_ComponentLabel_t *clabel;
3504 	int c;
3505 	int num_cols;
3506 	int num_missing;
3507 	int mod_counter;
3508 	int mod_counter_found;
3509 	int even_pair_failed;
3510 	char parity_type;
3511 
3512 
3513 	/* check to see that we have enough 'live' components
3514 	   of this set.  If so, we can configure it if necessary */
3515 
3516 	num_cols = cset->ac->clabel->num_columns;
3517 	parity_type = cset->ac->clabel->parityConfig;
3518 
3519 	/* XXX Check for duplicate components!?!?!? */
3520 
3521 	/* Determine what the mod_counter is supposed to be for this set. */
3522 
3523 	mod_counter_found = 0;
3524 	mod_counter = 0;
3525 	ac = cset->ac;
3526 	while(ac!=NULL) {
3527 		if (mod_counter_found==0) {
3528 			mod_counter = ac->clabel->mod_counter;
3529 			mod_counter_found = 1;
3530 		} else {
3531 			if (ac->clabel->mod_counter > mod_counter) {
3532 				mod_counter = ac->clabel->mod_counter;
3533 			}
3534 		}
3535 		ac = ac->next;
3536 	}
3537 
3538 	num_missing = 0;
3539 	auto_config = cset->ac;
3540 
3541 	even_pair_failed = 0;
3542 	for(c=0; c<num_cols; c++) {
3543 		ac = auto_config;
3544 		while(ac!=NULL) {
3545 			if ((ac->clabel->column == c) &&
3546 			    (ac->clabel->mod_counter == mod_counter)) {
3547 				/* it's this one... */
3548 #ifdef DEBUG
3549 				printf("Found: %s at %d\n",
3550 				       ac->devname,c);
3551 #endif
3552 				break;
3553 			}
3554 			ac=ac->next;
3555 		}
3556 		if (ac==NULL) {
3557 				/* Didn't find one here! */
3558 				/* special case for RAID 1, especially
3559 				   where there are more than 2
3560 				   components (where RAIDframe treats
3561 				   things a little differently :( ) */
3562 			if (parity_type == '1') {
3563 				if (c%2 == 0) { /* even component */
3564 					even_pair_failed = 1;
3565 				} else { /* odd component.  If
3566 					    we're failed, and
3567 					    so is the even
3568 					    component, it's
3569 					    "Good Night, Charlie" */
3570 					if (even_pair_failed == 1) {
3571 						return(0);
3572 					}
3573 				}
3574 			} else {
3575 				/* normal accounting */
3576 				num_missing++;
3577 			}
3578 		}
3579 		if ((parity_type == '1') && (c%2 == 1)) {
3580 				/* Just did an even component, and we didn't
3581 				   bail.. reset the even_pair_failed flag,
3582 				   and go on to the next component.... */
3583 			even_pair_failed = 0;
3584 		}
3585 	}
3586 
3587 	clabel = cset->ac->clabel;
3588 
3589 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3590 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3591 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3592 		/* XXX this needs to be made *much* more general */
3593 		/* Too many failures */
3594 		return(0);
3595 	}
3596 	/* otherwise, all is well, and we've got enough to take a kick
3597 	   at autoconfiguring this set */
3598 	return(1);
3599 }
3600 
3601 void
3602 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3603 			RF_Raid_t *raidPtr)
3604 {
3605 	RF_ComponentLabel_t *clabel;
3606 	int i;
3607 
3608 	clabel = ac->clabel;
3609 
3610 	/* 1. Fill in the common stuff */
3611 	config->numRow = clabel->num_rows = 1;
3612 	config->numCol = clabel->num_columns;
3613 	config->numSpare = 0; /* XXX should this be set here? */
3614 	config->sectPerSU = clabel->sectPerSU;
3615 	config->SUsPerPU = clabel->SUsPerPU;
3616 	config->SUsPerRU = clabel->SUsPerRU;
3617 	config->parityConfig = clabel->parityConfig;
3618 	/* XXX... */
3619 	strcpy(config->diskQueueType,"fifo");
3620 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3621 	config->layoutSpecificSize = 0; /* XXX ?? */
3622 
3623 	while(ac!=NULL) {
3624 		/* row/col values will be in range due to the checks
3625 		   in reasonable_label() */
3626 		strcpy(config->devnames[0][ac->clabel->column],
3627 		       ac->devname);
3628 		ac = ac->next;
3629 	}
3630 
3631 	for(i=0;i<RF_MAXDBGV;i++) {
3632 		config->debugVars[i][0] = 0;
3633 	}
3634 }
3635 
3636 int
3637 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3638 {
3639 	RF_ComponentLabel_t *clabel;
3640 	int column;
3641 	int sparecol;
3642 
3643 	raidPtr->autoconfigure = new_value;
3644 
3645 	for(column=0; column<raidPtr->numCol; column++) {
3646 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3647 			clabel = raidget_component_label(raidPtr, column);
3648 			clabel->autoconfigure = new_value;
3649 			raidflush_component_label(raidPtr, column);
3650 		}
3651 	}
3652 	for(column = 0; column < raidPtr->numSpare ; column++) {
3653 		sparecol = raidPtr->numCol + column;
3654 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3655 			clabel = raidget_component_label(raidPtr, sparecol);
3656 			clabel->autoconfigure = new_value;
3657 			raidflush_component_label(raidPtr, sparecol);
3658 		}
3659 	}
3660 	return(new_value);
3661 }
3662 
3663 int
3664 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3665 {
3666 	RF_ComponentLabel_t *clabel;
3667 	int column;
3668 	int sparecol;
3669 
3670 	raidPtr->root_partition = new_value;
3671 	for(column=0; column<raidPtr->numCol; column++) {
3672 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3673 			clabel = raidget_component_label(raidPtr, column);
3674 			clabel->root_partition = new_value;
3675 			raidflush_component_label(raidPtr, column);
3676 		}
3677 	}
3678 	for(column = 0; column < raidPtr->numSpare ; column++) {
3679 		sparecol = raidPtr->numCol + column;
3680 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3681 			clabel = raidget_component_label(raidPtr, sparecol);
3682 			clabel->root_partition = new_value;
3683 			raidflush_component_label(raidPtr, sparecol);
3684 		}
3685 	}
3686 	return(new_value);
3687 }
3688 
3689 void
3690 rf_release_all_vps(RF_ConfigSet_t *cset)
3691 {
3692 	RF_AutoConfig_t *ac;
3693 
3694 	ac = cset->ac;
3695 	while(ac!=NULL) {
3696 		/* Close the vp, and give it back */
3697 		if (ac->vp) {
3698 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3699 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
3700 			vput(ac->vp);
3701 			ac->vp = NULL;
3702 		}
3703 		ac = ac->next;
3704 	}
3705 }
3706 
3707 
3708 void
3709 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3710 {
3711 	RF_AutoConfig_t *ac;
3712 	RF_AutoConfig_t *next_ac;
3713 
3714 	ac = cset->ac;
3715 	while(ac!=NULL) {
3716 		next_ac = ac->next;
3717 		/* nuke the label */
3718 		free(ac->clabel, M_RAIDFRAME);
3719 		/* cleanup the config structure */
3720 		free(ac, M_RAIDFRAME);
3721 		/* "next.." */
3722 		ac = next_ac;
3723 	}
3724 	/* and, finally, nuke the config set */
3725 	free(cset, M_RAIDFRAME);
3726 }
3727 
3728 
3729 void
3730 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3731 {
3732 	/* current version number */
3733 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3734 	clabel->serial_number = raidPtr->serial_number;
3735 	clabel->mod_counter = raidPtr->mod_counter;
3736 
3737 	clabel->num_rows = 1;
3738 	clabel->num_columns = raidPtr->numCol;
3739 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3740 	clabel->status = rf_ds_optimal; /* "It's good!" */
3741 
3742 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3743 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3744 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3745 
3746 	clabel->blockSize = raidPtr->bytesPerSector;
3747 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3748 
3749 	/* XXX not portable */
3750 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3751 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3752 	clabel->autoconfigure = raidPtr->autoconfigure;
3753 	clabel->root_partition = raidPtr->root_partition;
3754 	clabel->last_unit = raidPtr->raidid;
3755 	clabel->config_order = raidPtr->config_order;
3756 
3757 #ifndef RF_NO_PARITY_MAP
3758 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3759 #endif
3760 }
3761 
3762 struct raid_softc *
3763 rf_auto_config_set(RF_ConfigSet_t *cset)
3764 {
3765 	RF_Raid_t *raidPtr;
3766 	RF_Config_t *config;
3767 	int raidID;
3768 	struct raid_softc *sc;
3769 
3770 #ifdef DEBUG
3771 	printf("RAID autoconfigure\n");
3772 #endif
3773 
3774 	/* 1. Create a config structure */
3775 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3776 	if (config == NULL) {
3777 		printf("Out of mem!?!?\n");
3778 				/* XXX do something more intelligent here. */
3779 		return NULL;
3780 	}
3781 
3782 	/*
3783 	   2. Figure out what RAID ID this one is supposed to live at
3784 	   See if we can get the same RAID dev that it was configured
3785 	   on last time..
3786 	*/
3787 
3788 	raidID = cset->ac->clabel->last_unit;
3789 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3790 		continue;
3791 #ifdef DEBUG
3792 	printf("Configuring raid%d:\n",raidID);
3793 #endif
3794 
3795 	raidPtr = &sc->sc_r;
3796 
3797 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3798 	raidPtr->softc = sc;
3799 	raidPtr->raidid = raidID;
3800 	raidPtr->openings = RAIDOUTSTANDING;
3801 
3802 	/* 3. Build the configuration structure */
3803 	rf_create_configuration(cset->ac, config, raidPtr);
3804 
3805 	/* 4. Do the configuration */
3806 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3807 		raidinit(sc);
3808 
3809 		rf_markalldirty(raidPtr);
3810 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3811 		switch (cset->ac->clabel->root_partition) {
3812 		case 1:	/* Force Root */
3813 		case 2:	/* Soft Root: root when boot partition part of raid */
3814 			/*
3815 			 * everything configured just fine.  Make a note
3816 			 * that this set is eligible to be root,
3817 			 * or forced to be root
3818 			 */
3819 			cset->rootable = cset->ac->clabel->root_partition;
3820 			/* XXX do this here? */
3821 			raidPtr->root_partition = cset->rootable;
3822 			break;
3823 		default:
3824 			break;
3825 		}
3826 	} else {
3827 		raidput(sc);
3828 		sc = NULL;
3829 	}
3830 
3831 	/* 5. Cleanup */
3832 	free(config, M_RAIDFRAME);
3833 	return sc;
3834 }
3835 
3836 void
3837 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3838 {
3839 	struct buf *bp;
3840 	struct raid_softc *rs;
3841 
3842 	bp = (struct buf *)desc->bp;
3843 	rs = desc->raidPtr->softc;
3844 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3845 	    (bp->b_flags & B_READ));
3846 }
3847 
3848 void
3849 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3850 	     size_t xmin, size_t xmax)
3851 {
3852 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3853 	pool_sethiwat(p, xmax);
3854 	pool_prime(p, xmin);
3855 	pool_setlowat(p, xmin);
3856 }
3857 
3858 /*
3859  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3860  * if there is IO pending and if that IO could possibly be done for a
3861  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
3862  * otherwise.
3863  *
3864  */
3865 
3866 int
3867 rf_buf_queue_check(RF_Raid_t *raidPtr)
3868 {
3869 	struct raid_softc *rs = raidPtr->softc;
3870 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3871 		/* there is work to do */
3872 		return 0;
3873 	}
3874 	/* default is nothing to do */
3875 	return 1;
3876 }
3877 
3878 int
3879 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3880 {
3881 	uint64_t numsecs;
3882 	unsigned secsize;
3883 	int error;
3884 
3885 	error = getdisksize(vp, &numsecs, &secsize);
3886 	if (error == 0) {
3887 		diskPtr->blockSize = secsize;
3888 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3889 		diskPtr->partitionSize = numsecs;
3890 		return 0;
3891 	}
3892 	return error;
3893 }
3894 
3895 static int
3896 raid_match(device_t self, cfdata_t cfdata, void *aux)
3897 {
3898 	return 1;
3899 }
3900 
3901 static void
3902 raid_attach(device_t parent, device_t self, void *aux)
3903 {
3904 
3905 }
3906 
3907 
3908 static int
3909 raid_detach(device_t self, int flags)
3910 {
3911 	int error;
3912 	struct raid_softc *rs = raidget(device_unit(self));
3913 
3914 	if (rs == NULL)
3915 		return ENXIO;
3916 
3917 	if ((error = raidlock(rs)) != 0)
3918 		return (error);
3919 
3920 	error = raid_detach_unlocked(rs);
3921 
3922 	raidunlock(rs);
3923 
3924 	/* XXXkd: raidput(rs) ??? */
3925 
3926 	return error;
3927 }
3928 
3929 static void
3930 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3931 {
3932 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
3933 
3934 	memset(dg, 0, sizeof(*dg));
3935 
3936 	dg->dg_secperunit = raidPtr->totalSectors;
3937 	dg->dg_secsize = raidPtr->bytesPerSector;
3938 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3939 	dg->dg_ntracks = 4 * raidPtr->numCol;
3940 
3941 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
3942 }
3943 
3944 /*
3945  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3946  * We end up returning whatever error was returned by the first cache flush
3947  * that fails.
3948  */
3949 
3950 int
3951 rf_sync_component_caches(RF_Raid_t *raidPtr)
3952 {
3953 	int c, sparecol;
3954 	int e,error;
3955 	int force = 1;
3956 
3957 	error = 0;
3958 	for (c = 0; c < raidPtr->numCol; c++) {
3959 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3960 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3961 					  &force, FWRITE, NOCRED);
3962 			if (e) {
3963 				if (e != ENODEV)
3964 					printf("raid%d: cache flush to component %s failed.\n",
3965 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3966 				if (error == 0) {
3967 					error = e;
3968 				}
3969 			}
3970 		}
3971 	}
3972 
3973 	for( c = 0; c < raidPtr->numSpare ; c++) {
3974 		sparecol = raidPtr->numCol + c;
3975 		/* Need to ensure that the reconstruct actually completed! */
3976 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3977 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3978 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3979 			if (e) {
3980 				if (e != ENODEV)
3981 					printf("raid%d: cache flush to component %s failed.\n",
3982 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3983 				if (error == 0) {
3984 					error = e;
3985 				}
3986 			}
3987 		}
3988 	}
3989 	return error;
3990 }
3991