xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.310 2014/05/12 15:53:01 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.310 2014/05/12 15:53:01 christos Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 
130 #include <prop/proplib.h>
131 
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135 
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149 
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165 
166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
167 						 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
169 						 * installation process */
170 #endif
171 
172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
173 
174 /* prototypes */
175 static void KernelWakeupFunc(struct buf *);
176 static void InitBP(struct buf *, struct vnode *, unsigned,
177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
178     void *, int, struct proc *);
179 struct raid_softc;
180 static void raidinit(struct raid_softc *);
181 
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186 
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188     daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190     daddr_t, daddr_t, int);
191 
192 static int raidwrite_component_label(unsigned,
193     dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195     dev_t, struct vnode *, RF_ComponentLabel_t *);
196 
197 
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206 
207 const struct bdevsw raid_bdevsw = {
208 	.d_open = raidopen,
209 	.d_close = raidclose,
210 	.d_strategy = raidstrategy,
211 	.d_ioctl = raidioctl,
212 	.d_dump = raiddump,
213 	.d_psize = raidsize,
214 	.d_flag = D_DISK
215 };
216 
217 const struct cdevsw raid_cdevsw = {
218 	.d_open = raidopen,
219 	.d_close = raidclose,
220 	.d_read = raidread,
221 	.d_write = raidwrite,
222 	.d_ioctl = raidioctl,
223 	.d_stop = nostop,
224 	.d_tty = notty,
225 	.d_poll = nopoll,
226 	.d_mmap = nommap,
227 	.d_kqfilter = nokqfilter,
228 	.d_flag = D_DISK
229 };
230 
231 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
232 
233 struct raid_softc {
234 	device_t sc_dev;
235 	int	sc_unit;
236 	int     sc_flags;	/* flags */
237 	int     sc_cflags;	/* configuration flags */
238 	uint64_t sc_size;	/* size of the raid device */
239 	char    sc_xname[20];	/* XXX external name */
240 	struct disk sc_dkdev;	/* generic disk device info */
241 	struct bufq_state *buf_queue;	/* used for the device queue */
242 	RF_Raid_t sc_r;
243 	LIST_ENTRY(raid_softc) sc_link;
244 };
245 /* sc_flags */
246 #define RAIDF_INITED	0x01	/* unit has been initialized */
247 #define RAIDF_WLABEL	0x02	/* label area is writable */
248 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
249 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
250 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
251 #define RAIDF_LOCKED	0x80	/* unit is locked */
252 
253 #define	raidunit(x)	DISKUNIT(x)
254 
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258     DVF_DETACH_SHUTDOWN);
259 
260 /*
261  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
262  * Be aware that large numbers can allow the driver to consume a lot of
263  * kernel memory, especially on writes, and in degraded mode reads.
264  *
265  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
266  * a single 64K write will typically require 64K for the old data,
267  * 64K for the old parity, and 64K for the new parity, for a total
268  * of 192K (if the parity buffer is not re-used immediately).
269  * Even it if is used immediately, that's still 128K, which when multiplied
270  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
271  *
272  * Now in degraded mode, for example, a 64K read on the above setup may
273  * require data reconstruction, which will require *all* of the 4 remaining
274  * disks to participate -- 4 * 32K/disk == 128K again.
275  */
276 
277 #ifndef RAIDOUTSTANDING
278 #define RAIDOUTSTANDING   6
279 #endif
280 
281 #define RAIDLABELDEV(dev)	\
282 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
283 
284 /* declared here, and made public, for the benefit of KVM stuff.. */
285 
286 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
287 				     struct disklabel *);
288 static void raidgetdisklabel(dev_t);
289 static void raidmakedisklabel(struct raid_softc *);
290 
291 static int raidlock(struct raid_softc *);
292 static void raidunlock(struct raid_softc *);
293 
294 static int raid_detach_unlocked(struct raid_softc *);
295 
296 static void rf_markalldirty(RF_Raid_t *);
297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
298 
299 void rf_ReconThread(struct rf_recon_req *);
300 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
301 void rf_CopybackThread(RF_Raid_t *raidPtr);
302 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
303 int rf_autoconfig(device_t);
304 void rf_buildroothack(RF_ConfigSet_t *);
305 
306 RF_AutoConfig_t *rf_find_raid_components(void);
307 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
308 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
309 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
310 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
311 int rf_set_autoconfig(RF_Raid_t *, int);
312 int rf_set_rootpartition(RF_Raid_t *, int);
313 void rf_release_all_vps(RF_ConfigSet_t *);
314 void rf_cleanup_config_set(RF_ConfigSet_t *);
315 int rf_have_enough_components(RF_ConfigSet_t *);
316 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
318 
319 /*
320  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
321  * Note that this is overridden by having RAID_AUTOCONFIG as an option
322  * in the kernel config file.
323  */
324 #ifdef RAID_AUTOCONFIG
325 int raidautoconfig = 1;
326 #else
327 int raidautoconfig = 0;
328 #endif
329 static bool raidautoconfigdone = false;
330 
331 struct RF_Pools_s rf_pools;
332 
333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
334 static kmutex_t raid_lock;
335 
336 static struct raid_softc *
337 raidcreate(int unit) {
338 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
339 	if (sc == NULL) {
340 #ifdef DIAGNOSTIC
341 		printf("%s: out of memory\n", __func__);
342 #endif
343 		return NULL;
344 	}
345 	sc->sc_unit = unit;
346 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
347 	return sc;
348 }
349 
350 static void
351 raiddestroy(struct raid_softc *sc) {
352 	bufq_free(sc->buf_queue);
353 	kmem_free(sc, sizeof(*sc));
354 }
355 
356 static struct raid_softc *
357 raidget(int unit) {
358 	struct raid_softc *sc;
359 	if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 		panic("%s: unit %d!", __func__, unit);
362 #endif
363 		return NULL;
364 	}
365 	mutex_enter(&raid_lock);
366 	LIST_FOREACH(sc, &raids, sc_link) {
367 		if (sc->sc_unit == unit) {
368 			mutex_exit(&raid_lock);
369 			return sc;
370 		}
371 	}
372 	mutex_exit(&raid_lock);
373 	if ((sc = raidcreate(unit)) == NULL)
374 		return NULL;
375 	mutex_enter(&raid_lock);
376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
377 	mutex_exit(&raid_lock);
378 	return sc;
379 }
380 
381 static void
382 raidput(struct raid_softc *sc) {
383 	mutex_enter(&raid_lock);
384 	LIST_REMOVE(sc, sc_link);
385 	mutex_exit(&raid_lock);
386 	raiddestroy(sc);
387 }
388 
389 void
390 raidattach(int num)
391 {
392 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
393 	/* This is where all the initialization stuff gets done. */
394 
395 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
396 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
397 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
398 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
399 
400 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
401 #endif
402 
403 	if (rf_BootRaidframe() == 0)
404 		aprint_verbose("Kernelized RAIDframe activated\n");
405 	else
406 		panic("Serious error booting RAID!!");
407 
408 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
409 		aprint_error("raidattach: config_cfattach_attach failed?\n");
410 	}
411 
412 	raidautoconfigdone = false;
413 
414 	/*
415 	 * Register a finalizer which will be used to auto-config RAID
416 	 * sets once all real hardware devices have been found.
417 	 */
418 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
419 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
420 }
421 
422 int
423 rf_autoconfig(device_t self)
424 {
425 	RF_AutoConfig_t *ac_list;
426 	RF_ConfigSet_t *config_sets;
427 
428 	if (!raidautoconfig || raidautoconfigdone == true)
429 		return (0);
430 
431 	/* XXX This code can only be run once. */
432 	raidautoconfigdone = true;
433 
434 #ifdef __HAVE_CPU_BOOTCONF
435 	/*
436 	 * 0. find the boot device if needed first so we can use it later
437 	 * this needs to be done before we autoconfigure any raid sets,
438 	 * because if we use wedges we are not going to be able to open
439 	 * the boot device later
440 	 */
441 	if (booted_device == NULL)
442 		cpu_bootconf();
443 #endif
444 	/* 1. locate all RAID components on the system */
445 	aprint_debug("Searching for RAID components...\n");
446 	ac_list = rf_find_raid_components();
447 
448 	/* 2. Sort them into their respective sets. */
449 	config_sets = rf_create_auto_sets(ac_list);
450 
451 	/*
452 	 * 3. Evaluate each set and configure the valid ones.
453 	 * This gets done in rf_buildroothack().
454 	 */
455 	rf_buildroothack(config_sets);
456 
457 	return 1;
458 }
459 
460 static int
461 rf_containsboot(RF_Raid_t *r, device_t bdv) {
462 	const char *bootname = device_xname(bdv);
463 	size_t len = strlen(bootname);
464 
465 	for (int col = 0; col < r->numCol; col++) {
466 		const char *devname = r->Disks[col].devname;
467 		devname += sizeof("/dev/") - 1;
468 		if (strncmp(devname, "dk", 2) == 0) {
469 			const char *parent =
470 			    dkwedge_get_parent_name(r->Disks[col].dev);
471 			if (parent != NULL)
472 				devname = parent;
473 		}
474 		if (strncmp(devname, bootname, len) == 0) {
475 			struct raid_softc *sc = r->softc;
476 			aprint_debug("raid%d includes boot device %s\n",
477 			    sc->sc_unit, devname);
478 			return 1;
479 		}
480 	}
481 	return 0;
482 }
483 
484 void
485 rf_buildroothack(RF_ConfigSet_t *config_sets)
486 {
487 	RF_ConfigSet_t *cset;
488 	RF_ConfigSet_t *next_cset;
489 	int num_root;
490 	struct raid_softc *sc, *rsc;
491 
492 	sc = rsc = NULL;
493 	num_root = 0;
494 	cset = config_sets;
495 	while (cset != NULL) {
496 		next_cset = cset->next;
497 		if (rf_have_enough_components(cset) &&
498 		    cset->ac->clabel->autoconfigure == 1) {
499 			sc = rf_auto_config_set(cset);
500 			if (sc != NULL) {
501 				aprint_debug("raid%d: configured ok\n",
502 				    sc->sc_unit);
503 				if (cset->rootable) {
504 					rsc = sc;
505 					num_root++;
506 				}
507 			} else {
508 				/* The autoconfig didn't work :( */
509 				aprint_debug("Autoconfig failed\n");
510 				rf_release_all_vps(cset);
511 			}
512 		} else {
513 			/* we're not autoconfiguring this set...
514 			   release the associated resources */
515 			rf_release_all_vps(cset);
516 		}
517 		/* cleanup */
518 		rf_cleanup_config_set(cset);
519 		cset = next_cset;
520 	}
521 
522 	/* if the user has specified what the root device should be
523 	   then we don't touch booted_device or boothowto... */
524 
525 	if (rootspec != NULL)
526 		return;
527 
528 	/* we found something bootable... */
529 
530 	/*
531 	 * XXX: The following code assumes that the root raid
532 	 * is the first ('a') partition. This is about the best
533 	 * we can do with a BSD disklabel, but we might be able
534 	 * to do better with a GPT label, by setting a specified
535 	 * attribute to indicate the root partition. We can then
536 	 * stash the partition number in the r->root_partition
537 	 * high bits (the bottom 2 bits are already used). For
538 	 * now we just set booted_partition to 0 when we override
539 	 * root.
540 	 */
541 	if (num_root == 1) {
542 		device_t candidate_root;
543 		if (rsc->sc_dkdev.dk_nwedges != 0) {
544 			char cname[sizeof(cset->ac->devname)];
545 			/* XXX: assume 'a' */
546 			snprintf(cname, sizeof(cname), "%s%c",
547 			    device_xname(rsc->sc_dev), 'a');
548 			candidate_root = dkwedge_find_by_wname(cname);
549 		} else
550 			candidate_root = rsc->sc_dev;
551 		if (booted_device == NULL ||
552 		    rsc->sc_r.root_partition == 1 ||
553 		    rf_containsboot(&rsc->sc_r, booted_device)) {
554 			booted_device = candidate_root;
555 			booted_partition = 0;	/* XXX assume 'a' */
556 		}
557 	} else if (num_root > 1) {
558 
559 		/*
560 		 * Maybe the MD code can help. If it cannot, then
561 		 * setroot() will discover that we have no
562 		 * booted_device and will ask the user if nothing was
563 		 * hardwired in the kernel config file
564 		 */
565 		if (booted_device == NULL)
566 			return;
567 
568 		num_root = 0;
569 		mutex_enter(&raid_lock);
570 		LIST_FOREACH(sc, &raids, sc_link) {
571 			RF_Raid_t *r = &sc->sc_r;
572 			if (r->valid == 0)
573 				continue;
574 
575 			if (r->root_partition == 0)
576 				continue;
577 
578 			if (rf_containsboot(r, booted_device)) {
579 				num_root++;
580 				rsc = sc;
581 			}
582 		}
583 		mutex_exit(&raid_lock);
584 
585 		if (num_root == 1) {
586 			booted_device = rsc->sc_dev;
587 			booted_partition = 0;	/* XXX assume 'a' */
588 		} else {
589 			/* we can't guess.. require the user to answer... */
590 			boothowto |= RB_ASKNAME;
591 		}
592 	}
593 }
594 
595 
596 int
597 raidsize(dev_t dev)
598 {
599 	struct raid_softc *rs;
600 	struct disklabel *lp;
601 	int     part, unit, omask, size;
602 
603 	unit = raidunit(dev);
604 	if ((rs = raidget(unit)) == NULL)
605 		return -1;
606 	if ((rs->sc_flags & RAIDF_INITED) == 0)
607 		return (-1);
608 
609 	part = DISKPART(dev);
610 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
611 	lp = rs->sc_dkdev.dk_label;
612 
613 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
614 		return (-1);
615 
616 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
617 		size = -1;
618 	else
619 		size = lp->d_partitions[part].p_size *
620 		    (lp->d_secsize / DEV_BSIZE);
621 
622 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
623 		return (-1);
624 
625 	return (size);
626 
627 }
628 
629 int
630 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
631 {
632 	int     unit = raidunit(dev);
633 	struct raid_softc *rs;
634 	const struct bdevsw *bdev;
635 	struct disklabel *lp;
636 	RF_Raid_t *raidPtr;
637 	daddr_t offset;
638 	int     part, c, sparecol, j, scol, dumpto;
639 	int     error = 0;
640 
641 	if ((rs = raidget(unit)) == NULL)
642 		return ENXIO;
643 
644 	raidPtr = &rs->sc_r;
645 
646 	if ((rs->sc_flags & RAIDF_INITED) == 0)
647 		return ENXIO;
648 
649 	/* we only support dumping to RAID 1 sets */
650 	if (raidPtr->Layout.numDataCol != 1 ||
651 	    raidPtr->Layout.numParityCol != 1)
652 		return EINVAL;
653 
654 
655 	if ((error = raidlock(rs)) != 0)
656 		return error;
657 
658 	if (size % DEV_BSIZE != 0) {
659 		error = EINVAL;
660 		goto out;
661 	}
662 
663 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
664 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
665 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
666 		    size / DEV_BSIZE, rs->sc_size);
667 		error = EINVAL;
668 		goto out;
669 	}
670 
671 	part = DISKPART(dev);
672 	lp = rs->sc_dkdev.dk_label;
673 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
674 
675 	/* figure out what device is alive.. */
676 
677 	/*
678 	   Look for a component to dump to.  The preference for the
679 	   component to dump to is as follows:
680 	   1) the master
681 	   2) a used_spare of the master
682 	   3) the slave
683 	   4) a used_spare of the slave
684 	*/
685 
686 	dumpto = -1;
687 	for (c = 0; c < raidPtr->numCol; c++) {
688 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
689 			/* this might be the one */
690 			dumpto = c;
691 			break;
692 		}
693 	}
694 
695 	/*
696 	   At this point we have possibly selected a live master or a
697 	   live slave.  We now check to see if there is a spared
698 	   master (or a spared slave), if we didn't find a live master
699 	   or a live slave.
700 	*/
701 
702 	for (c = 0; c < raidPtr->numSpare; c++) {
703 		sparecol = raidPtr->numCol + c;
704 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
705 			/* How about this one? */
706 			scol = -1;
707 			for(j=0;j<raidPtr->numCol;j++) {
708 				if (raidPtr->Disks[j].spareCol == sparecol) {
709 					scol = j;
710 					break;
711 				}
712 			}
713 			if (scol == 0) {
714 				/*
715 				   We must have found a spared master!
716 				   We'll take that over anything else
717 				   found so far.  (We couldn't have
718 				   found a real master before, since
719 				   this is a used spare, and it's
720 				   saying that it's replacing the
721 				   master.)  On reboot (with
722 				   autoconfiguration turned on)
723 				   sparecol will become the 1st
724 				   component (component0) of this set.
725 				*/
726 				dumpto = sparecol;
727 				break;
728 			} else if (scol != -1) {
729 				/*
730 				   Must be a spared slave.  We'll dump
731 				   to that if we havn't found anything
732 				   else so far.
733 				*/
734 				if (dumpto == -1)
735 					dumpto = sparecol;
736 			}
737 		}
738 	}
739 
740 	if (dumpto == -1) {
741 		/* we couldn't find any live components to dump to!?!?
742 		 */
743 		error = EINVAL;
744 		goto out;
745 	}
746 
747 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
748 
749 	/*
750 	   Note that blkno is relative to this particular partition.
751 	   By adding the offset of this partition in the RAID
752 	   set, and also adding RF_PROTECTED_SECTORS, we get a
753 	   value that is relative to the partition used for the
754 	   underlying component.
755 	*/
756 
757 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
758 				blkno + offset, va, size);
759 
760 out:
761 	raidunlock(rs);
762 
763 	return error;
764 }
765 /* ARGSUSED */
766 int
767 raidopen(dev_t dev, int flags, int fmt,
768     struct lwp *l)
769 {
770 	int     unit = raidunit(dev);
771 	struct raid_softc *rs;
772 	struct disklabel *lp;
773 	int     part, pmask;
774 	int     error = 0;
775 
776 	if ((rs = raidget(unit)) == NULL)
777 		return ENXIO;
778 	if ((error = raidlock(rs)) != 0)
779 		return (error);
780 
781 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
782 		error = EBUSY;
783 		goto bad;
784 	}
785 
786 	lp = rs->sc_dkdev.dk_label;
787 
788 	part = DISKPART(dev);
789 
790 	/*
791 	 * If there are wedges, and this is not RAW_PART, then we
792 	 * need to fail.
793 	 */
794 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
795 		error = EBUSY;
796 		goto bad;
797 	}
798 	pmask = (1 << part);
799 
800 	if ((rs->sc_flags & RAIDF_INITED) &&
801 	    (rs->sc_dkdev.dk_openmask == 0))
802 		raidgetdisklabel(dev);
803 
804 	/* make sure that this partition exists */
805 
806 	if (part != RAW_PART) {
807 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
808 		    ((part >= lp->d_npartitions) ||
809 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
810 			error = ENXIO;
811 			goto bad;
812 		}
813 	}
814 	/* Prevent this unit from being unconfigured while open. */
815 	switch (fmt) {
816 	case S_IFCHR:
817 		rs->sc_dkdev.dk_copenmask |= pmask;
818 		break;
819 
820 	case S_IFBLK:
821 		rs->sc_dkdev.dk_bopenmask |= pmask;
822 		break;
823 	}
824 
825 	if ((rs->sc_dkdev.dk_openmask == 0) &&
826 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
827 		/* First one... mark things as dirty... Note that we *MUST*
828 		 have done a configure before this.  I DO NOT WANT TO BE
829 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
830 		 THAT THEY BELONG TOGETHER!!!!! */
831 		/* XXX should check to see if we're only open for reading
832 		   here... If so, we needn't do this, but then need some
833 		   other way of keeping track of what's happened.. */
834 
835 		rf_markalldirty(&rs->sc_r);
836 	}
837 
838 
839 	rs->sc_dkdev.dk_openmask =
840 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
841 
842 bad:
843 	raidunlock(rs);
844 
845 	return (error);
846 
847 
848 }
849 /* ARGSUSED */
850 int
851 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
852 {
853 	int     unit = raidunit(dev);
854 	struct raid_softc *rs;
855 	int     error = 0;
856 	int     part;
857 
858 	if ((rs = raidget(unit)) == NULL)
859 		return ENXIO;
860 
861 	if ((error = raidlock(rs)) != 0)
862 		return (error);
863 
864 	part = DISKPART(dev);
865 
866 	/* ...that much closer to allowing unconfiguration... */
867 	switch (fmt) {
868 	case S_IFCHR:
869 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
870 		break;
871 
872 	case S_IFBLK:
873 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
874 		break;
875 	}
876 	rs->sc_dkdev.dk_openmask =
877 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
878 
879 	if ((rs->sc_dkdev.dk_openmask == 0) &&
880 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
881 		/* Last one... device is not unconfigured yet.
882 		   Device shutdown has taken care of setting the
883 		   clean bits if RAIDF_INITED is not set
884 		   mark things as clean... */
885 
886 		rf_update_component_labels(&rs->sc_r,
887 						 RF_FINAL_COMPONENT_UPDATE);
888 
889 		/* If the kernel is shutting down, it will detach
890 		 * this RAID set soon enough.
891 		 */
892 	}
893 
894 	raidunlock(rs);
895 	return (0);
896 
897 }
898 
899 void
900 raidstrategy(struct buf *bp)
901 {
902 	unsigned int unit = raidunit(bp->b_dev);
903 	RF_Raid_t *raidPtr;
904 	int     wlabel;
905 	struct raid_softc *rs;
906 
907 	if ((rs = raidget(unit)) == NULL) {
908 		bp->b_error = ENXIO;
909 		goto done;
910 	}
911 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
912 		bp->b_error = ENXIO;
913 		goto done;
914 	}
915 	raidPtr = &rs->sc_r;
916 	if (!raidPtr->valid) {
917 		bp->b_error = ENODEV;
918 		goto done;
919 	}
920 	if (bp->b_bcount == 0) {
921 		db1_printf(("b_bcount is zero..\n"));
922 		goto done;
923 	}
924 
925 	/*
926 	 * Do bounds checking and adjust transfer.  If there's an
927 	 * error, the bounds check will flag that for us.
928 	 */
929 
930 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
931 	if (DISKPART(bp->b_dev) == RAW_PART) {
932 		uint64_t size; /* device size in DEV_BSIZE unit */
933 
934 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
935 			size = raidPtr->totalSectors <<
936 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
937 		} else {
938 			size = raidPtr->totalSectors >>
939 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
940 		}
941 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
942 			goto done;
943 		}
944 	} else {
945 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
946 			db1_printf(("Bounds check failed!!:%d %d\n",
947 				(int) bp->b_blkno, (int) wlabel));
948 			goto done;
949 		}
950 	}
951 
952 	rf_lock_mutex2(raidPtr->iodone_lock);
953 
954 	bp->b_resid = 0;
955 
956 	/* stuff it onto our queue */
957 	bufq_put(rs->buf_queue, bp);
958 
959 	/* scheduled the IO to happen at the next convenient time */
960 	rf_signal_cond2(raidPtr->iodone_cv);
961 	rf_unlock_mutex2(raidPtr->iodone_lock);
962 
963 	return;
964 
965 done:
966 	bp->b_resid = bp->b_bcount;
967 	biodone(bp);
968 }
969 /* ARGSUSED */
970 int
971 raidread(dev_t dev, struct uio *uio, int flags)
972 {
973 	int     unit = raidunit(dev);
974 	struct raid_softc *rs;
975 
976 	if ((rs = raidget(unit)) == NULL)
977 		return ENXIO;
978 
979 	if ((rs->sc_flags & RAIDF_INITED) == 0)
980 		return (ENXIO);
981 
982 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
983 
984 }
985 /* ARGSUSED */
986 int
987 raidwrite(dev_t dev, struct uio *uio, int flags)
988 {
989 	int     unit = raidunit(dev);
990 	struct raid_softc *rs;
991 
992 	if ((rs = raidget(unit)) == NULL)
993 		return ENXIO;
994 
995 	if ((rs->sc_flags & RAIDF_INITED) == 0)
996 		return (ENXIO);
997 
998 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
999 
1000 }
1001 
1002 static int
1003 raid_detach_unlocked(struct raid_softc *rs)
1004 {
1005 	int error;
1006 	RF_Raid_t *raidPtr;
1007 
1008 	raidPtr = &rs->sc_r;
1009 
1010 	/*
1011 	 * If somebody has a partition mounted, we shouldn't
1012 	 * shutdown.
1013 	 */
1014 	if (rs->sc_dkdev.dk_openmask != 0)
1015 		return EBUSY;
1016 
1017 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1018 		;	/* not initialized: nothing to do */
1019 	else if ((error = rf_Shutdown(raidPtr)) != 0)
1020 		return error;
1021 	else
1022 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
1023 
1024 	/* Detach the disk. */
1025 	dkwedge_delall(&rs->sc_dkdev);
1026 	disk_detach(&rs->sc_dkdev);
1027 	disk_destroy(&rs->sc_dkdev);
1028 
1029 	aprint_normal_dev(rs->sc_dev, "detached\n");
1030 
1031 	return 0;
1032 }
1033 
1034 int
1035 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1036 {
1037 	int     unit = raidunit(dev);
1038 	int     error = 0;
1039 	int     part, pmask, s;
1040 	cfdata_t cf;
1041 	struct raid_softc *rs;
1042 	RF_Config_t *k_cfg, *u_cfg;
1043 	RF_Raid_t *raidPtr;
1044 	RF_RaidDisk_t *diskPtr;
1045 	RF_AccTotals_t *totals;
1046 	RF_DeviceConfig_t *d_cfg, **ucfgp;
1047 	u_char *specific_buf;
1048 	int retcode = 0;
1049 	int column;
1050 /*	int raidid; */
1051 	struct rf_recon_req *rrcopy, *rr;
1052 	RF_ComponentLabel_t *clabel;
1053 	RF_ComponentLabel_t *ci_label;
1054 	RF_ComponentLabel_t **clabel_ptr;
1055 	RF_SingleComponent_t *sparePtr,*componentPtr;
1056 	RF_SingleComponent_t component;
1057 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1058 	int i, j, d;
1059 #ifdef __HAVE_OLD_DISKLABEL
1060 	struct disklabel newlabel;
1061 #endif
1062 	struct dkwedge_info *dkw;
1063 
1064 	if ((rs = raidget(unit)) == NULL)
1065 		return ENXIO;
1066 	raidPtr = &rs->sc_r;
1067 
1068 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1069 		(int) DISKPART(dev), (int) unit, cmd));
1070 
1071 	/* Must be open for writes for these commands... */
1072 	switch (cmd) {
1073 #ifdef DIOCGSECTORSIZE
1074 	case DIOCGSECTORSIZE:
1075 		*(u_int *)data = raidPtr->bytesPerSector;
1076 		return 0;
1077 	case DIOCGMEDIASIZE:
1078 		*(off_t *)data =
1079 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1080 		return 0;
1081 #endif
1082 	case DIOCSDINFO:
1083 	case DIOCWDINFO:
1084 #ifdef __HAVE_OLD_DISKLABEL
1085 	case ODIOCWDINFO:
1086 	case ODIOCSDINFO:
1087 #endif
1088 	case DIOCWLABEL:
1089 	case DIOCAWEDGE:
1090 	case DIOCDWEDGE:
1091 	case DIOCSSTRATEGY:
1092 		if ((flag & FWRITE) == 0)
1093 			return (EBADF);
1094 	}
1095 
1096 	/* Must be initialized for these... */
1097 	switch (cmd) {
1098 	case DIOCGDINFO:
1099 	case DIOCSDINFO:
1100 	case DIOCWDINFO:
1101 #ifdef __HAVE_OLD_DISKLABEL
1102 	case ODIOCGDINFO:
1103 	case ODIOCWDINFO:
1104 	case ODIOCSDINFO:
1105 	case ODIOCGDEFLABEL:
1106 #endif
1107 	case DIOCGPART:
1108 	case DIOCWLABEL:
1109 	case DIOCGDEFLABEL:
1110 	case DIOCAWEDGE:
1111 	case DIOCDWEDGE:
1112 	case DIOCLWEDGES:
1113 	case DIOCCACHESYNC:
1114 	case RAIDFRAME_SHUTDOWN:
1115 	case RAIDFRAME_REWRITEPARITY:
1116 	case RAIDFRAME_GET_INFO:
1117 	case RAIDFRAME_RESET_ACCTOTALS:
1118 	case RAIDFRAME_GET_ACCTOTALS:
1119 	case RAIDFRAME_KEEP_ACCTOTALS:
1120 	case RAIDFRAME_GET_SIZE:
1121 	case RAIDFRAME_FAIL_DISK:
1122 	case RAIDFRAME_COPYBACK:
1123 	case RAIDFRAME_CHECK_RECON_STATUS:
1124 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1125 	case RAIDFRAME_GET_COMPONENT_LABEL:
1126 	case RAIDFRAME_SET_COMPONENT_LABEL:
1127 	case RAIDFRAME_ADD_HOT_SPARE:
1128 	case RAIDFRAME_REMOVE_HOT_SPARE:
1129 	case RAIDFRAME_INIT_LABELS:
1130 	case RAIDFRAME_REBUILD_IN_PLACE:
1131 	case RAIDFRAME_CHECK_PARITY:
1132 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1133 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1134 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1135 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1136 	case RAIDFRAME_SET_AUTOCONFIG:
1137 	case RAIDFRAME_SET_ROOT:
1138 	case RAIDFRAME_DELETE_COMPONENT:
1139 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1140 	case RAIDFRAME_PARITYMAP_STATUS:
1141 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1142 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1143 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1144 	case DIOCGSTRATEGY:
1145 	case DIOCSSTRATEGY:
1146 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1147 			return (ENXIO);
1148 	}
1149 
1150 	switch (cmd) {
1151 #ifdef COMPAT_50
1152 	case RAIDFRAME_GET_INFO50:
1153 		return rf_get_info50(raidPtr, data);
1154 
1155 	case RAIDFRAME_CONFIGURE50:
1156 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1157 			return retcode;
1158 		goto config;
1159 #endif
1160 		/* configure the system */
1161 	case RAIDFRAME_CONFIGURE:
1162 
1163 		if (raidPtr->valid) {
1164 			/* There is a valid RAID set running on this unit! */
1165 			printf("raid%d: Device already configured!\n",unit);
1166 			return(EINVAL);
1167 		}
1168 
1169 		/* copy-in the configuration information */
1170 		/* data points to a pointer to the configuration structure */
1171 
1172 		u_cfg = *((RF_Config_t **) data);
1173 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1174 		if (k_cfg == NULL) {
1175 			return (ENOMEM);
1176 		}
1177 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1178 		if (retcode) {
1179 			RF_Free(k_cfg, sizeof(RF_Config_t));
1180 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1181 				retcode));
1182 			return (retcode);
1183 		}
1184 		goto config;
1185 	config:
1186 		/* allocate a buffer for the layout-specific data, and copy it
1187 		 * in */
1188 		if (k_cfg->layoutSpecificSize) {
1189 			if (k_cfg->layoutSpecificSize > 10000) {
1190 				/* sanity check */
1191 				RF_Free(k_cfg, sizeof(RF_Config_t));
1192 				return (EINVAL);
1193 			}
1194 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1195 			    (u_char *));
1196 			if (specific_buf == NULL) {
1197 				RF_Free(k_cfg, sizeof(RF_Config_t));
1198 				return (ENOMEM);
1199 			}
1200 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1201 			    k_cfg->layoutSpecificSize);
1202 			if (retcode) {
1203 				RF_Free(k_cfg, sizeof(RF_Config_t));
1204 				RF_Free(specific_buf,
1205 					k_cfg->layoutSpecificSize);
1206 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1207 					retcode));
1208 				return (retcode);
1209 			}
1210 		} else
1211 			specific_buf = NULL;
1212 		k_cfg->layoutSpecific = specific_buf;
1213 
1214 		/* should do some kind of sanity check on the configuration.
1215 		 * Store the sum of all the bytes in the last byte? */
1216 
1217 		/* configure the system */
1218 
1219 		/*
1220 		 * Clear the entire RAID descriptor, just to make sure
1221 		 *  there is no stale data left in the case of a
1222 		 *  reconfiguration
1223 		 */
1224 		memset(raidPtr, 0, sizeof(*raidPtr));
1225 		raidPtr->softc = rs;
1226 		raidPtr->raidid = unit;
1227 
1228 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1229 
1230 		if (retcode == 0) {
1231 
1232 			/* allow this many simultaneous IO's to
1233 			   this RAID device */
1234 			raidPtr->openings = RAIDOUTSTANDING;
1235 
1236 			raidinit(rs);
1237 			rf_markalldirty(raidPtr);
1238 		}
1239 		/* free the buffers.  No return code here. */
1240 		if (k_cfg->layoutSpecificSize) {
1241 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1242 		}
1243 		RF_Free(k_cfg, sizeof(RF_Config_t));
1244 
1245 		return (retcode);
1246 
1247 		/* shutdown the system */
1248 	case RAIDFRAME_SHUTDOWN:
1249 
1250 		part = DISKPART(dev);
1251 		pmask = (1 << part);
1252 
1253 		if ((error = raidlock(rs)) != 0)
1254 			return (error);
1255 
1256 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1257 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1258 			(rs->sc_dkdev.dk_copenmask & pmask)))
1259 			retcode = EBUSY;
1260 		else {
1261 			rs->sc_flags |= RAIDF_SHUTDOWN;
1262 			rs->sc_dkdev.dk_copenmask &= ~pmask;
1263 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
1264 			rs->sc_dkdev.dk_openmask &= ~pmask;
1265 			retcode = 0;
1266 		}
1267 
1268 		raidunlock(rs);
1269 
1270 		if (retcode != 0)
1271 			return retcode;
1272 
1273 		/* free the pseudo device attach bits */
1274 
1275 		cf = device_cfdata(rs->sc_dev);
1276 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1277 			free(cf, M_RAIDFRAME);
1278 
1279 		return (retcode);
1280 	case RAIDFRAME_GET_COMPONENT_LABEL:
1281 		clabel_ptr = (RF_ComponentLabel_t **) data;
1282 		/* need to read the component label for the disk indicated
1283 		   by row,column in clabel */
1284 
1285 		/*
1286 		 * Perhaps there should be an option to skip the in-core
1287 		 * copy and hit the disk, as with disklabel(8).
1288 		 */
1289 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1290 
1291 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1292 
1293 		if (retcode) {
1294 			RF_Free(clabel, sizeof(*clabel));
1295 			return retcode;
1296 		}
1297 
1298 		clabel->row = 0; /* Don't allow looking at anything else.*/
1299 
1300 		column = clabel->column;
1301 
1302 		if ((column < 0) || (column >= raidPtr->numCol +
1303 		    raidPtr->numSpare)) {
1304 			RF_Free(clabel, sizeof(*clabel));
1305 			return EINVAL;
1306 		}
1307 
1308 		RF_Free(clabel, sizeof(*clabel));
1309 
1310 		clabel = raidget_component_label(raidPtr, column);
1311 
1312 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1313 
1314 #if 0
1315 	case RAIDFRAME_SET_COMPONENT_LABEL:
1316 		clabel = (RF_ComponentLabel_t *) data;
1317 
1318 		/* XXX check the label for valid stuff... */
1319 		/* Note that some things *should not* get modified --
1320 		   the user should be re-initing the labels instead of
1321 		   trying to patch things.
1322 		   */
1323 
1324 		raidid = raidPtr->raidid;
1325 #ifdef DEBUG
1326 		printf("raid%d: Got component label:\n", raidid);
1327 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1328 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1329 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1330 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1331 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1332 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1333 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1334 #endif
1335 		clabel->row = 0;
1336 		column = clabel->column;
1337 
1338 		if ((column < 0) || (column >= raidPtr->numCol)) {
1339 			return(EINVAL);
1340 		}
1341 
1342 		/* XXX this isn't allowed to do anything for now :-) */
1343 
1344 		/* XXX and before it is, we need to fill in the rest
1345 		   of the fields!?!?!?! */
1346 		memcpy(raidget_component_label(raidPtr, column),
1347 		    clabel, sizeof(*clabel));
1348 		raidflush_component_label(raidPtr, column);
1349 		return (0);
1350 #endif
1351 
1352 	case RAIDFRAME_INIT_LABELS:
1353 		clabel = (RF_ComponentLabel_t *) data;
1354 		/*
1355 		   we only want the serial number from
1356 		   the above.  We get all the rest of the information
1357 		   from the config that was used to create this RAID
1358 		   set.
1359 		   */
1360 
1361 		raidPtr->serial_number = clabel->serial_number;
1362 
1363 		for(column=0;column<raidPtr->numCol;column++) {
1364 			diskPtr = &raidPtr->Disks[column];
1365 			if (!RF_DEAD_DISK(diskPtr->status)) {
1366 				ci_label = raidget_component_label(raidPtr,
1367 				    column);
1368 				/* Zeroing this is important. */
1369 				memset(ci_label, 0, sizeof(*ci_label));
1370 				raid_init_component_label(raidPtr, ci_label);
1371 				ci_label->serial_number =
1372 				    raidPtr->serial_number;
1373 				ci_label->row = 0; /* we dont' pretend to support more */
1374 				rf_component_label_set_partitionsize(ci_label,
1375 				    diskPtr->partitionSize);
1376 				ci_label->column = column;
1377 				raidflush_component_label(raidPtr, column);
1378 			}
1379 			/* XXXjld what about the spares? */
1380 		}
1381 
1382 		return (retcode);
1383 	case RAIDFRAME_SET_AUTOCONFIG:
1384 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1385 		printf("raid%d: New autoconfig value is: %d\n",
1386 		       raidPtr->raidid, d);
1387 		*(int *) data = d;
1388 		return (retcode);
1389 
1390 	case RAIDFRAME_SET_ROOT:
1391 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1392 		printf("raid%d: New rootpartition value is: %d\n",
1393 		       raidPtr->raidid, d);
1394 		*(int *) data = d;
1395 		return (retcode);
1396 
1397 		/* initialize all parity */
1398 	case RAIDFRAME_REWRITEPARITY:
1399 
1400 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1401 			/* Parity for RAID 0 is trivially correct */
1402 			raidPtr->parity_good = RF_RAID_CLEAN;
1403 			return(0);
1404 		}
1405 
1406 		if (raidPtr->parity_rewrite_in_progress == 1) {
1407 			/* Re-write is already in progress! */
1408 			return(EINVAL);
1409 		}
1410 
1411 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1412 					   rf_RewriteParityThread,
1413 					   raidPtr,"raid_parity");
1414 		return (retcode);
1415 
1416 
1417 	case RAIDFRAME_ADD_HOT_SPARE:
1418 		sparePtr = (RF_SingleComponent_t *) data;
1419 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1420 		retcode = rf_add_hot_spare(raidPtr, &component);
1421 		return(retcode);
1422 
1423 	case RAIDFRAME_REMOVE_HOT_SPARE:
1424 		return(retcode);
1425 
1426 	case RAIDFRAME_DELETE_COMPONENT:
1427 		componentPtr = (RF_SingleComponent_t *)data;
1428 		memcpy( &component, componentPtr,
1429 			sizeof(RF_SingleComponent_t));
1430 		retcode = rf_delete_component(raidPtr, &component);
1431 		return(retcode);
1432 
1433 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1434 		componentPtr = (RF_SingleComponent_t *)data;
1435 		memcpy( &component, componentPtr,
1436 			sizeof(RF_SingleComponent_t));
1437 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1438 		return(retcode);
1439 
1440 	case RAIDFRAME_REBUILD_IN_PLACE:
1441 
1442 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1443 			/* Can't do this on a RAID 0!! */
1444 			return(EINVAL);
1445 		}
1446 
1447 		if (raidPtr->recon_in_progress == 1) {
1448 			/* a reconstruct is already in progress! */
1449 			return(EINVAL);
1450 		}
1451 
1452 		componentPtr = (RF_SingleComponent_t *) data;
1453 		memcpy( &component, componentPtr,
1454 			sizeof(RF_SingleComponent_t));
1455 		component.row = 0; /* we don't support any more */
1456 		column = component.column;
1457 
1458 		if ((column < 0) || (column >= raidPtr->numCol)) {
1459 			return(EINVAL);
1460 		}
1461 
1462 		rf_lock_mutex2(raidPtr->mutex);
1463 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1464 		    (raidPtr->numFailures > 0)) {
1465 			/* XXX 0 above shouldn't be constant!!! */
1466 			/* some component other than this has failed.
1467 			   Let's not make things worse than they already
1468 			   are... */
1469 			printf("raid%d: Unable to reconstruct to disk at:\n",
1470 			       raidPtr->raidid);
1471 			printf("raid%d:     Col: %d   Too many failures.\n",
1472 			       raidPtr->raidid, column);
1473 			rf_unlock_mutex2(raidPtr->mutex);
1474 			return (EINVAL);
1475 		}
1476 		if (raidPtr->Disks[column].status ==
1477 		    rf_ds_reconstructing) {
1478 			printf("raid%d: Unable to reconstruct to disk at:\n",
1479 			       raidPtr->raidid);
1480 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
1481 
1482 			rf_unlock_mutex2(raidPtr->mutex);
1483 			return (EINVAL);
1484 		}
1485 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1486 			rf_unlock_mutex2(raidPtr->mutex);
1487 			return (EINVAL);
1488 		}
1489 		rf_unlock_mutex2(raidPtr->mutex);
1490 
1491 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1492 		if (rrcopy == NULL)
1493 			return(ENOMEM);
1494 
1495 		rrcopy->raidPtr = (void *) raidPtr;
1496 		rrcopy->col = column;
1497 
1498 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1499 					   rf_ReconstructInPlaceThread,
1500 					   rrcopy,"raid_reconip");
1501 		return(retcode);
1502 
1503 	case RAIDFRAME_GET_INFO:
1504 		if (!raidPtr->valid)
1505 			return (ENODEV);
1506 		ucfgp = (RF_DeviceConfig_t **) data;
1507 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1508 			  (RF_DeviceConfig_t *));
1509 		if (d_cfg == NULL)
1510 			return (ENOMEM);
1511 		d_cfg->rows = 1; /* there is only 1 row now */
1512 		d_cfg->cols = raidPtr->numCol;
1513 		d_cfg->ndevs = raidPtr->numCol;
1514 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1515 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1516 			return (ENOMEM);
1517 		}
1518 		d_cfg->nspares = raidPtr->numSpare;
1519 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1520 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1521 			return (ENOMEM);
1522 		}
1523 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1524 		d = 0;
1525 		for (j = 0; j < d_cfg->cols; j++) {
1526 			d_cfg->devs[d] = raidPtr->Disks[j];
1527 			d++;
1528 		}
1529 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1530 			d_cfg->spares[i] = raidPtr->Disks[j];
1531 		}
1532 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1533 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1534 
1535 		return (retcode);
1536 
1537 	case RAIDFRAME_CHECK_PARITY:
1538 		*(int *) data = raidPtr->parity_good;
1539 		return (0);
1540 
1541 	case RAIDFRAME_PARITYMAP_STATUS:
1542 		if (rf_paritymap_ineligible(raidPtr))
1543 			return EINVAL;
1544 		rf_paritymap_status(raidPtr->parity_map,
1545 		    (struct rf_pmstat *)data);
1546 		return 0;
1547 
1548 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1549 		if (rf_paritymap_ineligible(raidPtr))
1550 			return EINVAL;
1551 		if (raidPtr->parity_map == NULL)
1552 			return ENOENT; /* ??? */
1553 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1554 			(struct rf_pmparams *)data, 1))
1555 			return EINVAL;
1556 		return 0;
1557 
1558 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1559 		if (rf_paritymap_ineligible(raidPtr))
1560 			return EINVAL;
1561 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1562 		return 0;
1563 
1564 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1565 		if (rf_paritymap_ineligible(raidPtr))
1566 			return EINVAL;
1567 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1568 		/* XXX should errors be passed up? */
1569 		return 0;
1570 
1571 	case RAIDFRAME_RESET_ACCTOTALS:
1572 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1573 		return (0);
1574 
1575 	case RAIDFRAME_GET_ACCTOTALS:
1576 		totals = (RF_AccTotals_t *) data;
1577 		*totals = raidPtr->acc_totals;
1578 		return (0);
1579 
1580 	case RAIDFRAME_KEEP_ACCTOTALS:
1581 		raidPtr->keep_acc_totals = *(int *)data;
1582 		return (0);
1583 
1584 	case RAIDFRAME_GET_SIZE:
1585 		*(int *) data = raidPtr->totalSectors;
1586 		return (0);
1587 
1588 		/* fail a disk & optionally start reconstruction */
1589 	case RAIDFRAME_FAIL_DISK:
1590 
1591 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1592 			/* Can't do this on a RAID 0!! */
1593 			return(EINVAL);
1594 		}
1595 
1596 		rr = (struct rf_recon_req *) data;
1597 		rr->row = 0;
1598 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1599 			return (EINVAL);
1600 
1601 
1602 		rf_lock_mutex2(raidPtr->mutex);
1603 		if (raidPtr->status == rf_rs_reconstructing) {
1604 			/* you can't fail a disk while we're reconstructing! */
1605 			/* XXX wrong for RAID6 */
1606 			rf_unlock_mutex2(raidPtr->mutex);
1607 			return (EINVAL);
1608 		}
1609 		if ((raidPtr->Disks[rr->col].status ==
1610 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1611 			/* some other component has failed.  Let's not make
1612 			   things worse. XXX wrong for RAID6 */
1613 			rf_unlock_mutex2(raidPtr->mutex);
1614 			return (EINVAL);
1615 		}
1616 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1617 			/* Can't fail a spared disk! */
1618 			rf_unlock_mutex2(raidPtr->mutex);
1619 			return (EINVAL);
1620 		}
1621 		rf_unlock_mutex2(raidPtr->mutex);
1622 
1623 		/* make a copy of the recon request so that we don't rely on
1624 		 * the user's buffer */
1625 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1626 		if (rrcopy == NULL)
1627 			return(ENOMEM);
1628 		memcpy(rrcopy, rr, sizeof(*rr));
1629 		rrcopy->raidPtr = (void *) raidPtr;
1630 
1631 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1632 					   rf_ReconThread,
1633 					   rrcopy,"raid_recon");
1634 		return (0);
1635 
1636 		/* invoke a copyback operation after recon on whatever disk
1637 		 * needs it, if any */
1638 	case RAIDFRAME_COPYBACK:
1639 
1640 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1641 			/* This makes no sense on a RAID 0!! */
1642 			return(EINVAL);
1643 		}
1644 
1645 		if (raidPtr->copyback_in_progress == 1) {
1646 			/* Copyback is already in progress! */
1647 			return(EINVAL);
1648 		}
1649 
1650 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1651 					   rf_CopybackThread,
1652 					   raidPtr,"raid_copyback");
1653 		return (retcode);
1654 
1655 		/* return the percentage completion of reconstruction */
1656 	case RAIDFRAME_CHECK_RECON_STATUS:
1657 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1658 			/* This makes no sense on a RAID 0, so tell the
1659 			   user it's done. */
1660 			*(int *) data = 100;
1661 			return(0);
1662 		}
1663 		if (raidPtr->status != rf_rs_reconstructing)
1664 			*(int *) data = 100;
1665 		else {
1666 			if (raidPtr->reconControl->numRUsTotal > 0) {
1667 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1668 			} else {
1669 				*(int *) data = 0;
1670 			}
1671 		}
1672 		return (0);
1673 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1674 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1675 		if (raidPtr->status != rf_rs_reconstructing) {
1676 			progressInfo.remaining = 0;
1677 			progressInfo.completed = 100;
1678 			progressInfo.total = 100;
1679 		} else {
1680 			progressInfo.total =
1681 				raidPtr->reconControl->numRUsTotal;
1682 			progressInfo.completed =
1683 				raidPtr->reconControl->numRUsComplete;
1684 			progressInfo.remaining = progressInfo.total -
1685 				progressInfo.completed;
1686 		}
1687 		retcode = copyout(&progressInfo, *progressInfoPtr,
1688 				  sizeof(RF_ProgressInfo_t));
1689 		return (retcode);
1690 
1691 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1692 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1693 			/* This makes no sense on a RAID 0, so tell the
1694 			   user it's done. */
1695 			*(int *) data = 100;
1696 			return(0);
1697 		}
1698 		if (raidPtr->parity_rewrite_in_progress == 1) {
1699 			*(int *) data = 100 *
1700 				raidPtr->parity_rewrite_stripes_done /
1701 				raidPtr->Layout.numStripe;
1702 		} else {
1703 			*(int *) data = 100;
1704 		}
1705 		return (0);
1706 
1707 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1708 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1709 		if (raidPtr->parity_rewrite_in_progress == 1) {
1710 			progressInfo.total = raidPtr->Layout.numStripe;
1711 			progressInfo.completed =
1712 				raidPtr->parity_rewrite_stripes_done;
1713 			progressInfo.remaining = progressInfo.total -
1714 				progressInfo.completed;
1715 		} else {
1716 			progressInfo.remaining = 0;
1717 			progressInfo.completed = 100;
1718 			progressInfo.total = 100;
1719 		}
1720 		retcode = copyout(&progressInfo, *progressInfoPtr,
1721 				  sizeof(RF_ProgressInfo_t));
1722 		return (retcode);
1723 
1724 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1725 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1726 			/* This makes no sense on a RAID 0 */
1727 			*(int *) data = 100;
1728 			return(0);
1729 		}
1730 		if (raidPtr->copyback_in_progress == 1) {
1731 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1732 				raidPtr->Layout.numStripe;
1733 		} else {
1734 			*(int *) data = 100;
1735 		}
1736 		return (0);
1737 
1738 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1739 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1740 		if (raidPtr->copyback_in_progress == 1) {
1741 			progressInfo.total = raidPtr->Layout.numStripe;
1742 			progressInfo.completed =
1743 				raidPtr->copyback_stripes_done;
1744 			progressInfo.remaining = progressInfo.total -
1745 				progressInfo.completed;
1746 		} else {
1747 			progressInfo.remaining = 0;
1748 			progressInfo.completed = 100;
1749 			progressInfo.total = 100;
1750 		}
1751 		retcode = copyout(&progressInfo, *progressInfoPtr,
1752 				  sizeof(RF_ProgressInfo_t));
1753 		return (retcode);
1754 
1755 		/* the sparetable daemon calls this to wait for the kernel to
1756 		 * need a spare table. this ioctl does not return until a
1757 		 * spare table is needed. XXX -- calling mpsleep here in the
1758 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1759 		 * -- I should either compute the spare table in the kernel,
1760 		 * or have a different -- XXX XXX -- interface (a different
1761 		 * character device) for delivering the table     -- XXX */
1762 #if 0
1763 	case RAIDFRAME_SPARET_WAIT:
1764 		rf_lock_mutex2(rf_sparet_wait_mutex);
1765 		while (!rf_sparet_wait_queue)
1766 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1767 		waitreq = rf_sparet_wait_queue;
1768 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1769 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1770 
1771 		/* structure assignment */
1772 		*((RF_SparetWait_t *) data) = *waitreq;
1773 
1774 		RF_Free(waitreq, sizeof(*waitreq));
1775 		return (0);
1776 
1777 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1778 		 * code in it that will cause the dameon to exit */
1779 	case RAIDFRAME_ABORT_SPARET_WAIT:
1780 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1781 		waitreq->fcol = -1;
1782 		rf_lock_mutex2(rf_sparet_wait_mutex);
1783 		waitreq->next = rf_sparet_wait_queue;
1784 		rf_sparet_wait_queue = waitreq;
1785 		rf_broadcast_conf2(rf_sparet_wait_cv);
1786 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1787 		return (0);
1788 
1789 		/* used by the spare table daemon to deliver a spare table
1790 		 * into the kernel */
1791 	case RAIDFRAME_SEND_SPARET:
1792 
1793 		/* install the spare table */
1794 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1795 
1796 		/* respond to the requestor.  the return status of the spare
1797 		 * table installation is passed in the "fcol" field */
1798 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1799 		waitreq->fcol = retcode;
1800 		rf_lock_mutex2(rf_sparet_wait_mutex);
1801 		waitreq->next = rf_sparet_resp_queue;
1802 		rf_sparet_resp_queue = waitreq;
1803 		rf_broadcast_cond2(rf_sparet_resp_cv);
1804 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1805 
1806 		return (retcode);
1807 #endif
1808 
1809 	default:
1810 		break; /* fall through to the os-specific code below */
1811 
1812 	}
1813 
1814 	if (!raidPtr->valid)
1815 		return (EINVAL);
1816 
1817 	/*
1818 	 * Add support for "regular" device ioctls here.
1819 	 */
1820 
1821 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1822 	if (error != EPASSTHROUGH)
1823 		return (error);
1824 
1825 	switch (cmd) {
1826 	case DIOCGDINFO:
1827 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1828 		break;
1829 #ifdef __HAVE_OLD_DISKLABEL
1830 	case ODIOCGDINFO:
1831 		newlabel = *(rs->sc_dkdev.dk_label);
1832 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1833 			return ENOTTY;
1834 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1835 		break;
1836 #endif
1837 
1838 	case DIOCGPART:
1839 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1840 		((struct partinfo *) data)->part =
1841 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1842 		break;
1843 
1844 	case DIOCWDINFO:
1845 	case DIOCSDINFO:
1846 #ifdef __HAVE_OLD_DISKLABEL
1847 	case ODIOCWDINFO:
1848 	case ODIOCSDINFO:
1849 #endif
1850 	{
1851 		struct disklabel *lp;
1852 #ifdef __HAVE_OLD_DISKLABEL
1853 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1854 			memset(&newlabel, 0, sizeof newlabel);
1855 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1856 			lp = &newlabel;
1857 		} else
1858 #endif
1859 		lp = (struct disklabel *)data;
1860 
1861 		if ((error = raidlock(rs)) != 0)
1862 			return (error);
1863 
1864 		rs->sc_flags |= RAIDF_LABELLING;
1865 
1866 		error = setdisklabel(rs->sc_dkdev.dk_label,
1867 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
1868 		if (error == 0) {
1869 			if (cmd == DIOCWDINFO
1870 #ifdef __HAVE_OLD_DISKLABEL
1871 			    || cmd == ODIOCWDINFO
1872 #endif
1873 			   )
1874 				error = writedisklabel(RAIDLABELDEV(dev),
1875 				    raidstrategy, rs->sc_dkdev.dk_label,
1876 				    rs->sc_dkdev.dk_cpulabel);
1877 		}
1878 		rs->sc_flags &= ~RAIDF_LABELLING;
1879 
1880 		raidunlock(rs);
1881 
1882 		if (error)
1883 			return (error);
1884 		break;
1885 	}
1886 
1887 	case DIOCWLABEL:
1888 		if (*(int *) data != 0)
1889 			rs->sc_flags |= RAIDF_WLABEL;
1890 		else
1891 			rs->sc_flags &= ~RAIDF_WLABEL;
1892 		break;
1893 
1894 	case DIOCGDEFLABEL:
1895 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1896 		break;
1897 
1898 #ifdef __HAVE_OLD_DISKLABEL
1899 	case ODIOCGDEFLABEL:
1900 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
1901 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1902 			return ENOTTY;
1903 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1904 		break;
1905 #endif
1906 
1907 	case DIOCAWEDGE:
1908 	case DIOCDWEDGE:
1909 	    	dkw = (void *)data;
1910 
1911 		/* If the ioctl happens here, the parent is us. */
1912 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
1913 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1914 
1915 	case DIOCLWEDGES:
1916 		return dkwedge_list(&rs->sc_dkdev,
1917 		    (struct dkwedge_list *)data, l);
1918 	case DIOCCACHESYNC:
1919 		return rf_sync_component_caches(raidPtr);
1920 
1921 	case DIOCGSTRATEGY:
1922 	    {
1923 		struct disk_strategy *dks = (void *)data;
1924 
1925 		s = splbio();
1926 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1927 		    sizeof(dks->dks_name));
1928 		splx(s);
1929 		dks->dks_paramlen = 0;
1930 
1931 		return 0;
1932 	    }
1933 
1934 	case DIOCSSTRATEGY:
1935 	    {
1936 		struct disk_strategy *dks = (void *)data;
1937 		struct bufq_state *new;
1938 		struct bufq_state *old;
1939 
1940 		if (dks->dks_param != NULL) {
1941 			return EINVAL;
1942 		}
1943 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1944 		error = bufq_alloc(&new, dks->dks_name,
1945 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1946 		if (error) {
1947 			return error;
1948 		}
1949 		s = splbio();
1950 		old = rs->buf_queue;
1951 		bufq_move(new, old);
1952 		rs->buf_queue = new;
1953 		splx(s);
1954 		bufq_free(old);
1955 
1956 		return 0;
1957 	    }
1958 
1959 	default:
1960 		retcode = ENOTTY;
1961 	}
1962 	return (retcode);
1963 
1964 }
1965 
1966 
1967 /* raidinit -- complete the rest of the initialization for the
1968    RAIDframe device.  */
1969 
1970 
1971 static void
1972 raidinit(struct raid_softc *rs)
1973 {
1974 	cfdata_t cf;
1975 	int     unit;
1976 	RF_Raid_t *raidPtr = &rs->sc_r;
1977 
1978 	unit = raidPtr->raidid;
1979 
1980 
1981 	/* XXX should check return code first... */
1982 	rs->sc_flags |= RAIDF_INITED;
1983 
1984 	/* XXX doesn't check bounds. */
1985 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1986 
1987 	/* attach the pseudo device */
1988 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1989 	cf->cf_name = raid_cd.cd_name;
1990 	cf->cf_atname = raid_cd.cd_name;
1991 	cf->cf_unit = unit;
1992 	cf->cf_fstate = FSTATE_STAR;
1993 
1994 	rs->sc_dev = config_attach_pseudo(cf);
1995 
1996 	if (rs->sc_dev == NULL) {
1997 		printf("raid%d: config_attach_pseudo failed\n",
1998 		    raidPtr->raidid);
1999 		rs->sc_flags &= ~RAIDF_INITED;
2000 		free(cf, M_RAIDFRAME);
2001 		return;
2002 	}
2003 
2004 	/* disk_attach actually creates space for the CPU disklabel, among
2005 	 * other things, so it's critical to call this *BEFORE* we try putzing
2006 	 * with disklabels. */
2007 
2008 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
2009 	disk_attach(&rs->sc_dkdev);
2010 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
2011 
2012 	/* XXX There may be a weird interaction here between this, and
2013 	 * protectedSectors, as used in RAIDframe.  */
2014 
2015 	rs->sc_size = raidPtr->totalSectors;
2016 
2017 	dkwedge_discover(&rs->sc_dkdev);
2018 
2019 	rf_set_geometry(rs, raidPtr);
2020 
2021 }
2022 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
2023 /* wake up the daemon & tell it to get us a spare table
2024  * XXX
2025  * the entries in the queues should be tagged with the raidPtr
2026  * so that in the extremely rare case that two recons happen at once,
2027  * we know for which device were requesting a spare table
2028  * XXX
2029  *
2030  * XXX This code is not currently used. GO
2031  */
2032 int
2033 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2034 {
2035 	int     retcode;
2036 
2037 	rf_lock_mutex2(rf_sparet_wait_mutex);
2038 	req->next = rf_sparet_wait_queue;
2039 	rf_sparet_wait_queue = req;
2040 	rf_broadcast_cond2(rf_sparet_wait_cv);
2041 
2042 	/* mpsleep unlocks the mutex */
2043 	while (!rf_sparet_resp_queue) {
2044 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2045 	}
2046 	req = rf_sparet_resp_queue;
2047 	rf_sparet_resp_queue = req->next;
2048 	rf_unlock_mutex2(rf_sparet_wait_mutex);
2049 
2050 	retcode = req->fcol;
2051 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
2052 					 * alloc'd */
2053 	return (retcode);
2054 }
2055 #endif
2056 
2057 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2058  * bp & passes it down.
2059  * any calls originating in the kernel must use non-blocking I/O
2060  * do some extra sanity checking to return "appropriate" error values for
2061  * certain conditions (to make some standard utilities work)
2062  *
2063  * Formerly known as: rf_DoAccessKernel
2064  */
2065 void
2066 raidstart(RF_Raid_t *raidPtr)
2067 {
2068 	RF_SectorCount_t num_blocks, pb, sum;
2069 	RF_RaidAddr_t raid_addr;
2070 	struct partition *pp;
2071 	daddr_t blocknum;
2072 	struct raid_softc *rs;
2073 	int     do_async;
2074 	struct buf *bp;
2075 	int rc;
2076 
2077 	rs = raidPtr->softc;
2078 	/* quick check to see if anything has died recently */
2079 	rf_lock_mutex2(raidPtr->mutex);
2080 	if (raidPtr->numNewFailures > 0) {
2081 		rf_unlock_mutex2(raidPtr->mutex);
2082 		rf_update_component_labels(raidPtr,
2083 					   RF_NORMAL_COMPONENT_UPDATE);
2084 		rf_lock_mutex2(raidPtr->mutex);
2085 		raidPtr->numNewFailures--;
2086 	}
2087 
2088 	/* Check to see if we're at the limit... */
2089 	while (raidPtr->openings > 0) {
2090 		rf_unlock_mutex2(raidPtr->mutex);
2091 
2092 		/* get the next item, if any, from the queue */
2093 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2094 			/* nothing more to do */
2095 			return;
2096 		}
2097 
2098 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
2099 		 * partition.. Need to make it absolute to the underlying
2100 		 * device.. */
2101 
2102 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2103 		if (DISKPART(bp->b_dev) != RAW_PART) {
2104 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2105 			blocknum += pp->p_offset;
2106 		}
2107 
2108 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2109 			    (int) blocknum));
2110 
2111 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2112 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2113 
2114 		/* *THIS* is where we adjust what block we're going to...
2115 		 * but DO NOT TOUCH bp->b_blkno!!! */
2116 		raid_addr = blocknum;
2117 
2118 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2119 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2120 		sum = raid_addr + num_blocks + pb;
2121 		if (1 || rf_debugKernelAccess) {
2122 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2123 				    (int) raid_addr, (int) sum, (int) num_blocks,
2124 				    (int) pb, (int) bp->b_resid));
2125 		}
2126 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2127 		    || (sum < num_blocks) || (sum < pb)) {
2128 			bp->b_error = ENOSPC;
2129 			bp->b_resid = bp->b_bcount;
2130 			biodone(bp);
2131 			rf_lock_mutex2(raidPtr->mutex);
2132 			continue;
2133 		}
2134 		/*
2135 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2136 		 */
2137 
2138 		if (bp->b_bcount & raidPtr->sectorMask) {
2139 			bp->b_error = EINVAL;
2140 			bp->b_resid = bp->b_bcount;
2141 			biodone(bp);
2142 			rf_lock_mutex2(raidPtr->mutex);
2143 			continue;
2144 
2145 		}
2146 		db1_printf(("Calling DoAccess..\n"));
2147 
2148 
2149 		rf_lock_mutex2(raidPtr->mutex);
2150 		raidPtr->openings--;
2151 		rf_unlock_mutex2(raidPtr->mutex);
2152 
2153 		/*
2154 		 * Everything is async.
2155 		 */
2156 		do_async = 1;
2157 
2158 		disk_busy(&rs->sc_dkdev);
2159 
2160 		/* XXX we're still at splbio() here... do we *really*
2161 		   need to be? */
2162 
2163 		/* don't ever condition on bp->b_flags & B_WRITE.
2164 		 * always condition on B_READ instead */
2165 
2166 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2167 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2168 				 do_async, raid_addr, num_blocks,
2169 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2170 
2171 		if (rc) {
2172 			bp->b_error = rc;
2173 			bp->b_resid = bp->b_bcount;
2174 			biodone(bp);
2175 			/* continue loop */
2176 		}
2177 
2178 		rf_lock_mutex2(raidPtr->mutex);
2179 	}
2180 	rf_unlock_mutex2(raidPtr->mutex);
2181 }
2182 
2183 
2184 
2185 
2186 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2187 
2188 int
2189 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2190 {
2191 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2192 	struct buf *bp;
2193 
2194 	req->queue = queue;
2195 	bp = req->bp;
2196 
2197 	switch (req->type) {
2198 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2199 		/* XXX need to do something extra here.. */
2200 		/* I'm leaving this in, as I've never actually seen it used,
2201 		 * and I'd like folks to report it... GO */
2202 		printf(("WAKEUP CALLED\n"));
2203 		queue->numOutstanding++;
2204 
2205 		bp->b_flags = 0;
2206 		bp->b_private = req;
2207 
2208 		KernelWakeupFunc(bp);
2209 		break;
2210 
2211 	case RF_IO_TYPE_READ:
2212 	case RF_IO_TYPE_WRITE:
2213 #if RF_ACC_TRACE > 0
2214 		if (req->tracerec) {
2215 			RF_ETIMER_START(req->tracerec->timer);
2216 		}
2217 #endif
2218 		InitBP(bp, queue->rf_cinfo->ci_vp,
2219 		    op, queue->rf_cinfo->ci_dev,
2220 		    req->sectorOffset, req->numSector,
2221 		    req->buf, KernelWakeupFunc, (void *) req,
2222 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2223 
2224 		if (rf_debugKernelAccess) {
2225 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2226 				(long) bp->b_blkno));
2227 		}
2228 		queue->numOutstanding++;
2229 		queue->last_deq_sector = req->sectorOffset;
2230 		/* acc wouldn't have been let in if there were any pending
2231 		 * reqs at any other priority */
2232 		queue->curPriority = req->priority;
2233 
2234 		db1_printf(("Going for %c to unit %d col %d\n",
2235 			    req->type, queue->raidPtr->raidid,
2236 			    queue->col));
2237 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2238 			(int) req->sectorOffset, (int) req->numSector,
2239 			(int) (req->numSector <<
2240 			    queue->raidPtr->logBytesPerSector),
2241 			(int) queue->raidPtr->logBytesPerSector));
2242 
2243 		/*
2244 		 * XXX: drop lock here since this can block at
2245 		 * least with backing SCSI devices.  Retake it
2246 		 * to minimize fuss with calling interfaces.
2247 		 */
2248 
2249 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2250 		bdev_strategy(bp);
2251 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2252 		break;
2253 
2254 	default:
2255 		panic("bad req->type in rf_DispatchKernelIO");
2256 	}
2257 	db1_printf(("Exiting from DispatchKernelIO\n"));
2258 
2259 	return (0);
2260 }
2261 /* this is the callback function associated with a I/O invoked from
2262    kernel code.
2263  */
2264 static void
2265 KernelWakeupFunc(struct buf *bp)
2266 {
2267 	RF_DiskQueueData_t *req = NULL;
2268 	RF_DiskQueue_t *queue;
2269 
2270 	db1_printf(("recovering the request queue:\n"));
2271 
2272 	req = bp->b_private;
2273 
2274 	queue = (RF_DiskQueue_t *) req->queue;
2275 
2276 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2277 
2278 #if RF_ACC_TRACE > 0
2279 	if (req->tracerec) {
2280 		RF_ETIMER_STOP(req->tracerec->timer);
2281 		RF_ETIMER_EVAL(req->tracerec->timer);
2282 		rf_lock_mutex2(rf_tracing_mutex);
2283 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2284 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2285 		req->tracerec->num_phys_ios++;
2286 		rf_unlock_mutex2(rf_tracing_mutex);
2287 	}
2288 #endif
2289 
2290 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2291 	 * ballistic, and mark the component as hosed... */
2292 
2293 	if (bp->b_error != 0) {
2294 		/* Mark the disk as dead */
2295 		/* but only mark it once... */
2296 		/* and only if it wouldn't leave this RAID set
2297 		   completely broken */
2298 		if (((queue->raidPtr->Disks[queue->col].status ==
2299 		      rf_ds_optimal) ||
2300 		     (queue->raidPtr->Disks[queue->col].status ==
2301 		      rf_ds_used_spare)) &&
2302 		     (queue->raidPtr->numFailures <
2303 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2304 			printf("raid%d: IO Error.  Marking %s as failed.\n",
2305 			       queue->raidPtr->raidid,
2306 			       queue->raidPtr->Disks[queue->col].devname);
2307 			queue->raidPtr->Disks[queue->col].status =
2308 			    rf_ds_failed;
2309 			queue->raidPtr->status = rf_rs_degraded;
2310 			queue->raidPtr->numFailures++;
2311 			queue->raidPtr->numNewFailures++;
2312 		} else {	/* Disk is already dead... */
2313 			/* printf("Disk already marked as dead!\n"); */
2314 		}
2315 
2316 	}
2317 
2318 	/* Fill in the error value */
2319 	req->error = bp->b_error;
2320 
2321 	/* Drop this one on the "finished" queue... */
2322 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2323 
2324 	/* Let the raidio thread know there is work to be done. */
2325 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2326 
2327 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2328 }
2329 
2330 
2331 /*
2332  * initialize a buf structure for doing an I/O in the kernel.
2333  */
2334 static void
2335 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2336        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2337        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2338        struct proc *b_proc)
2339 {
2340 	/* bp->b_flags       = B_PHYS | rw_flag; */
2341 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2342 	bp->b_oflags = 0;
2343 	bp->b_cflags = 0;
2344 	bp->b_bcount = numSect << logBytesPerSector;
2345 	bp->b_bufsize = bp->b_bcount;
2346 	bp->b_error = 0;
2347 	bp->b_dev = dev;
2348 	bp->b_data = bf;
2349 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2350 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2351 	if (bp->b_bcount == 0) {
2352 		panic("bp->b_bcount is zero in InitBP!!");
2353 	}
2354 	bp->b_proc = b_proc;
2355 	bp->b_iodone = cbFunc;
2356 	bp->b_private = cbArg;
2357 }
2358 
2359 static void
2360 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2361 		    struct disklabel *lp)
2362 {
2363 	memset(lp, 0, sizeof(*lp));
2364 
2365 	/* fabricate a label... */
2366 	lp->d_secperunit = raidPtr->totalSectors;
2367 	lp->d_secsize = raidPtr->bytesPerSector;
2368 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2369 	lp->d_ntracks = 4 * raidPtr->numCol;
2370 	lp->d_ncylinders = raidPtr->totalSectors /
2371 		(lp->d_nsectors * lp->d_ntracks);
2372 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2373 
2374 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2375 	lp->d_type = DTYPE_RAID;
2376 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2377 	lp->d_rpm = 3600;
2378 	lp->d_interleave = 1;
2379 	lp->d_flags = 0;
2380 
2381 	lp->d_partitions[RAW_PART].p_offset = 0;
2382 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2383 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2384 	lp->d_npartitions = RAW_PART + 1;
2385 
2386 	lp->d_magic = DISKMAGIC;
2387 	lp->d_magic2 = DISKMAGIC;
2388 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2389 
2390 }
2391 /*
2392  * Read the disklabel from the raid device.  If one is not present, fake one
2393  * up.
2394  */
2395 static void
2396 raidgetdisklabel(dev_t dev)
2397 {
2398 	int     unit = raidunit(dev);
2399 	struct raid_softc *rs;
2400 	const char   *errstring;
2401 	struct disklabel *lp;
2402 	struct cpu_disklabel *clp;
2403 	RF_Raid_t *raidPtr;
2404 
2405 	if ((rs = raidget(unit)) == NULL)
2406 		return;
2407 
2408 	lp = rs->sc_dkdev.dk_label;
2409 	clp = rs->sc_dkdev.dk_cpulabel;
2410 
2411 	db1_printf(("Getting the disklabel...\n"));
2412 
2413 	memset(clp, 0, sizeof(*clp));
2414 
2415 	raidPtr = &rs->sc_r;
2416 
2417 	raidgetdefaultlabel(raidPtr, rs, lp);
2418 
2419 	/*
2420 	 * Call the generic disklabel extraction routine.
2421 	 */
2422 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2423 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2424 	if (errstring)
2425 		raidmakedisklabel(rs);
2426 	else {
2427 		int     i;
2428 		struct partition *pp;
2429 
2430 		/*
2431 		 * Sanity check whether the found disklabel is valid.
2432 		 *
2433 		 * This is necessary since total size of the raid device
2434 		 * may vary when an interleave is changed even though exactly
2435 		 * same components are used, and old disklabel may used
2436 		 * if that is found.
2437 		 */
2438 		if (lp->d_secperunit != rs->sc_size)
2439 			printf("raid%d: WARNING: %s: "
2440 			    "total sector size in disklabel (%" PRIu32 ") != "
2441 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2442 			    lp->d_secperunit, rs->sc_size);
2443 		for (i = 0; i < lp->d_npartitions; i++) {
2444 			pp = &lp->d_partitions[i];
2445 			if (pp->p_offset + pp->p_size > rs->sc_size)
2446 				printf("raid%d: WARNING: %s: end of partition `%c' "
2447 				       "exceeds the size of raid (%" PRIu64 ")\n",
2448 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
2449 		}
2450 	}
2451 
2452 }
2453 /*
2454  * Take care of things one might want to take care of in the event
2455  * that a disklabel isn't present.
2456  */
2457 static void
2458 raidmakedisklabel(struct raid_softc *rs)
2459 {
2460 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2461 	db1_printf(("Making a label..\n"));
2462 
2463 	/*
2464 	 * For historical reasons, if there's no disklabel present
2465 	 * the raw partition must be marked FS_BSDFFS.
2466 	 */
2467 
2468 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2469 
2470 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2471 
2472 	lp->d_checksum = dkcksum(lp);
2473 }
2474 /*
2475  * Wait interruptibly for an exclusive lock.
2476  *
2477  * XXX
2478  * Several drivers do this; it should be abstracted and made MP-safe.
2479  * (Hmm... where have we seen this warning before :->  GO )
2480  */
2481 static int
2482 raidlock(struct raid_softc *rs)
2483 {
2484 	int     error;
2485 
2486 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2487 		rs->sc_flags |= RAIDF_WANTED;
2488 		if ((error =
2489 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2490 			return (error);
2491 	}
2492 	rs->sc_flags |= RAIDF_LOCKED;
2493 	return (0);
2494 }
2495 /*
2496  * Unlock and wake up any waiters.
2497  */
2498 static void
2499 raidunlock(struct raid_softc *rs)
2500 {
2501 
2502 	rs->sc_flags &= ~RAIDF_LOCKED;
2503 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2504 		rs->sc_flags &= ~RAIDF_WANTED;
2505 		wakeup(rs);
2506 	}
2507 }
2508 
2509 
2510 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2511 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2512 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2513 
2514 static daddr_t
2515 rf_component_info_offset(void)
2516 {
2517 
2518 	return RF_COMPONENT_INFO_OFFSET;
2519 }
2520 
2521 static daddr_t
2522 rf_component_info_size(unsigned secsize)
2523 {
2524 	daddr_t info_size;
2525 
2526 	KASSERT(secsize);
2527 	if (secsize > RF_COMPONENT_INFO_SIZE)
2528 		info_size = secsize;
2529 	else
2530 		info_size = RF_COMPONENT_INFO_SIZE;
2531 
2532 	return info_size;
2533 }
2534 
2535 static daddr_t
2536 rf_parity_map_offset(RF_Raid_t *raidPtr)
2537 {
2538 	daddr_t map_offset;
2539 
2540 	KASSERT(raidPtr->bytesPerSector);
2541 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2542 		map_offset = raidPtr->bytesPerSector;
2543 	else
2544 		map_offset = RF_COMPONENT_INFO_SIZE;
2545 	map_offset += rf_component_info_offset();
2546 
2547 	return map_offset;
2548 }
2549 
2550 static daddr_t
2551 rf_parity_map_size(RF_Raid_t *raidPtr)
2552 {
2553 	daddr_t map_size;
2554 
2555 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2556 		map_size = raidPtr->bytesPerSector;
2557 	else
2558 		map_size = RF_PARITY_MAP_SIZE;
2559 
2560 	return map_size;
2561 }
2562 
2563 int
2564 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2565 {
2566 	RF_ComponentLabel_t *clabel;
2567 
2568 	clabel = raidget_component_label(raidPtr, col);
2569 	clabel->clean = RF_RAID_CLEAN;
2570 	raidflush_component_label(raidPtr, col);
2571 	return(0);
2572 }
2573 
2574 
2575 int
2576 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2577 {
2578 	RF_ComponentLabel_t *clabel;
2579 
2580 	clabel = raidget_component_label(raidPtr, col);
2581 	clabel->clean = RF_RAID_DIRTY;
2582 	raidflush_component_label(raidPtr, col);
2583 	return(0);
2584 }
2585 
2586 int
2587 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2588 {
2589 	KASSERT(raidPtr->bytesPerSector);
2590 	return raidread_component_label(raidPtr->bytesPerSector,
2591 	    raidPtr->Disks[col].dev,
2592 	    raidPtr->raid_cinfo[col].ci_vp,
2593 	    &raidPtr->raid_cinfo[col].ci_label);
2594 }
2595 
2596 RF_ComponentLabel_t *
2597 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2598 {
2599 	return &raidPtr->raid_cinfo[col].ci_label;
2600 }
2601 
2602 int
2603 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2604 {
2605 	RF_ComponentLabel_t *label;
2606 
2607 	label = &raidPtr->raid_cinfo[col].ci_label;
2608 	label->mod_counter = raidPtr->mod_counter;
2609 #ifndef RF_NO_PARITY_MAP
2610 	label->parity_map_modcount = label->mod_counter;
2611 #endif
2612 	return raidwrite_component_label(raidPtr->bytesPerSector,
2613 	    raidPtr->Disks[col].dev,
2614 	    raidPtr->raid_cinfo[col].ci_vp, label);
2615 }
2616 
2617 
2618 static int
2619 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2620     RF_ComponentLabel_t *clabel)
2621 {
2622 	return raidread_component_area(dev, b_vp, clabel,
2623 	    sizeof(RF_ComponentLabel_t),
2624 	    rf_component_info_offset(),
2625 	    rf_component_info_size(secsize));
2626 }
2627 
2628 /* ARGSUSED */
2629 static int
2630 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2631     size_t msize, daddr_t offset, daddr_t dsize)
2632 {
2633 	struct buf *bp;
2634 	const struct bdevsw *bdev;
2635 	int error;
2636 
2637 	/* XXX should probably ensure that we don't try to do this if
2638 	   someone has changed rf_protected_sectors. */
2639 
2640 	if (b_vp == NULL) {
2641 		/* For whatever reason, this component is not valid.
2642 		   Don't try to read a component label from it. */
2643 		return(EINVAL);
2644 	}
2645 
2646 	/* get a block of the appropriate size... */
2647 	bp = geteblk((int)dsize);
2648 	bp->b_dev = dev;
2649 
2650 	/* get our ducks in a row for the read */
2651 	bp->b_blkno = offset / DEV_BSIZE;
2652 	bp->b_bcount = dsize;
2653 	bp->b_flags |= B_READ;
2654  	bp->b_resid = dsize;
2655 
2656 	bdev = bdevsw_lookup(bp->b_dev);
2657 	if (bdev == NULL)
2658 		return (ENXIO);
2659 	(*bdev->d_strategy)(bp);
2660 
2661 	error = biowait(bp);
2662 
2663 	if (!error) {
2664 		memcpy(data, bp->b_data, msize);
2665 	}
2666 
2667 	brelse(bp, 0);
2668 	return(error);
2669 }
2670 
2671 
2672 static int
2673 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2674     RF_ComponentLabel_t *clabel)
2675 {
2676 	return raidwrite_component_area(dev, b_vp, clabel,
2677 	    sizeof(RF_ComponentLabel_t),
2678 	    rf_component_info_offset(),
2679 	    rf_component_info_size(secsize), 0);
2680 }
2681 
2682 /* ARGSUSED */
2683 static int
2684 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2685     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2686 {
2687 	struct buf *bp;
2688 	const struct bdevsw *bdev;
2689 	int error;
2690 
2691 	/* get a block of the appropriate size... */
2692 	bp = geteblk((int)dsize);
2693 	bp->b_dev = dev;
2694 
2695 	/* get our ducks in a row for the write */
2696 	bp->b_blkno = offset / DEV_BSIZE;
2697 	bp->b_bcount = dsize;
2698 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2699  	bp->b_resid = dsize;
2700 
2701 	memset(bp->b_data, 0, dsize);
2702 	memcpy(bp->b_data, data, msize);
2703 
2704 	bdev = bdevsw_lookup(bp->b_dev);
2705 	if (bdev == NULL)
2706 		return (ENXIO);
2707 	(*bdev->d_strategy)(bp);
2708 	if (asyncp)
2709 		return 0;
2710 	error = biowait(bp);
2711 	brelse(bp, 0);
2712 	if (error) {
2713 #if 1
2714 		printf("Failed to write RAID component info!\n");
2715 #endif
2716 	}
2717 
2718 	return(error);
2719 }
2720 
2721 void
2722 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2723 {
2724 	int c;
2725 
2726 	for (c = 0; c < raidPtr->numCol; c++) {
2727 		/* Skip dead disks. */
2728 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2729 			continue;
2730 		/* XXXjld: what if an error occurs here? */
2731 		raidwrite_component_area(raidPtr->Disks[c].dev,
2732 		    raidPtr->raid_cinfo[c].ci_vp, map,
2733 		    RF_PARITYMAP_NBYTE,
2734 		    rf_parity_map_offset(raidPtr),
2735 		    rf_parity_map_size(raidPtr), 0);
2736 	}
2737 }
2738 
2739 void
2740 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2741 {
2742 	struct rf_paritymap_ondisk tmp;
2743 	int c,first;
2744 
2745 	first=1;
2746 	for (c = 0; c < raidPtr->numCol; c++) {
2747 		/* Skip dead disks. */
2748 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2749 			continue;
2750 		raidread_component_area(raidPtr->Disks[c].dev,
2751 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2752 		    RF_PARITYMAP_NBYTE,
2753 		    rf_parity_map_offset(raidPtr),
2754 		    rf_parity_map_size(raidPtr));
2755 		if (first) {
2756 			memcpy(map, &tmp, sizeof(*map));
2757 			first = 0;
2758 		} else {
2759 			rf_paritymap_merge(map, &tmp);
2760 		}
2761 	}
2762 }
2763 
2764 void
2765 rf_markalldirty(RF_Raid_t *raidPtr)
2766 {
2767 	RF_ComponentLabel_t *clabel;
2768 	int sparecol;
2769 	int c;
2770 	int j;
2771 	int scol = -1;
2772 
2773 	raidPtr->mod_counter++;
2774 	for (c = 0; c < raidPtr->numCol; c++) {
2775 		/* we don't want to touch (at all) a disk that has
2776 		   failed */
2777 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2778 			clabel = raidget_component_label(raidPtr, c);
2779 			if (clabel->status == rf_ds_spared) {
2780 				/* XXX do something special...
2781 				   but whatever you do, don't
2782 				   try to access it!! */
2783 			} else {
2784 				raidmarkdirty(raidPtr, c);
2785 			}
2786 		}
2787 	}
2788 
2789 	for( c = 0; c < raidPtr->numSpare ; c++) {
2790 		sparecol = raidPtr->numCol + c;
2791 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2792 			/*
2793 
2794 			   we claim this disk is "optimal" if it's
2795 			   rf_ds_used_spare, as that means it should be
2796 			   directly substitutable for the disk it replaced.
2797 			   We note that too...
2798 
2799 			 */
2800 
2801 			for(j=0;j<raidPtr->numCol;j++) {
2802 				if (raidPtr->Disks[j].spareCol == sparecol) {
2803 					scol = j;
2804 					break;
2805 				}
2806 			}
2807 
2808 			clabel = raidget_component_label(raidPtr, sparecol);
2809 			/* make sure status is noted */
2810 
2811 			raid_init_component_label(raidPtr, clabel);
2812 
2813 			clabel->row = 0;
2814 			clabel->column = scol;
2815 			/* Note: we *don't* change status from rf_ds_used_spare
2816 			   to rf_ds_optimal */
2817 			/* clabel.status = rf_ds_optimal; */
2818 
2819 			raidmarkdirty(raidPtr, sparecol);
2820 		}
2821 	}
2822 }
2823 
2824 
2825 void
2826 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2827 {
2828 	RF_ComponentLabel_t *clabel;
2829 	int sparecol;
2830 	int c;
2831 	int j;
2832 	int scol;
2833 
2834 	scol = -1;
2835 
2836 	/* XXX should do extra checks to make sure things really are clean,
2837 	   rather than blindly setting the clean bit... */
2838 
2839 	raidPtr->mod_counter++;
2840 
2841 	for (c = 0; c < raidPtr->numCol; c++) {
2842 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2843 			clabel = raidget_component_label(raidPtr, c);
2844 			/* make sure status is noted */
2845 			clabel->status = rf_ds_optimal;
2846 
2847 			/* note what unit we are configured as */
2848 			clabel->last_unit = raidPtr->raidid;
2849 
2850 			raidflush_component_label(raidPtr, c);
2851 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2852 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2853 					raidmarkclean(raidPtr, c);
2854 				}
2855 			}
2856 		}
2857 		/* else we don't touch it.. */
2858 	}
2859 
2860 	for( c = 0; c < raidPtr->numSpare ; c++) {
2861 		sparecol = raidPtr->numCol + c;
2862 		/* Need to ensure that the reconstruct actually completed! */
2863 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2864 			/*
2865 
2866 			   we claim this disk is "optimal" if it's
2867 			   rf_ds_used_spare, as that means it should be
2868 			   directly substitutable for the disk it replaced.
2869 			   We note that too...
2870 
2871 			 */
2872 
2873 			for(j=0;j<raidPtr->numCol;j++) {
2874 				if (raidPtr->Disks[j].spareCol == sparecol) {
2875 					scol = j;
2876 					break;
2877 				}
2878 			}
2879 
2880 			/* XXX shouldn't *really* need this... */
2881 			clabel = raidget_component_label(raidPtr, sparecol);
2882 			/* make sure status is noted */
2883 
2884 			raid_init_component_label(raidPtr, clabel);
2885 
2886 			clabel->column = scol;
2887 			clabel->status = rf_ds_optimal;
2888 			clabel->last_unit = raidPtr->raidid;
2889 
2890 			raidflush_component_label(raidPtr, sparecol);
2891 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2892 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2893 					raidmarkclean(raidPtr, sparecol);
2894 				}
2895 			}
2896 		}
2897 	}
2898 }
2899 
2900 void
2901 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2902 {
2903 
2904 	if (vp != NULL) {
2905 		if (auto_configured == 1) {
2906 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2907 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2908 			vput(vp);
2909 
2910 		} else {
2911 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2912 		}
2913 	}
2914 }
2915 
2916 
2917 void
2918 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2919 {
2920 	int r,c;
2921 	struct vnode *vp;
2922 	int acd;
2923 
2924 
2925 	/* We take this opportunity to close the vnodes like we should.. */
2926 
2927 	for (c = 0; c < raidPtr->numCol; c++) {
2928 		vp = raidPtr->raid_cinfo[c].ci_vp;
2929 		acd = raidPtr->Disks[c].auto_configured;
2930 		rf_close_component(raidPtr, vp, acd);
2931 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2932 		raidPtr->Disks[c].auto_configured = 0;
2933 	}
2934 
2935 	for (r = 0; r < raidPtr->numSpare; r++) {
2936 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2937 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2938 		rf_close_component(raidPtr, vp, acd);
2939 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2940 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2941 	}
2942 }
2943 
2944 
2945 void
2946 rf_ReconThread(struct rf_recon_req *req)
2947 {
2948 	int     s;
2949 	RF_Raid_t *raidPtr;
2950 
2951 	s = splbio();
2952 	raidPtr = (RF_Raid_t *) req->raidPtr;
2953 	raidPtr->recon_in_progress = 1;
2954 
2955 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2956 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2957 
2958 	RF_Free(req, sizeof(*req));
2959 
2960 	raidPtr->recon_in_progress = 0;
2961 	splx(s);
2962 
2963 	/* That's all... */
2964 	kthread_exit(0);	/* does not return */
2965 }
2966 
2967 void
2968 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2969 {
2970 	int retcode;
2971 	int s;
2972 
2973 	raidPtr->parity_rewrite_stripes_done = 0;
2974 	raidPtr->parity_rewrite_in_progress = 1;
2975 	s = splbio();
2976 	retcode = rf_RewriteParity(raidPtr);
2977 	splx(s);
2978 	if (retcode) {
2979 		printf("raid%d: Error re-writing parity (%d)!\n",
2980 		    raidPtr->raidid, retcode);
2981 	} else {
2982 		/* set the clean bit!  If we shutdown correctly,
2983 		   the clean bit on each component label will get
2984 		   set */
2985 		raidPtr->parity_good = RF_RAID_CLEAN;
2986 	}
2987 	raidPtr->parity_rewrite_in_progress = 0;
2988 
2989 	/* Anyone waiting for us to stop?  If so, inform them... */
2990 	if (raidPtr->waitShutdown) {
2991 		wakeup(&raidPtr->parity_rewrite_in_progress);
2992 	}
2993 
2994 	/* That's all... */
2995 	kthread_exit(0);	/* does not return */
2996 }
2997 
2998 
2999 void
3000 rf_CopybackThread(RF_Raid_t *raidPtr)
3001 {
3002 	int s;
3003 
3004 	raidPtr->copyback_in_progress = 1;
3005 	s = splbio();
3006 	rf_CopybackReconstructedData(raidPtr);
3007 	splx(s);
3008 	raidPtr->copyback_in_progress = 0;
3009 
3010 	/* That's all... */
3011 	kthread_exit(0);	/* does not return */
3012 }
3013 
3014 
3015 void
3016 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
3017 {
3018 	int s;
3019 	RF_Raid_t *raidPtr;
3020 
3021 	s = splbio();
3022 	raidPtr = req->raidPtr;
3023 	raidPtr->recon_in_progress = 1;
3024 	rf_ReconstructInPlace(raidPtr, req->col);
3025 	RF_Free(req, sizeof(*req));
3026 	raidPtr->recon_in_progress = 0;
3027 	splx(s);
3028 
3029 	/* That's all... */
3030 	kthread_exit(0);	/* does not return */
3031 }
3032 
3033 static RF_AutoConfig_t *
3034 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
3035     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
3036     unsigned secsize)
3037 {
3038 	int good_one = 0;
3039 	RF_ComponentLabel_t *clabel;
3040 	RF_AutoConfig_t *ac;
3041 
3042 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
3043 	if (clabel == NULL) {
3044 oomem:
3045 		    while(ac_list) {
3046 			    ac = ac_list;
3047 			    if (ac->clabel)
3048 				    free(ac->clabel, M_RAIDFRAME);
3049 			    ac_list = ac_list->next;
3050 			    free(ac, M_RAIDFRAME);
3051 		    }
3052 		    printf("RAID auto config: out of memory!\n");
3053 		    return NULL; /* XXX probably should panic? */
3054 	}
3055 
3056 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
3057 		/* Got the label.  Does it look reasonable? */
3058 		if (rf_reasonable_label(clabel, numsecs) &&
3059 		    (rf_component_label_partitionsize(clabel) <= size)) {
3060 #ifdef DEBUG
3061 			printf("Component on: %s: %llu\n",
3062 				cname, (unsigned long long)size);
3063 			rf_print_component_label(clabel);
3064 #endif
3065 			/* if it's reasonable, add it, else ignore it. */
3066 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3067 				M_NOWAIT);
3068 			if (ac == NULL) {
3069 				free(clabel, M_RAIDFRAME);
3070 				goto oomem;
3071 			}
3072 			strlcpy(ac->devname, cname, sizeof(ac->devname));
3073 			ac->dev = dev;
3074 			ac->vp = vp;
3075 			ac->clabel = clabel;
3076 			ac->next = ac_list;
3077 			ac_list = ac;
3078 			good_one = 1;
3079 		}
3080 	}
3081 	if (!good_one) {
3082 		/* cleanup */
3083 		free(clabel, M_RAIDFRAME);
3084 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3085 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3086 		vput(vp);
3087 	}
3088 	return ac_list;
3089 }
3090 
3091 RF_AutoConfig_t *
3092 rf_find_raid_components(void)
3093 {
3094 	struct vnode *vp;
3095 	struct disklabel label;
3096 	device_t dv;
3097 	deviter_t di;
3098 	dev_t dev;
3099 	int bmajor, bminor, wedge, rf_part_found;
3100 	int error;
3101 	int i;
3102 	RF_AutoConfig_t *ac_list;
3103 	uint64_t numsecs;
3104 	unsigned secsize;
3105 
3106 	/* initialize the AutoConfig list */
3107 	ac_list = NULL;
3108 
3109 	/* we begin by trolling through *all* the devices on the system */
3110 
3111 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3112 	     dv = deviter_next(&di)) {
3113 
3114 		/* we are only interested in disks... */
3115 		if (device_class(dv) != DV_DISK)
3116 			continue;
3117 
3118 		/* we don't care about floppies... */
3119 		if (device_is_a(dv, "fd")) {
3120 			continue;
3121 		}
3122 
3123 		/* we don't care about CD's... */
3124 		if (device_is_a(dv, "cd")) {
3125 			continue;
3126 		}
3127 
3128 		/* we don't care about md's... */
3129 		if (device_is_a(dv, "md")) {
3130 			continue;
3131 		}
3132 
3133 		/* hdfd is the Atari/Hades floppy driver */
3134 		if (device_is_a(dv, "hdfd")) {
3135 			continue;
3136 		}
3137 
3138 		/* fdisa is the Atari/Milan floppy driver */
3139 		if (device_is_a(dv, "fdisa")) {
3140 			continue;
3141 		}
3142 
3143 		/* need to find the device_name_to_block_device_major stuff */
3144 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3145 
3146 		rf_part_found = 0; /*No raid partition as yet*/
3147 
3148 		/* get a vnode for the raw partition of this disk */
3149 
3150 		wedge = device_is_a(dv, "dk");
3151 		bminor = minor(device_unit(dv));
3152 		dev = wedge ? makedev(bmajor, bminor) :
3153 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
3154 		if (bdevvp(dev, &vp))
3155 			panic("RAID can't alloc vnode");
3156 
3157 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3158 
3159 		if (error) {
3160 			/* "Who cares."  Continue looking
3161 			   for something that exists*/
3162 			vput(vp);
3163 			continue;
3164 		}
3165 
3166 		error = getdisksize(vp, &numsecs, &secsize);
3167 		if (error) {
3168 			vput(vp);
3169 			continue;
3170 		}
3171 		if (wedge) {
3172 			struct dkwedge_info dkw;
3173 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3174 			    NOCRED);
3175 			if (error) {
3176 				printf("RAIDframe: can't get wedge info for "
3177 				    "dev %s (%d)\n", device_xname(dv), error);
3178 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3179 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3180 				vput(vp);
3181 				continue;
3182 			}
3183 
3184 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3185 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3186 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3187 				vput(vp);
3188 				continue;
3189 			}
3190 
3191 			ac_list = rf_get_component(ac_list, dev, vp,
3192 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3193 			rf_part_found = 1; /*There is a raid component on this disk*/
3194 			continue;
3195 		}
3196 
3197 		/* Ok, the disk exists.  Go get the disklabel. */
3198 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3199 		if (error) {
3200 			/*
3201 			 * XXX can't happen - open() would
3202 			 * have errored out (or faked up one)
3203 			 */
3204 			if (error != ENOTTY)
3205 				printf("RAIDframe: can't get label for dev "
3206 				    "%s (%d)\n", device_xname(dv), error);
3207 		}
3208 
3209 		/* don't need this any more.  We'll allocate it again
3210 		   a little later if we really do... */
3211 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3212 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3213 		vput(vp);
3214 
3215 		if (error)
3216 			continue;
3217 
3218 		rf_part_found = 0; /*No raid partitions yet*/
3219 		for (i = 0; i < label.d_npartitions; i++) {
3220 			char cname[sizeof(ac_list->devname)];
3221 
3222 			/* We only support partitions marked as RAID */
3223 			if (label.d_partitions[i].p_fstype != FS_RAID)
3224 				continue;
3225 
3226 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3227 			if (bdevvp(dev, &vp))
3228 				panic("RAID can't alloc vnode");
3229 
3230 			error = VOP_OPEN(vp, FREAD, NOCRED);
3231 			if (error) {
3232 				/* Whatever... */
3233 				vput(vp);
3234 				continue;
3235 			}
3236 			snprintf(cname, sizeof(cname), "%s%c",
3237 			    device_xname(dv), 'a' + i);
3238 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3239 				label.d_partitions[i].p_size, numsecs, secsize);
3240 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3241 		}
3242 
3243 		/*
3244 		 *If there is no raid component on this disk, either in a
3245 		 *disklabel or inside a wedge, check the raw partition as well,
3246 		 *as it is possible to configure raid components on raw disk
3247 		 *devices.
3248 		 */
3249 
3250 		if (!rf_part_found) {
3251 			char cname[sizeof(ac_list->devname)];
3252 
3253 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3254 			if (bdevvp(dev, &vp))
3255 				panic("RAID can't alloc vnode");
3256 
3257 			error = VOP_OPEN(vp, FREAD, NOCRED);
3258 			if (error) {
3259 				/* Whatever... */
3260 				vput(vp);
3261 				continue;
3262 			}
3263 			snprintf(cname, sizeof(cname), "%s%c",
3264 			    device_xname(dv), 'a' + RAW_PART);
3265 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3266 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3267 		}
3268 	}
3269 	deviter_release(&di);
3270 	return ac_list;
3271 }
3272 
3273 
3274 int
3275 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3276 {
3277 
3278 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3279 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3280 	    ((clabel->clean == RF_RAID_CLEAN) ||
3281 	     (clabel->clean == RF_RAID_DIRTY)) &&
3282 	    clabel->row >=0 &&
3283 	    clabel->column >= 0 &&
3284 	    clabel->num_rows > 0 &&
3285 	    clabel->num_columns > 0 &&
3286 	    clabel->row < clabel->num_rows &&
3287 	    clabel->column < clabel->num_columns &&
3288 	    clabel->blockSize > 0 &&
3289 	    /*
3290 	     * numBlocksHi may contain garbage, but it is ok since
3291 	     * the type is unsigned.  If it is really garbage,
3292 	     * rf_fix_old_label_size() will fix it.
3293 	     */
3294 	    rf_component_label_numblocks(clabel) > 0) {
3295 		/*
3296 		 * label looks reasonable enough...
3297 		 * let's make sure it has no old garbage.
3298 		 */
3299 		if (numsecs)
3300 			rf_fix_old_label_size(clabel, numsecs);
3301 		return(1);
3302 	}
3303 	return(0);
3304 }
3305 
3306 
3307 /*
3308  * For reasons yet unknown, some old component labels have garbage in
3309  * the newer numBlocksHi region, and this causes lossage.  Since those
3310  * disks will also have numsecs set to less than 32 bits of sectors,
3311  * we can determine when this corruption has occurred, and fix it.
3312  *
3313  * The exact same problem, with the same unknown reason, happens to
3314  * the partitionSizeHi member as well.
3315  */
3316 static void
3317 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3318 {
3319 
3320 	if (numsecs < ((uint64_t)1 << 32)) {
3321 		if (clabel->numBlocksHi) {
3322 			printf("WARNING: total sectors < 32 bits, yet "
3323 			       "numBlocksHi set\n"
3324 			       "WARNING: resetting numBlocksHi to zero.\n");
3325 			clabel->numBlocksHi = 0;
3326 		}
3327 
3328 		if (clabel->partitionSizeHi) {
3329 			printf("WARNING: total sectors < 32 bits, yet "
3330 			       "partitionSizeHi set\n"
3331 			       "WARNING: resetting partitionSizeHi to zero.\n");
3332 			clabel->partitionSizeHi = 0;
3333 		}
3334 	}
3335 }
3336 
3337 
3338 #ifdef DEBUG
3339 void
3340 rf_print_component_label(RF_ComponentLabel_t *clabel)
3341 {
3342 	uint64_t numBlocks;
3343 	static const char *rp[] = {
3344 	    "No", "Force", "Soft", "*invalid*"
3345 	};
3346 
3347 
3348 	numBlocks = rf_component_label_numblocks(clabel);
3349 
3350 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3351 	       clabel->row, clabel->column,
3352 	       clabel->num_rows, clabel->num_columns);
3353 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3354 	       clabel->version, clabel->serial_number,
3355 	       clabel->mod_counter);
3356 	printf("   Clean: %s Status: %d\n",
3357 	       clabel->clean ? "Yes" : "No", clabel->status);
3358 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3359 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3360 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3361 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3362 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3363 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3364 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3365 #if 0
3366 	   printf("   Config order: %d\n", clabel->config_order);
3367 #endif
3368 
3369 }
3370 #endif
3371 
3372 RF_ConfigSet_t *
3373 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3374 {
3375 	RF_AutoConfig_t *ac;
3376 	RF_ConfigSet_t *config_sets;
3377 	RF_ConfigSet_t *cset;
3378 	RF_AutoConfig_t *ac_next;
3379 
3380 
3381 	config_sets = NULL;
3382 
3383 	/* Go through the AutoConfig list, and figure out which components
3384 	   belong to what sets.  */
3385 	ac = ac_list;
3386 	while(ac!=NULL) {
3387 		/* we're going to putz with ac->next, so save it here
3388 		   for use at the end of the loop */
3389 		ac_next = ac->next;
3390 
3391 		if (config_sets == NULL) {
3392 			/* will need at least this one... */
3393 			config_sets = (RF_ConfigSet_t *)
3394 				malloc(sizeof(RF_ConfigSet_t),
3395 				       M_RAIDFRAME, M_NOWAIT);
3396 			if (config_sets == NULL) {
3397 				panic("rf_create_auto_sets: No memory!");
3398 			}
3399 			/* this one is easy :) */
3400 			config_sets->ac = ac;
3401 			config_sets->next = NULL;
3402 			config_sets->rootable = 0;
3403 			ac->next = NULL;
3404 		} else {
3405 			/* which set does this component fit into? */
3406 			cset = config_sets;
3407 			while(cset!=NULL) {
3408 				if (rf_does_it_fit(cset, ac)) {
3409 					/* looks like it matches... */
3410 					ac->next = cset->ac;
3411 					cset->ac = ac;
3412 					break;
3413 				}
3414 				cset = cset->next;
3415 			}
3416 			if (cset==NULL) {
3417 				/* didn't find a match above... new set..*/
3418 				cset = (RF_ConfigSet_t *)
3419 					malloc(sizeof(RF_ConfigSet_t),
3420 					       M_RAIDFRAME, M_NOWAIT);
3421 				if (cset == NULL) {
3422 					panic("rf_create_auto_sets: No memory!");
3423 				}
3424 				cset->ac = ac;
3425 				ac->next = NULL;
3426 				cset->next = config_sets;
3427 				cset->rootable = 0;
3428 				config_sets = cset;
3429 			}
3430 		}
3431 		ac = ac_next;
3432 	}
3433 
3434 
3435 	return(config_sets);
3436 }
3437 
3438 static int
3439 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3440 {
3441 	RF_ComponentLabel_t *clabel1, *clabel2;
3442 
3443 	/* If this one matches the *first* one in the set, that's good
3444 	   enough, since the other members of the set would have been
3445 	   through here too... */
3446 	/* note that we are not checking partitionSize here..
3447 
3448 	   Note that we are also not checking the mod_counters here.
3449 	   If everything else matches except the mod_counter, that's
3450 	   good enough for this test.  We will deal with the mod_counters
3451 	   a little later in the autoconfiguration process.
3452 
3453 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3454 
3455 	   The reason we don't check for this is that failed disks
3456 	   will have lower modification counts.  If those disks are
3457 	   not added to the set they used to belong to, then they will
3458 	   form their own set, which may result in 2 different sets,
3459 	   for example, competing to be configured at raid0, and
3460 	   perhaps competing to be the root filesystem set.  If the
3461 	   wrong ones get configured, or both attempt to become /,
3462 	   weird behaviour and or serious lossage will occur.  Thus we
3463 	   need to bring them into the fold here, and kick them out at
3464 	   a later point.
3465 
3466 	*/
3467 
3468 	clabel1 = cset->ac->clabel;
3469 	clabel2 = ac->clabel;
3470 	if ((clabel1->version == clabel2->version) &&
3471 	    (clabel1->serial_number == clabel2->serial_number) &&
3472 	    (clabel1->num_rows == clabel2->num_rows) &&
3473 	    (clabel1->num_columns == clabel2->num_columns) &&
3474 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3475 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3476 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3477 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3478 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3479 	    (clabel1->blockSize == clabel2->blockSize) &&
3480 	    rf_component_label_numblocks(clabel1) ==
3481 	    rf_component_label_numblocks(clabel2) &&
3482 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3483 	    (clabel1->root_partition == clabel2->root_partition) &&
3484 	    (clabel1->last_unit == clabel2->last_unit) &&
3485 	    (clabel1->config_order == clabel2->config_order)) {
3486 		/* if it get's here, it almost *has* to be a match */
3487 	} else {
3488 		/* it's not consistent with somebody in the set..
3489 		   punt */
3490 		return(0);
3491 	}
3492 	/* all was fine.. it must fit... */
3493 	return(1);
3494 }
3495 
3496 int
3497 rf_have_enough_components(RF_ConfigSet_t *cset)
3498 {
3499 	RF_AutoConfig_t *ac;
3500 	RF_AutoConfig_t *auto_config;
3501 	RF_ComponentLabel_t *clabel;
3502 	int c;
3503 	int num_cols;
3504 	int num_missing;
3505 	int mod_counter;
3506 	int mod_counter_found;
3507 	int even_pair_failed;
3508 	char parity_type;
3509 
3510 
3511 	/* check to see that we have enough 'live' components
3512 	   of this set.  If so, we can configure it if necessary */
3513 
3514 	num_cols = cset->ac->clabel->num_columns;
3515 	parity_type = cset->ac->clabel->parityConfig;
3516 
3517 	/* XXX Check for duplicate components!?!?!? */
3518 
3519 	/* Determine what the mod_counter is supposed to be for this set. */
3520 
3521 	mod_counter_found = 0;
3522 	mod_counter = 0;
3523 	ac = cset->ac;
3524 	while(ac!=NULL) {
3525 		if (mod_counter_found==0) {
3526 			mod_counter = ac->clabel->mod_counter;
3527 			mod_counter_found = 1;
3528 		} else {
3529 			if (ac->clabel->mod_counter > mod_counter) {
3530 				mod_counter = ac->clabel->mod_counter;
3531 			}
3532 		}
3533 		ac = ac->next;
3534 	}
3535 
3536 	num_missing = 0;
3537 	auto_config = cset->ac;
3538 
3539 	even_pair_failed = 0;
3540 	for(c=0; c<num_cols; c++) {
3541 		ac = auto_config;
3542 		while(ac!=NULL) {
3543 			if ((ac->clabel->column == c) &&
3544 			    (ac->clabel->mod_counter == mod_counter)) {
3545 				/* it's this one... */
3546 #ifdef DEBUG
3547 				printf("Found: %s at %d\n",
3548 				       ac->devname,c);
3549 #endif
3550 				break;
3551 			}
3552 			ac=ac->next;
3553 		}
3554 		if (ac==NULL) {
3555 				/* Didn't find one here! */
3556 				/* special case for RAID 1, especially
3557 				   where there are more than 2
3558 				   components (where RAIDframe treats
3559 				   things a little differently :( ) */
3560 			if (parity_type == '1') {
3561 				if (c%2 == 0) { /* even component */
3562 					even_pair_failed = 1;
3563 				} else { /* odd component.  If
3564 					    we're failed, and
3565 					    so is the even
3566 					    component, it's
3567 					    "Good Night, Charlie" */
3568 					if (even_pair_failed == 1) {
3569 						return(0);
3570 					}
3571 				}
3572 			} else {
3573 				/* normal accounting */
3574 				num_missing++;
3575 			}
3576 		}
3577 		if ((parity_type == '1') && (c%2 == 1)) {
3578 				/* Just did an even component, and we didn't
3579 				   bail.. reset the even_pair_failed flag,
3580 				   and go on to the next component.... */
3581 			even_pair_failed = 0;
3582 		}
3583 	}
3584 
3585 	clabel = cset->ac->clabel;
3586 
3587 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3588 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3589 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3590 		/* XXX this needs to be made *much* more general */
3591 		/* Too many failures */
3592 		return(0);
3593 	}
3594 	/* otherwise, all is well, and we've got enough to take a kick
3595 	   at autoconfiguring this set */
3596 	return(1);
3597 }
3598 
3599 void
3600 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3601 			RF_Raid_t *raidPtr)
3602 {
3603 	RF_ComponentLabel_t *clabel;
3604 	int i;
3605 
3606 	clabel = ac->clabel;
3607 
3608 	/* 1. Fill in the common stuff */
3609 	config->numRow = clabel->num_rows = 1;
3610 	config->numCol = clabel->num_columns;
3611 	config->numSpare = 0; /* XXX should this be set here? */
3612 	config->sectPerSU = clabel->sectPerSU;
3613 	config->SUsPerPU = clabel->SUsPerPU;
3614 	config->SUsPerRU = clabel->SUsPerRU;
3615 	config->parityConfig = clabel->parityConfig;
3616 	/* XXX... */
3617 	strcpy(config->diskQueueType,"fifo");
3618 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3619 	config->layoutSpecificSize = 0; /* XXX ?? */
3620 
3621 	while(ac!=NULL) {
3622 		/* row/col values will be in range due to the checks
3623 		   in reasonable_label() */
3624 		strcpy(config->devnames[0][ac->clabel->column],
3625 		       ac->devname);
3626 		ac = ac->next;
3627 	}
3628 
3629 	for(i=0;i<RF_MAXDBGV;i++) {
3630 		config->debugVars[i][0] = 0;
3631 	}
3632 }
3633 
3634 int
3635 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3636 {
3637 	RF_ComponentLabel_t *clabel;
3638 	int column;
3639 	int sparecol;
3640 
3641 	raidPtr->autoconfigure = new_value;
3642 
3643 	for(column=0; column<raidPtr->numCol; column++) {
3644 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3645 			clabel = raidget_component_label(raidPtr, column);
3646 			clabel->autoconfigure = new_value;
3647 			raidflush_component_label(raidPtr, column);
3648 		}
3649 	}
3650 	for(column = 0; column < raidPtr->numSpare ; column++) {
3651 		sparecol = raidPtr->numCol + column;
3652 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3653 			clabel = raidget_component_label(raidPtr, sparecol);
3654 			clabel->autoconfigure = new_value;
3655 			raidflush_component_label(raidPtr, sparecol);
3656 		}
3657 	}
3658 	return(new_value);
3659 }
3660 
3661 int
3662 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3663 {
3664 	RF_ComponentLabel_t *clabel;
3665 	int column;
3666 	int sparecol;
3667 
3668 	raidPtr->root_partition = new_value;
3669 	for(column=0; column<raidPtr->numCol; column++) {
3670 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3671 			clabel = raidget_component_label(raidPtr, column);
3672 			clabel->root_partition = new_value;
3673 			raidflush_component_label(raidPtr, column);
3674 		}
3675 	}
3676 	for(column = 0; column < raidPtr->numSpare ; column++) {
3677 		sparecol = raidPtr->numCol + column;
3678 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3679 			clabel = raidget_component_label(raidPtr, sparecol);
3680 			clabel->root_partition = new_value;
3681 			raidflush_component_label(raidPtr, sparecol);
3682 		}
3683 	}
3684 	return(new_value);
3685 }
3686 
3687 void
3688 rf_release_all_vps(RF_ConfigSet_t *cset)
3689 {
3690 	RF_AutoConfig_t *ac;
3691 
3692 	ac = cset->ac;
3693 	while(ac!=NULL) {
3694 		/* Close the vp, and give it back */
3695 		if (ac->vp) {
3696 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3697 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
3698 			vput(ac->vp);
3699 			ac->vp = NULL;
3700 		}
3701 		ac = ac->next;
3702 	}
3703 }
3704 
3705 
3706 void
3707 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3708 {
3709 	RF_AutoConfig_t *ac;
3710 	RF_AutoConfig_t *next_ac;
3711 
3712 	ac = cset->ac;
3713 	while(ac!=NULL) {
3714 		next_ac = ac->next;
3715 		/* nuke the label */
3716 		free(ac->clabel, M_RAIDFRAME);
3717 		/* cleanup the config structure */
3718 		free(ac, M_RAIDFRAME);
3719 		/* "next.." */
3720 		ac = next_ac;
3721 	}
3722 	/* and, finally, nuke the config set */
3723 	free(cset, M_RAIDFRAME);
3724 }
3725 
3726 
3727 void
3728 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3729 {
3730 	/* current version number */
3731 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3732 	clabel->serial_number = raidPtr->serial_number;
3733 	clabel->mod_counter = raidPtr->mod_counter;
3734 
3735 	clabel->num_rows = 1;
3736 	clabel->num_columns = raidPtr->numCol;
3737 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3738 	clabel->status = rf_ds_optimal; /* "It's good!" */
3739 
3740 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3741 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3742 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3743 
3744 	clabel->blockSize = raidPtr->bytesPerSector;
3745 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3746 
3747 	/* XXX not portable */
3748 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3749 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3750 	clabel->autoconfigure = raidPtr->autoconfigure;
3751 	clabel->root_partition = raidPtr->root_partition;
3752 	clabel->last_unit = raidPtr->raidid;
3753 	clabel->config_order = raidPtr->config_order;
3754 
3755 #ifndef RF_NO_PARITY_MAP
3756 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3757 #endif
3758 }
3759 
3760 struct raid_softc *
3761 rf_auto_config_set(RF_ConfigSet_t *cset)
3762 {
3763 	RF_Raid_t *raidPtr;
3764 	RF_Config_t *config;
3765 	int raidID;
3766 	struct raid_softc *sc;
3767 
3768 #ifdef DEBUG
3769 	printf("RAID autoconfigure\n");
3770 #endif
3771 
3772 	/* 1. Create a config structure */
3773 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3774 	if (config == NULL) {
3775 		printf("Out of mem!?!?\n");
3776 				/* XXX do something more intelligent here. */
3777 		return NULL;
3778 	}
3779 
3780 	/*
3781 	   2. Figure out what RAID ID this one is supposed to live at
3782 	   See if we can get the same RAID dev that it was configured
3783 	   on last time..
3784 	*/
3785 
3786 	raidID = cset->ac->clabel->last_unit;
3787 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3788 		continue;
3789 #ifdef DEBUG
3790 	printf("Configuring raid%d:\n",raidID);
3791 #endif
3792 
3793 	raidPtr = &sc->sc_r;
3794 
3795 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3796 	raidPtr->softc = sc;
3797 	raidPtr->raidid = raidID;
3798 	raidPtr->openings = RAIDOUTSTANDING;
3799 
3800 	/* 3. Build the configuration structure */
3801 	rf_create_configuration(cset->ac, config, raidPtr);
3802 
3803 	/* 4. Do the configuration */
3804 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3805 		raidinit(sc);
3806 
3807 		rf_markalldirty(raidPtr);
3808 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3809 		switch (cset->ac->clabel->root_partition) {
3810 		case 1:	/* Force Root */
3811 		case 2:	/* Soft Root: root when boot partition part of raid */
3812 			/*
3813 			 * everything configured just fine.  Make a note
3814 			 * that this set is eligible to be root,
3815 			 * or forced to be root
3816 			 */
3817 			cset->rootable = cset->ac->clabel->root_partition;
3818 			/* XXX do this here? */
3819 			raidPtr->root_partition = cset->rootable;
3820 			break;
3821 		default:
3822 			break;
3823 		}
3824 	} else {
3825 		raidput(sc);
3826 		sc = NULL;
3827 	}
3828 
3829 	/* 5. Cleanup */
3830 	free(config, M_RAIDFRAME);
3831 	return sc;
3832 }
3833 
3834 void
3835 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3836 {
3837 	struct buf *bp;
3838 	struct raid_softc *rs;
3839 
3840 	bp = (struct buf *)desc->bp;
3841 	rs = desc->raidPtr->softc;
3842 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3843 	    (bp->b_flags & B_READ));
3844 }
3845 
3846 void
3847 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3848 	     size_t xmin, size_t xmax)
3849 {
3850 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3851 	pool_sethiwat(p, xmax);
3852 	pool_prime(p, xmin);
3853 	pool_setlowat(p, xmin);
3854 }
3855 
3856 /*
3857  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3858  * if there is IO pending and if that IO could possibly be done for a
3859  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
3860  * otherwise.
3861  *
3862  */
3863 
3864 int
3865 rf_buf_queue_check(RF_Raid_t *raidPtr)
3866 {
3867 	struct raid_softc *rs = raidPtr->softc;
3868 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3869 		/* there is work to do */
3870 		return 0;
3871 	}
3872 	/* default is nothing to do */
3873 	return 1;
3874 }
3875 
3876 int
3877 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3878 {
3879 	uint64_t numsecs;
3880 	unsigned secsize;
3881 	int error;
3882 
3883 	error = getdisksize(vp, &numsecs, &secsize);
3884 	if (error == 0) {
3885 		diskPtr->blockSize = secsize;
3886 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3887 		diskPtr->partitionSize = numsecs;
3888 		return 0;
3889 	}
3890 	return error;
3891 }
3892 
3893 static int
3894 raid_match(device_t self, cfdata_t cfdata, void *aux)
3895 {
3896 	return 1;
3897 }
3898 
3899 static void
3900 raid_attach(device_t parent, device_t self, void *aux)
3901 {
3902 
3903 }
3904 
3905 
3906 static int
3907 raid_detach(device_t self, int flags)
3908 {
3909 	int error;
3910 	struct raid_softc *rs = raidget(device_unit(self));
3911 
3912 	if (rs == NULL)
3913 		return ENXIO;
3914 
3915 	if ((error = raidlock(rs)) != 0)
3916 		return (error);
3917 
3918 	error = raid_detach_unlocked(rs);
3919 
3920 	raidunlock(rs);
3921 
3922 	/* XXXkd: raidput(rs) ??? */
3923 
3924 	return error;
3925 }
3926 
3927 static void
3928 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3929 {
3930 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
3931 
3932 	memset(dg, 0, sizeof(*dg));
3933 
3934 	dg->dg_secperunit = raidPtr->totalSectors;
3935 	dg->dg_secsize = raidPtr->bytesPerSector;
3936 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3937 	dg->dg_ntracks = 4 * raidPtr->numCol;
3938 
3939 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
3940 }
3941 
3942 /*
3943  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3944  * We end up returning whatever error was returned by the first cache flush
3945  * that fails.
3946  */
3947 
3948 int
3949 rf_sync_component_caches(RF_Raid_t *raidPtr)
3950 {
3951 	int c, sparecol;
3952 	int e,error;
3953 	int force = 1;
3954 
3955 	error = 0;
3956 	for (c = 0; c < raidPtr->numCol; c++) {
3957 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3958 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3959 					  &force, FWRITE, NOCRED);
3960 			if (e) {
3961 				if (e != ENODEV)
3962 					printf("raid%d: cache flush to component %s failed.\n",
3963 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3964 				if (error == 0) {
3965 					error = e;
3966 				}
3967 			}
3968 		}
3969 	}
3970 
3971 	for( c = 0; c < raidPtr->numSpare ; c++) {
3972 		sparecol = raidPtr->numCol + c;
3973 		/* Need to ensure that the reconstruct actually completed! */
3974 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3975 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3976 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3977 			if (e) {
3978 				if (e != ENODEV)
3979 					printf("raid%d: cache flush to component %s failed.\n",
3980 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3981 				if (error == 0) {
3982 					error = e;
3983 				}
3984 			}
3985 		}
3986 	}
3987 	return error;
3988 }
3989