xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.308 2014/04/03 18:55:26 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.308 2014/04/03 18:55:26 christos Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 
130 #include <prop/proplib.h>
131 
132 #include <dev/raidframe/raidframevar.h>
133 #include <dev/raidframe/raidframeio.h>
134 #include <dev/raidframe/rf_paritymap.h>
135 
136 #include "rf_raid.h"
137 #include "rf_copyback.h"
138 #include "rf_dag.h"
139 #include "rf_dagflags.h"
140 #include "rf_desc.h"
141 #include "rf_diskqueue.h"
142 #include "rf_etimer.h"
143 #include "rf_general.h"
144 #include "rf_kintf.h"
145 #include "rf_options.h"
146 #include "rf_driver.h"
147 #include "rf_parityscan.h"
148 #include "rf_threadstuff.h"
149 
150 #ifdef COMPAT_50
151 #include "rf_compat50.h"
152 #endif
153 
154 #ifdef DEBUG
155 int     rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else				/* DEBUG */
158 #define db1_printf(a) { }
159 #endif				/* DEBUG */
160 
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165 
166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
167 						 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
169 						 * installation process */
170 #endif
171 
172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
173 
174 /* prototypes */
175 static void KernelWakeupFunc(struct buf *);
176 static void InitBP(struct buf *, struct vnode *, unsigned,
177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
178     void *, int, struct proc *);
179 struct raid_softc;
180 static void raidinit(struct raid_softc *);
181 
182 void raidattach(int);
183 static int raid_match(device_t, cfdata_t, void *);
184 static void raid_attach(device_t, device_t, void *);
185 static int raid_detach(device_t, int);
186 
187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
188     daddr_t, daddr_t);
189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
190     daddr_t, daddr_t, int);
191 
192 static int raidwrite_component_label(unsigned,
193     dev_t, struct vnode *, RF_ComponentLabel_t *);
194 static int raidread_component_label(unsigned,
195     dev_t, struct vnode *, RF_ComponentLabel_t *);
196 
197 
198 dev_type_open(raidopen);
199 dev_type_close(raidclose);
200 dev_type_read(raidread);
201 dev_type_write(raidwrite);
202 dev_type_ioctl(raidioctl);
203 dev_type_strategy(raidstrategy);
204 dev_type_dump(raiddump);
205 dev_type_size(raidsize);
206 
207 const struct bdevsw raid_bdevsw = {
208 	.d_open = raidopen,
209 	.d_close = raidclose,
210 	.d_strategy = raidstrategy,
211 	.d_ioctl = raidioctl,
212 	.d_dump = raiddump,
213 	.d_psize = raidsize,
214 	.d_flag = D_DISK
215 };
216 
217 const struct cdevsw raid_cdevsw = {
218 	.d_open = raidopen,
219 	.d_close = raidclose,
220 	.d_read = raidread,
221 	.d_write = raidwrite,
222 	.d_ioctl = raidioctl,
223 	.d_stop = nostop,
224 	.d_tty = notty,
225 	.d_poll = nopoll,
226 	.d_mmap = nommap,
227 	.d_kqfilter = nokqfilter,
228 	.d_flag = D_DISK
229 };
230 
231 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
232 
233 struct raid_softc {
234 	device_t sc_dev;
235 	int	sc_unit;
236 	int     sc_flags;	/* flags */
237 	int     sc_cflags;	/* configuration flags */
238 	uint64_t sc_size;	/* size of the raid device */
239 	char    sc_xname[20];	/* XXX external name */
240 	struct disk sc_dkdev;	/* generic disk device info */
241 	struct bufq_state *buf_queue;	/* used for the device queue */
242 	RF_Raid_t sc_r;
243 	LIST_ENTRY(raid_softc) sc_link;
244 };
245 /* sc_flags */
246 #define RAIDF_INITED	0x01	/* unit has been initialized */
247 #define RAIDF_WLABEL	0x02	/* label area is writable */
248 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
249 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
250 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
251 #define RAIDF_LOCKED	0x80	/* unit is locked */
252 
253 #define	raidunit(x)	DISKUNIT(x)
254 
255 extern struct cfdriver raid_cd;
256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
258     DVF_DETACH_SHUTDOWN);
259 
260 /*
261  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
262  * Be aware that large numbers can allow the driver to consume a lot of
263  * kernel memory, especially on writes, and in degraded mode reads.
264  *
265  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
266  * a single 64K write will typically require 64K for the old data,
267  * 64K for the old parity, and 64K for the new parity, for a total
268  * of 192K (if the parity buffer is not re-used immediately).
269  * Even it if is used immediately, that's still 128K, which when multiplied
270  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
271  *
272  * Now in degraded mode, for example, a 64K read on the above setup may
273  * require data reconstruction, which will require *all* of the 4 remaining
274  * disks to participate -- 4 * 32K/disk == 128K again.
275  */
276 
277 #ifndef RAIDOUTSTANDING
278 #define RAIDOUTSTANDING   6
279 #endif
280 
281 #define RAIDLABELDEV(dev)	\
282 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
283 
284 /* declared here, and made public, for the benefit of KVM stuff.. */
285 
286 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
287 				     struct disklabel *);
288 static void raidgetdisklabel(dev_t);
289 static void raidmakedisklabel(struct raid_softc *);
290 
291 static int raidlock(struct raid_softc *);
292 static void raidunlock(struct raid_softc *);
293 
294 static int raid_detach_unlocked(struct raid_softc *);
295 
296 static void rf_markalldirty(RF_Raid_t *);
297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
298 
299 void rf_ReconThread(struct rf_recon_req *);
300 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
301 void rf_CopybackThread(RF_Raid_t *raidPtr);
302 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
303 int rf_autoconfig(device_t);
304 void rf_buildroothack(RF_ConfigSet_t *);
305 
306 RF_AutoConfig_t *rf_find_raid_components(void);
307 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
308 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
309 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
310 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
311 int rf_set_autoconfig(RF_Raid_t *, int);
312 int rf_set_rootpartition(RF_Raid_t *, int);
313 void rf_release_all_vps(RF_ConfigSet_t *);
314 void rf_cleanup_config_set(RF_ConfigSet_t *);
315 int rf_have_enough_components(RF_ConfigSet_t *);
316 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
318 
319 /*
320  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
321  * Note that this is overridden by having RAID_AUTOCONFIG as an option
322  * in the kernel config file.
323  */
324 #ifdef RAID_AUTOCONFIG
325 int raidautoconfig = 1;
326 #else
327 int raidautoconfig = 0;
328 #endif
329 static bool raidautoconfigdone = false;
330 
331 struct RF_Pools_s rf_pools;
332 
333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
334 static kmutex_t raid_lock;
335 
336 static struct raid_softc *
337 raidcreate(int unit) {
338 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
339 	if (sc == NULL) {
340 #ifdef DIAGNOSTIC
341 		printf("%s: out of memory\n", __func__);
342 #endif
343 		return NULL;
344 	}
345 	sc->sc_unit = unit;
346 	bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
347 	return sc;
348 }
349 
350 static void
351 raiddestroy(struct raid_softc *sc) {
352 	bufq_free(sc->buf_queue);
353 	kmem_free(sc, sizeof(*sc));
354 }
355 
356 static struct raid_softc *
357 raidget(int unit) {
358 	struct raid_softc *sc;
359 	if (unit < 0) {
360 #ifdef DIAGNOSTIC
361 		panic("%s: unit %d!", __func__, unit);
362 #endif
363 		return NULL;
364 	}
365 	mutex_enter(&raid_lock);
366 	LIST_FOREACH(sc, &raids, sc_link) {
367 		if (sc->sc_unit == unit) {
368 			mutex_exit(&raid_lock);
369 			return sc;
370 		}
371 	}
372 	mutex_exit(&raid_lock);
373 	if ((sc = raidcreate(unit)) == NULL)
374 		return NULL;
375 	mutex_enter(&raid_lock);
376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
377 	mutex_exit(&raid_lock);
378 	return sc;
379 }
380 
381 static void
382 raidput(struct raid_softc *sc) {
383 	mutex_enter(&raid_lock);
384 	LIST_REMOVE(sc, sc_link);
385 	mutex_exit(&raid_lock);
386 	raiddestroy(sc);
387 }
388 
389 void
390 raidattach(int num)
391 {
392 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
393 	/* This is where all the initialization stuff gets done. */
394 
395 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
396 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
397 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
398 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
399 
400 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
401 #endif
402 
403 	if (rf_BootRaidframe() == 0)
404 		aprint_verbose("Kernelized RAIDframe activated\n");
405 	else
406 		panic("Serious error booting RAID!!");
407 
408 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
409 		aprint_error("raidattach: config_cfattach_attach failed?\n");
410 	}
411 
412 	raidautoconfigdone = false;
413 
414 	/*
415 	 * Register a finalizer which will be used to auto-config RAID
416 	 * sets once all real hardware devices have been found.
417 	 */
418 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
419 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
420 }
421 
422 int
423 rf_autoconfig(device_t self)
424 {
425 	RF_AutoConfig_t *ac_list;
426 	RF_ConfigSet_t *config_sets;
427 
428 	if (!raidautoconfig || raidautoconfigdone == true)
429 		return (0);
430 
431 	/* XXX This code can only be run once. */
432 	raidautoconfigdone = true;
433 
434 #ifdef __HAVE_CPU_BOOTCONF
435 	/*
436 	 * 0. find the boot device if needed first so we can use it later
437 	 * this needs to be done before we autoconfigure any raid sets,
438 	 * because if we use wedges we are not going to be able to open
439 	 * the boot device later
440 	 */
441 	if (booted_device == NULL)
442 		cpu_bootconf();
443 #endif
444 	/* 1. locate all RAID components on the system */
445 	aprint_debug("Searching for RAID components...\n");
446 	ac_list = rf_find_raid_components();
447 
448 	/* 2. Sort them into their respective sets. */
449 	config_sets = rf_create_auto_sets(ac_list);
450 
451 	/*
452 	 * 3. Evaluate each set and configure the valid ones.
453 	 * This gets done in rf_buildroothack().
454 	 */
455 	rf_buildroothack(config_sets);
456 
457 	return 1;
458 }
459 
460 static int
461 rf_containsboot(RF_Raid_t *r, device_t bdv) {
462 	const char *bootname = device_xname(bdv);
463 	size_t len = strlen(bootname);
464 
465 	for (int col = 0; col < r->numCol; col++) {
466 		const char *devname = r->Disks[col].devname;
467 		devname += sizeof("/dev/") - 1;
468 		if (strncmp(devname, "dk", 2) == 0) {
469 			const char *parent =
470 			    dkwedge_get_parent_name(r->Disks[col].dev);
471 			if (parent != NULL)
472 				devname = parent;
473 		}
474 		if (strncmp(devname, bootname, len) == 0) {
475 			struct raid_softc *sc = r->softc;
476 			aprint_debug("raid%d includes boot device %s\n",
477 			    sc->sc_unit, devname);
478 			return 1;
479 		}
480 	}
481 	return 0;
482 }
483 
484 void
485 rf_buildroothack(RF_ConfigSet_t *config_sets)
486 {
487 	RF_ConfigSet_t *cset;
488 	RF_ConfigSet_t *next_cset;
489 	int num_root;
490 	struct raid_softc *sc, *rsc;
491 
492 	sc = rsc = NULL;
493 	num_root = 0;
494 	cset = config_sets;
495 	while (cset != NULL) {
496 		next_cset = cset->next;
497 		if (rf_have_enough_components(cset) &&
498 		    cset->ac->clabel->autoconfigure == 1) {
499 			sc = rf_auto_config_set(cset);
500 			if (sc != NULL) {
501 				aprint_debug("raid%d: configured ok\n",
502 				    sc->sc_unit);
503 				if (cset->rootable) {
504 					rsc = sc;
505 					num_root++;
506 				}
507 			} else {
508 				/* The autoconfig didn't work :( */
509 				aprint_debug("Autoconfig failed\n");
510 				rf_release_all_vps(cset);
511 			}
512 		} else {
513 			/* we're not autoconfiguring this set...
514 			   release the associated resources */
515 			rf_release_all_vps(cset);
516 		}
517 		/* cleanup */
518 		rf_cleanup_config_set(cset);
519 		cset = next_cset;
520 	}
521 
522 	/* if the user has specified what the root device should be
523 	   then we don't touch booted_device or boothowto... */
524 
525 	if (rootspec != NULL)
526 		return;
527 
528 	/* we found something bootable... */
529 
530 	if (num_root == 1) {
531 		device_t candidate_root;
532 		if (rsc->sc_dkdev.dk_nwedges != 0) {
533 			/* XXX: How do we find the real root partition? */
534 			char cname[sizeof(cset->ac->devname)];
535 			snprintf(cname, sizeof(cname), "%s%c",
536 			    device_xname(rsc->sc_dev), 'a');
537 			candidate_root = dkwedge_find_by_wname(cname);
538 		} else
539 			candidate_root = rsc->sc_dev;
540 		if (booted_device == NULL ||
541 		    rsc->sc_r.root_partition == 1 ||
542 		    rf_containsboot(&rsc->sc_r, booted_device))
543 			booted_device = candidate_root;
544 	} else if (num_root > 1) {
545 
546 		/*
547 		 * Maybe the MD code can help. If it cannot, then
548 		 * setroot() will discover that we have no
549 		 * booted_device and will ask the user if nothing was
550 		 * hardwired in the kernel config file
551 		 */
552 		if (booted_device == NULL)
553 			return;
554 
555 		num_root = 0;
556 		mutex_enter(&raid_lock);
557 		LIST_FOREACH(sc, &raids, sc_link) {
558 			RF_Raid_t *r = &sc->sc_r;
559 			if (r->valid == 0)
560 				continue;
561 
562 			if (r->root_partition == 0)
563 				continue;
564 
565 			if (rf_containsboot(r, booted_device)) {
566 				num_root++;
567 				rsc = sc;
568 			}
569 		}
570 		mutex_exit(&raid_lock);
571 
572 		if (num_root == 1) {
573 			booted_device = rsc->sc_dev;
574 		} else {
575 			/* we can't guess.. require the user to answer... */
576 			boothowto |= RB_ASKNAME;
577 		}
578 	}
579 }
580 
581 
582 int
583 raidsize(dev_t dev)
584 {
585 	struct raid_softc *rs;
586 	struct disklabel *lp;
587 	int     part, unit, omask, size;
588 
589 	unit = raidunit(dev);
590 	if ((rs = raidget(unit)) == NULL)
591 		return -1;
592 	if ((rs->sc_flags & RAIDF_INITED) == 0)
593 		return (-1);
594 
595 	part = DISKPART(dev);
596 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
597 	lp = rs->sc_dkdev.dk_label;
598 
599 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
600 		return (-1);
601 
602 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
603 		size = -1;
604 	else
605 		size = lp->d_partitions[part].p_size *
606 		    (lp->d_secsize / DEV_BSIZE);
607 
608 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
609 		return (-1);
610 
611 	return (size);
612 
613 }
614 
615 int
616 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
617 {
618 	int     unit = raidunit(dev);
619 	struct raid_softc *rs;
620 	const struct bdevsw *bdev;
621 	struct disklabel *lp;
622 	RF_Raid_t *raidPtr;
623 	daddr_t offset;
624 	int     part, c, sparecol, j, scol, dumpto;
625 	int     error = 0;
626 
627 	if ((rs = raidget(unit)) == NULL)
628 		return ENXIO;
629 
630 	raidPtr = &rs->sc_r;
631 
632 	if ((rs->sc_flags & RAIDF_INITED) == 0)
633 		return ENXIO;
634 
635 	/* we only support dumping to RAID 1 sets */
636 	if (raidPtr->Layout.numDataCol != 1 ||
637 	    raidPtr->Layout.numParityCol != 1)
638 		return EINVAL;
639 
640 
641 	if ((error = raidlock(rs)) != 0)
642 		return error;
643 
644 	if (size % DEV_BSIZE != 0) {
645 		error = EINVAL;
646 		goto out;
647 	}
648 
649 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
650 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
651 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
652 		    size / DEV_BSIZE, rs->sc_size);
653 		error = EINVAL;
654 		goto out;
655 	}
656 
657 	part = DISKPART(dev);
658 	lp = rs->sc_dkdev.dk_label;
659 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
660 
661 	/* figure out what device is alive.. */
662 
663 	/*
664 	   Look for a component to dump to.  The preference for the
665 	   component to dump to is as follows:
666 	   1) the master
667 	   2) a used_spare of the master
668 	   3) the slave
669 	   4) a used_spare of the slave
670 	*/
671 
672 	dumpto = -1;
673 	for (c = 0; c < raidPtr->numCol; c++) {
674 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
675 			/* this might be the one */
676 			dumpto = c;
677 			break;
678 		}
679 	}
680 
681 	/*
682 	   At this point we have possibly selected a live master or a
683 	   live slave.  We now check to see if there is a spared
684 	   master (or a spared slave), if we didn't find a live master
685 	   or a live slave.
686 	*/
687 
688 	for (c = 0; c < raidPtr->numSpare; c++) {
689 		sparecol = raidPtr->numCol + c;
690 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
691 			/* How about this one? */
692 			scol = -1;
693 			for(j=0;j<raidPtr->numCol;j++) {
694 				if (raidPtr->Disks[j].spareCol == sparecol) {
695 					scol = j;
696 					break;
697 				}
698 			}
699 			if (scol == 0) {
700 				/*
701 				   We must have found a spared master!
702 				   We'll take that over anything else
703 				   found so far.  (We couldn't have
704 				   found a real master before, since
705 				   this is a used spare, and it's
706 				   saying that it's replacing the
707 				   master.)  On reboot (with
708 				   autoconfiguration turned on)
709 				   sparecol will become the 1st
710 				   component (component0) of this set.
711 				*/
712 				dumpto = sparecol;
713 				break;
714 			} else if (scol != -1) {
715 				/*
716 				   Must be a spared slave.  We'll dump
717 				   to that if we havn't found anything
718 				   else so far.
719 				*/
720 				if (dumpto == -1)
721 					dumpto = sparecol;
722 			}
723 		}
724 	}
725 
726 	if (dumpto == -1) {
727 		/* we couldn't find any live components to dump to!?!?
728 		 */
729 		error = EINVAL;
730 		goto out;
731 	}
732 
733 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
734 
735 	/*
736 	   Note that blkno is relative to this particular partition.
737 	   By adding the offset of this partition in the RAID
738 	   set, and also adding RF_PROTECTED_SECTORS, we get a
739 	   value that is relative to the partition used for the
740 	   underlying component.
741 	*/
742 
743 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
744 				blkno + offset, va, size);
745 
746 out:
747 	raidunlock(rs);
748 
749 	return error;
750 }
751 /* ARGSUSED */
752 int
753 raidopen(dev_t dev, int flags, int fmt,
754     struct lwp *l)
755 {
756 	int     unit = raidunit(dev);
757 	struct raid_softc *rs;
758 	struct disklabel *lp;
759 	int     part, pmask;
760 	int     error = 0;
761 
762 	if ((rs = raidget(unit)) == NULL)
763 		return ENXIO;
764 	if ((error = raidlock(rs)) != 0)
765 		return (error);
766 
767 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
768 		error = EBUSY;
769 		goto bad;
770 	}
771 
772 	lp = rs->sc_dkdev.dk_label;
773 
774 	part = DISKPART(dev);
775 
776 	/*
777 	 * If there are wedges, and this is not RAW_PART, then we
778 	 * need to fail.
779 	 */
780 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
781 		error = EBUSY;
782 		goto bad;
783 	}
784 	pmask = (1 << part);
785 
786 	if ((rs->sc_flags & RAIDF_INITED) &&
787 	    (rs->sc_dkdev.dk_openmask == 0))
788 		raidgetdisklabel(dev);
789 
790 	/* make sure that this partition exists */
791 
792 	if (part != RAW_PART) {
793 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
794 		    ((part >= lp->d_npartitions) ||
795 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
796 			error = ENXIO;
797 			goto bad;
798 		}
799 	}
800 	/* Prevent this unit from being unconfigured while open. */
801 	switch (fmt) {
802 	case S_IFCHR:
803 		rs->sc_dkdev.dk_copenmask |= pmask;
804 		break;
805 
806 	case S_IFBLK:
807 		rs->sc_dkdev.dk_bopenmask |= pmask;
808 		break;
809 	}
810 
811 	if ((rs->sc_dkdev.dk_openmask == 0) &&
812 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
813 		/* First one... mark things as dirty... Note that we *MUST*
814 		 have done a configure before this.  I DO NOT WANT TO BE
815 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
816 		 THAT THEY BELONG TOGETHER!!!!! */
817 		/* XXX should check to see if we're only open for reading
818 		   here... If so, we needn't do this, but then need some
819 		   other way of keeping track of what's happened.. */
820 
821 		rf_markalldirty(&rs->sc_r);
822 	}
823 
824 
825 	rs->sc_dkdev.dk_openmask =
826 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
827 
828 bad:
829 	raidunlock(rs);
830 
831 	return (error);
832 
833 
834 }
835 /* ARGSUSED */
836 int
837 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
838 {
839 	int     unit = raidunit(dev);
840 	struct raid_softc *rs;
841 	int     error = 0;
842 	int     part;
843 
844 	if ((rs = raidget(unit)) == NULL)
845 		return ENXIO;
846 
847 	if ((error = raidlock(rs)) != 0)
848 		return (error);
849 
850 	part = DISKPART(dev);
851 
852 	/* ...that much closer to allowing unconfiguration... */
853 	switch (fmt) {
854 	case S_IFCHR:
855 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
856 		break;
857 
858 	case S_IFBLK:
859 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
860 		break;
861 	}
862 	rs->sc_dkdev.dk_openmask =
863 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
864 
865 	if ((rs->sc_dkdev.dk_openmask == 0) &&
866 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
867 		/* Last one... device is not unconfigured yet.
868 		   Device shutdown has taken care of setting the
869 		   clean bits if RAIDF_INITED is not set
870 		   mark things as clean... */
871 
872 		rf_update_component_labels(&rs->sc_r,
873 						 RF_FINAL_COMPONENT_UPDATE);
874 
875 		/* If the kernel is shutting down, it will detach
876 		 * this RAID set soon enough.
877 		 */
878 	}
879 
880 	raidunlock(rs);
881 	return (0);
882 
883 }
884 
885 void
886 raidstrategy(struct buf *bp)
887 {
888 	unsigned int unit = raidunit(bp->b_dev);
889 	RF_Raid_t *raidPtr;
890 	int     wlabel;
891 	struct raid_softc *rs;
892 
893 	if ((rs = raidget(unit)) == NULL) {
894 		bp->b_error = ENXIO;
895 		goto done;
896 	}
897 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
898 		bp->b_error = ENXIO;
899 		goto done;
900 	}
901 	raidPtr = &rs->sc_r;
902 	if (!raidPtr->valid) {
903 		bp->b_error = ENODEV;
904 		goto done;
905 	}
906 	if (bp->b_bcount == 0) {
907 		db1_printf(("b_bcount is zero..\n"));
908 		goto done;
909 	}
910 
911 	/*
912 	 * Do bounds checking and adjust transfer.  If there's an
913 	 * error, the bounds check will flag that for us.
914 	 */
915 
916 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
917 	if (DISKPART(bp->b_dev) == RAW_PART) {
918 		uint64_t size; /* device size in DEV_BSIZE unit */
919 
920 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
921 			size = raidPtr->totalSectors <<
922 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
923 		} else {
924 			size = raidPtr->totalSectors >>
925 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
926 		}
927 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
928 			goto done;
929 		}
930 	} else {
931 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
932 			db1_printf(("Bounds check failed!!:%d %d\n",
933 				(int) bp->b_blkno, (int) wlabel));
934 			goto done;
935 		}
936 	}
937 
938 	rf_lock_mutex2(raidPtr->iodone_lock);
939 
940 	bp->b_resid = 0;
941 
942 	/* stuff it onto our queue */
943 	bufq_put(rs->buf_queue, bp);
944 
945 	/* scheduled the IO to happen at the next convenient time */
946 	rf_signal_cond2(raidPtr->iodone_cv);
947 	rf_unlock_mutex2(raidPtr->iodone_lock);
948 
949 	return;
950 
951 done:
952 	bp->b_resid = bp->b_bcount;
953 	biodone(bp);
954 }
955 /* ARGSUSED */
956 int
957 raidread(dev_t dev, struct uio *uio, int flags)
958 {
959 	int     unit = raidunit(dev);
960 	struct raid_softc *rs;
961 
962 	if ((rs = raidget(unit)) == NULL)
963 		return ENXIO;
964 
965 	if ((rs->sc_flags & RAIDF_INITED) == 0)
966 		return (ENXIO);
967 
968 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
969 
970 }
971 /* ARGSUSED */
972 int
973 raidwrite(dev_t dev, struct uio *uio, int flags)
974 {
975 	int     unit = raidunit(dev);
976 	struct raid_softc *rs;
977 
978 	if ((rs = raidget(unit)) == NULL)
979 		return ENXIO;
980 
981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
982 		return (ENXIO);
983 
984 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
985 
986 }
987 
988 static int
989 raid_detach_unlocked(struct raid_softc *rs)
990 {
991 	int error;
992 	RF_Raid_t *raidPtr;
993 
994 	raidPtr = &rs->sc_r;
995 
996 	/*
997 	 * If somebody has a partition mounted, we shouldn't
998 	 * shutdown.
999 	 */
1000 	if (rs->sc_dkdev.dk_openmask != 0)
1001 		return EBUSY;
1002 
1003 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1004 		;	/* not initialized: nothing to do */
1005 	else if ((error = rf_Shutdown(raidPtr)) != 0)
1006 		return error;
1007 	else
1008 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
1009 
1010 	/* Detach the disk. */
1011 	dkwedge_delall(&rs->sc_dkdev);
1012 	disk_detach(&rs->sc_dkdev);
1013 	disk_destroy(&rs->sc_dkdev);
1014 
1015 	aprint_normal_dev(rs->sc_dev, "detached\n");
1016 
1017 	return 0;
1018 }
1019 
1020 int
1021 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1022 {
1023 	int     unit = raidunit(dev);
1024 	int     error = 0;
1025 	int     part, pmask, s;
1026 	cfdata_t cf;
1027 	struct raid_softc *rs;
1028 	RF_Config_t *k_cfg, *u_cfg;
1029 	RF_Raid_t *raidPtr;
1030 	RF_RaidDisk_t *diskPtr;
1031 	RF_AccTotals_t *totals;
1032 	RF_DeviceConfig_t *d_cfg, **ucfgp;
1033 	u_char *specific_buf;
1034 	int retcode = 0;
1035 	int column;
1036 /*	int raidid; */
1037 	struct rf_recon_req *rrcopy, *rr;
1038 	RF_ComponentLabel_t *clabel;
1039 	RF_ComponentLabel_t *ci_label;
1040 	RF_ComponentLabel_t **clabel_ptr;
1041 	RF_SingleComponent_t *sparePtr,*componentPtr;
1042 	RF_SingleComponent_t component;
1043 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1044 	int i, j, d;
1045 #ifdef __HAVE_OLD_DISKLABEL
1046 	struct disklabel newlabel;
1047 #endif
1048 	struct dkwedge_info *dkw;
1049 
1050 	if ((rs = raidget(unit)) == NULL)
1051 		return ENXIO;
1052 	raidPtr = &rs->sc_r;
1053 
1054 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1055 		(int) DISKPART(dev), (int) unit, cmd));
1056 
1057 	/* Must be open for writes for these commands... */
1058 	switch (cmd) {
1059 #ifdef DIOCGSECTORSIZE
1060 	case DIOCGSECTORSIZE:
1061 		*(u_int *)data = raidPtr->bytesPerSector;
1062 		return 0;
1063 	case DIOCGMEDIASIZE:
1064 		*(off_t *)data =
1065 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1066 		return 0;
1067 #endif
1068 	case DIOCSDINFO:
1069 	case DIOCWDINFO:
1070 #ifdef __HAVE_OLD_DISKLABEL
1071 	case ODIOCWDINFO:
1072 	case ODIOCSDINFO:
1073 #endif
1074 	case DIOCWLABEL:
1075 	case DIOCAWEDGE:
1076 	case DIOCDWEDGE:
1077 	case DIOCSSTRATEGY:
1078 		if ((flag & FWRITE) == 0)
1079 			return (EBADF);
1080 	}
1081 
1082 	/* Must be initialized for these... */
1083 	switch (cmd) {
1084 	case DIOCGDINFO:
1085 	case DIOCSDINFO:
1086 	case DIOCWDINFO:
1087 #ifdef __HAVE_OLD_DISKLABEL
1088 	case ODIOCGDINFO:
1089 	case ODIOCWDINFO:
1090 	case ODIOCSDINFO:
1091 	case ODIOCGDEFLABEL:
1092 #endif
1093 	case DIOCGPART:
1094 	case DIOCWLABEL:
1095 	case DIOCGDEFLABEL:
1096 	case DIOCAWEDGE:
1097 	case DIOCDWEDGE:
1098 	case DIOCLWEDGES:
1099 	case DIOCCACHESYNC:
1100 	case RAIDFRAME_SHUTDOWN:
1101 	case RAIDFRAME_REWRITEPARITY:
1102 	case RAIDFRAME_GET_INFO:
1103 	case RAIDFRAME_RESET_ACCTOTALS:
1104 	case RAIDFRAME_GET_ACCTOTALS:
1105 	case RAIDFRAME_KEEP_ACCTOTALS:
1106 	case RAIDFRAME_GET_SIZE:
1107 	case RAIDFRAME_FAIL_DISK:
1108 	case RAIDFRAME_COPYBACK:
1109 	case RAIDFRAME_CHECK_RECON_STATUS:
1110 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1111 	case RAIDFRAME_GET_COMPONENT_LABEL:
1112 	case RAIDFRAME_SET_COMPONENT_LABEL:
1113 	case RAIDFRAME_ADD_HOT_SPARE:
1114 	case RAIDFRAME_REMOVE_HOT_SPARE:
1115 	case RAIDFRAME_INIT_LABELS:
1116 	case RAIDFRAME_REBUILD_IN_PLACE:
1117 	case RAIDFRAME_CHECK_PARITY:
1118 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1119 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1120 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1121 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1122 	case RAIDFRAME_SET_AUTOCONFIG:
1123 	case RAIDFRAME_SET_ROOT:
1124 	case RAIDFRAME_DELETE_COMPONENT:
1125 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1126 	case RAIDFRAME_PARITYMAP_STATUS:
1127 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1128 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1129 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1130 	case DIOCGSTRATEGY:
1131 	case DIOCSSTRATEGY:
1132 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1133 			return (ENXIO);
1134 	}
1135 
1136 	switch (cmd) {
1137 #ifdef COMPAT_50
1138 	case RAIDFRAME_GET_INFO50:
1139 		return rf_get_info50(raidPtr, data);
1140 
1141 	case RAIDFRAME_CONFIGURE50:
1142 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1143 			return retcode;
1144 		goto config;
1145 #endif
1146 		/* configure the system */
1147 	case RAIDFRAME_CONFIGURE:
1148 
1149 		if (raidPtr->valid) {
1150 			/* There is a valid RAID set running on this unit! */
1151 			printf("raid%d: Device already configured!\n",unit);
1152 			return(EINVAL);
1153 		}
1154 
1155 		/* copy-in the configuration information */
1156 		/* data points to a pointer to the configuration structure */
1157 
1158 		u_cfg = *((RF_Config_t **) data);
1159 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1160 		if (k_cfg == NULL) {
1161 			return (ENOMEM);
1162 		}
1163 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1164 		if (retcode) {
1165 			RF_Free(k_cfg, sizeof(RF_Config_t));
1166 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1167 				retcode));
1168 			return (retcode);
1169 		}
1170 		goto config;
1171 	config:
1172 		/* allocate a buffer for the layout-specific data, and copy it
1173 		 * in */
1174 		if (k_cfg->layoutSpecificSize) {
1175 			if (k_cfg->layoutSpecificSize > 10000) {
1176 				/* sanity check */
1177 				RF_Free(k_cfg, sizeof(RF_Config_t));
1178 				return (EINVAL);
1179 			}
1180 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1181 			    (u_char *));
1182 			if (specific_buf == NULL) {
1183 				RF_Free(k_cfg, sizeof(RF_Config_t));
1184 				return (ENOMEM);
1185 			}
1186 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1187 			    k_cfg->layoutSpecificSize);
1188 			if (retcode) {
1189 				RF_Free(k_cfg, sizeof(RF_Config_t));
1190 				RF_Free(specific_buf,
1191 					k_cfg->layoutSpecificSize);
1192 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1193 					retcode));
1194 				return (retcode);
1195 			}
1196 		} else
1197 			specific_buf = NULL;
1198 		k_cfg->layoutSpecific = specific_buf;
1199 
1200 		/* should do some kind of sanity check on the configuration.
1201 		 * Store the sum of all the bytes in the last byte? */
1202 
1203 		/* configure the system */
1204 
1205 		/*
1206 		 * Clear the entire RAID descriptor, just to make sure
1207 		 *  there is no stale data left in the case of a
1208 		 *  reconfiguration
1209 		 */
1210 		memset(raidPtr, 0, sizeof(*raidPtr));
1211 		raidPtr->softc = rs;
1212 		raidPtr->raidid = unit;
1213 
1214 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1215 
1216 		if (retcode == 0) {
1217 
1218 			/* allow this many simultaneous IO's to
1219 			   this RAID device */
1220 			raidPtr->openings = RAIDOUTSTANDING;
1221 
1222 			raidinit(rs);
1223 			rf_markalldirty(raidPtr);
1224 		}
1225 		/* free the buffers.  No return code here. */
1226 		if (k_cfg->layoutSpecificSize) {
1227 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1228 		}
1229 		RF_Free(k_cfg, sizeof(RF_Config_t));
1230 
1231 		return (retcode);
1232 
1233 		/* shutdown the system */
1234 	case RAIDFRAME_SHUTDOWN:
1235 
1236 		part = DISKPART(dev);
1237 		pmask = (1 << part);
1238 
1239 		if ((error = raidlock(rs)) != 0)
1240 			return (error);
1241 
1242 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1243 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1244 			(rs->sc_dkdev.dk_copenmask & pmask)))
1245 			retcode = EBUSY;
1246 		else {
1247 			rs->sc_flags |= RAIDF_SHUTDOWN;
1248 			rs->sc_dkdev.dk_copenmask &= ~pmask;
1249 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
1250 			rs->sc_dkdev.dk_openmask &= ~pmask;
1251 			retcode = 0;
1252 		}
1253 
1254 		raidunlock(rs);
1255 
1256 		if (retcode != 0)
1257 			return retcode;
1258 
1259 		/* free the pseudo device attach bits */
1260 
1261 		cf = device_cfdata(rs->sc_dev);
1262 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1263 			free(cf, M_RAIDFRAME);
1264 
1265 		return (retcode);
1266 	case RAIDFRAME_GET_COMPONENT_LABEL:
1267 		clabel_ptr = (RF_ComponentLabel_t **) data;
1268 		/* need to read the component label for the disk indicated
1269 		   by row,column in clabel */
1270 
1271 		/*
1272 		 * Perhaps there should be an option to skip the in-core
1273 		 * copy and hit the disk, as with disklabel(8).
1274 		 */
1275 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1276 
1277 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1278 
1279 		if (retcode) {
1280 			RF_Free(clabel, sizeof(*clabel));
1281 			return retcode;
1282 		}
1283 
1284 		clabel->row = 0; /* Don't allow looking at anything else.*/
1285 
1286 		column = clabel->column;
1287 
1288 		if ((column < 0) || (column >= raidPtr->numCol +
1289 		    raidPtr->numSpare)) {
1290 			RF_Free(clabel, sizeof(*clabel));
1291 			return EINVAL;
1292 		}
1293 
1294 		RF_Free(clabel, sizeof(*clabel));
1295 
1296 		clabel = raidget_component_label(raidPtr, column);
1297 
1298 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1299 
1300 #if 0
1301 	case RAIDFRAME_SET_COMPONENT_LABEL:
1302 		clabel = (RF_ComponentLabel_t *) data;
1303 
1304 		/* XXX check the label for valid stuff... */
1305 		/* Note that some things *should not* get modified --
1306 		   the user should be re-initing the labels instead of
1307 		   trying to patch things.
1308 		   */
1309 
1310 		raidid = raidPtr->raidid;
1311 #ifdef DEBUG
1312 		printf("raid%d: Got component label:\n", raidid);
1313 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1314 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1315 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1316 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1317 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1318 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1319 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1320 #endif
1321 		clabel->row = 0;
1322 		column = clabel->column;
1323 
1324 		if ((column < 0) || (column >= raidPtr->numCol)) {
1325 			return(EINVAL);
1326 		}
1327 
1328 		/* XXX this isn't allowed to do anything for now :-) */
1329 
1330 		/* XXX and before it is, we need to fill in the rest
1331 		   of the fields!?!?!?! */
1332 		memcpy(raidget_component_label(raidPtr, column),
1333 		    clabel, sizeof(*clabel));
1334 		raidflush_component_label(raidPtr, column);
1335 		return (0);
1336 #endif
1337 
1338 	case RAIDFRAME_INIT_LABELS:
1339 		clabel = (RF_ComponentLabel_t *) data;
1340 		/*
1341 		   we only want the serial number from
1342 		   the above.  We get all the rest of the information
1343 		   from the config that was used to create this RAID
1344 		   set.
1345 		   */
1346 
1347 		raidPtr->serial_number = clabel->serial_number;
1348 
1349 		for(column=0;column<raidPtr->numCol;column++) {
1350 			diskPtr = &raidPtr->Disks[column];
1351 			if (!RF_DEAD_DISK(diskPtr->status)) {
1352 				ci_label = raidget_component_label(raidPtr,
1353 				    column);
1354 				/* Zeroing this is important. */
1355 				memset(ci_label, 0, sizeof(*ci_label));
1356 				raid_init_component_label(raidPtr, ci_label);
1357 				ci_label->serial_number =
1358 				    raidPtr->serial_number;
1359 				ci_label->row = 0; /* we dont' pretend to support more */
1360 				rf_component_label_set_partitionsize(ci_label,
1361 				    diskPtr->partitionSize);
1362 				ci_label->column = column;
1363 				raidflush_component_label(raidPtr, column);
1364 			}
1365 			/* XXXjld what about the spares? */
1366 		}
1367 
1368 		return (retcode);
1369 	case RAIDFRAME_SET_AUTOCONFIG:
1370 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1371 		printf("raid%d: New autoconfig value is: %d\n",
1372 		       raidPtr->raidid, d);
1373 		*(int *) data = d;
1374 		return (retcode);
1375 
1376 	case RAIDFRAME_SET_ROOT:
1377 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1378 		printf("raid%d: New rootpartition value is: %d\n",
1379 		       raidPtr->raidid, d);
1380 		*(int *) data = d;
1381 		return (retcode);
1382 
1383 		/* initialize all parity */
1384 	case RAIDFRAME_REWRITEPARITY:
1385 
1386 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1387 			/* Parity for RAID 0 is trivially correct */
1388 			raidPtr->parity_good = RF_RAID_CLEAN;
1389 			return(0);
1390 		}
1391 
1392 		if (raidPtr->parity_rewrite_in_progress == 1) {
1393 			/* Re-write is already in progress! */
1394 			return(EINVAL);
1395 		}
1396 
1397 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1398 					   rf_RewriteParityThread,
1399 					   raidPtr,"raid_parity");
1400 		return (retcode);
1401 
1402 
1403 	case RAIDFRAME_ADD_HOT_SPARE:
1404 		sparePtr = (RF_SingleComponent_t *) data;
1405 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1406 		retcode = rf_add_hot_spare(raidPtr, &component);
1407 		return(retcode);
1408 
1409 	case RAIDFRAME_REMOVE_HOT_SPARE:
1410 		return(retcode);
1411 
1412 	case RAIDFRAME_DELETE_COMPONENT:
1413 		componentPtr = (RF_SingleComponent_t *)data;
1414 		memcpy( &component, componentPtr,
1415 			sizeof(RF_SingleComponent_t));
1416 		retcode = rf_delete_component(raidPtr, &component);
1417 		return(retcode);
1418 
1419 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1420 		componentPtr = (RF_SingleComponent_t *)data;
1421 		memcpy( &component, componentPtr,
1422 			sizeof(RF_SingleComponent_t));
1423 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1424 		return(retcode);
1425 
1426 	case RAIDFRAME_REBUILD_IN_PLACE:
1427 
1428 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1429 			/* Can't do this on a RAID 0!! */
1430 			return(EINVAL);
1431 		}
1432 
1433 		if (raidPtr->recon_in_progress == 1) {
1434 			/* a reconstruct is already in progress! */
1435 			return(EINVAL);
1436 		}
1437 
1438 		componentPtr = (RF_SingleComponent_t *) data;
1439 		memcpy( &component, componentPtr,
1440 			sizeof(RF_SingleComponent_t));
1441 		component.row = 0; /* we don't support any more */
1442 		column = component.column;
1443 
1444 		if ((column < 0) || (column >= raidPtr->numCol)) {
1445 			return(EINVAL);
1446 		}
1447 
1448 		rf_lock_mutex2(raidPtr->mutex);
1449 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1450 		    (raidPtr->numFailures > 0)) {
1451 			/* XXX 0 above shouldn't be constant!!! */
1452 			/* some component other than this has failed.
1453 			   Let's not make things worse than they already
1454 			   are... */
1455 			printf("raid%d: Unable to reconstruct to disk at:\n",
1456 			       raidPtr->raidid);
1457 			printf("raid%d:     Col: %d   Too many failures.\n",
1458 			       raidPtr->raidid, column);
1459 			rf_unlock_mutex2(raidPtr->mutex);
1460 			return (EINVAL);
1461 		}
1462 		if (raidPtr->Disks[column].status ==
1463 		    rf_ds_reconstructing) {
1464 			printf("raid%d: Unable to reconstruct to disk at:\n",
1465 			       raidPtr->raidid);
1466 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
1467 
1468 			rf_unlock_mutex2(raidPtr->mutex);
1469 			return (EINVAL);
1470 		}
1471 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1472 			rf_unlock_mutex2(raidPtr->mutex);
1473 			return (EINVAL);
1474 		}
1475 		rf_unlock_mutex2(raidPtr->mutex);
1476 
1477 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1478 		if (rrcopy == NULL)
1479 			return(ENOMEM);
1480 
1481 		rrcopy->raidPtr = (void *) raidPtr;
1482 		rrcopy->col = column;
1483 
1484 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1485 					   rf_ReconstructInPlaceThread,
1486 					   rrcopy,"raid_reconip");
1487 		return(retcode);
1488 
1489 	case RAIDFRAME_GET_INFO:
1490 		if (!raidPtr->valid)
1491 			return (ENODEV);
1492 		ucfgp = (RF_DeviceConfig_t **) data;
1493 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1494 			  (RF_DeviceConfig_t *));
1495 		if (d_cfg == NULL)
1496 			return (ENOMEM);
1497 		d_cfg->rows = 1; /* there is only 1 row now */
1498 		d_cfg->cols = raidPtr->numCol;
1499 		d_cfg->ndevs = raidPtr->numCol;
1500 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1501 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1502 			return (ENOMEM);
1503 		}
1504 		d_cfg->nspares = raidPtr->numSpare;
1505 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1506 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1507 			return (ENOMEM);
1508 		}
1509 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1510 		d = 0;
1511 		for (j = 0; j < d_cfg->cols; j++) {
1512 			d_cfg->devs[d] = raidPtr->Disks[j];
1513 			d++;
1514 		}
1515 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1516 			d_cfg->spares[i] = raidPtr->Disks[j];
1517 		}
1518 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1519 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1520 
1521 		return (retcode);
1522 
1523 	case RAIDFRAME_CHECK_PARITY:
1524 		*(int *) data = raidPtr->parity_good;
1525 		return (0);
1526 
1527 	case RAIDFRAME_PARITYMAP_STATUS:
1528 		if (rf_paritymap_ineligible(raidPtr))
1529 			return EINVAL;
1530 		rf_paritymap_status(raidPtr->parity_map,
1531 		    (struct rf_pmstat *)data);
1532 		return 0;
1533 
1534 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1535 		if (rf_paritymap_ineligible(raidPtr))
1536 			return EINVAL;
1537 		if (raidPtr->parity_map == NULL)
1538 			return ENOENT; /* ??? */
1539 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1540 			(struct rf_pmparams *)data, 1))
1541 			return EINVAL;
1542 		return 0;
1543 
1544 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1545 		if (rf_paritymap_ineligible(raidPtr))
1546 			return EINVAL;
1547 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1548 		return 0;
1549 
1550 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1551 		if (rf_paritymap_ineligible(raidPtr))
1552 			return EINVAL;
1553 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1554 		/* XXX should errors be passed up? */
1555 		return 0;
1556 
1557 	case RAIDFRAME_RESET_ACCTOTALS:
1558 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1559 		return (0);
1560 
1561 	case RAIDFRAME_GET_ACCTOTALS:
1562 		totals = (RF_AccTotals_t *) data;
1563 		*totals = raidPtr->acc_totals;
1564 		return (0);
1565 
1566 	case RAIDFRAME_KEEP_ACCTOTALS:
1567 		raidPtr->keep_acc_totals = *(int *)data;
1568 		return (0);
1569 
1570 	case RAIDFRAME_GET_SIZE:
1571 		*(int *) data = raidPtr->totalSectors;
1572 		return (0);
1573 
1574 		/* fail a disk & optionally start reconstruction */
1575 	case RAIDFRAME_FAIL_DISK:
1576 
1577 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1578 			/* Can't do this on a RAID 0!! */
1579 			return(EINVAL);
1580 		}
1581 
1582 		rr = (struct rf_recon_req *) data;
1583 		rr->row = 0;
1584 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1585 			return (EINVAL);
1586 
1587 
1588 		rf_lock_mutex2(raidPtr->mutex);
1589 		if (raidPtr->status == rf_rs_reconstructing) {
1590 			/* you can't fail a disk while we're reconstructing! */
1591 			/* XXX wrong for RAID6 */
1592 			rf_unlock_mutex2(raidPtr->mutex);
1593 			return (EINVAL);
1594 		}
1595 		if ((raidPtr->Disks[rr->col].status ==
1596 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1597 			/* some other component has failed.  Let's not make
1598 			   things worse. XXX wrong for RAID6 */
1599 			rf_unlock_mutex2(raidPtr->mutex);
1600 			return (EINVAL);
1601 		}
1602 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1603 			/* Can't fail a spared disk! */
1604 			rf_unlock_mutex2(raidPtr->mutex);
1605 			return (EINVAL);
1606 		}
1607 		rf_unlock_mutex2(raidPtr->mutex);
1608 
1609 		/* make a copy of the recon request so that we don't rely on
1610 		 * the user's buffer */
1611 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1612 		if (rrcopy == NULL)
1613 			return(ENOMEM);
1614 		memcpy(rrcopy, rr, sizeof(*rr));
1615 		rrcopy->raidPtr = (void *) raidPtr;
1616 
1617 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1618 					   rf_ReconThread,
1619 					   rrcopy,"raid_recon");
1620 		return (0);
1621 
1622 		/* invoke a copyback operation after recon on whatever disk
1623 		 * needs it, if any */
1624 	case RAIDFRAME_COPYBACK:
1625 
1626 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1627 			/* This makes no sense on a RAID 0!! */
1628 			return(EINVAL);
1629 		}
1630 
1631 		if (raidPtr->copyback_in_progress == 1) {
1632 			/* Copyback is already in progress! */
1633 			return(EINVAL);
1634 		}
1635 
1636 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1637 					   rf_CopybackThread,
1638 					   raidPtr,"raid_copyback");
1639 		return (retcode);
1640 
1641 		/* return the percentage completion of reconstruction */
1642 	case RAIDFRAME_CHECK_RECON_STATUS:
1643 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1644 			/* This makes no sense on a RAID 0, so tell the
1645 			   user it's done. */
1646 			*(int *) data = 100;
1647 			return(0);
1648 		}
1649 		if (raidPtr->status != rf_rs_reconstructing)
1650 			*(int *) data = 100;
1651 		else {
1652 			if (raidPtr->reconControl->numRUsTotal > 0) {
1653 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1654 			} else {
1655 				*(int *) data = 0;
1656 			}
1657 		}
1658 		return (0);
1659 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1660 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1661 		if (raidPtr->status != rf_rs_reconstructing) {
1662 			progressInfo.remaining = 0;
1663 			progressInfo.completed = 100;
1664 			progressInfo.total = 100;
1665 		} else {
1666 			progressInfo.total =
1667 				raidPtr->reconControl->numRUsTotal;
1668 			progressInfo.completed =
1669 				raidPtr->reconControl->numRUsComplete;
1670 			progressInfo.remaining = progressInfo.total -
1671 				progressInfo.completed;
1672 		}
1673 		retcode = copyout(&progressInfo, *progressInfoPtr,
1674 				  sizeof(RF_ProgressInfo_t));
1675 		return (retcode);
1676 
1677 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1678 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1679 			/* This makes no sense on a RAID 0, so tell the
1680 			   user it's done. */
1681 			*(int *) data = 100;
1682 			return(0);
1683 		}
1684 		if (raidPtr->parity_rewrite_in_progress == 1) {
1685 			*(int *) data = 100 *
1686 				raidPtr->parity_rewrite_stripes_done /
1687 				raidPtr->Layout.numStripe;
1688 		} else {
1689 			*(int *) data = 100;
1690 		}
1691 		return (0);
1692 
1693 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1694 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1695 		if (raidPtr->parity_rewrite_in_progress == 1) {
1696 			progressInfo.total = raidPtr->Layout.numStripe;
1697 			progressInfo.completed =
1698 				raidPtr->parity_rewrite_stripes_done;
1699 			progressInfo.remaining = progressInfo.total -
1700 				progressInfo.completed;
1701 		} else {
1702 			progressInfo.remaining = 0;
1703 			progressInfo.completed = 100;
1704 			progressInfo.total = 100;
1705 		}
1706 		retcode = copyout(&progressInfo, *progressInfoPtr,
1707 				  sizeof(RF_ProgressInfo_t));
1708 		return (retcode);
1709 
1710 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1711 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1712 			/* This makes no sense on a RAID 0 */
1713 			*(int *) data = 100;
1714 			return(0);
1715 		}
1716 		if (raidPtr->copyback_in_progress == 1) {
1717 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1718 				raidPtr->Layout.numStripe;
1719 		} else {
1720 			*(int *) data = 100;
1721 		}
1722 		return (0);
1723 
1724 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1725 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1726 		if (raidPtr->copyback_in_progress == 1) {
1727 			progressInfo.total = raidPtr->Layout.numStripe;
1728 			progressInfo.completed =
1729 				raidPtr->copyback_stripes_done;
1730 			progressInfo.remaining = progressInfo.total -
1731 				progressInfo.completed;
1732 		} else {
1733 			progressInfo.remaining = 0;
1734 			progressInfo.completed = 100;
1735 			progressInfo.total = 100;
1736 		}
1737 		retcode = copyout(&progressInfo, *progressInfoPtr,
1738 				  sizeof(RF_ProgressInfo_t));
1739 		return (retcode);
1740 
1741 		/* the sparetable daemon calls this to wait for the kernel to
1742 		 * need a spare table. this ioctl does not return until a
1743 		 * spare table is needed. XXX -- calling mpsleep here in the
1744 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1745 		 * -- I should either compute the spare table in the kernel,
1746 		 * or have a different -- XXX XXX -- interface (a different
1747 		 * character device) for delivering the table     -- XXX */
1748 #if 0
1749 	case RAIDFRAME_SPARET_WAIT:
1750 		rf_lock_mutex2(rf_sparet_wait_mutex);
1751 		while (!rf_sparet_wait_queue)
1752 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1753 		waitreq = rf_sparet_wait_queue;
1754 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1755 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1756 
1757 		/* structure assignment */
1758 		*((RF_SparetWait_t *) data) = *waitreq;
1759 
1760 		RF_Free(waitreq, sizeof(*waitreq));
1761 		return (0);
1762 
1763 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1764 		 * code in it that will cause the dameon to exit */
1765 	case RAIDFRAME_ABORT_SPARET_WAIT:
1766 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1767 		waitreq->fcol = -1;
1768 		rf_lock_mutex2(rf_sparet_wait_mutex);
1769 		waitreq->next = rf_sparet_wait_queue;
1770 		rf_sparet_wait_queue = waitreq;
1771 		rf_broadcast_conf2(rf_sparet_wait_cv);
1772 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1773 		return (0);
1774 
1775 		/* used by the spare table daemon to deliver a spare table
1776 		 * into the kernel */
1777 	case RAIDFRAME_SEND_SPARET:
1778 
1779 		/* install the spare table */
1780 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1781 
1782 		/* respond to the requestor.  the return status of the spare
1783 		 * table installation is passed in the "fcol" field */
1784 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1785 		waitreq->fcol = retcode;
1786 		rf_lock_mutex2(rf_sparet_wait_mutex);
1787 		waitreq->next = rf_sparet_resp_queue;
1788 		rf_sparet_resp_queue = waitreq;
1789 		rf_broadcast_cond2(rf_sparet_resp_cv);
1790 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1791 
1792 		return (retcode);
1793 #endif
1794 
1795 	default:
1796 		break; /* fall through to the os-specific code below */
1797 
1798 	}
1799 
1800 	if (!raidPtr->valid)
1801 		return (EINVAL);
1802 
1803 	/*
1804 	 * Add support for "regular" device ioctls here.
1805 	 */
1806 
1807 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1808 	if (error != EPASSTHROUGH)
1809 		return (error);
1810 
1811 	switch (cmd) {
1812 	case DIOCGDINFO:
1813 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1814 		break;
1815 #ifdef __HAVE_OLD_DISKLABEL
1816 	case ODIOCGDINFO:
1817 		newlabel = *(rs->sc_dkdev.dk_label);
1818 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1819 			return ENOTTY;
1820 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1821 		break;
1822 #endif
1823 
1824 	case DIOCGPART:
1825 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1826 		((struct partinfo *) data)->part =
1827 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1828 		break;
1829 
1830 	case DIOCWDINFO:
1831 	case DIOCSDINFO:
1832 #ifdef __HAVE_OLD_DISKLABEL
1833 	case ODIOCWDINFO:
1834 	case ODIOCSDINFO:
1835 #endif
1836 	{
1837 		struct disklabel *lp;
1838 #ifdef __HAVE_OLD_DISKLABEL
1839 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1840 			memset(&newlabel, 0, sizeof newlabel);
1841 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1842 			lp = &newlabel;
1843 		} else
1844 #endif
1845 		lp = (struct disklabel *)data;
1846 
1847 		if ((error = raidlock(rs)) != 0)
1848 			return (error);
1849 
1850 		rs->sc_flags |= RAIDF_LABELLING;
1851 
1852 		error = setdisklabel(rs->sc_dkdev.dk_label,
1853 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
1854 		if (error == 0) {
1855 			if (cmd == DIOCWDINFO
1856 #ifdef __HAVE_OLD_DISKLABEL
1857 			    || cmd == ODIOCWDINFO
1858 #endif
1859 			   )
1860 				error = writedisklabel(RAIDLABELDEV(dev),
1861 				    raidstrategy, rs->sc_dkdev.dk_label,
1862 				    rs->sc_dkdev.dk_cpulabel);
1863 		}
1864 		rs->sc_flags &= ~RAIDF_LABELLING;
1865 
1866 		raidunlock(rs);
1867 
1868 		if (error)
1869 			return (error);
1870 		break;
1871 	}
1872 
1873 	case DIOCWLABEL:
1874 		if (*(int *) data != 0)
1875 			rs->sc_flags |= RAIDF_WLABEL;
1876 		else
1877 			rs->sc_flags &= ~RAIDF_WLABEL;
1878 		break;
1879 
1880 	case DIOCGDEFLABEL:
1881 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1882 		break;
1883 
1884 #ifdef __HAVE_OLD_DISKLABEL
1885 	case ODIOCGDEFLABEL:
1886 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
1887 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1888 			return ENOTTY;
1889 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1890 		break;
1891 #endif
1892 
1893 	case DIOCAWEDGE:
1894 	case DIOCDWEDGE:
1895 	    	dkw = (void *)data;
1896 
1897 		/* If the ioctl happens here, the parent is us. */
1898 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
1899 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1900 
1901 	case DIOCLWEDGES:
1902 		return dkwedge_list(&rs->sc_dkdev,
1903 		    (struct dkwedge_list *)data, l);
1904 	case DIOCCACHESYNC:
1905 		return rf_sync_component_caches(raidPtr);
1906 
1907 	case DIOCGSTRATEGY:
1908 	    {
1909 		struct disk_strategy *dks = (void *)data;
1910 
1911 		s = splbio();
1912 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
1913 		    sizeof(dks->dks_name));
1914 		splx(s);
1915 		dks->dks_paramlen = 0;
1916 
1917 		return 0;
1918 	    }
1919 
1920 	case DIOCSSTRATEGY:
1921 	    {
1922 		struct disk_strategy *dks = (void *)data;
1923 		struct bufq_state *new;
1924 		struct bufq_state *old;
1925 
1926 		if (dks->dks_param != NULL) {
1927 			return EINVAL;
1928 		}
1929 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
1930 		error = bufq_alloc(&new, dks->dks_name,
1931 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
1932 		if (error) {
1933 			return error;
1934 		}
1935 		s = splbio();
1936 		old = rs->buf_queue;
1937 		bufq_move(new, old);
1938 		rs->buf_queue = new;
1939 		splx(s);
1940 		bufq_free(old);
1941 
1942 		return 0;
1943 	    }
1944 
1945 	default:
1946 		retcode = ENOTTY;
1947 	}
1948 	return (retcode);
1949 
1950 }
1951 
1952 
1953 /* raidinit -- complete the rest of the initialization for the
1954    RAIDframe device.  */
1955 
1956 
1957 static void
1958 raidinit(struct raid_softc *rs)
1959 {
1960 	cfdata_t cf;
1961 	int     unit;
1962 	RF_Raid_t *raidPtr = &rs->sc_r;
1963 
1964 	unit = raidPtr->raidid;
1965 
1966 
1967 	/* XXX should check return code first... */
1968 	rs->sc_flags |= RAIDF_INITED;
1969 
1970 	/* XXX doesn't check bounds. */
1971 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1972 
1973 	/* attach the pseudo device */
1974 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1975 	cf->cf_name = raid_cd.cd_name;
1976 	cf->cf_atname = raid_cd.cd_name;
1977 	cf->cf_unit = unit;
1978 	cf->cf_fstate = FSTATE_STAR;
1979 
1980 	rs->sc_dev = config_attach_pseudo(cf);
1981 
1982 	if (rs->sc_dev == NULL) {
1983 		printf("raid%d: config_attach_pseudo failed\n",
1984 		    raidPtr->raidid);
1985 		rs->sc_flags &= ~RAIDF_INITED;
1986 		free(cf, M_RAIDFRAME);
1987 		return;
1988 	}
1989 
1990 	/* disk_attach actually creates space for the CPU disklabel, among
1991 	 * other things, so it's critical to call this *BEFORE* we try putzing
1992 	 * with disklabels. */
1993 
1994 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1995 	disk_attach(&rs->sc_dkdev);
1996 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1997 
1998 	/* XXX There may be a weird interaction here between this, and
1999 	 * protectedSectors, as used in RAIDframe.  */
2000 
2001 	rs->sc_size = raidPtr->totalSectors;
2002 
2003 	dkwedge_discover(&rs->sc_dkdev);
2004 
2005 	rf_set_geometry(rs, raidPtr);
2006 
2007 }
2008 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
2009 /* wake up the daemon & tell it to get us a spare table
2010  * XXX
2011  * the entries in the queues should be tagged with the raidPtr
2012  * so that in the extremely rare case that two recons happen at once,
2013  * we know for which device were requesting a spare table
2014  * XXX
2015  *
2016  * XXX This code is not currently used. GO
2017  */
2018 int
2019 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2020 {
2021 	int     retcode;
2022 
2023 	rf_lock_mutex2(rf_sparet_wait_mutex);
2024 	req->next = rf_sparet_wait_queue;
2025 	rf_sparet_wait_queue = req;
2026 	rf_broadcast_cond2(rf_sparet_wait_cv);
2027 
2028 	/* mpsleep unlocks the mutex */
2029 	while (!rf_sparet_resp_queue) {
2030 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2031 	}
2032 	req = rf_sparet_resp_queue;
2033 	rf_sparet_resp_queue = req->next;
2034 	rf_unlock_mutex2(rf_sparet_wait_mutex);
2035 
2036 	retcode = req->fcol;
2037 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
2038 					 * alloc'd */
2039 	return (retcode);
2040 }
2041 #endif
2042 
2043 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2044  * bp & passes it down.
2045  * any calls originating in the kernel must use non-blocking I/O
2046  * do some extra sanity checking to return "appropriate" error values for
2047  * certain conditions (to make some standard utilities work)
2048  *
2049  * Formerly known as: rf_DoAccessKernel
2050  */
2051 void
2052 raidstart(RF_Raid_t *raidPtr)
2053 {
2054 	RF_SectorCount_t num_blocks, pb, sum;
2055 	RF_RaidAddr_t raid_addr;
2056 	struct partition *pp;
2057 	daddr_t blocknum;
2058 	struct raid_softc *rs;
2059 	int     do_async;
2060 	struct buf *bp;
2061 	int rc;
2062 
2063 	rs = raidPtr->softc;
2064 	/* quick check to see if anything has died recently */
2065 	rf_lock_mutex2(raidPtr->mutex);
2066 	if (raidPtr->numNewFailures > 0) {
2067 		rf_unlock_mutex2(raidPtr->mutex);
2068 		rf_update_component_labels(raidPtr,
2069 					   RF_NORMAL_COMPONENT_UPDATE);
2070 		rf_lock_mutex2(raidPtr->mutex);
2071 		raidPtr->numNewFailures--;
2072 	}
2073 
2074 	/* Check to see if we're at the limit... */
2075 	while (raidPtr->openings > 0) {
2076 		rf_unlock_mutex2(raidPtr->mutex);
2077 
2078 		/* get the next item, if any, from the queue */
2079 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
2080 			/* nothing more to do */
2081 			return;
2082 		}
2083 
2084 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
2085 		 * partition.. Need to make it absolute to the underlying
2086 		 * device.. */
2087 
2088 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2089 		if (DISKPART(bp->b_dev) != RAW_PART) {
2090 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2091 			blocknum += pp->p_offset;
2092 		}
2093 
2094 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2095 			    (int) blocknum));
2096 
2097 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2098 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2099 
2100 		/* *THIS* is where we adjust what block we're going to...
2101 		 * but DO NOT TOUCH bp->b_blkno!!! */
2102 		raid_addr = blocknum;
2103 
2104 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2105 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2106 		sum = raid_addr + num_blocks + pb;
2107 		if (1 || rf_debugKernelAccess) {
2108 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2109 				    (int) raid_addr, (int) sum, (int) num_blocks,
2110 				    (int) pb, (int) bp->b_resid));
2111 		}
2112 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2113 		    || (sum < num_blocks) || (sum < pb)) {
2114 			bp->b_error = ENOSPC;
2115 			bp->b_resid = bp->b_bcount;
2116 			biodone(bp);
2117 			rf_lock_mutex2(raidPtr->mutex);
2118 			continue;
2119 		}
2120 		/*
2121 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2122 		 */
2123 
2124 		if (bp->b_bcount & raidPtr->sectorMask) {
2125 			bp->b_error = EINVAL;
2126 			bp->b_resid = bp->b_bcount;
2127 			biodone(bp);
2128 			rf_lock_mutex2(raidPtr->mutex);
2129 			continue;
2130 
2131 		}
2132 		db1_printf(("Calling DoAccess..\n"));
2133 
2134 
2135 		rf_lock_mutex2(raidPtr->mutex);
2136 		raidPtr->openings--;
2137 		rf_unlock_mutex2(raidPtr->mutex);
2138 
2139 		/*
2140 		 * Everything is async.
2141 		 */
2142 		do_async = 1;
2143 
2144 		disk_busy(&rs->sc_dkdev);
2145 
2146 		/* XXX we're still at splbio() here... do we *really*
2147 		   need to be? */
2148 
2149 		/* don't ever condition on bp->b_flags & B_WRITE.
2150 		 * always condition on B_READ instead */
2151 
2152 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2153 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2154 				 do_async, raid_addr, num_blocks,
2155 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2156 
2157 		if (rc) {
2158 			bp->b_error = rc;
2159 			bp->b_resid = bp->b_bcount;
2160 			biodone(bp);
2161 			/* continue loop */
2162 		}
2163 
2164 		rf_lock_mutex2(raidPtr->mutex);
2165 	}
2166 	rf_unlock_mutex2(raidPtr->mutex);
2167 }
2168 
2169 
2170 
2171 
2172 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2173 
2174 int
2175 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2176 {
2177 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2178 	struct buf *bp;
2179 
2180 	req->queue = queue;
2181 	bp = req->bp;
2182 
2183 	switch (req->type) {
2184 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2185 		/* XXX need to do something extra here.. */
2186 		/* I'm leaving this in, as I've never actually seen it used,
2187 		 * and I'd like folks to report it... GO */
2188 		printf(("WAKEUP CALLED\n"));
2189 		queue->numOutstanding++;
2190 
2191 		bp->b_flags = 0;
2192 		bp->b_private = req;
2193 
2194 		KernelWakeupFunc(bp);
2195 		break;
2196 
2197 	case RF_IO_TYPE_READ:
2198 	case RF_IO_TYPE_WRITE:
2199 #if RF_ACC_TRACE > 0
2200 		if (req->tracerec) {
2201 			RF_ETIMER_START(req->tracerec->timer);
2202 		}
2203 #endif
2204 		InitBP(bp, queue->rf_cinfo->ci_vp,
2205 		    op, queue->rf_cinfo->ci_dev,
2206 		    req->sectorOffset, req->numSector,
2207 		    req->buf, KernelWakeupFunc, (void *) req,
2208 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2209 
2210 		if (rf_debugKernelAccess) {
2211 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2212 				(long) bp->b_blkno));
2213 		}
2214 		queue->numOutstanding++;
2215 		queue->last_deq_sector = req->sectorOffset;
2216 		/* acc wouldn't have been let in if there were any pending
2217 		 * reqs at any other priority */
2218 		queue->curPriority = req->priority;
2219 
2220 		db1_printf(("Going for %c to unit %d col %d\n",
2221 			    req->type, queue->raidPtr->raidid,
2222 			    queue->col));
2223 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2224 			(int) req->sectorOffset, (int) req->numSector,
2225 			(int) (req->numSector <<
2226 			    queue->raidPtr->logBytesPerSector),
2227 			(int) queue->raidPtr->logBytesPerSector));
2228 
2229 		/*
2230 		 * XXX: drop lock here since this can block at
2231 		 * least with backing SCSI devices.  Retake it
2232 		 * to minimize fuss with calling interfaces.
2233 		 */
2234 
2235 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2236 		bdev_strategy(bp);
2237 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2238 		break;
2239 
2240 	default:
2241 		panic("bad req->type in rf_DispatchKernelIO");
2242 	}
2243 	db1_printf(("Exiting from DispatchKernelIO\n"));
2244 
2245 	return (0);
2246 }
2247 /* this is the callback function associated with a I/O invoked from
2248    kernel code.
2249  */
2250 static void
2251 KernelWakeupFunc(struct buf *bp)
2252 {
2253 	RF_DiskQueueData_t *req = NULL;
2254 	RF_DiskQueue_t *queue;
2255 
2256 	db1_printf(("recovering the request queue:\n"));
2257 
2258 	req = bp->b_private;
2259 
2260 	queue = (RF_DiskQueue_t *) req->queue;
2261 
2262 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2263 
2264 #if RF_ACC_TRACE > 0
2265 	if (req->tracerec) {
2266 		RF_ETIMER_STOP(req->tracerec->timer);
2267 		RF_ETIMER_EVAL(req->tracerec->timer);
2268 		rf_lock_mutex2(rf_tracing_mutex);
2269 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2270 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2271 		req->tracerec->num_phys_ios++;
2272 		rf_unlock_mutex2(rf_tracing_mutex);
2273 	}
2274 #endif
2275 
2276 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2277 	 * ballistic, and mark the component as hosed... */
2278 
2279 	if (bp->b_error != 0) {
2280 		/* Mark the disk as dead */
2281 		/* but only mark it once... */
2282 		/* and only if it wouldn't leave this RAID set
2283 		   completely broken */
2284 		if (((queue->raidPtr->Disks[queue->col].status ==
2285 		      rf_ds_optimal) ||
2286 		     (queue->raidPtr->Disks[queue->col].status ==
2287 		      rf_ds_used_spare)) &&
2288 		     (queue->raidPtr->numFailures <
2289 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2290 			printf("raid%d: IO Error.  Marking %s as failed.\n",
2291 			       queue->raidPtr->raidid,
2292 			       queue->raidPtr->Disks[queue->col].devname);
2293 			queue->raidPtr->Disks[queue->col].status =
2294 			    rf_ds_failed;
2295 			queue->raidPtr->status = rf_rs_degraded;
2296 			queue->raidPtr->numFailures++;
2297 			queue->raidPtr->numNewFailures++;
2298 		} else {	/* Disk is already dead... */
2299 			/* printf("Disk already marked as dead!\n"); */
2300 		}
2301 
2302 	}
2303 
2304 	/* Fill in the error value */
2305 	req->error = bp->b_error;
2306 
2307 	/* Drop this one on the "finished" queue... */
2308 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2309 
2310 	/* Let the raidio thread know there is work to be done. */
2311 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2312 
2313 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2314 }
2315 
2316 
2317 /*
2318  * initialize a buf structure for doing an I/O in the kernel.
2319  */
2320 static void
2321 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2322        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2323        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2324        struct proc *b_proc)
2325 {
2326 	/* bp->b_flags       = B_PHYS | rw_flag; */
2327 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2328 	bp->b_oflags = 0;
2329 	bp->b_cflags = 0;
2330 	bp->b_bcount = numSect << logBytesPerSector;
2331 	bp->b_bufsize = bp->b_bcount;
2332 	bp->b_error = 0;
2333 	bp->b_dev = dev;
2334 	bp->b_data = bf;
2335 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2336 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2337 	if (bp->b_bcount == 0) {
2338 		panic("bp->b_bcount is zero in InitBP!!");
2339 	}
2340 	bp->b_proc = b_proc;
2341 	bp->b_iodone = cbFunc;
2342 	bp->b_private = cbArg;
2343 }
2344 
2345 static void
2346 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2347 		    struct disklabel *lp)
2348 {
2349 	memset(lp, 0, sizeof(*lp));
2350 
2351 	/* fabricate a label... */
2352 	lp->d_secperunit = raidPtr->totalSectors;
2353 	lp->d_secsize = raidPtr->bytesPerSector;
2354 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2355 	lp->d_ntracks = 4 * raidPtr->numCol;
2356 	lp->d_ncylinders = raidPtr->totalSectors /
2357 		(lp->d_nsectors * lp->d_ntracks);
2358 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2359 
2360 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2361 	lp->d_type = DTYPE_RAID;
2362 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2363 	lp->d_rpm = 3600;
2364 	lp->d_interleave = 1;
2365 	lp->d_flags = 0;
2366 
2367 	lp->d_partitions[RAW_PART].p_offset = 0;
2368 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2369 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2370 	lp->d_npartitions = RAW_PART + 1;
2371 
2372 	lp->d_magic = DISKMAGIC;
2373 	lp->d_magic2 = DISKMAGIC;
2374 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2375 
2376 }
2377 /*
2378  * Read the disklabel from the raid device.  If one is not present, fake one
2379  * up.
2380  */
2381 static void
2382 raidgetdisklabel(dev_t dev)
2383 {
2384 	int     unit = raidunit(dev);
2385 	struct raid_softc *rs;
2386 	const char   *errstring;
2387 	struct disklabel *lp;
2388 	struct cpu_disklabel *clp;
2389 	RF_Raid_t *raidPtr;
2390 
2391 	if ((rs = raidget(unit)) == NULL)
2392 		return;
2393 
2394 	lp = rs->sc_dkdev.dk_label;
2395 	clp = rs->sc_dkdev.dk_cpulabel;
2396 
2397 	db1_printf(("Getting the disklabel...\n"));
2398 
2399 	memset(clp, 0, sizeof(*clp));
2400 
2401 	raidPtr = &rs->sc_r;
2402 
2403 	raidgetdefaultlabel(raidPtr, rs, lp);
2404 
2405 	/*
2406 	 * Call the generic disklabel extraction routine.
2407 	 */
2408 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2409 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2410 	if (errstring)
2411 		raidmakedisklabel(rs);
2412 	else {
2413 		int     i;
2414 		struct partition *pp;
2415 
2416 		/*
2417 		 * Sanity check whether the found disklabel is valid.
2418 		 *
2419 		 * This is necessary since total size of the raid device
2420 		 * may vary when an interleave is changed even though exactly
2421 		 * same components are used, and old disklabel may used
2422 		 * if that is found.
2423 		 */
2424 		if (lp->d_secperunit != rs->sc_size)
2425 			printf("raid%d: WARNING: %s: "
2426 			    "total sector size in disklabel (%" PRIu32 ") != "
2427 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2428 			    lp->d_secperunit, rs->sc_size);
2429 		for (i = 0; i < lp->d_npartitions; i++) {
2430 			pp = &lp->d_partitions[i];
2431 			if (pp->p_offset + pp->p_size > rs->sc_size)
2432 				printf("raid%d: WARNING: %s: end of partition `%c' "
2433 				       "exceeds the size of raid (%" PRIu64 ")\n",
2434 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
2435 		}
2436 	}
2437 
2438 }
2439 /*
2440  * Take care of things one might want to take care of in the event
2441  * that a disklabel isn't present.
2442  */
2443 static void
2444 raidmakedisklabel(struct raid_softc *rs)
2445 {
2446 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2447 	db1_printf(("Making a label..\n"));
2448 
2449 	/*
2450 	 * For historical reasons, if there's no disklabel present
2451 	 * the raw partition must be marked FS_BSDFFS.
2452 	 */
2453 
2454 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2455 
2456 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2457 
2458 	lp->d_checksum = dkcksum(lp);
2459 }
2460 /*
2461  * Wait interruptibly for an exclusive lock.
2462  *
2463  * XXX
2464  * Several drivers do this; it should be abstracted and made MP-safe.
2465  * (Hmm... where have we seen this warning before :->  GO )
2466  */
2467 static int
2468 raidlock(struct raid_softc *rs)
2469 {
2470 	int     error;
2471 
2472 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2473 		rs->sc_flags |= RAIDF_WANTED;
2474 		if ((error =
2475 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2476 			return (error);
2477 	}
2478 	rs->sc_flags |= RAIDF_LOCKED;
2479 	return (0);
2480 }
2481 /*
2482  * Unlock and wake up any waiters.
2483  */
2484 static void
2485 raidunlock(struct raid_softc *rs)
2486 {
2487 
2488 	rs->sc_flags &= ~RAIDF_LOCKED;
2489 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2490 		rs->sc_flags &= ~RAIDF_WANTED;
2491 		wakeup(rs);
2492 	}
2493 }
2494 
2495 
2496 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2497 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2498 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2499 
2500 static daddr_t
2501 rf_component_info_offset(void)
2502 {
2503 
2504 	return RF_COMPONENT_INFO_OFFSET;
2505 }
2506 
2507 static daddr_t
2508 rf_component_info_size(unsigned secsize)
2509 {
2510 	daddr_t info_size;
2511 
2512 	KASSERT(secsize);
2513 	if (secsize > RF_COMPONENT_INFO_SIZE)
2514 		info_size = secsize;
2515 	else
2516 		info_size = RF_COMPONENT_INFO_SIZE;
2517 
2518 	return info_size;
2519 }
2520 
2521 static daddr_t
2522 rf_parity_map_offset(RF_Raid_t *raidPtr)
2523 {
2524 	daddr_t map_offset;
2525 
2526 	KASSERT(raidPtr->bytesPerSector);
2527 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2528 		map_offset = raidPtr->bytesPerSector;
2529 	else
2530 		map_offset = RF_COMPONENT_INFO_SIZE;
2531 	map_offset += rf_component_info_offset();
2532 
2533 	return map_offset;
2534 }
2535 
2536 static daddr_t
2537 rf_parity_map_size(RF_Raid_t *raidPtr)
2538 {
2539 	daddr_t map_size;
2540 
2541 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2542 		map_size = raidPtr->bytesPerSector;
2543 	else
2544 		map_size = RF_PARITY_MAP_SIZE;
2545 
2546 	return map_size;
2547 }
2548 
2549 int
2550 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2551 {
2552 	RF_ComponentLabel_t *clabel;
2553 
2554 	clabel = raidget_component_label(raidPtr, col);
2555 	clabel->clean = RF_RAID_CLEAN;
2556 	raidflush_component_label(raidPtr, col);
2557 	return(0);
2558 }
2559 
2560 
2561 int
2562 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2563 {
2564 	RF_ComponentLabel_t *clabel;
2565 
2566 	clabel = raidget_component_label(raidPtr, col);
2567 	clabel->clean = RF_RAID_DIRTY;
2568 	raidflush_component_label(raidPtr, col);
2569 	return(0);
2570 }
2571 
2572 int
2573 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2574 {
2575 	KASSERT(raidPtr->bytesPerSector);
2576 	return raidread_component_label(raidPtr->bytesPerSector,
2577 	    raidPtr->Disks[col].dev,
2578 	    raidPtr->raid_cinfo[col].ci_vp,
2579 	    &raidPtr->raid_cinfo[col].ci_label);
2580 }
2581 
2582 RF_ComponentLabel_t *
2583 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2584 {
2585 	return &raidPtr->raid_cinfo[col].ci_label;
2586 }
2587 
2588 int
2589 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2590 {
2591 	RF_ComponentLabel_t *label;
2592 
2593 	label = &raidPtr->raid_cinfo[col].ci_label;
2594 	label->mod_counter = raidPtr->mod_counter;
2595 #ifndef RF_NO_PARITY_MAP
2596 	label->parity_map_modcount = label->mod_counter;
2597 #endif
2598 	return raidwrite_component_label(raidPtr->bytesPerSector,
2599 	    raidPtr->Disks[col].dev,
2600 	    raidPtr->raid_cinfo[col].ci_vp, label);
2601 }
2602 
2603 
2604 static int
2605 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2606     RF_ComponentLabel_t *clabel)
2607 {
2608 	return raidread_component_area(dev, b_vp, clabel,
2609 	    sizeof(RF_ComponentLabel_t),
2610 	    rf_component_info_offset(),
2611 	    rf_component_info_size(secsize));
2612 }
2613 
2614 /* ARGSUSED */
2615 static int
2616 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2617     size_t msize, daddr_t offset, daddr_t dsize)
2618 {
2619 	struct buf *bp;
2620 	const struct bdevsw *bdev;
2621 	int error;
2622 
2623 	/* XXX should probably ensure that we don't try to do this if
2624 	   someone has changed rf_protected_sectors. */
2625 
2626 	if (b_vp == NULL) {
2627 		/* For whatever reason, this component is not valid.
2628 		   Don't try to read a component label from it. */
2629 		return(EINVAL);
2630 	}
2631 
2632 	/* get a block of the appropriate size... */
2633 	bp = geteblk((int)dsize);
2634 	bp->b_dev = dev;
2635 
2636 	/* get our ducks in a row for the read */
2637 	bp->b_blkno = offset / DEV_BSIZE;
2638 	bp->b_bcount = dsize;
2639 	bp->b_flags |= B_READ;
2640  	bp->b_resid = dsize;
2641 
2642 	bdev = bdevsw_lookup(bp->b_dev);
2643 	if (bdev == NULL)
2644 		return (ENXIO);
2645 	(*bdev->d_strategy)(bp);
2646 
2647 	error = biowait(bp);
2648 
2649 	if (!error) {
2650 		memcpy(data, bp->b_data, msize);
2651 	}
2652 
2653 	brelse(bp, 0);
2654 	return(error);
2655 }
2656 
2657 
2658 static int
2659 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2660     RF_ComponentLabel_t *clabel)
2661 {
2662 	return raidwrite_component_area(dev, b_vp, clabel,
2663 	    sizeof(RF_ComponentLabel_t),
2664 	    rf_component_info_offset(),
2665 	    rf_component_info_size(secsize), 0);
2666 }
2667 
2668 /* ARGSUSED */
2669 static int
2670 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2671     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2672 {
2673 	struct buf *bp;
2674 	const struct bdevsw *bdev;
2675 	int error;
2676 
2677 	/* get a block of the appropriate size... */
2678 	bp = geteblk((int)dsize);
2679 	bp->b_dev = dev;
2680 
2681 	/* get our ducks in a row for the write */
2682 	bp->b_blkno = offset / DEV_BSIZE;
2683 	bp->b_bcount = dsize;
2684 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2685  	bp->b_resid = dsize;
2686 
2687 	memset(bp->b_data, 0, dsize);
2688 	memcpy(bp->b_data, data, msize);
2689 
2690 	bdev = bdevsw_lookup(bp->b_dev);
2691 	if (bdev == NULL)
2692 		return (ENXIO);
2693 	(*bdev->d_strategy)(bp);
2694 	if (asyncp)
2695 		return 0;
2696 	error = biowait(bp);
2697 	brelse(bp, 0);
2698 	if (error) {
2699 #if 1
2700 		printf("Failed to write RAID component info!\n");
2701 #endif
2702 	}
2703 
2704 	return(error);
2705 }
2706 
2707 void
2708 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2709 {
2710 	int c;
2711 
2712 	for (c = 0; c < raidPtr->numCol; c++) {
2713 		/* Skip dead disks. */
2714 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2715 			continue;
2716 		/* XXXjld: what if an error occurs here? */
2717 		raidwrite_component_area(raidPtr->Disks[c].dev,
2718 		    raidPtr->raid_cinfo[c].ci_vp, map,
2719 		    RF_PARITYMAP_NBYTE,
2720 		    rf_parity_map_offset(raidPtr),
2721 		    rf_parity_map_size(raidPtr), 0);
2722 	}
2723 }
2724 
2725 void
2726 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2727 {
2728 	struct rf_paritymap_ondisk tmp;
2729 	int c,first;
2730 
2731 	first=1;
2732 	for (c = 0; c < raidPtr->numCol; c++) {
2733 		/* Skip dead disks. */
2734 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2735 			continue;
2736 		raidread_component_area(raidPtr->Disks[c].dev,
2737 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2738 		    RF_PARITYMAP_NBYTE,
2739 		    rf_parity_map_offset(raidPtr),
2740 		    rf_parity_map_size(raidPtr));
2741 		if (first) {
2742 			memcpy(map, &tmp, sizeof(*map));
2743 			first = 0;
2744 		} else {
2745 			rf_paritymap_merge(map, &tmp);
2746 		}
2747 	}
2748 }
2749 
2750 void
2751 rf_markalldirty(RF_Raid_t *raidPtr)
2752 {
2753 	RF_ComponentLabel_t *clabel;
2754 	int sparecol;
2755 	int c;
2756 	int j;
2757 	int scol = -1;
2758 
2759 	raidPtr->mod_counter++;
2760 	for (c = 0; c < raidPtr->numCol; c++) {
2761 		/* we don't want to touch (at all) a disk that has
2762 		   failed */
2763 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2764 			clabel = raidget_component_label(raidPtr, c);
2765 			if (clabel->status == rf_ds_spared) {
2766 				/* XXX do something special...
2767 				   but whatever you do, don't
2768 				   try to access it!! */
2769 			} else {
2770 				raidmarkdirty(raidPtr, c);
2771 			}
2772 		}
2773 	}
2774 
2775 	for( c = 0; c < raidPtr->numSpare ; c++) {
2776 		sparecol = raidPtr->numCol + c;
2777 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2778 			/*
2779 
2780 			   we claim this disk is "optimal" if it's
2781 			   rf_ds_used_spare, as that means it should be
2782 			   directly substitutable for the disk it replaced.
2783 			   We note that too...
2784 
2785 			 */
2786 
2787 			for(j=0;j<raidPtr->numCol;j++) {
2788 				if (raidPtr->Disks[j].spareCol == sparecol) {
2789 					scol = j;
2790 					break;
2791 				}
2792 			}
2793 
2794 			clabel = raidget_component_label(raidPtr, sparecol);
2795 			/* make sure status is noted */
2796 
2797 			raid_init_component_label(raidPtr, clabel);
2798 
2799 			clabel->row = 0;
2800 			clabel->column = scol;
2801 			/* Note: we *don't* change status from rf_ds_used_spare
2802 			   to rf_ds_optimal */
2803 			/* clabel.status = rf_ds_optimal; */
2804 
2805 			raidmarkdirty(raidPtr, sparecol);
2806 		}
2807 	}
2808 }
2809 
2810 
2811 void
2812 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2813 {
2814 	RF_ComponentLabel_t *clabel;
2815 	int sparecol;
2816 	int c;
2817 	int j;
2818 	int scol;
2819 
2820 	scol = -1;
2821 
2822 	/* XXX should do extra checks to make sure things really are clean,
2823 	   rather than blindly setting the clean bit... */
2824 
2825 	raidPtr->mod_counter++;
2826 
2827 	for (c = 0; c < raidPtr->numCol; c++) {
2828 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2829 			clabel = raidget_component_label(raidPtr, c);
2830 			/* make sure status is noted */
2831 			clabel->status = rf_ds_optimal;
2832 
2833 			/* note what unit we are configured as */
2834 			clabel->last_unit = raidPtr->raidid;
2835 
2836 			raidflush_component_label(raidPtr, c);
2837 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2838 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2839 					raidmarkclean(raidPtr, c);
2840 				}
2841 			}
2842 		}
2843 		/* else we don't touch it.. */
2844 	}
2845 
2846 	for( c = 0; c < raidPtr->numSpare ; c++) {
2847 		sparecol = raidPtr->numCol + c;
2848 		/* Need to ensure that the reconstruct actually completed! */
2849 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2850 			/*
2851 
2852 			   we claim this disk is "optimal" if it's
2853 			   rf_ds_used_spare, as that means it should be
2854 			   directly substitutable for the disk it replaced.
2855 			   We note that too...
2856 
2857 			 */
2858 
2859 			for(j=0;j<raidPtr->numCol;j++) {
2860 				if (raidPtr->Disks[j].spareCol == sparecol) {
2861 					scol = j;
2862 					break;
2863 				}
2864 			}
2865 
2866 			/* XXX shouldn't *really* need this... */
2867 			clabel = raidget_component_label(raidPtr, sparecol);
2868 			/* make sure status is noted */
2869 
2870 			raid_init_component_label(raidPtr, clabel);
2871 
2872 			clabel->column = scol;
2873 			clabel->status = rf_ds_optimal;
2874 			clabel->last_unit = raidPtr->raidid;
2875 
2876 			raidflush_component_label(raidPtr, sparecol);
2877 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2878 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2879 					raidmarkclean(raidPtr, sparecol);
2880 				}
2881 			}
2882 		}
2883 	}
2884 }
2885 
2886 void
2887 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2888 {
2889 
2890 	if (vp != NULL) {
2891 		if (auto_configured == 1) {
2892 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2893 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2894 			vput(vp);
2895 
2896 		} else {
2897 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2898 		}
2899 	}
2900 }
2901 
2902 
2903 void
2904 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2905 {
2906 	int r,c;
2907 	struct vnode *vp;
2908 	int acd;
2909 
2910 
2911 	/* We take this opportunity to close the vnodes like we should.. */
2912 
2913 	for (c = 0; c < raidPtr->numCol; c++) {
2914 		vp = raidPtr->raid_cinfo[c].ci_vp;
2915 		acd = raidPtr->Disks[c].auto_configured;
2916 		rf_close_component(raidPtr, vp, acd);
2917 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2918 		raidPtr->Disks[c].auto_configured = 0;
2919 	}
2920 
2921 	for (r = 0; r < raidPtr->numSpare; r++) {
2922 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2923 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2924 		rf_close_component(raidPtr, vp, acd);
2925 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2926 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2927 	}
2928 }
2929 
2930 
2931 void
2932 rf_ReconThread(struct rf_recon_req *req)
2933 {
2934 	int     s;
2935 	RF_Raid_t *raidPtr;
2936 
2937 	s = splbio();
2938 	raidPtr = (RF_Raid_t *) req->raidPtr;
2939 	raidPtr->recon_in_progress = 1;
2940 
2941 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2942 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2943 
2944 	RF_Free(req, sizeof(*req));
2945 
2946 	raidPtr->recon_in_progress = 0;
2947 	splx(s);
2948 
2949 	/* That's all... */
2950 	kthread_exit(0);	/* does not return */
2951 }
2952 
2953 void
2954 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2955 {
2956 	int retcode;
2957 	int s;
2958 
2959 	raidPtr->parity_rewrite_stripes_done = 0;
2960 	raidPtr->parity_rewrite_in_progress = 1;
2961 	s = splbio();
2962 	retcode = rf_RewriteParity(raidPtr);
2963 	splx(s);
2964 	if (retcode) {
2965 		printf("raid%d: Error re-writing parity (%d)!\n",
2966 		    raidPtr->raidid, retcode);
2967 	} else {
2968 		/* set the clean bit!  If we shutdown correctly,
2969 		   the clean bit on each component label will get
2970 		   set */
2971 		raidPtr->parity_good = RF_RAID_CLEAN;
2972 	}
2973 	raidPtr->parity_rewrite_in_progress = 0;
2974 
2975 	/* Anyone waiting for us to stop?  If so, inform them... */
2976 	if (raidPtr->waitShutdown) {
2977 		wakeup(&raidPtr->parity_rewrite_in_progress);
2978 	}
2979 
2980 	/* That's all... */
2981 	kthread_exit(0);	/* does not return */
2982 }
2983 
2984 
2985 void
2986 rf_CopybackThread(RF_Raid_t *raidPtr)
2987 {
2988 	int s;
2989 
2990 	raidPtr->copyback_in_progress = 1;
2991 	s = splbio();
2992 	rf_CopybackReconstructedData(raidPtr);
2993 	splx(s);
2994 	raidPtr->copyback_in_progress = 0;
2995 
2996 	/* That's all... */
2997 	kthread_exit(0);	/* does not return */
2998 }
2999 
3000 
3001 void
3002 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
3003 {
3004 	int s;
3005 	RF_Raid_t *raidPtr;
3006 
3007 	s = splbio();
3008 	raidPtr = req->raidPtr;
3009 	raidPtr->recon_in_progress = 1;
3010 	rf_ReconstructInPlace(raidPtr, req->col);
3011 	RF_Free(req, sizeof(*req));
3012 	raidPtr->recon_in_progress = 0;
3013 	splx(s);
3014 
3015 	/* That's all... */
3016 	kthread_exit(0);	/* does not return */
3017 }
3018 
3019 static RF_AutoConfig_t *
3020 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
3021     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
3022     unsigned secsize)
3023 {
3024 	int good_one = 0;
3025 	RF_ComponentLabel_t *clabel;
3026 	RF_AutoConfig_t *ac;
3027 
3028 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
3029 	if (clabel == NULL) {
3030 oomem:
3031 		    while(ac_list) {
3032 			    ac = ac_list;
3033 			    if (ac->clabel)
3034 				    free(ac->clabel, M_RAIDFRAME);
3035 			    ac_list = ac_list->next;
3036 			    free(ac, M_RAIDFRAME);
3037 		    }
3038 		    printf("RAID auto config: out of memory!\n");
3039 		    return NULL; /* XXX probably should panic? */
3040 	}
3041 
3042 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
3043 		/* Got the label.  Does it look reasonable? */
3044 		if (rf_reasonable_label(clabel, numsecs) &&
3045 		    (rf_component_label_partitionsize(clabel) <= size)) {
3046 #ifdef DEBUG
3047 			printf("Component on: %s: %llu\n",
3048 				cname, (unsigned long long)size);
3049 			rf_print_component_label(clabel);
3050 #endif
3051 			/* if it's reasonable, add it, else ignore it. */
3052 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
3053 				M_NOWAIT);
3054 			if (ac == NULL) {
3055 				free(clabel, M_RAIDFRAME);
3056 				goto oomem;
3057 			}
3058 			strlcpy(ac->devname, cname, sizeof(ac->devname));
3059 			ac->dev = dev;
3060 			ac->vp = vp;
3061 			ac->clabel = clabel;
3062 			ac->next = ac_list;
3063 			ac_list = ac;
3064 			good_one = 1;
3065 		}
3066 	}
3067 	if (!good_one) {
3068 		/* cleanup */
3069 		free(clabel, M_RAIDFRAME);
3070 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3071 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3072 		vput(vp);
3073 	}
3074 	return ac_list;
3075 }
3076 
3077 RF_AutoConfig_t *
3078 rf_find_raid_components(void)
3079 {
3080 	struct vnode *vp;
3081 	struct disklabel label;
3082 	device_t dv;
3083 	deviter_t di;
3084 	dev_t dev;
3085 	int bmajor, bminor, wedge, rf_part_found;
3086 	int error;
3087 	int i;
3088 	RF_AutoConfig_t *ac_list;
3089 	uint64_t numsecs;
3090 	unsigned secsize;
3091 
3092 	/* initialize the AutoConfig list */
3093 	ac_list = NULL;
3094 
3095 	/* we begin by trolling through *all* the devices on the system */
3096 
3097 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3098 	     dv = deviter_next(&di)) {
3099 
3100 		/* we are only interested in disks... */
3101 		if (device_class(dv) != DV_DISK)
3102 			continue;
3103 
3104 		/* we don't care about floppies... */
3105 		if (device_is_a(dv, "fd")) {
3106 			continue;
3107 		}
3108 
3109 		/* we don't care about CD's... */
3110 		if (device_is_a(dv, "cd")) {
3111 			continue;
3112 		}
3113 
3114 		/* we don't care about md's... */
3115 		if (device_is_a(dv, "md")) {
3116 			continue;
3117 		}
3118 
3119 		/* hdfd is the Atari/Hades floppy driver */
3120 		if (device_is_a(dv, "hdfd")) {
3121 			continue;
3122 		}
3123 
3124 		/* fdisa is the Atari/Milan floppy driver */
3125 		if (device_is_a(dv, "fdisa")) {
3126 			continue;
3127 		}
3128 
3129 		/* need to find the device_name_to_block_device_major stuff */
3130 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3131 
3132 		rf_part_found = 0; /*No raid partition as yet*/
3133 
3134 		/* get a vnode for the raw partition of this disk */
3135 
3136 		wedge = device_is_a(dv, "dk");
3137 		bminor = minor(device_unit(dv));
3138 		dev = wedge ? makedev(bmajor, bminor) :
3139 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
3140 		if (bdevvp(dev, &vp))
3141 			panic("RAID can't alloc vnode");
3142 
3143 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3144 
3145 		if (error) {
3146 			/* "Who cares."  Continue looking
3147 			   for something that exists*/
3148 			vput(vp);
3149 			continue;
3150 		}
3151 
3152 		error = getdisksize(vp, &numsecs, &secsize);
3153 		if (error) {
3154 			vput(vp);
3155 			continue;
3156 		}
3157 		if (wedge) {
3158 			struct dkwedge_info dkw;
3159 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3160 			    NOCRED);
3161 			if (error) {
3162 				printf("RAIDframe: can't get wedge info for "
3163 				    "dev %s (%d)\n", device_xname(dv), error);
3164 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3165 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3166 				vput(vp);
3167 				continue;
3168 			}
3169 
3170 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3171 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3172 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3173 				vput(vp);
3174 				continue;
3175 			}
3176 
3177 			ac_list = rf_get_component(ac_list, dev, vp,
3178 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3179 			rf_part_found = 1; /*There is a raid component on this disk*/
3180 			continue;
3181 		}
3182 
3183 		/* Ok, the disk exists.  Go get the disklabel. */
3184 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3185 		if (error) {
3186 			/*
3187 			 * XXX can't happen - open() would
3188 			 * have errored out (or faked up one)
3189 			 */
3190 			if (error != ENOTTY)
3191 				printf("RAIDframe: can't get label for dev "
3192 				    "%s (%d)\n", device_xname(dv), error);
3193 		}
3194 
3195 		/* don't need this any more.  We'll allocate it again
3196 		   a little later if we really do... */
3197 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3198 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3199 		vput(vp);
3200 
3201 		if (error)
3202 			continue;
3203 
3204 		rf_part_found = 0; /*No raid partitions yet*/
3205 		for (i = 0; i < label.d_npartitions; i++) {
3206 			char cname[sizeof(ac_list->devname)];
3207 
3208 			/* We only support partitions marked as RAID */
3209 			if (label.d_partitions[i].p_fstype != FS_RAID)
3210 				continue;
3211 
3212 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3213 			if (bdevvp(dev, &vp))
3214 				panic("RAID can't alloc vnode");
3215 
3216 			error = VOP_OPEN(vp, FREAD, NOCRED);
3217 			if (error) {
3218 				/* Whatever... */
3219 				vput(vp);
3220 				continue;
3221 			}
3222 			snprintf(cname, sizeof(cname), "%s%c",
3223 			    device_xname(dv), 'a' + i);
3224 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3225 				label.d_partitions[i].p_size, numsecs, secsize);
3226 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3227 		}
3228 
3229 		/*
3230 		 *If there is no raid component on this disk, either in a
3231 		 *disklabel or inside a wedge, check the raw partition as well,
3232 		 *as it is possible to configure raid components on raw disk
3233 		 *devices.
3234 		 */
3235 
3236 		if (!rf_part_found) {
3237 			char cname[sizeof(ac_list->devname)];
3238 
3239 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3240 			if (bdevvp(dev, &vp))
3241 				panic("RAID can't alloc vnode");
3242 
3243 			error = VOP_OPEN(vp, FREAD, NOCRED);
3244 			if (error) {
3245 				/* Whatever... */
3246 				vput(vp);
3247 				continue;
3248 			}
3249 			snprintf(cname, sizeof(cname), "%s%c",
3250 			    device_xname(dv), 'a' + RAW_PART);
3251 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3252 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3253 		}
3254 	}
3255 	deviter_release(&di);
3256 	return ac_list;
3257 }
3258 
3259 
3260 int
3261 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3262 {
3263 
3264 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3265 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3266 	    ((clabel->clean == RF_RAID_CLEAN) ||
3267 	     (clabel->clean == RF_RAID_DIRTY)) &&
3268 	    clabel->row >=0 &&
3269 	    clabel->column >= 0 &&
3270 	    clabel->num_rows > 0 &&
3271 	    clabel->num_columns > 0 &&
3272 	    clabel->row < clabel->num_rows &&
3273 	    clabel->column < clabel->num_columns &&
3274 	    clabel->blockSize > 0 &&
3275 	    /*
3276 	     * numBlocksHi may contain garbage, but it is ok since
3277 	     * the type is unsigned.  If it is really garbage,
3278 	     * rf_fix_old_label_size() will fix it.
3279 	     */
3280 	    rf_component_label_numblocks(clabel) > 0) {
3281 		/*
3282 		 * label looks reasonable enough...
3283 		 * let's make sure it has no old garbage.
3284 		 */
3285 		if (numsecs)
3286 			rf_fix_old_label_size(clabel, numsecs);
3287 		return(1);
3288 	}
3289 	return(0);
3290 }
3291 
3292 
3293 /*
3294  * For reasons yet unknown, some old component labels have garbage in
3295  * the newer numBlocksHi region, and this causes lossage.  Since those
3296  * disks will also have numsecs set to less than 32 bits of sectors,
3297  * we can determine when this corruption has occurred, and fix it.
3298  *
3299  * The exact same problem, with the same unknown reason, happens to
3300  * the partitionSizeHi member as well.
3301  */
3302 static void
3303 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3304 {
3305 
3306 	if (numsecs < ((uint64_t)1 << 32)) {
3307 		if (clabel->numBlocksHi) {
3308 			printf("WARNING: total sectors < 32 bits, yet "
3309 			       "numBlocksHi set\n"
3310 			       "WARNING: resetting numBlocksHi to zero.\n");
3311 			clabel->numBlocksHi = 0;
3312 		}
3313 
3314 		if (clabel->partitionSizeHi) {
3315 			printf("WARNING: total sectors < 32 bits, yet "
3316 			       "partitionSizeHi set\n"
3317 			       "WARNING: resetting partitionSizeHi to zero.\n");
3318 			clabel->partitionSizeHi = 0;
3319 		}
3320 	}
3321 }
3322 
3323 
3324 #ifdef DEBUG
3325 void
3326 rf_print_component_label(RF_ComponentLabel_t *clabel)
3327 {
3328 	uint64_t numBlocks;
3329 	static const char *rp[] = {
3330 	    "No", "Force", "Soft", "*invalid*"
3331 	};
3332 
3333 
3334 	numBlocks = rf_component_label_numblocks(clabel);
3335 
3336 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3337 	       clabel->row, clabel->column,
3338 	       clabel->num_rows, clabel->num_columns);
3339 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3340 	       clabel->version, clabel->serial_number,
3341 	       clabel->mod_counter);
3342 	printf("   Clean: %s Status: %d\n",
3343 	       clabel->clean ? "Yes" : "No", clabel->status);
3344 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3345 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3346 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3347 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3348 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3349 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3350 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3351 #if 0
3352 	   printf("   Config order: %d\n", clabel->config_order);
3353 #endif
3354 
3355 }
3356 #endif
3357 
3358 RF_ConfigSet_t *
3359 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3360 {
3361 	RF_AutoConfig_t *ac;
3362 	RF_ConfigSet_t *config_sets;
3363 	RF_ConfigSet_t *cset;
3364 	RF_AutoConfig_t *ac_next;
3365 
3366 
3367 	config_sets = NULL;
3368 
3369 	/* Go through the AutoConfig list, and figure out which components
3370 	   belong to what sets.  */
3371 	ac = ac_list;
3372 	while(ac!=NULL) {
3373 		/* we're going to putz with ac->next, so save it here
3374 		   for use at the end of the loop */
3375 		ac_next = ac->next;
3376 
3377 		if (config_sets == NULL) {
3378 			/* will need at least this one... */
3379 			config_sets = (RF_ConfigSet_t *)
3380 				malloc(sizeof(RF_ConfigSet_t),
3381 				       M_RAIDFRAME, M_NOWAIT);
3382 			if (config_sets == NULL) {
3383 				panic("rf_create_auto_sets: No memory!");
3384 			}
3385 			/* this one is easy :) */
3386 			config_sets->ac = ac;
3387 			config_sets->next = NULL;
3388 			config_sets->rootable = 0;
3389 			ac->next = NULL;
3390 		} else {
3391 			/* which set does this component fit into? */
3392 			cset = config_sets;
3393 			while(cset!=NULL) {
3394 				if (rf_does_it_fit(cset, ac)) {
3395 					/* looks like it matches... */
3396 					ac->next = cset->ac;
3397 					cset->ac = ac;
3398 					break;
3399 				}
3400 				cset = cset->next;
3401 			}
3402 			if (cset==NULL) {
3403 				/* didn't find a match above... new set..*/
3404 				cset = (RF_ConfigSet_t *)
3405 					malloc(sizeof(RF_ConfigSet_t),
3406 					       M_RAIDFRAME, M_NOWAIT);
3407 				if (cset == NULL) {
3408 					panic("rf_create_auto_sets: No memory!");
3409 				}
3410 				cset->ac = ac;
3411 				ac->next = NULL;
3412 				cset->next = config_sets;
3413 				cset->rootable = 0;
3414 				config_sets = cset;
3415 			}
3416 		}
3417 		ac = ac_next;
3418 	}
3419 
3420 
3421 	return(config_sets);
3422 }
3423 
3424 static int
3425 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3426 {
3427 	RF_ComponentLabel_t *clabel1, *clabel2;
3428 
3429 	/* If this one matches the *first* one in the set, that's good
3430 	   enough, since the other members of the set would have been
3431 	   through here too... */
3432 	/* note that we are not checking partitionSize here..
3433 
3434 	   Note that we are also not checking the mod_counters here.
3435 	   If everything else matches except the mod_counter, that's
3436 	   good enough for this test.  We will deal with the mod_counters
3437 	   a little later in the autoconfiguration process.
3438 
3439 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3440 
3441 	   The reason we don't check for this is that failed disks
3442 	   will have lower modification counts.  If those disks are
3443 	   not added to the set they used to belong to, then they will
3444 	   form their own set, which may result in 2 different sets,
3445 	   for example, competing to be configured at raid0, and
3446 	   perhaps competing to be the root filesystem set.  If the
3447 	   wrong ones get configured, or both attempt to become /,
3448 	   weird behaviour and or serious lossage will occur.  Thus we
3449 	   need to bring them into the fold here, and kick them out at
3450 	   a later point.
3451 
3452 	*/
3453 
3454 	clabel1 = cset->ac->clabel;
3455 	clabel2 = ac->clabel;
3456 	if ((clabel1->version == clabel2->version) &&
3457 	    (clabel1->serial_number == clabel2->serial_number) &&
3458 	    (clabel1->num_rows == clabel2->num_rows) &&
3459 	    (clabel1->num_columns == clabel2->num_columns) &&
3460 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3461 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3462 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3463 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3464 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3465 	    (clabel1->blockSize == clabel2->blockSize) &&
3466 	    rf_component_label_numblocks(clabel1) ==
3467 	    rf_component_label_numblocks(clabel2) &&
3468 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3469 	    (clabel1->root_partition == clabel2->root_partition) &&
3470 	    (clabel1->last_unit == clabel2->last_unit) &&
3471 	    (clabel1->config_order == clabel2->config_order)) {
3472 		/* if it get's here, it almost *has* to be a match */
3473 	} else {
3474 		/* it's not consistent with somebody in the set..
3475 		   punt */
3476 		return(0);
3477 	}
3478 	/* all was fine.. it must fit... */
3479 	return(1);
3480 }
3481 
3482 int
3483 rf_have_enough_components(RF_ConfigSet_t *cset)
3484 {
3485 	RF_AutoConfig_t *ac;
3486 	RF_AutoConfig_t *auto_config;
3487 	RF_ComponentLabel_t *clabel;
3488 	int c;
3489 	int num_cols;
3490 	int num_missing;
3491 	int mod_counter;
3492 	int mod_counter_found;
3493 	int even_pair_failed;
3494 	char parity_type;
3495 
3496 
3497 	/* check to see that we have enough 'live' components
3498 	   of this set.  If so, we can configure it if necessary */
3499 
3500 	num_cols = cset->ac->clabel->num_columns;
3501 	parity_type = cset->ac->clabel->parityConfig;
3502 
3503 	/* XXX Check for duplicate components!?!?!? */
3504 
3505 	/* Determine what the mod_counter is supposed to be for this set. */
3506 
3507 	mod_counter_found = 0;
3508 	mod_counter = 0;
3509 	ac = cset->ac;
3510 	while(ac!=NULL) {
3511 		if (mod_counter_found==0) {
3512 			mod_counter = ac->clabel->mod_counter;
3513 			mod_counter_found = 1;
3514 		} else {
3515 			if (ac->clabel->mod_counter > mod_counter) {
3516 				mod_counter = ac->clabel->mod_counter;
3517 			}
3518 		}
3519 		ac = ac->next;
3520 	}
3521 
3522 	num_missing = 0;
3523 	auto_config = cset->ac;
3524 
3525 	even_pair_failed = 0;
3526 	for(c=0; c<num_cols; c++) {
3527 		ac = auto_config;
3528 		while(ac!=NULL) {
3529 			if ((ac->clabel->column == c) &&
3530 			    (ac->clabel->mod_counter == mod_counter)) {
3531 				/* it's this one... */
3532 #ifdef DEBUG
3533 				printf("Found: %s at %d\n",
3534 				       ac->devname,c);
3535 #endif
3536 				break;
3537 			}
3538 			ac=ac->next;
3539 		}
3540 		if (ac==NULL) {
3541 				/* Didn't find one here! */
3542 				/* special case for RAID 1, especially
3543 				   where there are more than 2
3544 				   components (where RAIDframe treats
3545 				   things a little differently :( ) */
3546 			if (parity_type == '1') {
3547 				if (c%2 == 0) { /* even component */
3548 					even_pair_failed = 1;
3549 				} else { /* odd component.  If
3550 					    we're failed, and
3551 					    so is the even
3552 					    component, it's
3553 					    "Good Night, Charlie" */
3554 					if (even_pair_failed == 1) {
3555 						return(0);
3556 					}
3557 				}
3558 			} else {
3559 				/* normal accounting */
3560 				num_missing++;
3561 			}
3562 		}
3563 		if ((parity_type == '1') && (c%2 == 1)) {
3564 				/* Just did an even component, and we didn't
3565 				   bail.. reset the even_pair_failed flag,
3566 				   and go on to the next component.... */
3567 			even_pair_failed = 0;
3568 		}
3569 	}
3570 
3571 	clabel = cset->ac->clabel;
3572 
3573 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3574 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3575 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3576 		/* XXX this needs to be made *much* more general */
3577 		/* Too many failures */
3578 		return(0);
3579 	}
3580 	/* otherwise, all is well, and we've got enough to take a kick
3581 	   at autoconfiguring this set */
3582 	return(1);
3583 }
3584 
3585 void
3586 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3587 			RF_Raid_t *raidPtr)
3588 {
3589 	RF_ComponentLabel_t *clabel;
3590 	int i;
3591 
3592 	clabel = ac->clabel;
3593 
3594 	/* 1. Fill in the common stuff */
3595 	config->numRow = clabel->num_rows = 1;
3596 	config->numCol = clabel->num_columns;
3597 	config->numSpare = 0; /* XXX should this be set here? */
3598 	config->sectPerSU = clabel->sectPerSU;
3599 	config->SUsPerPU = clabel->SUsPerPU;
3600 	config->SUsPerRU = clabel->SUsPerRU;
3601 	config->parityConfig = clabel->parityConfig;
3602 	/* XXX... */
3603 	strcpy(config->diskQueueType,"fifo");
3604 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3605 	config->layoutSpecificSize = 0; /* XXX ?? */
3606 
3607 	while(ac!=NULL) {
3608 		/* row/col values will be in range due to the checks
3609 		   in reasonable_label() */
3610 		strcpy(config->devnames[0][ac->clabel->column],
3611 		       ac->devname);
3612 		ac = ac->next;
3613 	}
3614 
3615 	for(i=0;i<RF_MAXDBGV;i++) {
3616 		config->debugVars[i][0] = 0;
3617 	}
3618 }
3619 
3620 int
3621 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3622 {
3623 	RF_ComponentLabel_t *clabel;
3624 	int column;
3625 	int sparecol;
3626 
3627 	raidPtr->autoconfigure = new_value;
3628 
3629 	for(column=0; column<raidPtr->numCol; column++) {
3630 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3631 			clabel = raidget_component_label(raidPtr, column);
3632 			clabel->autoconfigure = new_value;
3633 			raidflush_component_label(raidPtr, column);
3634 		}
3635 	}
3636 	for(column = 0; column < raidPtr->numSpare ; column++) {
3637 		sparecol = raidPtr->numCol + column;
3638 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3639 			clabel = raidget_component_label(raidPtr, sparecol);
3640 			clabel->autoconfigure = new_value;
3641 			raidflush_component_label(raidPtr, sparecol);
3642 		}
3643 	}
3644 	return(new_value);
3645 }
3646 
3647 int
3648 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3649 {
3650 	RF_ComponentLabel_t *clabel;
3651 	int column;
3652 	int sparecol;
3653 
3654 	raidPtr->root_partition = new_value;
3655 	for(column=0; column<raidPtr->numCol; column++) {
3656 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3657 			clabel = raidget_component_label(raidPtr, column);
3658 			clabel->root_partition = new_value;
3659 			raidflush_component_label(raidPtr, column);
3660 		}
3661 	}
3662 	for(column = 0; column < raidPtr->numSpare ; column++) {
3663 		sparecol = raidPtr->numCol + column;
3664 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3665 			clabel = raidget_component_label(raidPtr, sparecol);
3666 			clabel->root_partition = new_value;
3667 			raidflush_component_label(raidPtr, sparecol);
3668 		}
3669 	}
3670 	return(new_value);
3671 }
3672 
3673 void
3674 rf_release_all_vps(RF_ConfigSet_t *cset)
3675 {
3676 	RF_AutoConfig_t *ac;
3677 
3678 	ac = cset->ac;
3679 	while(ac!=NULL) {
3680 		/* Close the vp, and give it back */
3681 		if (ac->vp) {
3682 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3683 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
3684 			vput(ac->vp);
3685 			ac->vp = NULL;
3686 		}
3687 		ac = ac->next;
3688 	}
3689 }
3690 
3691 
3692 void
3693 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3694 {
3695 	RF_AutoConfig_t *ac;
3696 	RF_AutoConfig_t *next_ac;
3697 
3698 	ac = cset->ac;
3699 	while(ac!=NULL) {
3700 		next_ac = ac->next;
3701 		/* nuke the label */
3702 		free(ac->clabel, M_RAIDFRAME);
3703 		/* cleanup the config structure */
3704 		free(ac, M_RAIDFRAME);
3705 		/* "next.." */
3706 		ac = next_ac;
3707 	}
3708 	/* and, finally, nuke the config set */
3709 	free(cset, M_RAIDFRAME);
3710 }
3711 
3712 
3713 void
3714 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3715 {
3716 	/* current version number */
3717 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3718 	clabel->serial_number = raidPtr->serial_number;
3719 	clabel->mod_counter = raidPtr->mod_counter;
3720 
3721 	clabel->num_rows = 1;
3722 	clabel->num_columns = raidPtr->numCol;
3723 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3724 	clabel->status = rf_ds_optimal; /* "It's good!" */
3725 
3726 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3727 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3728 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3729 
3730 	clabel->blockSize = raidPtr->bytesPerSector;
3731 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3732 
3733 	/* XXX not portable */
3734 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3735 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3736 	clabel->autoconfigure = raidPtr->autoconfigure;
3737 	clabel->root_partition = raidPtr->root_partition;
3738 	clabel->last_unit = raidPtr->raidid;
3739 	clabel->config_order = raidPtr->config_order;
3740 
3741 #ifndef RF_NO_PARITY_MAP
3742 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3743 #endif
3744 }
3745 
3746 struct raid_softc *
3747 rf_auto_config_set(RF_ConfigSet_t *cset)
3748 {
3749 	RF_Raid_t *raidPtr;
3750 	RF_Config_t *config;
3751 	int raidID;
3752 	struct raid_softc *sc;
3753 
3754 #ifdef DEBUG
3755 	printf("RAID autoconfigure\n");
3756 #endif
3757 
3758 	/* 1. Create a config structure */
3759 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3760 	if (config == NULL) {
3761 		printf("Out of mem!?!?\n");
3762 				/* XXX do something more intelligent here. */
3763 		return NULL;
3764 	}
3765 
3766 	/*
3767 	   2. Figure out what RAID ID this one is supposed to live at
3768 	   See if we can get the same RAID dev that it was configured
3769 	   on last time..
3770 	*/
3771 
3772 	raidID = cset->ac->clabel->last_unit;
3773 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
3774 		continue;
3775 #ifdef DEBUG
3776 	printf("Configuring raid%d:\n",raidID);
3777 #endif
3778 
3779 	raidPtr = &sc->sc_r;
3780 
3781 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3782 	raidPtr->softc = sc;
3783 	raidPtr->raidid = raidID;
3784 	raidPtr->openings = RAIDOUTSTANDING;
3785 
3786 	/* 3. Build the configuration structure */
3787 	rf_create_configuration(cset->ac, config, raidPtr);
3788 
3789 	/* 4. Do the configuration */
3790 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3791 		raidinit(sc);
3792 
3793 		rf_markalldirty(raidPtr);
3794 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3795 		switch (cset->ac->clabel->root_partition) {
3796 		case 1:	/* Force Root */
3797 		case 2:	/* Soft Root: root when boot partition part of raid */
3798 			/*
3799 			 * everything configured just fine.  Make a note
3800 			 * that this set is eligible to be root,
3801 			 * or forced to be root
3802 			 */
3803 			cset->rootable = cset->ac->clabel->root_partition;
3804 			/* XXX do this here? */
3805 			raidPtr->root_partition = cset->rootable;
3806 			break;
3807 		default:
3808 			break;
3809 		}
3810 	} else {
3811 		raidput(sc);
3812 		sc = NULL;
3813 	}
3814 
3815 	/* 5. Cleanup */
3816 	free(config, M_RAIDFRAME);
3817 	return sc;
3818 }
3819 
3820 void
3821 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3822 {
3823 	struct buf *bp;
3824 	struct raid_softc *rs;
3825 
3826 	bp = (struct buf *)desc->bp;
3827 	rs = desc->raidPtr->softc;
3828 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
3829 	    (bp->b_flags & B_READ));
3830 }
3831 
3832 void
3833 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3834 	     size_t xmin, size_t xmax)
3835 {
3836 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3837 	pool_sethiwat(p, xmax);
3838 	pool_prime(p, xmin);
3839 	pool_setlowat(p, xmin);
3840 }
3841 
3842 /*
3843  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
3844  * if there is IO pending and if that IO could possibly be done for a
3845  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
3846  * otherwise.
3847  *
3848  */
3849 
3850 int
3851 rf_buf_queue_check(RF_Raid_t *raidPtr)
3852 {
3853 	struct raid_softc *rs = raidPtr->softc;
3854 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
3855 		/* there is work to do */
3856 		return 0;
3857 	}
3858 	/* default is nothing to do */
3859 	return 1;
3860 }
3861 
3862 int
3863 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3864 {
3865 	uint64_t numsecs;
3866 	unsigned secsize;
3867 	int error;
3868 
3869 	error = getdisksize(vp, &numsecs, &secsize);
3870 	if (error == 0) {
3871 		diskPtr->blockSize = secsize;
3872 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3873 		diskPtr->partitionSize = numsecs;
3874 		return 0;
3875 	}
3876 	return error;
3877 }
3878 
3879 static int
3880 raid_match(device_t self, cfdata_t cfdata, void *aux)
3881 {
3882 	return 1;
3883 }
3884 
3885 static void
3886 raid_attach(device_t parent, device_t self, void *aux)
3887 {
3888 
3889 }
3890 
3891 
3892 static int
3893 raid_detach(device_t self, int flags)
3894 {
3895 	int error;
3896 	struct raid_softc *rs = raidget(device_unit(self));
3897 
3898 	if (rs == NULL)
3899 		return ENXIO;
3900 
3901 	if ((error = raidlock(rs)) != 0)
3902 		return (error);
3903 
3904 	error = raid_detach_unlocked(rs);
3905 
3906 	raidunlock(rs);
3907 
3908 	/* XXXkd: raidput(rs) ??? */
3909 
3910 	return error;
3911 }
3912 
3913 static void
3914 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3915 {
3916 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
3917 
3918 	memset(dg, 0, sizeof(*dg));
3919 
3920 	dg->dg_secperunit = raidPtr->totalSectors;
3921 	dg->dg_secsize = raidPtr->bytesPerSector;
3922 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3923 	dg->dg_ntracks = 4 * raidPtr->numCol;
3924 
3925 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
3926 }
3927 
3928 /*
3929  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3930  * We end up returning whatever error was returned by the first cache flush
3931  * that fails.
3932  */
3933 
3934 int
3935 rf_sync_component_caches(RF_Raid_t *raidPtr)
3936 {
3937 	int c, sparecol;
3938 	int e,error;
3939 	int force = 1;
3940 
3941 	error = 0;
3942 	for (c = 0; c < raidPtr->numCol; c++) {
3943 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3944 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3945 					  &force, FWRITE, NOCRED);
3946 			if (e) {
3947 				if (e != ENODEV)
3948 					printf("raid%d: cache flush to component %s failed.\n",
3949 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3950 				if (error == 0) {
3951 					error = e;
3952 				}
3953 			}
3954 		}
3955 	}
3956 
3957 	for( c = 0; c < raidPtr->numSpare ; c++) {
3958 		sparecol = raidPtr->numCol + c;
3959 		/* Need to ensure that the reconstruct actually completed! */
3960 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3961 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3962 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3963 			if (e) {
3964 				if (e != ENODEV)
3965 					printf("raid%d: cache flush to component %s failed.\n",
3966 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3967 				if (error == 0) {
3968 					error = e;
3969 				}
3970 			}
3971 		}
3972 	}
3973 	return error;
3974 }
3975